310 lines
10 KiB
Python
310 lines
10 KiB
Python
![]() |
"""
|
|||
|
正则表达式处理器模块
|
|||
|
提供基于正则表达式的文本批量操作功能
|
|||
|
"""
|
|||
|
|
|||
|
import re
|
|||
|
from typing import Dict, List, Any, Tuple
|
|||
|
|
|||
|
|
|||
|
class RegexProcessor:
|
|||
|
"""
|
|||
|
正则表达式处理器类
|
|||
|
|
|||
|
提供各种基于正则表达式的文本处理功能,包括替换、提取、验证等
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self):
|
|||
|
"""初始化正则表达式处理器"""
|
|||
|
self.compiled_patterns = {}
|
|||
|
|
|||
|
def _compile_pattern(self, pattern: str, flags: str = '') -> re.Pattern:
|
|||
|
"""
|
|||
|
编译正则表达式模式,使用缓存提高性能
|
|||
|
|
|||
|
Args:
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
flags (str): 正则表达式标志
|
|||
|
|
|||
|
Returns:
|
|||
|
re.Pattern: 编译后的正则表达式对象
|
|||
|
"""
|
|||
|
cache_key = f"{pattern}:{flags}"
|
|||
|
|
|||
|
if cache_key not in self.compiled_patterns:
|
|||
|
flag_value = 0
|
|||
|
for flag in flags.lower():
|
|||
|
if flag == 'i':
|
|||
|
flag_value |= re.IGNORECASE
|
|||
|
elif flag == 'm':
|
|||
|
flag_value |= re.MULTILINE
|
|||
|
elif flag == 's':
|
|||
|
flag_value |= re.DOTALL
|
|||
|
elif flag == 'x':
|
|||
|
flag_value |= re.VERBOSE
|
|||
|
elif flag == 'u':
|
|||
|
flag_value |= re.UNICODE
|
|||
|
elif flag == 'l':
|
|||
|
flag_value |= re.LOCALE
|
|||
|
|
|||
|
self.compiled_patterns[cache_key] = re.compile(pattern, flag_value)
|
|||
|
|
|||
|
return self.compiled_patterns[cache_key]
|
|||
|
|
|||
|
def replace(self, content: str, pattern: str, replacement: str, flags: str = '') -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
使用正则表达式替换文本
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 原始文本
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
replacement (str): 替换内容
|
|||
|
flags (str): 正则表达式标志
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 替换结果信息
|
|||
|
{
|
|||
|
'result': str, # 替换后的文本
|
|||
|
'matches': int, # 匹配次数
|
|||
|
'groups': List[str], # 匹配的组
|
|||
|
'replacements': List[Dict[str, str]] # 详细的替换信息
|
|||
|
}
|
|||
|
"""
|
|||
|
regex = self._compile_pattern(pattern, flags)
|
|||
|
|
|||
|
# 收集匹配信息
|
|||
|
matches = list(regex.finditer(content))
|
|||
|
replacements = []
|
|||
|
|
|||
|
def replace_func(match):
|
|||
|
groups = match.groups()
|
|||
|
replacement_result = match.expand(replacement)
|
|||
|
|
|||
|
replacements.append({
|
|||
|
'original': match.group(0),
|
|||
|
'replaced': replacement_result,
|
|||
|
'start': match.start(),
|
|||
|
'end': match.end(),
|
|||
|
'groups': list(groups) if groups else []
|
|||
|
})
|
|||
|
|
|||
|
return replacement_result
|
|||
|
|
|||
|
# 执行替换
|
|||
|
result = regex.sub(replace_func, content)
|
|||
|
|
|||
|
return {
|
|||
|
'result': result,
|
|||
|
'matches': len(matches),
|
|||
|
'groups': [match.groups() for match in matches if match.groups()],
|
|||
|
'replacements': replacements
|
|||
|
}
|
|||
|
|
|||
|
def extract(self, content: str, pattern: str, flags: str = '') -> List[Dict[str, Any]]:
|
|||
|
"""
|
|||
|
使用正则表达式提取匹配项
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 文本内容
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
flags (str): 正则表达式标志
|
|||
|
|
|||
|
Returns:
|
|||
|
List[Dict[str, Any]]: 匹配项列表
|
|||
|
[
|
|||
|
{
|
|||
|
'match': str, # 完整匹配
|
|||
|
'groups': List[str], # 捕获组
|
|||
|
'start': int, # 开始位置
|
|||
|
'end': int, # 结束位置
|
|||
|
'named_groups': Dict[str, str] # 命名捕获组
|
|||
|
}
|
|||
|
]
|
|||
|
"""
|
|||
|
regex = self._compile_pattern(pattern, flags)
|
|||
|
matches = []
|
|||
|
|
|||
|
for match in regex.finditer(content):
|
|||
|
match_info = {
|
|||
|
'match': match.group(0),
|
|||
|
'groups': list(match.groups()) if match.groups() else [],
|
|||
|
'start': match.start(),
|
|||
|
'end': match.end(),
|
|||
|
'named_groups': match.groupdict()
|
|||
|
}
|
|||
|
matches.append(match_info)
|
|||
|
|
|||
|
return matches
|
|||
|
|
|||
|
def validate(self, content: str, pattern: str, flags: str = '') -> bool:
|
|||
|
"""
|
|||
|
验证文本是否匹配正则表达式
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 文本内容
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
flags (str): 正则表达式标志
|
|||
|
|
|||
|
Returns:
|
|||
|
bool: 是否匹配
|
|||
|
"""
|
|||
|
regex = self._compile_pattern(pattern, flags)
|
|||
|
return bool(regex.search(content))
|
|||
|
|
|||
|
def split(self, content: str, pattern: str, flags: str = '', maxsplit: int = 0) -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
使用正则表达式分割文本
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 文本内容
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
flags (str): 正则表达式标志
|
|||
|
maxsplit (int): 最大分割次数
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 分割结果
|
|||
|
{
|
|||
|
'parts': List[str], # 分割后的部分
|
|||
|
'separators': List[str], # 分隔符
|
|||
|
'count': int # 分割次数
|
|||
|
}
|
|||
|
"""
|
|||
|
regex = self._compile_pattern(pattern, flags)
|
|||
|
|
|||
|
if maxsplit > 0:
|
|||
|
parts = regex.split(content, maxsplit)
|
|||
|
else:
|
|||
|
parts = regex.split(content)
|
|||
|
|
|||
|
# 找出分隔符
|
|||
|
separators = [match.group(0) for match in regex.finditer(content)]
|
|||
|
|
|||
|
return {
|
|||
|
'parts': parts,
|
|||
|
'separators': separators,
|
|||
|
'count': len(separators)
|
|||
|
}
|
|||
|
|
|||
|
def find_and_replace_batch(self, content: str, operations: List[Dict[str, str]]) -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
批量执行正则表达式替换操作
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 原始文本
|
|||
|
operations (List[Dict[str, str]]): 替换操作列表
|
|||
|
[
|
|||
|
{
|
|||
|
'pattern': str, # 正则表达式
|
|||
|
'replacement': str, # 替换内容
|
|||
|
'flags': str # 正则表达式标志
|
|||
|
}
|
|||
|
]
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 批量替换结果
|
|||
|
{
|
|||
|
'result': str, # 最终文本
|
|||
|
'operations': List[Dict[str, Any]] # 每个操作的结果
|
|||
|
}
|
|||
|
"""
|
|||
|
result = content
|
|||
|
operation_results = []
|
|||
|
|
|||
|
for operation in operations:
|
|||
|
pattern = operation['pattern']
|
|||
|
replacement = operation['replacement']
|
|||
|
flags = operation.get('flags', '')
|
|||
|
|
|||
|
replace_result = self.replace(result, pattern, replacement, flags)
|
|||
|
result = replace_result['result']
|
|||
|
|
|||
|
operation_results.append({
|
|||
|
'pattern': pattern,
|
|||
|
'replacement': replacement,
|
|||
|
'flags': flags,
|
|||
|
'matches': replace_result['matches'],
|
|||
|
'replacements': replace_result['replacements']
|
|||
|
})
|
|||
|
|
|||
|
return {
|
|||
|
'result': result,
|
|||
|
'operations': operation_results
|
|||
|
}
|
|||
|
|
|||
|
def get_statistics(self, content: str, pattern: str, flags: str = '') -> Dict[str, Any]:
|
|||
|
"""
|
|||
|
获取正则表达式的匹配统计信息
|
|||
|
|
|||
|
Args:
|
|||
|
content (str): 文本内容
|
|||
|
pattern (str): 正则表达式模式
|
|||
|
flags (str): 正则表达式标志
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, Any]: 统计信息
|
|||
|
{
|
|||
|
'total_matches': int, # 总匹配数
|
|||
|
'unique_matches': int, # 唯一匹配数
|
|||
|
'average_length': float, # 平均匹配长度
|
|||
|
'longest_match': str, # 最长匹配
|
|||
|
'shortest_match': str, # 最短匹配
|
|||
|
'positions': List[Tuple[int, int]] # 匹配位置列表
|
|||
|
}
|
|||
|
"""
|
|||
|
matches = self.extract(content, pattern, flags)
|
|||
|
|
|||
|
if not matches:
|
|||
|
return {
|
|||
|
'total_matches': 0,
|
|||
|
'unique_matches': 0,
|
|||
|
'average_length': 0.0,
|
|||
|
'longest_match': '',
|
|||
|
'shortest_match': '',
|
|||
|
'positions': []
|
|||
|
}
|
|||
|
|
|||
|
match_texts = [m['match'] for m in matches]
|
|||
|
lengths = [len(text) for text in match_texts]
|
|||
|
|
|||
|
return {
|
|||
|
'total_matches': len(matches),
|
|||
|
'unique_matches': len(set(match_texts)),
|
|||
|
'average_length': sum(lengths) / len(lengths),
|
|||
|
'longest_match': max(match_texts, key=len),
|
|||
|
'shortest_match': min(match_texts, key=len),
|
|||
|
'positions': [(m['start'], m['end']) for m in matches]
|
|||
|
}
|
|||
|
|
|||
|
def common_patterns(self) -> Dict[str, str]:
|
|||
|
"""
|
|||
|
返回常用正则表达式模式
|
|||
|
|
|||
|
Returns:
|
|||
|
Dict[str, str]: 常用模式字典
|
|||
|
"""
|
|||
|
return {
|
|||
|
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
|
|||
|
'url': r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?',
|
|||
|
'phone': r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
|
|||
|
'markdown_headers': r'^#{1,6}\s+(.+)$',
|
|||
|
'markdown_links': r'\[([^\]]+)\]\(([^)]+)\)',
|
|||
|
'markdown_images': r'!\[([^\]]*)\]\(([^)]+)\)',
|
|||
|
'markdown_bold': r'\*\*([^*]+)\*\*|__([^_]+)__',
|
|||
|
'markdown_italic': r'\*([^*]+)\*|_([^_]+)_',
|
|||
|
'code_blocks': r'```(\w*)\n(.*?)\n```',
|
|||
|
'inline_code': r'`([^`]+)`',
|
|||
|
'numbers': r'\d+',
|
|||
|
'words': r'\b\w+\b',
|
|||
|
'whitespace': r'\s+',
|
|||
|
'lines': r'.*(?:\n|$)',
|
|||
|
'markdown_tables': r'\|(.+)\|\n\|[-:\| ]+\|\n((?:\|.+\|\n?)*)',
|
|||
|
'markdown_lists': r'^(\s*)[-*+]\s+(.+)$',
|
|||
|
'markdown_quotes': r'^\s*>\s*(.+)',
|
|||
|
'html_tags': r'</?[a-zA-Z][^>]*\u003e',
|
|||
|
'ip_address': r'\b(?:\d{1,3}\.){3}\d{1,3}\b',
|
|||
|
'date_ymd': r'\d{4}-\d{2}-\d{2}',
|
|||
|
'date_dmy': r'\d{2}/\d{2}/\d{4}',
|
|||
|
'time_hms': r'\d{2}:\d{2}:\d{2}',
|
|||
|
'hex_color': r'#(?:[0-9a-fA-F]{3}){1,2}\b',
|
|||
|
'markdown_task_lists': r'^\s*[-*+]\s+\[([ x])\]\s+(.+)$'
|
|||
|
}
|