优化PDF文本查找功能,支持列表类型查询,新增预处理选项以提高模糊匹配准确性,修复多个匹配结果的处理逻辑
This commit is contained in:
		| @@ -1,7 +1,22 @@ | ||||
| import fitz  # pymupdf | ||||
| import regex   # 支持多行正则 | ||||
| from rapidfuzz import fuzz | ||||
| import re | ||||
| def normalize_text(text): | ||||
|     """标准化文本,移除多余空白字符""" | ||||
|     # 将换行符、制表符等替换为空格,然后合并多个空格为一个 | ||||
|     import re | ||||
|     normalized = re.sub(r'\s+', ' ', text.strip()) | ||||
|     return normalized | ||||
|  | ||||
|  | ||||
| def clean_text_for_fuzzy_match(text): | ||||
|     """清理文本用于模糊匹配,移除特殊字符,只保留字母数字和空格""" | ||||
|     # 移除标点符号和特殊字符,只保留字母、数字、中文字符和空格 | ||||
|     cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text) | ||||
|     # 标准化空白字符 | ||||
|     cleaned = re.sub(r'\s+', ' ', cleaned.strip()) | ||||
|     return cleaned | ||||
| def _merge_lines(lines): | ||||
|     """ | ||||
|     把多行文本合并成一段,同时记录每行 bbox 的并集。 | ||||
| @@ -43,16 +58,25 @@ def _collect_lines(page): | ||||
|     return lines | ||||
|  | ||||
| def find_text_in_pdf(pdf_path, | ||||
|                      query, | ||||
|                      query,  # 修改为支持list类型 | ||||
|                      use_regex=False, | ||||
|                      threshold=80,      # rapidfuzz 默认 0~100 | ||||
|                      page_range=None):  # 例如 (1,5) 只搜 1-4 页 | ||||
|                      page_range=None, | ||||
|                      preprocess=True):  # 添加预处理选项 | ||||
|     """ | ||||
|     高级查找函数 | ||||
|     query: 正则表达式字符串 或 普通字符串 | ||||
|     query: 正则表达式字符串 或 普通字符串,或它们的列表 | ||||
|     preprocess: 是否对文本进行预处理以提高模糊匹配准确性 | ||||
|     返回: list[dict] 每个 dict 含 page, bbox, matched_text | ||||
|     """ | ||||
|     results = [] | ||||
|     # 处理单个查询字符串的情况 | ||||
|     if isinstance(query, str): | ||||
|         queries = [query] | ||||
|     else: | ||||
|         queries = query  # 假设已经是列表 | ||||
|         # 初始化结果列表 | ||||
|     batch_results = [[] for _ in queries] | ||||
|      | ||||
|     doc = fitz.open(pdf_path) | ||||
|     pages = range(len(doc)) if page_range is None else range(page_range[0]-1, page_range[1]) | ||||
|  | ||||
| @@ -63,58 +87,88 @@ def find_text_in_pdf(pdf_path, | ||||
|             continue | ||||
|  | ||||
|         full_text, _ = _merge_lines(lines)    # 整页纯文本 | ||||
|         positions = []                        # 记录匹配区间在 full_text 中的起止字符索引 | ||||
|          | ||||
|         # 如果启用预处理,则对整页文本进行预处理 | ||||
|         processed_full_text = full_text | ||||
|         if preprocess and not use_regex: | ||||
|             processed_full_text = clean_text_for_fuzzy_match(full_text) | ||||
|          | ||||
|         # 一次性计算所有查询的匹配结果 | ||||
|         for idx ,q in enumerate(queries): | ||||
|             positions = []                        # 记录匹配区间在 full_text 中的起止字符索引 | ||||
|             results = [] | ||||
|             if use_regex: | ||||
|                 # regex 支持 (?s) 使 . 匹配换行 | ||||
|                 pattern = regex.compile(q) | ||||
|                 for match in pattern.finditer(full_text): | ||||
|                     positions.append((match.start(), match.end(), match.group())) | ||||
|             else: | ||||
|                 # 模糊匹配:滑动窗口(整页 vs 查询) | ||||
|                 # 修改:支持多个匹配结果并计算相似度分数 | ||||
|                 potential_matches = [] | ||||
|                 query_text = q | ||||
|                 # 如果启用预处理,则对查询文本也进行预处理 | ||||
|                 if preprocess: | ||||
|                     query_text = clean_text_for_fuzzy_match(q) | ||||
|                 score = fuzz.partial_ratio(processed_full_text, query_text, score_cutoff=threshold) | ||||
|                 if score >= threshold: | ||||
|                     # 这里简单返回整页;如需精确定位,可再做二次对齐 | ||||
|                     positions.append((0, len(full_text), full_text)) | ||||
|                  | ||||
|                 # query_len = len(query_text) | ||||
|                 # text_len = len(processed_full_text) | ||||
|                  | ||||
|                 # # 优化:只在合理范围内进行滑动窗口匹配 | ||||
|                 # # 添加早期终止机制,一旦找到足够高的匹配就停止搜索 | ||||
|                 # best_score = 0 | ||||
|                 # for i in range(text_len - query_len + 1): | ||||
|                 #     window_text = processed_full_text[i:i + query_len] | ||||
|                 #     # 优化:只处理非空文本 | ||||
|                 #     if window_text.strip(): | ||||
|                 #         # 优化:使用更快速的相似度计算方法 | ||||
|                 #         score = fuzz.partial_ratio(query_text, window_text) | ||||
|                 #         if score >= threshold: | ||||
|                 #             # 优化:记录当前最佳分数 | ||||
|                 #             if score > best_score: | ||||
|                 #                 best_score = score | ||||
|                 #             potential_matches.append((i, i + query_len, window_text, score)) | ||||
|                 #             # 优化:如果找到非常高分的匹配,可以提前终止 | ||||
|                 #             if score >= 95:  # 如果匹配度已经很高,可以提前结束 | ||||
|                 #                 break | ||||
|                  | ||||
|                 # 如果找到了潜在匹配,按分数排序并只取最高分的匹配 | ||||
|                 # if potential_matches: | ||||
|                 #     # 按分数降序排序 | ||||
|                 #     potential_matches.sort(key=lambda x: x[3], reverse=True) | ||||
|                 #     # 只取分数最高的匹配 | ||||
|                 #     best_match = potential_matches[0] | ||||
|                 #     positions.append((best_match[0], best_match[1], best_match[2])) | ||||
|  | ||||
|         if use_regex: | ||||
|             # regex 支持 (?s) 使 . 匹配换行 | ||||
|             pattern = regex.compile(query) | ||||
|             for match in pattern.finditer(full_text): | ||||
|                 positions.append((match.start(), match.end(), match.group())) | ||||
|         else: | ||||
|             # 模糊匹配:滑动窗口(整页 vs 查询) | ||||
|             # 修改:支持多个匹配结果并计算相似度分数 | ||||
|             potential_matches = [] | ||||
|             # 使用不同的方法获取多个可能的匹配 | ||||
|             for i in range(len(full_text) - len(query) + 1): | ||||
|                 if i < 0: | ||||
|                     continue | ||||
|                 window_text = full_text[i:i + len(query)] | ||||
|                 if window_text.strip():  # 只处理非空文本 | ||||
|                     score = fuzz.partial_ratio(query, window_text) | ||||
|                     if score >= threshold: | ||||
|                         potential_matches.append((i, i + len(query), window_text, score)) | ||||
|              | ||||
|             # 如果找到了潜在匹配,按分数排序并只取最高分的匹配 | ||||
|             if potential_matches: | ||||
|                 # 按分数降序排序 | ||||
|                 potential_matches.sort(key=lambda x: x[3], reverse=True) | ||||
|                 # 只取分数最高的匹配 | ||||
|                 best_match = potential_matches[0] | ||||
|                 positions.append((best_match[0], best_match[1], best_match[2])) | ||||
|  | ||||
|         # 将字符区间映射回行 | ||||
|         for start, end, matched_text in positions: | ||||
|             # 计算每一行在 full_text 中的起止字符偏移 | ||||
|             offset = 0 | ||||
|             matched_lines = [] | ||||
|             for text, bbox in lines: | ||||
|                 line_start = offset | ||||
|                 line_end = offset + len(text) | ||||
|                 # 检查该行是否与匹配区间有重叠 - 更严格的条件 | ||||
|                 if line_start < end and line_end > start: | ||||
|                     matched_lines.append((text, bbox)) | ||||
|                 # 修正:正确计算偏移量,包括换行符 | ||||
|                 offset += len(text) + 1  # 加上换行符的长度 | ||||
|             # 修正:只有当确实匹配到文本时才添加结果 | ||||
|             if matched_lines: | ||||
|                 _, merged_bbox = _merge_lines(matched_lines) | ||||
|                 results.append({ | ||||
|                     "page": p + 1, | ||||
|                     "bbox": merged_bbox, | ||||
|                     "matched_text": matched_text | ||||
|                 }) | ||||
|             # 将字符区间映射回行 | ||||
|             for start, end, matched_text in positions: | ||||
|                 # 计算每一行在 full_text 中的起止字符偏移 | ||||
|                 offset = 0 | ||||
|                 matched_lines = [] | ||||
|                 for text, bbox in lines: | ||||
|                     line_start = offset | ||||
|                     line_end = offset + len(text) | ||||
|                     # 检查该行是否与匹配区间有重叠 - 更严格的条件 | ||||
|                     if line_start < end and line_end > start: | ||||
|                         matched_lines.append((text, bbox)) | ||||
|                     # 修正:正确计算偏移量,包括换行符 | ||||
|                     offset += len(text) + 1  # 加上换行符的长度 | ||||
|                 # 修正:只有当确实匹配到文本时才添加结果 | ||||
|                 if matched_lines: | ||||
|                     _, merged_bbox = _merge_lines(matched_lines) | ||||
|                     results.append({ | ||||
|                         "page": p + 1, | ||||
|                         "bbox": merged_bbox, | ||||
|                         "matched_text": matched_text | ||||
|                     }) | ||||
|             if results: | ||||
|                 batch_results[idx].append(results) | ||||
|     doc.close() | ||||
|     return results | ||||
|     return batch_results | ||||
|  | ||||
| def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"): | ||||
|     """ | ||||
| @@ -161,8 +215,19 @@ if __name__ == "__main__": | ||||
|         threshold=75 | ||||
|          | ||||
|     ) | ||||
|     for match in matches: | ||||
|         print(f"第 {match['page']} 页 匹配: {match['matched_text'][:50]}... 位置: {match['bbox']}") | ||||
|     # 修改:正确处理二维列表结果 | ||||
|     print(matches) | ||||
|     print("------------------") | ||||
|     for idx,query_matches in enumerate(matches): | ||||
|         for m_item in query_matches: | ||||
|             highlight_matches(pdf_path, m_item, "example_highlighted.pdf") | ||||
|             for m in m_item: | ||||
|             # 输出匹配结果 | ||||
|             #print(m) | ||||
|  | ||||
|                 print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}") | ||||
|  | ||||
|     # 2. 高亮并保存 | ||||
|     highlight_matches(pdf_path, matches, "example_highlighted.pdf") | ||||
|     # 修改:展平二维列表用于高亮 | ||||
|     # flattened_matches = [match for query_matches in matches for match in query_matches] | ||||
|     # highlight_matches(pdf_path, flattened_matches, "example_highlighted.pdf") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user