优化PDF文本查找功能，支持列表类型查询，新增预处理选项以提高模糊匹配准确性，修复多个匹配结果的处理逻辑

2025-08-05 18:24:49 +08:00
parent 020de8da5d
commit c8f96ee41e
1 changed files with 122 additions and 57 deletions
--- a/src/find_text_in_pdf_enhanced.py
+++ b/src/find_text_in_pdf_enhanced.py
@@ -1,7 +1,22 @@
 import fitz  # pymupdf
 import regex   # 支持多行正则
 from rapidfuzz import fuzz
+import re
+def normalize_text(text):
+    """标准化文本，移除多余空白字符"""
+    # 将换行符、制表符等替换为空格，然后合并多个空格为一个
+    import re
+    normalized = re.sub(r'\s+', ' ', text.strip())
+    return normalized

+
+def clean_text_for_fuzzy_match(text):
+    """清理文本用于模糊匹配，移除特殊字符，只保留字母数字和空格"""
+    # 移除标点符号和特殊字符，只保留字母、数字、中文字符和空格
+    cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
+    # 标准化空白字符
+    cleaned = re.sub(r'\s+', ' ', cleaned.strip())
+    return cleaned
 def _merge_lines(lines):
    """
    把多行文本合并成一段，同时记录每行 bbox 的并集。
@@ -43,16 +58,25 @@ def _collect_lines(page):
    return lines

 def find_text_in_pdf(pdf_path,
-                     query,
+                     query,  # 修改为支持list类型
                     use_regex=False,
                     threshold=80,      # rapidfuzz 默认 0~100
-                     page_range=None):  # 例如 (1,5) 只搜 1-4 页
+                     page_range=None,
+                     preprocess=True):  # 添加预处理选项
    """
    高级查找函数
-    query: 正则表达式字符串 或 普通字符串
+    query: 正则表达式字符串 或 普通字符串，或它们的列表
+    preprocess: 是否对文本进行预处理以提高模糊匹配准确性
    返回: list[dict] 每个 dict 含 page, bbox, matched_text
    """
-    results = []
+    # 处理单个查询字符串的情况
+    if isinstance(query, str):
+        queries = [query]
+    else:
+        queries = query  # 假设已经是列表
+        # 初始化结果列表
+    batch_results = [[] for _ in queries]
+    
    doc = fitz.open(pdf_path)
    pages = range(len(doc)) if page_range is None else range(page_range[0]-1, page_range[1])

@@ -63,58 +87,88 @@ def find_text_in_pdf(pdf_path,
            continue

        full_text, _ = _merge_lines(lines)    # 整页纯文本
-        positions = []                        # 记录匹配区间在 full_text 中的起止字符索引
+        
+        # 如果启用预处理，则对整页文本进行预处理
+        processed_full_text = full_text
+        if preprocess and not use_regex:
+            processed_full_text = clean_text_for_fuzzy_match(full_text)
+        
+        # 一次性计算所有查询的匹配结果
+        for idx ,q in enumerate(queries):
+            positions = []                        # 记录匹配区间在 full_text 中的起止字符索引
+            results = []
+            if use_regex:
+                # regex 支持 (?s) 使 . 匹配换行
+                pattern = regex.compile(q)
+                for match in pattern.finditer(full_text):
+                    positions.append((match.start(), match.end(), match.group()))
+            else:
+                # 模糊匹配：滑动窗口（整页 vs 查询）
+                # 修改：支持多个匹配结果并计算相似度分数
+                potential_matches = []
+                query_text = q
+                # 如果启用预处理，则对查询文本也进行预处理
+                if preprocess:
+                    query_text = clean_text_for_fuzzy_match(q)
+                score = fuzz.partial_ratio(processed_full_text, query_text, score_cutoff=threshold)
+                if score >= threshold:
+                    # 这里简单返回整页；如需精确定位，可再做二次对齐
+                    positions.append((0, len(full_text), full_text))
+                
+                # query_len = len(query_text)
+                # text_len = len(processed_full_text)
+                
+                # # 优化：只在合理范围内进行滑动窗口匹配
+                # # 添加早期终止机制，一旦找到足够高的匹配就停止搜索
+                # best_score = 0
+                # for i in range(text_len - query_len + 1):
+                #     window_text = processed_full_text[i:i + query_len]
+                #     # 优化：只处理非空文本
+                #     if window_text.strip():
+                #         # 优化：使用更快速的相似度计算方法
+                #         score = fuzz.partial_ratio(query_text, window_text)
+                #         if score >= threshold:
+                #             # 优化：记录当前最佳分数
+                #             if score > best_score:
+                #                 best_score = score
+                #             potential_matches.append((i, i + query_len, window_text, score))
+                #             # 优化：如果找到非常高分的匹配，可以提前终止
+                #             if score >= 95:  # 如果匹配度已经很高，可以提前结束
+                #                 break
+                
+                # 如果找到了潜在匹配，按分数排序并只取最高分的匹配
+                # if potential_matches:
+                #     # 按分数降序排序
+                #     potential_matches.sort(key=lambda x: x[3], reverse=True)
+                #     # 只取分数最高的匹配
+                #     best_match = potential_matches[0]
+                #     positions.append((best_match[0], best_match[1], best_match[2]))

-        if use_regex:
-            # regex 支持 (?s) 使 . 匹配换行
-            pattern = regex.compile(query)
-            for match in pattern.finditer(full_text):
-                positions.append((match.start(), match.end(), match.group()))
-        else:
-            # 模糊匹配：滑动窗口（整页 vs 查询）
-            # 修改：支持多个匹配结果并计算相似度分数
-            potential_matches = []
-            # 使用不同的方法获取多个可能的匹配
-            for i in range(len(full_text) - len(query) + 1):
-                if i < 0:
-                    continue
-                window_text = full_text[i:i + len(query)]
-                if window_text.strip():  # 只处理非空文本
-                    score = fuzz.partial_ratio(query, window_text)
-                    if score >= threshold:
-                        potential_matches.append((i, i + len(query), window_text, score))
-            
-            # 如果找到了潜在匹配，按分数排序并只取最高分的匹配
-            if potential_matches:
-                # 按分数降序排序
-                potential_matches.sort(key=lambda x: x[3], reverse=True)
-                # 只取分数最高的匹配
-                best_match = potential_matches[0]
-                positions.append((best_match[0], best_match[1], best_match[2]))
-
-        # 将字符区间映射回行
-        for start, end, matched_text in positions:
-            # 计算每一行在 full_text 中的起止字符偏移
-            offset = 0
-            matched_lines = []
-            for text, bbox in lines:
-                line_start = offset
-                line_end = offset + len(text)
-                # 检查该行是否与匹配区间有重叠 - 更严格的条件
-                if line_start < end and line_end > start:
-                    matched_lines.append((text, bbox))
-                # 修正：正确计算偏移量，包括换行符
-                offset += len(text) + 1  # 加上换行符的长度
-            # 修正：只有当确实匹配到文本时才添加结果
-            if matched_lines:
-                _, merged_bbox = _merge_lines(matched_lines)
-                results.append({
-                    "page": p + 1,
-                    "bbox": merged_bbox,
-                    "matched_text": matched_text
-                })
+            # 将字符区间映射回行
+            for start, end, matched_text in positions:
+                # 计算每一行在 full_text 中的起止字符偏移
+                offset = 0
+                matched_lines = []
+                for text, bbox in lines:
+                    line_start = offset
+                    line_end = offset + len(text)
+                    # 检查该行是否与匹配区间有重叠 - 更严格的条件
+                    if line_start < end and line_end > start:
+                        matched_lines.append((text, bbox))
+                    # 修正：正确计算偏移量，包括换行符
+                    offset += len(text) + 1  # 加上换行符的长度
+                # 修正：只有当确实匹配到文本时才添加结果
+                if matched_lines:
+                    _, merged_bbox = _merge_lines(matched_lines)
+                    results.append({
+                        "page": p + 1,
+                        "bbox": merged_bbox,
+                        "matched_text": matched_text
+                    })
+            if results:
+                batch_results[idx].append(results)
    doc.close()
-    return results
+    return batch_results

 def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
    """
@@ -161,8 +215,19 @@ if __name__ == "__main__":
        threshold=75
        
    )
-    for match in matches:
-        print(f"第 {match['page']} 页 匹配: {match['matched_text'][:50]}... 位置: {match['bbox']}")
+    # 修改：正确处理二维列表结果
+    print(matches)
+    print("------------------")
+    for idx,query_matches in enumerate(matches):
+        for m_item in query_matches:
+            highlight_matches(pdf_path, m_item, "example_highlighted.pdf")
+            for m in m_item:
+            # 输出匹配结果
+            #print(m)
+
+                print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}")

    # 2. 高亮并保存
-    highlight_matches(pdf_path, matches, "example_highlighted.pdf")
+    # 修改：展平二维列表用于高亮
+    # flattened_matches = [match for query_matches in matches for match in query_matches]
+    # highlight_matches(pdf_path, flattened_matches, "example_highlighted.pdf")