新增PDF文本查找功能,支持多行正则和模糊匹配,优化匹配结果返回逻辑
This commit is contained in:
		
							
								
								
									
										168
									
								
								src/find_text_in_pdf_enhanced.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								src/find_text_in_pdf_enhanced.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,168 @@ | ||||
| import fitz  # pymupdf | ||||
| import regex   # 支持多行正则 | ||||
| from rapidfuzz import fuzz | ||||
|  | ||||
| def _merge_lines(lines): | ||||
|     """ | ||||
|     把多行文本合并成一段,同时记录每行 bbox 的并集。 | ||||
|     lines: list of (text, bbox) | ||||
|     return: (merged_text, merged_bbox) | ||||
|     """ | ||||
|     if not lines: | ||||
|         return "", None | ||||
|     texts, bboxes = zip(*lines) | ||||
|     merged_text = "\n".join(texts) | ||||
|  | ||||
|     # 合并 bbox:取所有 bbox 的最小 x0,y0 和最大 x1,y1 | ||||
|     x0 = min(b[0] for b in bboxes) | ||||
|     y0 = min(b[1] for b in bboxes) | ||||
|     x1 = max(b[2] for b in bboxes) | ||||
|     y1 = max(b[3] for b in bboxes) | ||||
|     # 修改:将坐标转换为整数 | ||||
|     return merged_text, (int(x0), int(y0), int(x1), int(y1)) | ||||
|  | ||||
| def _collect_lines(page): | ||||
|     """ | ||||
|     把一页的所有行按阅读顺序收集起来。 | ||||
|     return: list of (text, bbox) | ||||
|     """ | ||||
|     lines = [] | ||||
|     blocks = page.get_text("dict")["blocks"] | ||||
|     for blk in blocks: | ||||
|         if "lines" not in blk: | ||||
|             continue | ||||
|         for line in blk["lines"]: | ||||
|             line_text = "".join(span["text"] for span in line["spans"]) | ||||
|             # 行级 bbox | ||||
|             x0 = min(span["bbox"][0] for span in line["spans"]) | ||||
|             y0 = min(span["bbox"][1] for span in line["spans"]) | ||||
|             x1 = max(span["bbox"][2] for span in line["spans"]) | ||||
|             y1 = max(span["bbox"][3] for span in line["spans"]) | ||||
|             # 修改:将坐标转换为整数 | ||||
|             lines.append((line_text, (int(x0), int(y0), int(x1), int(y1)))) | ||||
|     return lines | ||||
|  | ||||
| def find_text_in_pdf(pdf_path, | ||||
|                      query, | ||||
|                      use_regex=False, | ||||
|                      threshold=80,      # rapidfuzz 默认 0~100 | ||||
|                      page_range=None):  # 例如 (1,5) 只搜 1-4 页 | ||||
|     """ | ||||
|     高级查找函数 | ||||
|     query: 正则表达式字符串 或 普通字符串 | ||||
|     返回: list[dict] 每个 dict 含 page, bbox, matched_text | ||||
|     """ | ||||
|     results = [] | ||||
|     doc = fitz.open(pdf_path) | ||||
|     pages = range(len(doc)) if page_range is None else range(page_range[0]-1, page_range[1]) | ||||
|  | ||||
|     for p in pages: | ||||
|         page = doc.load_page(p) | ||||
|         lines = _collect_lines(page)          # [(text, bbox), ...] | ||||
|         if not lines: | ||||
|             continue | ||||
|  | ||||
|         full_text, _ = _merge_lines(lines)    # 整页纯文本 | ||||
|         positions = []                        # 记录匹配区间在 full_text 中的起止字符索引 | ||||
|  | ||||
|         if use_regex: | ||||
|             # regex 支持 (?s) 使 . 匹配换行 | ||||
|             pattern = regex.compile(query) | ||||
|             for match in pattern.finditer(full_text): | ||||
|                 positions.append((match.start(), match.end(), match.group())) | ||||
|         else: | ||||
|             # 模糊匹配:滑动窗口(整页 vs 查询) | ||||
|             # 修改:支持多个匹配结果并计算相似度分数 | ||||
|             potential_matches = [] | ||||
|             # 使用不同的方法获取多个可能的匹配 | ||||
|             for i in range(len(full_text) - len(query) + 1): | ||||
|                 if i < 0: | ||||
|                     continue | ||||
|                 window_text = full_text[i:i + len(query)] | ||||
|                 if window_text.strip():  # 只处理非空文本 | ||||
|                     score = fuzz.partial_ratio(query, window_text) | ||||
|                     if score >= threshold: | ||||
|                         potential_matches.append((i, i + len(query), window_text, score)) | ||||
|              | ||||
|             # 如果找到了潜在匹配,按分数排序并只取最高分的匹配 | ||||
|             if potential_matches: | ||||
|                 # 按分数降序排序 | ||||
|                 potential_matches.sort(key=lambda x: x[3], reverse=True) | ||||
|                 # 只取分数最高的匹配 | ||||
|                 best_match = potential_matches[0] | ||||
|                 positions.append((best_match[0], best_match[1], best_match[2])) | ||||
|  | ||||
|         # 将字符区间映射回行 | ||||
|         for start, end, matched_text in positions: | ||||
|             # 计算每一行在 full_text 中的起止字符偏移 | ||||
|             offset = 0 | ||||
|             matched_lines = [] | ||||
|             for text, bbox in lines: | ||||
|                 line_start = offset | ||||
|                 line_end = offset + len(text) | ||||
|                 # 检查该行是否与匹配区间有重叠 - 更严格的条件 | ||||
|                 if line_start < end and line_end > start: | ||||
|                     matched_lines.append((text, bbox)) | ||||
|                 # 修正:正确计算偏移量,包括换行符 | ||||
|                 offset += len(text) + 1  # 加上换行符的长度 | ||||
|             # 修正:只有当确实匹配到文本时才添加结果 | ||||
|             if matched_lines: | ||||
|                 _, merged_bbox = _merge_lines(matched_lines) | ||||
|                 results.append({ | ||||
|                     "page": p + 1, | ||||
|                     "bbox": merged_bbox, | ||||
|                     "matched_text": matched_text | ||||
|                 }) | ||||
|     doc.close() | ||||
|     return results | ||||
|  | ||||
| def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"): | ||||
|     """ | ||||
|     把 matches 里的 bbox 用黄色高亮写入新 PDF | ||||
|     matches: find_text_in_pdf(...) 的返回值 | ||||
|     """ | ||||
|     doc = fitz.open(pdf_path) | ||||
|     for m in matches: | ||||
|         page = doc.load_page(m["page"] - 1)  # 0-based | ||||
|         # 修改:确保坐标为整数(虽然已经是整数了,但为了保险起见) | ||||
|         bbox = m["bbox"] | ||||
|         rect = fitz.Rect(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])) | ||||
|         page.add_highlight_annot(rect)       # 黄色高亮 | ||||
|     doc.save(output_path) | ||||
|     doc.close() | ||||
|     print(f"已保存高亮 PDF:{output_path}") | ||||
|  | ||||
|  | ||||
|  | ||||
| # ----------------- DEMO ----------------- | ||||
| # if __name__ == "__main__": | ||||
| #     pdf_path = "example.pdf" | ||||
| #     # 例1:正则跨行匹配 | ||||
| #     query_regex = r"条款\s*\d+\.?\s*[\s\S]*?责任限制" | ||||
| #     res = find_text_in_pdf(pdf_path, query_regex, use_regex=True) | ||||
| #     for r in res: | ||||
| #         print(r) | ||||
|  | ||||
| #     # 例2:模糊匹配一句话 | ||||
| #     res2 = find_text_in_pdf(pdf_path, "这是一段可能不完全一样的文本", threshold=75) | ||||
| #     for r in res2: | ||||
| #         print(r) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     pdf_path = 'F:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf' | ||||
|     query = '''2222二、全领域推进城市数字化转型 | ||||
| (四)建立城市数字化共性基础。深化完善统一规划、统一架构、统一标准、统一运维的城市智能中枢体系,打造线上线下联动、服务管理协同的城市共性支撑平台,构建开放兼容、共性赋能、安全可靠的综合性基础环境,推进算法、模型等数字资源一体集成部署,探索建立共性组件、模块等共享协作机制。鼓励发展基于人工智能等技术的智能分析、智能调度、智能监管、辅助决策,全面支撑赋能城市数字化转型场景建设与发展。鼓励有条件的地方推进城市信息模型、时空大数据、国土空间基础信息、实景三维中国等基础平台功能整合、协同发展、应用赋能,为城市数字化转型提供统一的时空框架,因地制宜有序推进数字孪生城市建设,推动虚实共生、仿真推演、迭代优化的数字孪生场景落地。''' | ||||
|  | ||||
|     # 1. 找跨行正则匹配 | ||||
|     matches = find_text_in_pdf( | ||||
|         pdf_path, | ||||
|         query,  # 你的正则 | ||||
|         threshold=75 | ||||
|          | ||||
|     ) | ||||
|     for match in matches: | ||||
|         print(f"第 {match['page']} 页 匹配: {match['matched_text'][:50]}... 位置: {match['bbox']}") | ||||
|  | ||||
|     # 2. 高亮并保存 | ||||
|     highlight_matches(pdf_path, matches, "example_highlighted.pdf") | ||||
		Reference in New Issue
	
	Block a user