Compare commits
	
		
			2 Commits
		
	
	
		
			657e3cb9e5
			...
			c8f96ee41e
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| c8f96ee41e | |||
| 020de8da5d | 
							
								
								
									
										233
									
								
								src/find_text_in_pdf_enhanced.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										233
									
								
								src/find_text_in_pdf_enhanced.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,233 @@
 | 
				
			|||||||
 | 
					import fitz  # pymupdf
 | 
				
			||||||
 | 
					import regex   # 支持多行正则
 | 
				
			||||||
 | 
					from rapidfuzz import fuzz
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					def normalize_text(text):
 | 
				
			||||||
 | 
					    """标准化文本,移除多余空白字符"""
 | 
				
			||||||
 | 
					    # 将换行符、制表符等替换为空格,然后合并多个空格为一个
 | 
				
			||||||
 | 
					    import re
 | 
				
			||||||
 | 
					    normalized = re.sub(r'\s+', ' ', text.strip())
 | 
				
			||||||
 | 
					    return normalized
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def clean_text_for_fuzzy_match(text):
 | 
				
			||||||
 | 
					    """清理文本用于模糊匹配,移除特殊字符,只保留字母数字和空格"""
 | 
				
			||||||
 | 
					    # 移除标点符号和特殊字符,只保留字母、数字、中文字符和空格
 | 
				
			||||||
 | 
					    cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
 | 
				
			||||||
 | 
					    # 标准化空白字符
 | 
				
			||||||
 | 
					    cleaned = re.sub(r'\s+', ' ', cleaned.strip())
 | 
				
			||||||
 | 
					    return cleaned
 | 
				
			||||||
 | 
					def _merge_lines(lines):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    把多行文本合并成一段,同时记录每行 bbox 的并集。
 | 
				
			||||||
 | 
					    lines: list of (text, bbox)
 | 
				
			||||||
 | 
					    return: (merged_text, merged_bbox)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if not lines:
 | 
				
			||||||
 | 
					        return "", None
 | 
				
			||||||
 | 
					    texts, bboxes = zip(*lines)
 | 
				
			||||||
 | 
					    merged_text = "\n".join(texts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 合并 bbox:取所有 bbox 的最小 x0,y0 和最大 x1,y1
 | 
				
			||||||
 | 
					    x0 = min(b[0] for b in bboxes)
 | 
				
			||||||
 | 
					    y0 = min(b[1] for b in bboxes)
 | 
				
			||||||
 | 
					    x1 = max(b[2] for b in bboxes)
 | 
				
			||||||
 | 
					    y1 = max(b[3] for b in bboxes)
 | 
				
			||||||
 | 
					    # 修改:将坐标转换为整数
 | 
				
			||||||
 | 
					    return merged_text, (int(x0), int(y0), int(x1), int(y1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _collect_lines(page):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    把一页的所有行按阅读顺序收集起来。
 | 
				
			||||||
 | 
					    return: list of (text, bbox)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    lines = []
 | 
				
			||||||
 | 
					    blocks = page.get_text("dict")["blocks"]
 | 
				
			||||||
 | 
					    for blk in blocks:
 | 
				
			||||||
 | 
					        if "lines" not in blk:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        for line in blk["lines"]:
 | 
				
			||||||
 | 
					            line_text = "".join(span["text"] for span in line["spans"])
 | 
				
			||||||
 | 
					            # 行级 bbox
 | 
				
			||||||
 | 
					            x0 = min(span["bbox"][0] for span in line["spans"])
 | 
				
			||||||
 | 
					            y0 = min(span["bbox"][1] for span in line["spans"])
 | 
				
			||||||
 | 
					            x1 = max(span["bbox"][2] for span in line["spans"])
 | 
				
			||||||
 | 
					            y1 = max(span["bbox"][3] for span in line["spans"])
 | 
				
			||||||
 | 
					            # 修改:将坐标转换为整数
 | 
				
			||||||
 | 
					            lines.append((line_text, (int(x0), int(y0), int(x1), int(y1))))
 | 
				
			||||||
 | 
					    return lines
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def find_text_in_pdf(pdf_path,
 | 
				
			||||||
 | 
					                     query,  # 修改为支持list类型
 | 
				
			||||||
 | 
					                     use_regex=False,
 | 
				
			||||||
 | 
					                     threshold=80,      # rapidfuzz 默认 0~100
 | 
				
			||||||
 | 
					                     page_range=None,
 | 
				
			||||||
 | 
					                     preprocess=True):  # 添加预处理选项
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    高级查找函数
 | 
				
			||||||
 | 
					    query: 正则表达式字符串 或 普通字符串,或它们的列表
 | 
				
			||||||
 | 
					    preprocess: 是否对文本进行预处理以提高模糊匹配准确性
 | 
				
			||||||
 | 
					    返回: list[dict] 每个 dict 含 page, bbox, matched_text
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # 处理单个查询字符串的情况
 | 
				
			||||||
 | 
					    if isinstance(query, str):
 | 
				
			||||||
 | 
					        queries = [query]
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        queries = query  # 假设已经是列表
 | 
				
			||||||
 | 
					        # 初始化结果列表
 | 
				
			||||||
 | 
					    batch_results = [[] for _ in queries]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    doc = fitz.open(pdf_path)
 | 
				
			||||||
 | 
					    pages = range(len(doc)) if page_range is None else range(page_range[0]-1, page_range[1])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for p in pages:
 | 
				
			||||||
 | 
					        page = doc.load_page(p)
 | 
				
			||||||
 | 
					        lines = _collect_lines(page)          # [(text, bbox), ...]
 | 
				
			||||||
 | 
					        if not lines:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        full_text, _ = _merge_lines(lines)    # 整页纯文本
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 如果启用预处理,则对整页文本进行预处理
 | 
				
			||||||
 | 
					        processed_full_text = full_text
 | 
				
			||||||
 | 
					        if preprocess and not use_regex:
 | 
				
			||||||
 | 
					            processed_full_text = clean_text_for_fuzzy_match(full_text)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 一次性计算所有查询的匹配结果
 | 
				
			||||||
 | 
					        for idx ,q in enumerate(queries):
 | 
				
			||||||
 | 
					            positions = []                        # 记录匹配区间在 full_text 中的起止字符索引
 | 
				
			||||||
 | 
					            results = []
 | 
				
			||||||
 | 
					            if use_regex:
 | 
				
			||||||
 | 
					                # regex 支持 (?s) 使 . 匹配换行
 | 
				
			||||||
 | 
					                pattern = regex.compile(q)
 | 
				
			||||||
 | 
					                for match in pattern.finditer(full_text):
 | 
				
			||||||
 | 
					                    positions.append((match.start(), match.end(), match.group()))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                # 模糊匹配:滑动窗口(整页 vs 查询)
 | 
				
			||||||
 | 
					                # 修改:支持多个匹配结果并计算相似度分数
 | 
				
			||||||
 | 
					                potential_matches = []
 | 
				
			||||||
 | 
					                query_text = q
 | 
				
			||||||
 | 
					                # 如果启用预处理,则对查询文本也进行预处理
 | 
				
			||||||
 | 
					                if preprocess:
 | 
				
			||||||
 | 
					                    query_text = clean_text_for_fuzzy_match(q)
 | 
				
			||||||
 | 
					                score = fuzz.partial_ratio(processed_full_text, query_text, score_cutoff=threshold)
 | 
				
			||||||
 | 
					                if score >= threshold:
 | 
				
			||||||
 | 
					                    # 这里简单返回整页;如需精确定位,可再做二次对齐
 | 
				
			||||||
 | 
					                    positions.append((0, len(full_text), full_text))
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # query_len = len(query_text)
 | 
				
			||||||
 | 
					                # text_len = len(processed_full_text)
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # # 优化:只在合理范围内进行滑动窗口匹配
 | 
				
			||||||
 | 
					                # # 添加早期终止机制,一旦找到足够高的匹配就停止搜索
 | 
				
			||||||
 | 
					                # best_score = 0
 | 
				
			||||||
 | 
					                # for i in range(text_len - query_len + 1):
 | 
				
			||||||
 | 
					                #     window_text = processed_full_text[i:i + query_len]
 | 
				
			||||||
 | 
					                #     # 优化:只处理非空文本
 | 
				
			||||||
 | 
					                #     if window_text.strip():
 | 
				
			||||||
 | 
					                #         # 优化:使用更快速的相似度计算方法
 | 
				
			||||||
 | 
					                #         score = fuzz.partial_ratio(query_text, window_text)
 | 
				
			||||||
 | 
					                #         if score >= threshold:
 | 
				
			||||||
 | 
					                #             # 优化:记录当前最佳分数
 | 
				
			||||||
 | 
					                #             if score > best_score:
 | 
				
			||||||
 | 
					                #                 best_score = score
 | 
				
			||||||
 | 
					                #             potential_matches.append((i, i + query_len, window_text, score))
 | 
				
			||||||
 | 
					                #             # 优化:如果找到非常高分的匹配,可以提前终止
 | 
				
			||||||
 | 
					                #             if score >= 95:  # 如果匹配度已经很高,可以提前结束
 | 
				
			||||||
 | 
					                #                 break
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 如果找到了潜在匹配,按分数排序并只取最高分的匹配
 | 
				
			||||||
 | 
					                # if potential_matches:
 | 
				
			||||||
 | 
					                #     # 按分数降序排序
 | 
				
			||||||
 | 
					                #     potential_matches.sort(key=lambda x: x[3], reverse=True)
 | 
				
			||||||
 | 
					                #     # 只取分数最高的匹配
 | 
				
			||||||
 | 
					                #     best_match = potential_matches[0]
 | 
				
			||||||
 | 
					                #     positions.append((best_match[0], best_match[1], best_match[2]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # 将字符区间映射回行
 | 
				
			||||||
 | 
					            for start, end, matched_text in positions:
 | 
				
			||||||
 | 
					                # 计算每一行在 full_text 中的起止字符偏移
 | 
				
			||||||
 | 
					                offset = 0
 | 
				
			||||||
 | 
					                matched_lines = []
 | 
				
			||||||
 | 
					                for text, bbox in lines:
 | 
				
			||||||
 | 
					                    line_start = offset
 | 
				
			||||||
 | 
					                    line_end = offset + len(text)
 | 
				
			||||||
 | 
					                    # 检查该行是否与匹配区间有重叠 - 更严格的条件
 | 
				
			||||||
 | 
					                    if line_start < end and line_end > start:
 | 
				
			||||||
 | 
					                        matched_lines.append((text, bbox))
 | 
				
			||||||
 | 
					                    # 修正:正确计算偏移量,包括换行符
 | 
				
			||||||
 | 
					                    offset += len(text) + 1  # 加上换行符的长度
 | 
				
			||||||
 | 
					                # 修正:只有当确实匹配到文本时才添加结果
 | 
				
			||||||
 | 
					                if matched_lines:
 | 
				
			||||||
 | 
					                    _, merged_bbox = _merge_lines(matched_lines)
 | 
				
			||||||
 | 
					                    results.append({
 | 
				
			||||||
 | 
					                        "page": p + 1,
 | 
				
			||||||
 | 
					                        "bbox": merged_bbox,
 | 
				
			||||||
 | 
					                        "matched_text": matched_text
 | 
				
			||||||
 | 
					                    })
 | 
				
			||||||
 | 
					            if results:
 | 
				
			||||||
 | 
					                batch_results[idx].append(results)
 | 
				
			||||||
 | 
					    doc.close()
 | 
				
			||||||
 | 
					    return batch_results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    把 matches 里的 bbox 用黄色高亮写入新 PDF
 | 
				
			||||||
 | 
					    matches: find_text_in_pdf(...) 的返回值
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    doc = fitz.open(pdf_path)
 | 
				
			||||||
 | 
					    for m in matches:
 | 
				
			||||||
 | 
					        page = doc.load_page(m["page"] - 1)  # 0-based
 | 
				
			||||||
 | 
					        # 修改:确保坐标为整数(虽然已经是整数了,但为了保险起见)
 | 
				
			||||||
 | 
					        bbox = m["bbox"]
 | 
				
			||||||
 | 
					        rect = fitz.Rect(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
 | 
				
			||||||
 | 
					        page.add_highlight_annot(rect)       # 黄色高亮
 | 
				
			||||||
 | 
					    doc.save(output_path)
 | 
				
			||||||
 | 
					    doc.close()
 | 
				
			||||||
 | 
					    print(f"已保存高亮 PDF:{output_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ----------------- DEMO -----------------
 | 
				
			||||||
 | 
					# if __name__ == "__main__":
 | 
				
			||||||
 | 
					#     pdf_path = "example.pdf"
 | 
				
			||||||
 | 
					#     # 例1:正则跨行匹配
 | 
				
			||||||
 | 
					#     query_regex = r"条款\s*\d+\.?\s*[\s\S]*?责任限制"
 | 
				
			||||||
 | 
					#     res = find_text_in_pdf(pdf_path, query_regex, use_regex=True)
 | 
				
			||||||
 | 
					#     for r in res:
 | 
				
			||||||
 | 
					#         print(r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#     # 例2:模糊匹配一句话
 | 
				
			||||||
 | 
					#     res2 = find_text_in_pdf(pdf_path, "这是一段可能不完全一样的文本", threshold=75)
 | 
				
			||||||
 | 
					#     for r in res2:
 | 
				
			||||||
 | 
					#         print(r)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    pdf_path = 'F:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
 | 
				
			||||||
 | 
					    query = '''2222二、全领域推进城市数字化转型
 | 
				
			||||||
 | 
					(四)建立城市数字化共性基础。深化完善统一规划、统一架构、统一标准、统一运维的城市智能中枢体系,打造线上线下联动、服务管理协同的城市共性支撑平台,构建开放兼容、共性赋能、安全可靠的综合性基础环境,推进算法、模型等数字资源一体集成部署,探索建立共性组件、模块等共享协作机制。鼓励发展基于人工智能等技术的智能分析、智能调度、智能监管、辅助决策,全面支撑赋能城市数字化转型场景建设与发展。鼓励有条件的地方推进城市信息模型、时空大数据、国土空间基础信息、实景三维中国等基础平台功能整合、协同发展、应用赋能,为城市数字化转型提供统一的时空框架,因地制宜有序推进数字孪生城市建设,推动虚实共生、仿真推演、迭代优化的数字孪生场景落地。'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 1. 找跨行正则匹配
 | 
				
			||||||
 | 
					    matches = find_text_in_pdf(
 | 
				
			||||||
 | 
					        pdf_path,
 | 
				
			||||||
 | 
					        query,  # 你的正则
 | 
				
			||||||
 | 
					        threshold=75
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    # 修改:正确处理二维列表结果
 | 
				
			||||||
 | 
					    print(matches)
 | 
				
			||||||
 | 
					    print("------------------")
 | 
				
			||||||
 | 
					    for idx,query_matches in enumerate(matches):
 | 
				
			||||||
 | 
					        for m_item in query_matches:
 | 
				
			||||||
 | 
					            highlight_matches(pdf_path, m_item, "example_highlighted.pdf")
 | 
				
			||||||
 | 
					            for m in m_item:
 | 
				
			||||||
 | 
					            # 输出匹配结果
 | 
				
			||||||
 | 
					            #print(m)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # 2. 高亮并保存
 | 
				
			||||||
 | 
					    # 修改:展平二维列表用于高亮
 | 
				
			||||||
 | 
					    # flattened_matches = [match for query_matches in matches for match in query_matches]
 | 
				
			||||||
 | 
					    # highlight_matches(pdf_path, flattened_matches, "example_highlighted.pdf")
 | 
				
			||||||
		Reference in New Issue
	
	Block a user