新增PDF文本查找功能,支持多行正则和模糊匹配,优化匹配结果返回逻辑
This commit is contained in:
168
src/find_text_in_pdf_enhanced.py
Normal file
168
src/find_text_in_pdf_enhanced.py
Normal file
@@ -0,0 +1,168 @@
|
|||||||
|
import fitz # pymupdf
|
||||||
|
import regex # 支持多行正则
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
|
||||||
|
def _merge_lines(lines):
|
||||||
|
"""
|
||||||
|
把多行文本合并成一段,同时记录每行 bbox 的并集。
|
||||||
|
lines: list of (text, bbox)
|
||||||
|
return: (merged_text, merged_bbox)
|
||||||
|
"""
|
||||||
|
if not lines:
|
||||||
|
return "", None
|
||||||
|
texts, bboxes = zip(*lines)
|
||||||
|
merged_text = "\n".join(texts)
|
||||||
|
|
||||||
|
# 合并 bbox:取所有 bbox 的最小 x0,y0 和最大 x1,y1
|
||||||
|
x0 = min(b[0] for b in bboxes)
|
||||||
|
y0 = min(b[1] for b in bboxes)
|
||||||
|
x1 = max(b[2] for b in bboxes)
|
||||||
|
y1 = max(b[3] for b in bboxes)
|
||||||
|
# 修改:将坐标转换为整数
|
||||||
|
return merged_text, (int(x0), int(y0), int(x1), int(y1))
|
||||||
|
|
||||||
|
def _collect_lines(page):
|
||||||
|
"""
|
||||||
|
把一页的所有行按阅读顺序收集起来。
|
||||||
|
return: list of (text, bbox)
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
blocks = page.get_text("dict")["blocks"]
|
||||||
|
for blk in blocks:
|
||||||
|
if "lines" not in blk:
|
||||||
|
continue
|
||||||
|
for line in blk["lines"]:
|
||||||
|
line_text = "".join(span["text"] for span in line["spans"])
|
||||||
|
# 行级 bbox
|
||||||
|
x0 = min(span["bbox"][0] for span in line["spans"])
|
||||||
|
y0 = min(span["bbox"][1] for span in line["spans"])
|
||||||
|
x1 = max(span["bbox"][2] for span in line["spans"])
|
||||||
|
y1 = max(span["bbox"][3] for span in line["spans"])
|
||||||
|
# 修改:将坐标转换为整数
|
||||||
|
lines.append((line_text, (int(x0), int(y0), int(x1), int(y1))))
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def find_text_in_pdf(pdf_path,
|
||||||
|
query,
|
||||||
|
use_regex=False,
|
||||||
|
threshold=80, # rapidfuzz 默认 0~100
|
||||||
|
page_range=None): # 例如 (1,5) 只搜 1-4 页
|
||||||
|
"""
|
||||||
|
高级查找函数
|
||||||
|
query: 正则表达式字符串 或 普通字符串
|
||||||
|
返回: list[dict] 每个 dict 含 page, bbox, matched_text
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
pages = range(len(doc)) if page_range is None else range(page_range[0]-1, page_range[1])
|
||||||
|
|
||||||
|
for p in pages:
|
||||||
|
page = doc.load_page(p)
|
||||||
|
lines = _collect_lines(page) # [(text, bbox), ...]
|
||||||
|
if not lines:
|
||||||
|
continue
|
||||||
|
|
||||||
|
full_text, _ = _merge_lines(lines) # 整页纯文本
|
||||||
|
positions = [] # 记录匹配区间在 full_text 中的起止字符索引
|
||||||
|
|
||||||
|
if use_regex:
|
||||||
|
# regex 支持 (?s) 使 . 匹配换行
|
||||||
|
pattern = regex.compile(query)
|
||||||
|
for match in pattern.finditer(full_text):
|
||||||
|
positions.append((match.start(), match.end(), match.group()))
|
||||||
|
else:
|
||||||
|
# 模糊匹配:滑动窗口(整页 vs 查询)
|
||||||
|
# 修改:支持多个匹配结果并计算相似度分数
|
||||||
|
potential_matches = []
|
||||||
|
# 使用不同的方法获取多个可能的匹配
|
||||||
|
for i in range(len(full_text) - len(query) + 1):
|
||||||
|
if i < 0:
|
||||||
|
continue
|
||||||
|
window_text = full_text[i:i + len(query)]
|
||||||
|
if window_text.strip(): # 只处理非空文本
|
||||||
|
score = fuzz.partial_ratio(query, window_text)
|
||||||
|
if score >= threshold:
|
||||||
|
potential_matches.append((i, i + len(query), window_text, score))
|
||||||
|
|
||||||
|
# 如果找到了潜在匹配,按分数排序并只取最高分的匹配
|
||||||
|
if potential_matches:
|
||||||
|
# 按分数降序排序
|
||||||
|
potential_matches.sort(key=lambda x: x[3], reverse=True)
|
||||||
|
# 只取分数最高的匹配
|
||||||
|
best_match = potential_matches[0]
|
||||||
|
positions.append((best_match[0], best_match[1], best_match[2]))
|
||||||
|
|
||||||
|
# 将字符区间映射回行
|
||||||
|
for start, end, matched_text in positions:
|
||||||
|
# 计算每一行在 full_text 中的起止字符偏移
|
||||||
|
offset = 0
|
||||||
|
matched_lines = []
|
||||||
|
for text, bbox in lines:
|
||||||
|
line_start = offset
|
||||||
|
line_end = offset + len(text)
|
||||||
|
# 检查该行是否与匹配区间有重叠 - 更严格的条件
|
||||||
|
if line_start < end and line_end > start:
|
||||||
|
matched_lines.append((text, bbox))
|
||||||
|
# 修正:正确计算偏移量,包括换行符
|
||||||
|
offset += len(text) + 1 # 加上换行符的长度
|
||||||
|
# 修正:只有当确实匹配到文本时才添加结果
|
||||||
|
if matched_lines:
|
||||||
|
_, merged_bbox = _merge_lines(matched_lines)
|
||||||
|
results.append({
|
||||||
|
"page": p + 1,
|
||||||
|
"bbox": merged_bbox,
|
||||||
|
"matched_text": matched_text
|
||||||
|
})
|
||||||
|
doc.close()
|
||||||
|
return results
|
||||||
|
|
||||||
|
def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
|
||||||
|
"""
|
||||||
|
把 matches 里的 bbox 用黄色高亮写入新 PDF
|
||||||
|
matches: find_text_in_pdf(...) 的返回值
|
||||||
|
"""
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
for m in matches:
|
||||||
|
page = doc.load_page(m["page"] - 1) # 0-based
|
||||||
|
# 修改:确保坐标为整数(虽然已经是整数了,但为了保险起见)
|
||||||
|
bbox = m["bbox"]
|
||||||
|
rect = fitz.Rect(int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))
|
||||||
|
page.add_highlight_annot(rect) # 黄色高亮
|
||||||
|
doc.save(output_path)
|
||||||
|
doc.close()
|
||||||
|
print(f"已保存高亮 PDF:{output_path}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------- DEMO -----------------
|
||||||
|
# if __name__ == "__main__":
|
||||||
|
# pdf_path = "example.pdf"
|
||||||
|
# # 例1:正则跨行匹配
|
||||||
|
# query_regex = r"条款\s*\d+\.?\s*[\s\S]*?责任限制"
|
||||||
|
# res = find_text_in_pdf(pdf_path, query_regex, use_regex=True)
|
||||||
|
# for r in res:
|
||||||
|
# print(r)
|
||||||
|
|
||||||
|
# # 例2:模糊匹配一句话
|
||||||
|
# res2 = find_text_in_pdf(pdf_path, "这是一段可能不完全一样的文本", threshold=75)
|
||||||
|
# for r in res2:
|
||||||
|
# print(r)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pdf_path = 'F:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
|
||||||
|
query = '''2222二、全领域推进城市数字化转型
|
||||||
|
(四)建立城市数字化共性基础。深化完善统一规划、统一架构、统一标准、统一运维的城市智能中枢体系,打造线上线下联动、服务管理协同的城市共性支撑平台,构建开放兼容、共性赋能、安全可靠的综合性基础环境,推进算法、模型等数字资源一体集成部署,探索建立共性组件、模块等共享协作机制。鼓励发展基于人工智能等技术的智能分析、智能调度、智能监管、辅助决策,全面支撑赋能城市数字化转型场景建设与发展。鼓励有条件的地方推进城市信息模型、时空大数据、国土空间基础信息、实景三维中国等基础平台功能整合、协同发展、应用赋能,为城市数字化转型提供统一的时空框架,因地制宜有序推进数字孪生城市建设,推动虚实共生、仿真推演、迭代优化的数字孪生场景落地。'''
|
||||||
|
|
||||||
|
# 1. 找跨行正则匹配
|
||||||
|
matches = find_text_in_pdf(
|
||||||
|
pdf_path,
|
||||||
|
query, # 你的正则
|
||||||
|
threshold=75
|
||||||
|
|
||||||
|
)
|
||||||
|
for match in matches:
|
||||||
|
print(f"第 {match['page']} 页 匹配: {match['matched_text'][:50]}... 位置: {match['bbox']}")
|
||||||
|
|
||||||
|
# 2. 高亮并保存
|
||||||
|
highlight_matches(pdf_path, matches, "example_highlighted.pdf")
|
Reference in New Issue
Block a user