新增PDF文本查找功能,支持全文和逐页搜索,返回文本坐标信息

This commit is contained in:
2025-07-30 08:51:30 +08:00
parent 466fae53c9
commit 44ef61daab

219
src/get_pos_pdf.py Normal file
View File

@@ -0,0 +1,219 @@
import requests
import io
import os
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTText, LTChar, LTAnno
def parse_char_layout(layout):
"""解析页面内容,一个字母一个字母的解析"""
# bbox:
# x0从页面左侧到框左边缘的距离。
# y0从页面底部到框的下边缘的距离。
# x1从页面左侧到方框右边缘的距离。
# y1从页面底部到框的上边缘的距离
char_list = []
for textbox in layout:
if isinstance(textbox, LTText):
for line in textbox:
for char in line:
# If the char is a line-break or an empty space, the word is complete
if isinstance(char, LTAnno):
char_info = {
'x': char.bbox[0] if hasattr(char, 'bbox') else 0,
'y': char.bbox[3] if hasattr(char, 'bbox') else 0,
'char': char.get_text()
}
char_list.append(char_info)
elif isinstance(char, LTChar):
char_info = {
'x': char.bbox[0],
'y': char.bbox[3],
'char': char.get_text()
}
char_list.append(char_info)
return char_list
def normalize_text(text):
"""标准化文本,移除多余空白字符"""
# 将换行符、制表符等替换为空格,然后合并多个空格为一个
import re
normalized = re.sub(r'\s+', ' ', text.strip())
return normalized
def find_text_positions(pdf_path, target_text):
"""
在PDF中查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 标准化目标文本
normalized_target = normalize_text(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
all_chars = [] # 存储所有页面的字符
page_start_indices = [] # 存储每页开始的索引
# 处理每一页并收集所有字符
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
page_start_indices.append(len(all_chars))
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
all_chars.extend(char_list)
# 将所有字符组合成文本并标准化
full_text = ''.join([char_info['char'] for char_info in all_chars])
normalized_full_text = normalize_text(full_text)
# 在标准化文本中查找目标文本
found_positions = []
start = 0
while True:
pos = normalized_full_text.find(normalized_target, start)
if pos == -1:
break
# 找到匹配项,获取对应的坐标信息
if pos < len(all_chars):
start_char = all_chars[pos]
end_pos = pos + len(normalized_target) - 1
if end_pos < len(all_chars):
end_char = all_chars[end_pos]
# 确定在哪一页
page_num = 1
for i, page_start in enumerate(page_start_indices):
if pos >= page_start:
page_num = i + 1
position_info = {
'page': page_num,
'text': normalized_target,
'start_x': start_char['x'],
'start_y': start_char['y'],
'end_x': end_char['x'],
'end_y': end_char['y']
}
found_positions.append(position_info)
start = pos + 1
return found_positions
def find_text_in_pdf_per_page(pdf_path, target_text):
"""
在PDF中逐页查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 标准化目标文本
normalized_target = normalize_text(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本并标准化
page_text = ''.join([char_info['char'] for char_info in char_list])
normalized_page_text = normalize_text(page_text)
# 在页面文本中查找目标文本
pos = normalized_page_text.find(normalized_target)
if pos != -1:
# 找到匹配项,获取对应的坐标信息
if pos < len(char_list):
start_char = char_list[pos]
end_pos = pos + len(normalized_target) - 1
if end_pos < len(char_list):
end_char = char_list[end_pos]
position_info = {
'page': page_num,
'text': normalized_target,
'start_x': start_char['x'],
'start_y': start_char['y'],
'end_x': end_char['x'],
'end_y': end_char['y']
}
found_positions.append(position_info)
return found_positions
if __name__ == '__main__':
# 使用本地PDF文件
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
target_text = '''执行方式:
• 在当前 chat 中,已有上下文,但可能混乱
• 新开一个 chat干净的上下文需要填充'''
try:
print("方法1全文搜索")
positions = find_text_positions(pdf_file_path, target_text)
if positions:
print(f"找到文本在以下位置:")
for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
else:
print("未找到文本")
print("\n方法2逐页搜索")
positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
if positions:
print(f"找到文本在以下位置:")
for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
else:
print("未找到文本")
except FileNotFoundError as e:
print(e)
except Exception as e:
print(f"处理PDF时出错: {e}")