新增PDF文本查找功能,支持全文和逐页搜索,返回文本坐标信息
This commit is contained in:
219
src/get_pos_pdf.py
Normal file
219
src/get_pos_pdf.py
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
import requests
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
from pdfminer.pdfdocument import PDFDocument
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.pdfparser import PDFParser
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from pdfminer.converter import PDFPageAggregator
|
||||||
|
from pdfminer.layout import LAParams, LTText, LTChar, LTAnno
|
||||||
|
|
||||||
|
def parse_char_layout(layout):
|
||||||
|
"""解析页面内容,一个字母一个字母的解析"""
|
||||||
|
# bbox:
|
||||||
|
# x0:从页面左侧到框左边缘的距离。
|
||||||
|
# y0:从页面底部到框的下边缘的距离。
|
||||||
|
# x1:从页面左侧到方框右边缘的距离。
|
||||||
|
# y1:从页面底部到框的上边缘的距离
|
||||||
|
char_list = []
|
||||||
|
for textbox in layout:
|
||||||
|
if isinstance(textbox, LTText):
|
||||||
|
for line in textbox:
|
||||||
|
for char in line:
|
||||||
|
# If the char is a line-break or an empty space, the word is complete
|
||||||
|
if isinstance(char, LTAnno):
|
||||||
|
char_info = {
|
||||||
|
'x': char.bbox[0] if hasattr(char, 'bbox') else 0,
|
||||||
|
'y': char.bbox[3] if hasattr(char, 'bbox') else 0,
|
||||||
|
'char': char.get_text()
|
||||||
|
}
|
||||||
|
char_list.append(char_info)
|
||||||
|
elif isinstance(char, LTChar):
|
||||||
|
char_info = {
|
||||||
|
'x': char.bbox[0],
|
||||||
|
'y': char.bbox[3],
|
||||||
|
'char': char.get_text()
|
||||||
|
}
|
||||||
|
char_list.append(char_info)
|
||||||
|
return char_list
|
||||||
|
|
||||||
|
def normalize_text(text):
|
||||||
|
"""标准化文本,移除多余空白字符"""
|
||||||
|
# 将换行符、制表符等替换为空格,然后合并多个空格为一个
|
||||||
|
import re
|
||||||
|
normalized = re.sub(r'\s+', ' ', text.strip())
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
def find_text_positions(pdf_path, target_text):
|
||||||
|
"""
|
||||||
|
在PDF中查找指定文本并返回坐标
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path (str): PDF文件路径
|
||||||
|
target_text (str): 要查找的文本
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 包含匹配文本坐标信息的列表
|
||||||
|
"""
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
|
# 标准化目标文本
|
||||||
|
normalized_target = normalize_text(target_text)
|
||||||
|
|
||||||
|
# 打开本地PDF文件
|
||||||
|
with open(pdf_path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
|
all_chars = [] # 存储所有页面的字符
|
||||||
|
page_start_indices = [] # 存储每页开始的索引
|
||||||
|
|
||||||
|
# 处理每一页并收集所有字符
|
||||||
|
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||||
|
page_start_indices.append(len(all_chars))
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
char_list = parse_char_layout(layout)
|
||||||
|
all_chars.extend(char_list)
|
||||||
|
|
||||||
|
# 将所有字符组合成文本并标准化
|
||||||
|
full_text = ''.join([char_info['char'] for char_info in all_chars])
|
||||||
|
normalized_full_text = normalize_text(full_text)
|
||||||
|
|
||||||
|
# 在标准化文本中查找目标文本
|
||||||
|
found_positions = []
|
||||||
|
start = 0
|
||||||
|
while True:
|
||||||
|
pos = normalized_full_text.find(normalized_target, start)
|
||||||
|
if pos == -1:
|
||||||
|
break
|
||||||
|
|
||||||
|
# 找到匹配项,获取对应的坐标信息
|
||||||
|
if pos < len(all_chars):
|
||||||
|
start_char = all_chars[pos]
|
||||||
|
end_pos = pos + len(normalized_target) - 1
|
||||||
|
if end_pos < len(all_chars):
|
||||||
|
end_char = all_chars[end_pos]
|
||||||
|
# 确定在哪一页
|
||||||
|
page_num = 1
|
||||||
|
for i, page_start in enumerate(page_start_indices):
|
||||||
|
if pos >= page_start:
|
||||||
|
page_num = i + 1
|
||||||
|
|
||||||
|
position_info = {
|
||||||
|
'page': page_num,
|
||||||
|
'text': normalized_target,
|
||||||
|
'start_x': start_char['x'],
|
||||||
|
'start_y': start_char['y'],
|
||||||
|
'end_x': end_char['x'],
|
||||||
|
'end_y': end_char['y']
|
||||||
|
}
|
||||||
|
found_positions.append(position_info)
|
||||||
|
|
||||||
|
start = pos + 1
|
||||||
|
|
||||||
|
return found_positions
|
||||||
|
|
||||||
|
def find_text_in_pdf_per_page(pdf_path, target_text):
|
||||||
|
"""
|
||||||
|
在PDF中逐页查找指定文本并返回坐标
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path (str): PDF文件路径
|
||||||
|
target_text (str): 要查找的文本
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 包含匹配文本坐标信息的列表
|
||||||
|
"""
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
|
# 标准化目标文本
|
||||||
|
normalized_target = normalize_text(target_text)
|
||||||
|
|
||||||
|
# 打开本地PDF文件
|
||||||
|
with open(pdf_path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
|
found_positions = []
|
||||||
|
|
||||||
|
# 处理每一页
|
||||||
|
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
char_list = parse_char_layout(layout)
|
||||||
|
|
||||||
|
# 将页面字符组合成文本并标准化
|
||||||
|
page_text = ''.join([char_info['char'] for char_info in char_list])
|
||||||
|
normalized_page_text = normalize_text(page_text)
|
||||||
|
|
||||||
|
# 在页面文本中查找目标文本
|
||||||
|
pos = normalized_page_text.find(normalized_target)
|
||||||
|
if pos != -1:
|
||||||
|
# 找到匹配项,获取对应的坐标信息
|
||||||
|
if pos < len(char_list):
|
||||||
|
start_char = char_list[pos]
|
||||||
|
end_pos = pos + len(normalized_target) - 1
|
||||||
|
if end_pos < len(char_list):
|
||||||
|
end_char = char_list[end_pos]
|
||||||
|
|
||||||
|
position_info = {
|
||||||
|
'page': page_num,
|
||||||
|
'text': normalized_target,
|
||||||
|
'start_x': start_char['x'],
|
||||||
|
'start_y': start_char['y'],
|
||||||
|
'end_x': end_char['x'],
|
||||||
|
'end_y': end_char['y']
|
||||||
|
}
|
||||||
|
found_positions.append(position_info)
|
||||||
|
|
||||||
|
return found_positions
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# 使用本地PDF文件
|
||||||
|
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
|
||||||
|
target_text = '''执行方式:
|
||||||
|
• 在当前 chat 中,已有上下文,但可能混乱
|
||||||
|
• 新开一个 chat,干净的上下文,需要填充'''
|
||||||
|
|
||||||
|
try:
|
||||||
|
print("方法1:全文搜索")
|
||||||
|
positions = find_text_positions(pdf_file_path, target_text)
|
||||||
|
if positions:
|
||||||
|
print(f"找到文本在以下位置:")
|
||||||
|
for pos in positions:
|
||||||
|
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
|
||||||
|
else:
|
||||||
|
print("未找到文本")
|
||||||
|
|
||||||
|
print("\n方法2:逐页搜索")
|
||||||
|
positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
|
||||||
|
if positions:
|
||||||
|
print(f"找到文本在以下位置:")
|
||||||
|
for pos in positions:
|
||||||
|
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
|
||||||
|
else:
|
||||||
|
print("未找到文本")
|
||||||
|
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(e)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理PDF时出错: {e}")
|
Reference in New Issue
Block a user