新增PDF文本查找功能,支持全文和逐页搜索,返回文本坐标信息
This commit is contained in:
219
src/get_pos_pdf.py
Normal file
219
src/get_pos_pdf.py
Normal file
@@ -0,0 +1,219 @@
|
||||
import requests
|
||||
import io
|
||||
import os
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.converter import PDFPageAggregator
|
||||
from pdfminer.layout import LAParams, LTText, LTChar, LTAnno
|
||||
|
||||
def parse_char_layout(layout):
|
||||
"""解析页面内容,一个字母一个字母的解析"""
|
||||
# bbox:
|
||||
# x0:从页面左侧到框左边缘的距离。
|
||||
# y0:从页面底部到框的下边缘的距离。
|
||||
# x1:从页面左侧到方框右边缘的距离。
|
||||
# y1:从页面底部到框的上边缘的距离
|
||||
char_list = []
|
||||
for textbox in layout:
|
||||
if isinstance(textbox, LTText):
|
||||
for line in textbox:
|
||||
for char in line:
|
||||
# If the char is a line-break or an empty space, the word is complete
|
||||
if isinstance(char, LTAnno):
|
||||
char_info = {
|
||||
'x': char.bbox[0] if hasattr(char, 'bbox') else 0,
|
||||
'y': char.bbox[3] if hasattr(char, 'bbox') else 0,
|
||||
'char': char.get_text()
|
||||
}
|
||||
char_list.append(char_info)
|
||||
elif isinstance(char, LTChar):
|
||||
char_info = {
|
||||
'x': char.bbox[0],
|
||||
'y': char.bbox[3],
|
||||
'char': char.get_text()
|
||||
}
|
||||
char_list.append(char_info)
|
||||
return char_list
|
||||
|
||||
def normalize_text(text):
|
||||
"""标准化文本,移除多余空白字符"""
|
||||
# 将换行符、制表符等替换为空格,然后合并多个空格为一个
|
||||
import re
|
||||
normalized = re.sub(r'\s+', ' ', text.strip())
|
||||
return normalized
|
||||
|
||||
def find_text_positions(pdf_path, target_text):
|
||||
"""
|
||||
在PDF中查找指定文本并返回坐标
|
||||
|
||||
Args:
|
||||
pdf_path (str): PDF文件路径
|
||||
target_text (str): 要查找的文本
|
||||
|
||||
Returns:
|
||||
list: 包含匹配文本坐标信息的列表
|
||||
"""
|
||||
if not os.path.exists(pdf_path):
|
||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||
|
||||
# 标准化目标文本
|
||||
normalized_target = normalize_text(target_text)
|
||||
|
||||
# 打开本地PDF文件
|
||||
with open(pdf_path, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
all_chars = [] # 存储所有页面的字符
|
||||
page_start_indices = [] # 存储每页开始的索引
|
||||
|
||||
# 处理每一页并收集所有字符
|
||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||
page_start_indices.append(len(all_chars))
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
char_list = parse_char_layout(layout)
|
||||
all_chars.extend(char_list)
|
||||
|
||||
# 将所有字符组合成文本并标准化
|
||||
full_text = ''.join([char_info['char'] for char_info in all_chars])
|
||||
normalized_full_text = normalize_text(full_text)
|
||||
|
||||
# 在标准化文本中查找目标文本
|
||||
found_positions = []
|
||||
start = 0
|
||||
while True:
|
||||
pos = normalized_full_text.find(normalized_target, start)
|
||||
if pos == -1:
|
||||
break
|
||||
|
||||
# 找到匹配项,获取对应的坐标信息
|
||||
if pos < len(all_chars):
|
||||
start_char = all_chars[pos]
|
||||
end_pos = pos + len(normalized_target) - 1
|
||||
if end_pos < len(all_chars):
|
||||
end_char = all_chars[end_pos]
|
||||
# 确定在哪一页
|
||||
page_num = 1
|
||||
for i, page_start in enumerate(page_start_indices):
|
||||
if pos >= page_start:
|
||||
page_num = i + 1
|
||||
|
||||
position_info = {
|
||||
'page': page_num,
|
||||
'text': normalized_target,
|
||||
'start_x': start_char['x'],
|
||||
'start_y': start_char['y'],
|
||||
'end_x': end_char['x'],
|
||||
'end_y': end_char['y']
|
||||
}
|
||||
found_positions.append(position_info)
|
||||
|
||||
start = pos + 1
|
||||
|
||||
return found_positions
|
||||
|
||||
def find_text_in_pdf_per_page(pdf_path, target_text):
|
||||
"""
|
||||
在PDF中逐页查找指定文本并返回坐标
|
||||
|
||||
Args:
|
||||
pdf_path (str): PDF文件路径
|
||||
target_text (str): 要查找的文本
|
||||
|
||||
Returns:
|
||||
list: 包含匹配文本坐标信息的列表
|
||||
"""
|
||||
if not os.path.exists(pdf_path):
|
||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||
|
||||
# 标准化目标文本
|
||||
normalized_target = normalize_text(target_text)
|
||||
|
||||
# 打开本地PDF文件
|
||||
with open(pdf_path, 'rb') as fp:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser)
|
||||
|
||||
rsrcmgr = PDFResourceManager()
|
||||
laparams = LAParams()
|
||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
|
||||
found_positions = []
|
||||
|
||||
# 处理每一页
|
||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||
interpreter.process_page(page)
|
||||
layout = device.get_result()
|
||||
char_list = parse_char_layout(layout)
|
||||
|
||||
# 将页面字符组合成文本并标准化
|
||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
||||
normalized_page_text = normalize_text(page_text)
|
||||
|
||||
# 在页面文本中查找目标文本
|
||||
pos = normalized_page_text.find(normalized_target)
|
||||
if pos != -1:
|
||||
# 找到匹配项,获取对应的坐标信息
|
||||
if pos < len(char_list):
|
||||
start_char = char_list[pos]
|
||||
end_pos = pos + len(normalized_target) - 1
|
||||
if end_pos < len(char_list):
|
||||
end_char = char_list[end_pos]
|
||||
|
||||
position_info = {
|
||||
'page': page_num,
|
||||
'text': normalized_target,
|
||||
'start_x': start_char['x'],
|
||||
'start_y': start_char['y'],
|
||||
'end_x': end_char['x'],
|
||||
'end_y': end_char['y']
|
||||
}
|
||||
found_positions.append(position_info)
|
||||
|
||||
return found_positions
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 使用本地PDF文件
|
||||
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
|
||||
target_text = '''执行方式:
|
||||
• 在当前 chat 中,已有上下文,但可能混乱
|
||||
• 新开一个 chat,干净的上下文,需要填充'''
|
||||
|
||||
try:
|
||||
print("方法1:全文搜索")
|
||||
positions = find_text_positions(pdf_file_path, target_text)
|
||||
if positions:
|
||||
print(f"找到文本在以下位置:")
|
||||
for pos in positions:
|
||||
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
|
||||
else:
|
||||
print("未找到文本")
|
||||
|
||||
print("\n方法2:逐页搜索")
|
||||
positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
|
||||
if positions:
|
||||
print(f"找到文本在以下位置:")
|
||||
for pos in positions:
|
||||
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
|
||||
else:
|
||||
print("未找到文本")
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(e)
|
||||
except Exception as e:
|
||||
print(f"处理PDF时出错: {e}")
|
Reference in New Issue
Block a user