实现智能模糊文本查找功能,支持精确、模糊和部分匹配,优化文本坐标返回逻辑

This commit is contained in:
2025-07-30 12:48:11 +08:00
parent 44ef61daab
commit 73557a272d

View File

@@ -1,6 +1,8 @@
import requests import requests
import io import io
import os import os
import re
from difflib import SequenceMatcher
from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser from pdfminer.pdfparser import PDFParser
@@ -44,6 +46,386 @@ def normalize_text(text):
normalized = re.sub(r'\s+', ' ', text.strip()) normalized = re.sub(r'\s+', ' ', text.strip())
return normalized return normalized
def clean_text_for_fuzzy_match(text):
"""清理文本用于模糊匹配,移除特殊字符,只保留字母数字和空格"""
# 移除标点符号和特殊字符,只保留字母、数字、中文字符和空格
cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
# 标准化空白字符
cleaned = re.sub(r'\s+', ' ', cleaned.strip())
return cleaned
def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8):
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0].copy() # 创建副本
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
# 合并索引范围
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 计算加权平均相似度
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
(next_match['end_idx'] - next_match['start_idx'] + 1)
current_match['similarity'] = (
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
) / total_length
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match.copy() # 创建副本
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 过滤掉坐标为0的字符通常是特殊字符
valid_chars = [char for char in matched_chars
if char['x'] > 0 and char['y'] > 0]
# 如果没有有效字符,则使用所有字符
chars_to_use = valid_chars if valid_chars else matched_chars
# 计算边界框 (left, right, top, bottom)
if chars_to_use:
# 计算边界值
left = min([char['x'] for char in chars_to_use])
right = max([char['x'] for char in chars_to_use])
bottom = min([char['y'] for char in chars_to_use])
top = max([char['y'] for char in chars_to_use])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
# 只有当边界框有效时才添加结果
if left >= 0 and right > left and top > bottom:
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0].copy() # 创建副本
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
# 合并索引范围
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 计算加权平均相似度
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
(next_match['end_idx'] - next_match['start_idx'] + 1)
current_match['similarity'] = (
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
) / total_length
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match.copy() # 创建副本
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 计算边界框 (left, right, top, bottom)
if matched_chars:
# 计算边界值
left = min([char['x'] for char in matched_chars])
right = max([char['x'] for char in matched_chars])
bottom = min([char['y'] for char in matched_chars])
top = max([char['y'] for char in matched_chars])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0]
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
if next_match['start_idx'] <= current_match['end_idx'] + target_len:
# 合并索引范围
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 平均相似度
current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 计算边界框 (left, right, top, bottom)
if matched_chars:
# 计算边界值
left = min([char['x'] for char in matched_chars])
right = max([char['x'] for char in matched_chars])
bottom = min([char['y'] for char in matched_chars])
top = max([char['y'] for char in matched_chars])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
def find_text_positions(pdf_path, target_text): def find_text_positions(pdf_path, target_text):
""" """
在PDF中查找指定文本并返回坐标 在PDF中查找指定文本并返回坐标
@@ -106,15 +488,25 @@ def find_text_positions(pdf_path, target_text):
if pos >= page_start: if pos >= page_start:
page_num = i + 1 page_num = i + 1
position_info = { # 获取匹配的文本内容
'page': page_num, matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]])
'text': normalized_target,
'start_x': start_char['x'], # 计算边界框 (left, right, top, bottom)
'start_y': start_char['y'], left = min(start_char['x'], end_char['x'])
'end_x': end_char['x'], right = max(start_char['x'], end_char['x'])
'end_y': end_char['y'] bottom = min(start_char['y'], end_char['y'])
} top = max(start_char['y'], end_char['y'])
found_positions.append(position_info)
position=[
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
1.0 # 添加相似度信息精确匹配为1.0
]
found_positions.append(position)
start = pos + 1 start = pos + 1
@@ -169,47 +561,153 @@ def find_text_in_pdf_per_page(pdf_path, target_text):
if end_pos < len(char_list): if end_pos < len(char_list):
end_char = char_list[end_pos] end_char = char_list[end_pos]
position_info = { # 获取匹配的文本内容
'page': page_num, matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]])
'text': normalized_target,
'start_x': start_char['x'], # 计算边界框 (left, right, top, bottom)
'start_y': start_char['y'], left = min(start_char['x'], end_char['x'])
'end_x': end_char['x'], right = max(start_char['x'], end_char['x'])
'end_y': end_char['y'] bottom = min(start_char['y'], end_char['y'])
} top = max(start_char['y'], end_char['y'])
found_positions.append(position_info)
position=[
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
1.0 # 添加相似度信息精确匹配为1.0
]
found_positions.append(position)
return found_positions return found_positions
def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
"""
查找部分匹配的文本(适用于较长的文本)
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
min_match_ratio (float): 最小匹配比例 (0-1)
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 将目标文本分割成关键词或短语
normalized_target = normalize_text(target_text)
# 提取关键词(移除常见停用词后的词)
keywords = [word for word in normalized_target.split() if len(word) > 2]
if not keywords:
keywords = normalized_target.split() # 如果没有长词,则使用所有词
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本并标准化
page_text = ''.join([char_info['char'] for char_info in char_list])
normalized_page_text = normalize_text(page_text)
# 计算匹配的关键词数量
matched_keywords = 0
for keyword in keywords:
if keyword in normalized_page_text:
matched_keywords += 1
# 如果匹配的关键词比例超过阈值,则认为找到匹配
if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio:
# 简单起见,返回页面第一个字符和最后一个字符的坐标
if char_list:
start_char = char_list[0]
end_char = char_list[-1]
match_ratio = matched_keywords / len(keywords)
# 获取页面文本作为匹配内容
matched_text = ''.join([char_info['char'] for char_info in char_list])
# 计算边界框 (left, right, top, bottom)
left = min(start_char['x'], end_char['x'])
right = max(start_char['x'], end_char['x'])
bottom = min(start_char['y'], end_char['y'])
top = max(start_char['y'], end_char['y'])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text[:100] + "..." if len(matched_text) > 100 else matched_text, # 添加匹配的内容(限制长度)
match_ratio # 添加匹配比例信息
]
found_positions.append(position)
return found_positions
def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8):
"""
智能模糊文本查找,结合多种方法
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值
Returns:
list: 包含匹配文本坐标信息的列表
"""
# 方法1: 精确匹配
exact_results = find_text_in_pdf_per_page(pdf_path, target_text)
if exact_results:
return exact_results
# 方法2: 模糊匹配
fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold)
if fuzzy_results:
return fuzzy_results
# 方法3: 部分匹配(关键词匹配)
partial_results = find_partial_text_positions(pdf_path, target_text, 0.5)
return partial_results
if __name__ == '__main__': if __name__ == '__main__':
# 使用本地PDF文件 # 使用本地PDF文件
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径 pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
target_text = '''执行方式: target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做"
在当前 chat 中,已有上下文,但可能混乱 基于 `plan` 执行: 精准驱动 AI 完成任务'''
• 新开一个 chat干净的上下文需要填充'''
try: try:
print("方法1全文搜索") print("智能模糊查找:")
positions = find_text_positions(pdf_file_path, target_text) positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7)
if positions:
print(f"找到文本在以下位置:")
for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
else:
print("未找到文本")
print("\n方法2逐页搜索")
positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
if positions: if positions:
print(f"找到文本在以下位置:") print(f"找到文本在以下位置:")
for pos in positions: for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") if len(pos) >= 7: # 包含匹配内容和相似度信息
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}")
print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}")
print("-" * 50)
else:
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
else: else:
print("未找到文本") print("未找到文本")