实现智能模糊文本查找功能,支持精确、模糊和部分匹配,优化文本坐标返回逻辑

This commit is contained in:
2025-07-30 12:48:11 +08:00
parent 44ef61daab
commit 73557a272d

View File

@@ -1,6 +1,8 @@
import requests
import io
import os
import re
from difflib import SequenceMatcher
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
@@ -44,6 +46,386 @@ def normalize_text(text):
normalized = re.sub(r'\s+', ' ', text.strip())
return normalized
def clean_text_for_fuzzy_match(text):
"""清理文本用于模糊匹配,移除特殊字符,只保留字母数字和空格"""
# 移除标点符号和特殊字符,只保留字母、数字、中文字符和空格
cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
# 标准化空白字符
cleaned = re.sub(r'\s+', ' ', cleaned.strip())
return cleaned
def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8):
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0].copy() # 创建副本
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
# 合并索引范围
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 计算加权平均相似度
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
(next_match['end_idx'] - next_match['start_idx'] + 1)
current_match['similarity'] = (
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
) / total_length
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match.copy() # 创建副本
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 过滤掉坐标为0的字符通常是特殊字符
valid_chars = [char for char in matched_chars
if char['x'] > 0 and char['y'] > 0]
# 如果没有有效字符,则使用所有字符
chars_to_use = valid_chars if valid_chars else matched_chars
# 计算边界框 (left, right, top, bottom)
if chars_to_use:
# 计算边界值
left = min([char['x'] for char in chars_to_use])
right = max([char['x'] for char in chars_to_use])
bottom = min([char['y'] for char in chars_to_use])
top = max([char['y'] for char in chars_to_use])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
# 只有当边界框有效时才添加结果
if left >= 0 and right > left and top > bottom:
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0].copy() # 创建副本
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
# 合并索引范围
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 计算加权平均相似度
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
(next_match['end_idx'] - next_match['start_idx'] + 1)
current_match['similarity'] = (
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
) / total_length
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match.copy() # 创建副本
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 计算边界框 (left, right, top, bottom)
if matched_chars:
# 计算边界值
left = min([char['x'] for char in matched_chars])
right = max([char['x'] for char in matched_chars])
bottom = min([char['y'] for char in matched_chars])
top = max([char['y'] for char in matched_chars])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
"""
在PDF中模糊查找指定文本并返回坐标
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值 (0-1)默认0.8
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 清理目标文本
cleaned_target = clean_text_for_fuzzy_match(target_text)
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本
page_text = ''.join([char_info['char'] for char_info in char_list])
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
# 滑动窗口查找相似文本
target_len = len(cleaned_target)
if target_len == 0:
continue
# 存储所有匹配的块
matches = []
for i in range(len(cleaned_page_text) - target_len + 1):
window_text = cleaned_page_text[i:i + target_len]
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
if similarity >= similarity_threshold:
# 找到匹配项,记录位置和相似度
if i < len(char_list):
matches.append({
'start_idx': i,
'end_idx': min(i + target_len - 1, len(char_list) - 1),
'similarity': similarity
})
# 合并相邻的匹配块
if matches:
# 按起始位置排序
matches.sort(key=lambda x: x['start_idx'])
# 合并相邻或重叠的匹配块
merged_matches = []
current_match = matches[0]
for i in range(1, len(matches)):
next_match = matches[i]
# 如果下一个匹配块与当前块相邻或重叠,则合并
if next_match['start_idx'] <= current_match['end_idx'] + target_len:
# 合并索引范围
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
# 平均相似度
current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2
else:
# 不相邻,保存当前块,开始新的块
merged_matches.append(current_match)
current_match = next_match
# 添加最后一个块
merged_matches.append(current_match)
# 为每个合并后的块生成坐标信息
for match in merged_matches:
start_idx = match['start_idx']
end_idx = match['end_idx']
if start_idx < len(char_list) and end_idx < len(char_list):
# 获取匹配区域的所有字符
matched_chars = char_list[start_idx:end_idx+1]
# 计算边界框 (left, right, top, bottom)
if matched_chars:
# 计算边界值
left = min([char['x'] for char in matched_chars])
right = max([char['x'] for char in matched_chars])
bottom = min([char['y'] for char in matched_chars])
top = max([char['y'] for char in matched_chars])
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
match['similarity'] # 添加相似度信息
]
found_positions.append(position)
return found_positions
def find_text_positions(pdf_path, target_text):
"""
在PDF中查找指定文本并返回坐标
@@ -106,15 +488,25 @@ def find_text_positions(pdf_path, target_text):
if pos >= page_start:
page_num = i + 1
position_info = {
'page': page_num,
'text': normalized_target,
'start_x': start_char['x'],
'start_y': start_char['y'],
'end_x': end_char['x'],
'end_y': end_char['y']
}
found_positions.append(position_info)
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]])
# 计算边界框 (left, right, top, bottom)
left = min(start_char['x'], end_char['x'])
right = max(start_char['x'], end_char['x'])
bottom = min(start_char['y'], end_char['y'])
top = max(start_char['y'], end_char['y'])
position=[
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
1.0 # 添加相似度信息精确匹配为1.0
]
found_positions.append(position)
start = pos + 1
@@ -169,47 +561,153 @@ def find_text_in_pdf_per_page(pdf_path, target_text):
if end_pos < len(char_list):
end_char = char_list[end_pos]
position_info = {
'page': page_num,
'text': normalized_target,
'start_x': start_char['x'],
'start_y': start_char['y'],
'end_x': end_char['x'],
'end_y': end_char['y']
}
found_positions.append(position_info)
# 获取匹配的文本内容
matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]])
# 计算边界框 (left, right, top, bottom)
left = min(start_char['x'], end_char['x'])
right = max(start_char['x'], end_char['x'])
bottom = min(start_char['y'], end_char['y'])
top = max(start_char['y'], end_char['y'])
position=[
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text, # 添加匹配的内容
1.0 # 添加相似度信息精确匹配为1.0
]
found_positions.append(position)
return found_positions
def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
"""
查找部分匹配的文本(适用于较长的文本)
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
min_match_ratio (float): 最小匹配比例 (0-1)
Returns:
list: 包含匹配文本坐标信息的列表
"""
if not os.path.exists(pdf_path):
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
# 将目标文本分割成关键词或短语
normalized_target = normalize_text(target_text)
# 提取关键词(移除常见停用词后的词)
keywords = [word for word in normalized_target.split() if len(word) > 2]
if not keywords:
keywords = normalized_target.split() # 如果没有长词,则使用所有词
# 打开本地PDF文件
with open(pdf_path, 'rb') as fp:
parser = PDFParser(fp)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
found_positions = []
# 处理每一页
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
interpreter.process_page(page)
layout = device.get_result()
char_list = parse_char_layout(layout)
# 将页面字符组合成文本并标准化
page_text = ''.join([char_info['char'] for char_info in char_list])
normalized_page_text = normalize_text(page_text)
# 计算匹配的关键词数量
matched_keywords = 0
for keyword in keywords:
if keyword in normalized_page_text:
matched_keywords += 1
# 如果匹配的关键词比例超过阈值,则认为找到匹配
if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio:
# 简单起见,返回页面第一个字符和最后一个字符的坐标
if char_list:
start_char = char_list[0]
end_char = char_list[-1]
match_ratio = matched_keywords / len(keywords)
# 获取页面文本作为匹配内容
matched_text = ''.join([char_info['char'] for char_info in char_list])
# 计算边界框 (left, right, top, bottom)
left = min(start_char['x'], end_char['x'])
right = max(start_char['x'], end_char['x'])
bottom = min(start_char['y'], end_char['y'])
top = max(start_char['y'], end_char['y'])
position = [
page_num,
left, # left
right, # right
top, # top
bottom, # bottom
matched_text[:100] + "..." if len(matched_text) > 100 else matched_text, # 添加匹配的内容(限制长度)
match_ratio # 添加匹配比例信息
]
found_positions.append(position)
return found_positions
def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8):
"""
智能模糊文本查找,结合多种方法
Args:
pdf_path (str): PDF文件路径
target_text (str): 要查找的文本
similarity_threshold (float): 相似度阈值
Returns:
list: 包含匹配文本坐标信息的列表
"""
# 方法1: 精确匹配
exact_results = find_text_in_pdf_per_page(pdf_path, target_text)
if exact_results:
return exact_results
# 方法2: 模糊匹配
fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold)
if fuzzy_results:
return fuzzy_results
# 方法3: 部分匹配(关键词匹配)
partial_results = find_partial_text_positions(pdf_path, target_text, 0.5)
return partial_results
if __name__ == '__main__':
# 使用本地PDF文件
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
target_text = '''执行方式:
在当前 chat 中,已有上下文,但可能混乱
• 新开一个 chat干净的上下文需要填充'''
target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做"
基于 `plan` 执行: 精准驱动 AI 完成任务'''
try:
print("方法1全文搜索")
positions = find_text_positions(pdf_file_path, target_text)
print("智能模糊查找:")
positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7)
if positions:
print(f"找到文本在以下位置:")
for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
else:
print("未找到文本")
print("\n方法2逐页搜索")
positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
if positions:
print(f"找到文本在以下位置:")
for pos in positions:
print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
if len(pos) >= 7: # 包含匹配内容和相似度信息
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}")
print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}")
print("-" * 50)
else:
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
else:
print("未找到文本")