实现PDF批量模糊文本查找功能,支持多个目标文本的相似度匹配,优化文本坐标返回逻辑
This commit is contained in:
@@ -54,23 +54,23 @@ def clean_text_for_fuzzy_match(text):
|
|||||||
# 标准化空白字符
|
# 标准化空白字符
|
||||||
cleaned = re.sub(r'\s+', ' ', cleaned.strip())
|
cleaned = re.sub(r'\s+', ' ', cleaned.strip())
|
||||||
return cleaned
|
return cleaned
|
||||||
def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8):
|
def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold=0.8):
|
||||||
"""
|
"""
|
||||||
在PDF中模糊查找指定文本并返回坐标
|
在PDF中批量模糊查找指定文本并返回坐标
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): PDF文件路径
|
pdf_path (str): PDF文件路径
|
||||||
target_text (str): 要查找的文本
|
target_texts (list): 要查找的文本列表
|
||||||
similarity_threshold (float): 相似度阈值 (0-1),默认0.8
|
similarity_threshold (float): 相似度阈值 (0-1),默认0.8
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 包含匹配文本坐标信息的列表
|
dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(pdf_path):
|
if not os.path.exists(pdf_path):
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
# 清理目标文本
|
# 初始化结果字典
|
||||||
cleaned_target = clean_text_for_fuzzy_match(target_text)
|
batch_results = {text: [] for text in target_texts}
|
||||||
|
|
||||||
# 打开本地PDF文件
|
# 打开本地PDF文件
|
||||||
with open(pdf_path, 'rb') as fp:
|
with open(pdf_path, 'rb') as fp:
|
||||||
@@ -82,366 +82,138 @@ def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8):
|
|||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
found_positions = []
|
|
||||||
|
|
||||||
# 处理每一页
|
# 处理每一页
|
||||||
|
pages_chars = []
|
||||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||||
interpreter.process_page(page)
|
interpreter.process_page(page)
|
||||||
layout = device.get_result()
|
layout = device.get_result()
|
||||||
char_list = parse_char_layout(layout)
|
char_list = parse_char_layout(layout)
|
||||||
|
pages_chars.append((page_num, char_list))
|
||||||
# 将页面字符组合成文本
|
|
||||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
# 为每个目标文本进行查找
|
||||||
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
|
for target_text in target_texts:
|
||||||
|
# 清理目标文本
|
||||||
# 滑动窗口查找相似文本
|
cleaned_target = clean_text_for_fuzzy_match(target_text)
|
||||||
target_len = len(cleaned_target)
|
target_len = len(cleaned_target)
|
||||||
|
|
||||||
if target_len == 0:
|
if target_len == 0:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
found_positions = []
|
||||||
|
|
||||||
# 存储所有匹配的块
|
# 在每一页中查找
|
||||||
matches = []
|
for page_num, char_list in pages_chars:
|
||||||
for i in range(len(cleaned_page_text) - target_len + 1):
|
# 将页面字符组合成文本
|
||||||
window_text = cleaned_page_text[i:i + target_len]
|
page_text = ''.join([char_info['char'] for char_info in char_list])
|
||||||
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
|
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
|
||||||
|
|
||||||
if similarity >= similarity_threshold:
|
# 滑动窗口查找相似文本
|
||||||
# 找到匹配项,记录位置和相似度
|
matches = []
|
||||||
if i < len(char_list):
|
for i in range(len(cleaned_page_text) - target_len + 1):
|
||||||
matches.append({
|
window_text = cleaned_page_text[i:i + target_len]
|
||||||
'start_idx': i,
|
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
|
||||||
'end_idx': min(i + target_len - 1, len(char_list) - 1),
|
|
||||||
'similarity': similarity
|
|
||||||
})
|
|
||||||
|
|
||||||
# 合并相邻的匹配块
|
|
||||||
if matches:
|
|
||||||
# 按起始位置排序
|
|
||||||
matches.sort(key=lambda x: x['start_idx'])
|
|
||||||
|
|
||||||
# 合并相邻或重叠的匹配块
|
|
||||||
merged_matches = []
|
|
||||||
current_match = matches[0].copy() # 创建副本
|
|
||||||
|
|
||||||
for i in range(1, len(matches)):
|
|
||||||
next_match = matches[i]
|
|
||||||
# 如果下一个匹配块与当前块相邻或重叠,则合并
|
|
||||||
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
|
|
||||||
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
|
|
||||||
# 合并索引范围
|
|
||||||
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
|
|
||||||
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
|
|
||||||
# 计算加权平均相似度
|
|
||||||
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
|
|
||||||
(next_match['end_idx'] - next_match['start_idx'] + 1)
|
|
||||||
current_match['similarity'] = (
|
|
||||||
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
|
|
||||||
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
|
|
||||||
) / total_length
|
|
||||||
else:
|
|
||||||
# 不相邻,保存当前块,开始新的块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
current_match = next_match.copy() # 创建副本
|
|
||||||
|
|
||||||
# 添加最后一个块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
|
|
||||||
# 为每个合并后的块生成坐标信息
|
|
||||||
for match in merged_matches:
|
|
||||||
start_idx = match['start_idx']
|
|
||||||
end_idx = match['end_idx']
|
|
||||||
|
|
||||||
if start_idx < len(char_list) and end_idx < len(char_list):
|
if similarity >= similarity_threshold:
|
||||||
# 获取匹配区域的所有字符
|
# 找到匹配项,记录位置和相似度
|
||||||
matched_chars = char_list[start_idx:end_idx+1]
|
if i < len(char_list):
|
||||||
|
matches.append({
|
||||||
|
'start_idx': i,
|
||||||
|
'end_idx': min(i + target_len - 1, len(char_list) - 1),
|
||||||
|
'similarity': similarity
|
||||||
|
})
|
||||||
|
|
||||||
|
# 合并相邻的匹配块
|
||||||
|
if matches:
|
||||||
|
# 按起始位置排序
|
||||||
|
matches.sort(key=lambda x: x['start_idx'])
|
||||||
|
|
||||||
|
# 合并相邻或重叠的匹配块
|
||||||
|
merged_matches = []
|
||||||
|
current_match = matches[0].copy() # 创建副本
|
||||||
|
|
||||||
|
for i in range(1, len(matches)):
|
||||||
|
next_match = matches[i]
|
||||||
|
# 如果下一个匹配块与当前块相邻或重叠,则合并
|
||||||
|
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
|
||||||
|
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
|
||||||
|
# 合并索引范围
|
||||||
|
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
|
||||||
|
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
|
||||||
|
# 计算加权平均相似度
|
||||||
|
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
|
||||||
|
(next_match['end_idx'] - next_match['start_idx'] + 1)
|
||||||
|
current_match['similarity'] = (
|
||||||
|
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
|
||||||
|
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
|
||||||
|
) / total_length
|
||||||
|
else:
|
||||||
|
# 不相邻,保存当前块,开始新的块
|
||||||
|
merged_matches.append(current_match)
|
||||||
|
current_match = next_match.copy() # 创建副本
|
||||||
|
|
||||||
|
# 添加最后一个块
|
||||||
|
merged_matches.append(current_match)
|
||||||
|
|
||||||
|
# 为每个合并后的块生成坐标信息
|
||||||
|
for match in merged_matches:
|
||||||
|
start_idx = match['start_idx']
|
||||||
|
end_idx = match['end_idx']
|
||||||
|
|
||||||
# 过滤掉坐标为0的字符(通常是特殊字符)
|
if start_idx < len(char_list) and end_idx < len(char_list):
|
||||||
valid_chars = [char for char in matched_chars
|
# 获取匹配区域的所有字符
|
||||||
if char['x'] > 0 and char['y'] > 0]
|
matched_chars = char_list[start_idx:end_idx+1]
|
||||||
|
|
||||||
# 如果没有有效字符,则使用所有字符
|
|
||||||
chars_to_use = valid_chars if valid_chars else matched_chars
|
|
||||||
|
|
||||||
# 计算边界框 (left, right, top, bottom)
|
|
||||||
if chars_to_use:
|
|
||||||
# 计算边界值
|
|
||||||
left = min([char['x'] for char in chars_to_use])
|
|
||||||
right = max([char['x'] for char in chars_to_use])
|
|
||||||
bottom = min([char['y'] for char in chars_to_use])
|
|
||||||
top = max([char['y'] for char in chars_to_use])
|
|
||||||
|
|
||||||
# 获取匹配的文本内容
|
# 过滤掉坐标为0的字符(通常是特殊字符)
|
||||||
matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
|
valid_chars = [char for char in matched_chars
|
||||||
|
if char['x'] > 0 and char['y'] > 0]
|
||||||
|
|
||||||
# 只有当边界框有效时才添加结果
|
# 如果没有有效字符,则使用所有字符
|
||||||
if left >= 0 and right > left and top > bottom:
|
chars_to_use = valid_chars if valid_chars else matched_chars
|
||||||
position = [
|
|
||||||
page_num,
|
# 计算边界框 (left, right, top, bottom)
|
||||||
left, # left
|
if chars_to_use:
|
||||||
right, # right
|
# 计算边界值
|
||||||
top, # top
|
left = min([char['x'] for char in chars_to_use])
|
||||||
bottom, # bottom
|
right = max([char['x'] for char in chars_to_use])
|
||||||
matched_text, # 添加匹配的内容
|
bottom = min([char['y'] for char in chars_to_use])
|
||||||
match['similarity'] # 添加相似度信息
|
top = max([char['y'] for char in chars_to_use])
|
||||||
]
|
|
||||||
found_positions.append(position)
|
# 获取匹配的文本内容
|
||||||
|
matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
|
||||||
return found_positions
|
|
||||||
|
# 只有当边界框有效时才添加结果
|
||||||
|
if left >= 0 and right > left and top > bottom:
|
||||||
|
position = [
|
||||||
|
page_num,
|
||||||
|
left, # left
|
||||||
|
right, # right
|
||||||
|
top, # top
|
||||||
|
bottom, # bottom
|
||||||
|
matched_text, # 添加匹配的内容
|
||||||
|
match['similarity'] # 添加相似度信息
|
||||||
|
]
|
||||||
|
found_positions.append(position)
|
||||||
|
|
||||||
|
batch_results[target_text] = found_positions
|
||||||
|
|
||||||
|
return batch_results
|
||||||
|
|
||||||
|
def find_text_positions_batch(pdf_path, target_texts):
|
||||||
"""
|
"""
|
||||||
在PDF中模糊查找指定文本并返回坐标
|
在PDF中批量查找指定文本并返回坐标
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): PDF文件路径
|
pdf_path (str): PDF文件路径
|
||||||
target_text (str): 要查找的文本
|
target_texts (list): 要查找的文本列表
|
||||||
similarity_threshold (float): 相似度阈值 (0-1),默认0.8
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 包含匹配文本坐标信息的列表
|
dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(pdf_path):
|
if not os.path.exists(pdf_path):
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
# 清理目标文本
|
# 初始化结果字典
|
||||||
cleaned_target = clean_text_for_fuzzy_match(target_text)
|
batch_results = {text: [] for text in target_texts}
|
||||||
|
|
||||||
# 打开本地PDF文件
|
|
||||||
with open(pdf_path, 'rb') as fp:
|
|
||||||
parser = PDFParser(fp)
|
|
||||||
doc = PDFDocument(parser)
|
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager()
|
|
||||||
laparams = LAParams()
|
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
|
|
||||||
found_positions = []
|
|
||||||
|
|
||||||
# 处理每一页
|
|
||||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
|
||||||
interpreter.process_page(page)
|
|
||||||
layout = device.get_result()
|
|
||||||
char_list = parse_char_layout(layout)
|
|
||||||
|
|
||||||
# 将页面字符组合成文本
|
|
||||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
|
||||||
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
|
|
||||||
|
|
||||||
# 滑动窗口查找相似文本
|
|
||||||
target_len = len(cleaned_target)
|
|
||||||
if target_len == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 存储所有匹配的块
|
|
||||||
matches = []
|
|
||||||
for i in range(len(cleaned_page_text) - target_len + 1):
|
|
||||||
window_text = cleaned_page_text[i:i + target_len]
|
|
||||||
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
|
|
||||||
|
|
||||||
if similarity >= similarity_threshold:
|
|
||||||
# 找到匹配项,记录位置和相似度
|
|
||||||
if i < len(char_list):
|
|
||||||
matches.append({
|
|
||||||
'start_idx': i,
|
|
||||||
'end_idx': min(i + target_len - 1, len(char_list) - 1),
|
|
||||||
'similarity': similarity
|
|
||||||
})
|
|
||||||
|
|
||||||
# 合并相邻的匹配块
|
|
||||||
if matches:
|
|
||||||
# 按起始位置排序
|
|
||||||
matches.sort(key=lambda x: x['start_idx'])
|
|
||||||
|
|
||||||
# 合并相邻或重叠的匹配块
|
|
||||||
merged_matches = []
|
|
||||||
current_match = matches[0].copy() # 创建副本
|
|
||||||
|
|
||||||
for i in range(1, len(matches)):
|
|
||||||
next_match = matches[i]
|
|
||||||
# 如果下一个匹配块与当前块相邻或重叠,则合并
|
|
||||||
# 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
|
|
||||||
if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
|
|
||||||
# 合并索引范围
|
|
||||||
current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
|
|
||||||
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
|
|
||||||
# 计算加权平均相似度
|
|
||||||
total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
|
|
||||||
(next_match['end_idx'] - next_match['start_idx'] + 1)
|
|
||||||
current_match['similarity'] = (
|
|
||||||
current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
|
|
||||||
next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
|
|
||||||
) / total_length
|
|
||||||
else:
|
|
||||||
# 不相邻,保存当前块,开始新的块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
current_match = next_match.copy() # 创建副本
|
|
||||||
|
|
||||||
# 添加最后一个块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
|
|
||||||
# 为每个合并后的块生成坐标信息
|
|
||||||
for match in merged_matches:
|
|
||||||
start_idx = match['start_idx']
|
|
||||||
end_idx = match['end_idx']
|
|
||||||
|
|
||||||
if start_idx < len(char_list) and end_idx < len(char_list):
|
|
||||||
# 获取匹配区域的所有字符
|
|
||||||
matched_chars = char_list[start_idx:end_idx+1]
|
|
||||||
|
|
||||||
# 计算边界框 (left, right, top, bottom)
|
|
||||||
if matched_chars:
|
|
||||||
# 计算边界值
|
|
||||||
left = min([char['x'] for char in matched_chars])
|
|
||||||
right = max([char['x'] for char in matched_chars])
|
|
||||||
bottom = min([char['y'] for char in matched_chars])
|
|
||||||
top = max([char['y'] for char in matched_chars])
|
|
||||||
|
|
||||||
# 获取匹配的文本内容
|
|
||||||
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
|
|
||||||
|
|
||||||
position = [
|
|
||||||
page_num,
|
|
||||||
left, # left
|
|
||||||
right, # right
|
|
||||||
top, # top
|
|
||||||
bottom, # bottom
|
|
||||||
matched_text, # 添加匹配的内容
|
|
||||||
match['similarity'] # 添加相似度信息
|
|
||||||
]
|
|
||||||
found_positions.append(position)
|
|
||||||
|
|
||||||
return found_positions
|
|
||||||
"""
|
|
||||||
在PDF中模糊查找指定文本并返回坐标
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path (str): PDF文件路径
|
|
||||||
target_text (str): 要查找的文本
|
|
||||||
similarity_threshold (float): 相似度阈值 (0-1),默认0.8
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: 包含匹配文本坐标信息的列表
|
|
||||||
"""
|
|
||||||
if not os.path.exists(pdf_path):
|
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
|
||||||
|
|
||||||
# 清理目标文本
|
|
||||||
cleaned_target = clean_text_for_fuzzy_match(target_text)
|
|
||||||
|
|
||||||
# 打开本地PDF文件
|
|
||||||
with open(pdf_path, 'rb') as fp:
|
|
||||||
parser = PDFParser(fp)
|
|
||||||
doc = PDFDocument(parser)
|
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager()
|
|
||||||
laparams = LAParams()
|
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
|
|
||||||
found_positions = []
|
|
||||||
|
|
||||||
# 处理每一页
|
|
||||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
|
||||||
interpreter.process_page(page)
|
|
||||||
layout = device.get_result()
|
|
||||||
char_list = parse_char_layout(layout)
|
|
||||||
|
|
||||||
# 将页面字符组合成文本
|
|
||||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
|
||||||
cleaned_page_text = clean_text_for_fuzzy_match(page_text)
|
|
||||||
|
|
||||||
# 滑动窗口查找相似文本
|
|
||||||
target_len = len(cleaned_target)
|
|
||||||
if target_len == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 存储所有匹配的块
|
|
||||||
matches = []
|
|
||||||
for i in range(len(cleaned_page_text) - target_len + 1):
|
|
||||||
window_text = cleaned_page_text[i:i + target_len]
|
|
||||||
similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
|
|
||||||
|
|
||||||
if similarity >= similarity_threshold:
|
|
||||||
# 找到匹配项,记录位置和相似度
|
|
||||||
if i < len(char_list):
|
|
||||||
matches.append({
|
|
||||||
'start_idx': i,
|
|
||||||
'end_idx': min(i + target_len - 1, len(char_list) - 1),
|
|
||||||
'similarity': similarity
|
|
||||||
})
|
|
||||||
|
|
||||||
# 合并相邻的匹配块
|
|
||||||
if matches:
|
|
||||||
# 按起始位置排序
|
|
||||||
matches.sort(key=lambda x: x['start_idx'])
|
|
||||||
|
|
||||||
# 合并相邻或重叠的匹配块
|
|
||||||
merged_matches = []
|
|
||||||
current_match = matches[0]
|
|
||||||
|
|
||||||
for i in range(1, len(matches)):
|
|
||||||
next_match = matches[i]
|
|
||||||
# 如果下一个匹配块与当前块相邻或重叠,则合并
|
|
||||||
if next_match['start_idx'] <= current_match['end_idx'] + target_len:
|
|
||||||
# 合并索引范围
|
|
||||||
current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
|
|
||||||
# 平均相似度
|
|
||||||
current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2
|
|
||||||
else:
|
|
||||||
# 不相邻,保存当前块,开始新的块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
current_match = next_match
|
|
||||||
|
|
||||||
# 添加最后一个块
|
|
||||||
merged_matches.append(current_match)
|
|
||||||
|
|
||||||
# 为每个合并后的块生成坐标信息
|
|
||||||
for match in merged_matches:
|
|
||||||
start_idx = match['start_idx']
|
|
||||||
end_idx = match['end_idx']
|
|
||||||
|
|
||||||
if start_idx < len(char_list) and end_idx < len(char_list):
|
|
||||||
# 获取匹配区域的所有字符
|
|
||||||
matched_chars = char_list[start_idx:end_idx+1]
|
|
||||||
|
|
||||||
# 计算边界框 (left, right, top, bottom)
|
|
||||||
if matched_chars:
|
|
||||||
# 计算边界值
|
|
||||||
left = min([char['x'] for char in matched_chars])
|
|
||||||
right = max([char['x'] for char in matched_chars])
|
|
||||||
bottom = min([char['y'] for char in matched_chars])
|
|
||||||
top = max([char['y'] for char in matched_chars])
|
|
||||||
|
|
||||||
# 获取匹配的文本内容
|
|
||||||
matched_text = ''.join([char_info['char'] for char_info in matched_chars])
|
|
||||||
|
|
||||||
position = [
|
|
||||||
page_num,
|
|
||||||
left, # left
|
|
||||||
right, # right
|
|
||||||
top, # top
|
|
||||||
bottom, # bottom
|
|
||||||
matched_text, # 添加匹配的内容
|
|
||||||
match['similarity'] # 添加相似度信息
|
|
||||||
]
|
|
||||||
found_positions.append(position)
|
|
||||||
|
|
||||||
return found_positions
|
|
||||||
def find_text_positions(pdf_path, target_text):
|
|
||||||
"""
|
|
||||||
在PDF中查找指定文本并返回坐标
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path (str): PDF文件路径
|
|
||||||
target_text (str): 要查找的文本
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: 包含匹配文本坐标信息的列表
|
|
||||||
"""
|
|
||||||
if not os.path.exists(pdf_path):
|
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
|
||||||
|
|
||||||
# 标准化目标文本
|
|
||||||
normalized_target = normalize_text(target_text)
|
|
||||||
|
|
||||||
# 打开本地PDF文件
|
# 打开本地PDF文件
|
||||||
with open(pdf_path, 'rb') as fp:
|
with open(pdf_path, 'rb') as fp:
|
||||||
@@ -468,101 +240,32 @@ def find_text_positions(pdf_path, target_text):
|
|||||||
full_text = ''.join([char_info['char'] for char_info in all_chars])
|
full_text = ''.join([char_info['char'] for char_info in all_chars])
|
||||||
normalized_full_text = normalize_text(full_text)
|
normalized_full_text = normalize_text(full_text)
|
||||||
|
|
||||||
# 在标准化文本中查找目标文本
|
# 为每个目标文本查找位置
|
||||||
found_positions = []
|
for target_text in target_texts:
|
||||||
start = 0
|
# 标准化目标文本
|
||||||
while True:
|
normalized_target = normalize_text(target_text)
|
||||||
pos = normalized_full_text.find(normalized_target, start)
|
|
||||||
if pos == -1:
|
|
||||||
break
|
|
||||||
|
|
||||||
# 找到匹配项,获取对应的坐标信息
|
found_positions = []
|
||||||
if pos < len(all_chars):
|
start = 0
|
||||||
start_char = all_chars[pos]
|
while True:
|
||||||
end_pos = pos + len(normalized_target) - 1
|
pos = normalized_full_text.find(normalized_target, start)
|
||||||
if end_pos < len(all_chars):
|
if pos == -1:
|
||||||
end_char = all_chars[end_pos]
|
break
|
||||||
# 确定在哪一页
|
|
||||||
page_num = 1
|
|
||||||
for i, page_start in enumerate(page_start_indices):
|
|
||||||
if pos >= page_start:
|
|
||||||
page_num = i + 1
|
|
||||||
|
|
||||||
# 获取匹配的文本内容
|
|
||||||
matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]])
|
|
||||||
|
|
||||||
# 计算边界框 (left, right, top, bottom)
|
|
||||||
left = min(start_char['x'], end_char['x'])
|
|
||||||
right = max(start_char['x'], end_char['x'])
|
|
||||||
bottom = min(start_char['y'], end_char['y'])
|
|
||||||
top = max(start_char['y'], end_char['y'])
|
|
||||||
|
|
||||||
position=[
|
|
||||||
page_num,
|
|
||||||
left, # left
|
|
||||||
right, # right
|
|
||||||
top, # top
|
|
||||||
bottom, # bottom
|
|
||||||
matched_text, # 添加匹配的内容
|
|
||||||
1.0 # 添加相似度信息(精确匹配为1.0)
|
|
||||||
]
|
|
||||||
found_positions.append(position)
|
|
||||||
|
|
||||||
start = pos + 1
|
|
||||||
|
|
||||||
return found_positions
|
|
||||||
|
|
||||||
def find_text_in_pdf_per_page(pdf_path, target_text):
|
|
||||||
"""
|
|
||||||
在PDF中逐页查找指定文本并返回坐标
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pdf_path (str): PDF文件路径
|
|
||||||
target_text (str): 要查找的文本
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: 包含匹配文本坐标信息的列表
|
|
||||||
"""
|
|
||||||
if not os.path.exists(pdf_path):
|
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
|
||||||
|
|
||||||
# 标准化目标文本
|
|
||||||
normalized_target = normalize_text(target_text)
|
|
||||||
|
|
||||||
# 打开本地PDF文件
|
|
||||||
with open(pdf_path, 'rb') as fp:
|
|
||||||
parser = PDFParser(fp)
|
|
||||||
doc = PDFDocument(parser)
|
|
||||||
|
|
||||||
rsrcmgr = PDFResourceManager()
|
|
||||||
laparams = LAParams()
|
|
||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
|
|
||||||
found_positions = []
|
|
||||||
|
|
||||||
# 处理每一页
|
|
||||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
|
||||||
interpreter.process_page(page)
|
|
||||||
layout = device.get_result()
|
|
||||||
char_list = parse_char_layout(layout)
|
|
||||||
|
|
||||||
# 将页面字符组合成文本并标准化
|
|
||||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
|
||||||
normalized_page_text = normalize_text(page_text)
|
|
||||||
|
|
||||||
# 在页面文本中查找目标文本
|
|
||||||
pos = normalized_page_text.find(normalized_target)
|
|
||||||
if pos != -1:
|
|
||||||
# 找到匹配项,获取对应的坐标信息
|
# 找到匹配项,获取对应的坐标信息
|
||||||
if pos < len(char_list):
|
if pos < len(all_chars):
|
||||||
start_char = char_list[pos]
|
start_char = all_chars[pos]
|
||||||
end_pos = pos + len(normalized_target) - 1
|
end_pos = pos + len(normalized_target) - 1
|
||||||
if end_pos < len(char_list):
|
if end_pos < len(all_chars):
|
||||||
end_char = char_list[end_pos]
|
end_char = all_chars[end_pos]
|
||||||
|
# 确定在哪一页
|
||||||
|
page_num = 1
|
||||||
|
for i, page_start in enumerate(page_start_indices):
|
||||||
|
if pos >= page_start:
|
||||||
|
page_num = i + 1
|
||||||
|
|
||||||
# 获取匹配的文本内容
|
# 获取匹配的文本内容
|
||||||
matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]])
|
matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]])
|
||||||
|
|
||||||
# 计算边界框 (left, right, top, bottom)
|
# 计算边界框 (left, right, top, bottom)
|
||||||
left = min(start_char['x'], end_char['x'])
|
left = min(start_char['x'], end_char['x'])
|
||||||
@@ -570,41 +273,37 @@ def find_text_in_pdf_per_page(pdf_path, target_text):
|
|||||||
bottom = min(start_char['y'], end_char['y'])
|
bottom = min(start_char['y'], end_char['y'])
|
||||||
top = max(start_char['y'], end_char['y'])
|
top = max(start_char['y'], end_char['y'])
|
||||||
|
|
||||||
position=[
|
position = [
|
||||||
page_num,
|
page_num,
|
||||||
left, # left
|
left, # left
|
||||||
right, # right
|
right, # right
|
||||||
top, # top
|
top, # top
|
||||||
bottom, # bottom
|
bottom, # bottom
|
||||||
matched_text, # 添加匹配的内容
|
|
||||||
1.0 # 添加相似度信息(精确匹配为1.0)
|
|
||||||
]
|
]
|
||||||
found_positions.append(position)
|
found_positions.append(position)
|
||||||
|
|
||||||
return found_positions
|
start = pos + 1
|
||||||
|
|
||||||
|
batch_results[target_text] = found_positions
|
||||||
|
|
||||||
|
return batch_results
|
||||||
|
|
||||||
def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
|
def find_text_in_pdf_per_page_batch(pdf_path, target_texts):
|
||||||
"""
|
"""
|
||||||
查找部分匹配的文本(适用于较长的文本)
|
在PDF中逐页批量查找指定文本并返回坐标
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): PDF文件路径
|
pdf_path (str): PDF文件路径
|
||||||
target_text (str): 要查找的文本
|
target_texts (list): 要查找的文本列表
|
||||||
min_match_ratio (float): 最小匹配比例 (0-1)
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 包含匹配文本坐标信息的列表
|
dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(pdf_path):
|
if not os.path.exists(pdf_path):
|
||||||
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
# 将目标文本分割成关键词或短语
|
# 初始化结果字典
|
||||||
normalized_target = normalize_text(target_text)
|
batch_results = {text: [] for text in target_texts}
|
||||||
# 提取关键词(移除常见停用词后的词)
|
|
||||||
keywords = [word for word in normalized_target.split() if len(word) > 2]
|
|
||||||
|
|
||||||
if not keywords:
|
|
||||||
keywords = normalized_target.split() # 如果没有长词,则使用所有词
|
|
||||||
|
|
||||||
# 打开本地PDF文件
|
# 打开本地PDF文件
|
||||||
with open(pdf_path, 'rb') as fp:
|
with open(pdf_path, 'rb') as fp:
|
||||||
@@ -616,7 +315,77 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
|
|||||||
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
found_positions = []
|
# 处理每一页
|
||||||
|
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||||
|
interpreter.process_page(page)
|
||||||
|
layout = device.get_result()
|
||||||
|
char_list = parse_char_layout(layout)
|
||||||
|
|
||||||
|
# 将页面字符组合成文本并标准化
|
||||||
|
page_text = ''.join([char_info['char'] for char_info in char_list])
|
||||||
|
normalized_page_text = normalize_text(page_text)
|
||||||
|
|
||||||
|
# 为每个目标文本在当前页查找
|
||||||
|
for target_text in target_texts:
|
||||||
|
normalized_target = normalize_text(target_text)
|
||||||
|
|
||||||
|
# 在页面文本中查找目标文本
|
||||||
|
pos = normalized_page_text.find(normalized_target)
|
||||||
|
if pos != -1:
|
||||||
|
# 找到匹配项,获取对应的坐标信息
|
||||||
|
if pos < len(char_list):
|
||||||
|
start_char = char_list[pos]
|
||||||
|
end_pos = pos + len(normalized_target) - 1
|
||||||
|
if end_pos < len(char_list):
|
||||||
|
end_char = char_list[end_pos]
|
||||||
|
|
||||||
|
# 获取匹配的文本内容
|
||||||
|
matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]])
|
||||||
|
|
||||||
|
# 计算边界框 (left, right, top, bottom)
|
||||||
|
left = min(start_char['x'], end_char['x'])
|
||||||
|
right = max(start_char['x'], end_char['x'])
|
||||||
|
bottom = min(start_char['y'], end_char['y'])
|
||||||
|
top = max(start_char['y'], end_char['y'])
|
||||||
|
|
||||||
|
position = [
|
||||||
|
page_num,
|
||||||
|
left, # left
|
||||||
|
right, # right
|
||||||
|
top, # top
|
||||||
|
bottom, # bottom
|
||||||
|
]
|
||||||
|
batch_results[target_text].append(position)
|
||||||
|
|
||||||
|
return batch_results
|
||||||
|
|
||||||
|
def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.7):
|
||||||
|
"""
|
||||||
|
批量查找部分匹配的文本(适用于较长的文本)
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_path (str): PDF文件路径
|
||||||
|
target_texts (list): 要查找的文本列表
|
||||||
|
min_match_ratio (float): 最小匹配比例 (0-1)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
|
||||||
|
"""
|
||||||
|
if not os.path.exists(pdf_path):
|
||||||
|
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
|
||||||
|
|
||||||
|
# 初始化结果字典
|
||||||
|
batch_results = {text: [] for text in target_texts}
|
||||||
|
|
||||||
|
# 打开本地PDF文件
|
||||||
|
with open(pdf_path, 'rb') as fp:
|
||||||
|
parser = PDFParser(fp)
|
||||||
|
doc = PDFDocument(parser)
|
||||||
|
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
laparams = LAParams()
|
||||||
|
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
|
||||||
# 处理每一页
|
# 处理每一页
|
||||||
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
|
||||||
@@ -628,90 +397,136 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
|
|||||||
page_text = ''.join([char_info['char'] for char_info in char_list])
|
page_text = ''.join([char_info['char'] for char_info in char_list])
|
||||||
normalized_page_text = normalize_text(page_text)
|
normalized_page_text = normalize_text(page_text)
|
||||||
|
|
||||||
# 计算匹配的关键词数量
|
# 为每个目标文本计算匹配
|
||||||
matched_keywords = 0
|
for target_text in target_texts:
|
||||||
for keyword in keywords:
|
# 将目标文本分割成关键词或短语
|
||||||
if keyword in normalized_page_text:
|
normalized_target = normalize_text(target_text)
|
||||||
matched_keywords += 1
|
# 提取关键词(移除常见停用词后的词)
|
||||||
|
keywords = [word for word in normalized_target.split() if len(word) > 2]
|
||||||
# 如果匹配的关键词比例超过阈值,则认为找到匹配
|
|
||||||
if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio:
|
if not keywords:
|
||||||
# 简单起见,返回页面第一个字符和最后一个字符的坐标
|
keywords = normalized_target.split() # 如果没有长词,则使用所有词
|
||||||
if char_list:
|
|
||||||
start_char = char_list[0]
|
if not keywords:
|
||||||
end_char = char_list[-1]
|
continue
|
||||||
match_ratio = matched_keywords / len(keywords)
|
|
||||||
|
|
||||||
# 获取页面文本作为匹配内容
|
# 计算匹配的关键词数量
|
||||||
matched_text = ''.join([char_info['char'] for char_info in char_list])
|
matched_keywords = 0
|
||||||
|
for keyword in keywords:
|
||||||
# 计算边界框 (left, right, top, bottom)
|
if keyword in normalized_page_text:
|
||||||
left = min(start_char['x'], end_char['x'])
|
matched_keywords += 1
|
||||||
right = max(start_char['x'], end_char['x'])
|
|
||||||
bottom = min(start_char['y'], end_char['y'])
|
# 如果匹配的关键词比例超过阈值,则认为找到匹配
|
||||||
top = max(start_char['y'], end_char['y'])
|
if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio:
|
||||||
|
# 简单起见,返回页面第一个字符和最后一个字符的坐标
|
||||||
position = [
|
if char_list:
|
||||||
page_num,
|
start_char = char_list[0]
|
||||||
left, # left
|
end_char = char_list[-1]
|
||||||
right, # right
|
match_ratio = matched_keywords / len(keywords)
|
||||||
top, # top
|
|
||||||
bottom, # bottom
|
# 获取页面文本作为匹配内容
|
||||||
matched_text[:100] + "..." if len(matched_text) > 100 else matched_text, # 添加匹配的内容(限制长度)
|
matched_text = ''.join([char_info['char'] for char_info in char_list])
|
||||||
match_ratio # 添加匹配比例信息
|
|
||||||
]
|
# 计算边界框 (left, right, top, bottom)
|
||||||
found_positions.append(position)
|
left = min(start_char['x'], end_char['x'])
|
||||||
|
right = max(start_char['x'], end_char['x'])
|
||||||
return found_positions
|
bottom = min(start_char['y'], end_char['y'])
|
||||||
def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8):
|
top = max(start_char['y'], end_char['y'])
|
||||||
|
|
||||||
|
position = [
|
||||||
|
page_num,
|
||||||
|
left, # left
|
||||||
|
right, # right
|
||||||
|
top, # top
|
||||||
|
bottom, # bottom
|
||||||
|
]
|
||||||
|
batch_results[target_text].append(position)
|
||||||
|
|
||||||
|
return batch_results
|
||||||
|
|
||||||
|
def smart_fuzzy_find_text_batch(pdf_path, target_texts, similarity_threshold=0.8):
|
||||||
"""
|
"""
|
||||||
智能模糊文本查找,结合多种方法
|
智能批量模糊文本查找,结合多种方法
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): PDF文件路径
|
pdf_path (str): PDF文件路径
|
||||||
target_text (str): 要查找的文本
|
target_texts (list): 要查找的文本列表
|
||||||
similarity_threshold (float): 相似度阈值
|
similarity_threshold (float): 相似度阈值
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 包含匹配文本坐标信息的列表
|
dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
|
||||||
"""
|
"""
|
||||||
|
# 初始化结果字典
|
||||||
|
batch_results = {text: [] for text in target_texts}
|
||||||
|
|
||||||
# 方法1: 精确匹配
|
# 方法1: 精确匹配
|
||||||
exact_results = find_text_in_pdf_per_page(pdf_path, target_text)
|
exact_results = find_text_in_pdf_per_page_batch(pdf_path, target_texts)
|
||||||
if exact_results:
|
|
||||||
return exact_results
|
|
||||||
|
|
||||||
# 方法2: 模糊匹配
|
# 对于已经找到精确匹配的文本,直接使用结果
|
||||||
fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold)
|
remaining_texts = []
|
||||||
if fuzzy_results:
|
for text in target_texts:
|
||||||
return fuzzy_results
|
if exact_results.get(text):
|
||||||
|
batch_results[text] = exact_results[text]
|
||||||
|
else:
|
||||||
|
remaining_texts.append(text)
|
||||||
|
|
||||||
# 方法3: 部分匹配(关键词匹配)
|
if not remaining_texts:
|
||||||
partial_results = find_partial_text_positions(pdf_path, target_text, 0.5)
|
return batch_results
|
||||||
return partial_results
|
|
||||||
|
# 方法2: 模糊匹配(仅对未找到精确匹配的文本)
|
||||||
|
fuzzy_results = find_fuzzy_text_positions_batch(pdf_path, remaining_texts, similarity_threshold)
|
||||||
|
|
||||||
|
# 更新结果
|
||||||
|
for text in remaining_texts:
|
||||||
|
if fuzzy_results.get(text):
|
||||||
|
batch_results[text] = fuzzy_results[text]
|
||||||
|
remaining_texts = [t for t in remaining_texts if t != text] # 从剩余文本中移除
|
||||||
|
|
||||||
|
if not remaining_texts:
|
||||||
|
return batch_results
|
||||||
|
|
||||||
|
# 方法3: 部分匹配(关键词匹配,仅对仍未找到匹配的文本)
|
||||||
|
partial_results = find_partial_text_positions_batch(pdf_path, remaining_texts, 0.5)
|
||||||
|
|
||||||
|
# 更新最终结果
|
||||||
|
for text in remaining_texts:
|
||||||
|
if partial_results.get(text):
|
||||||
|
batch_results[text] = partial_results[text]
|
||||||
|
|
||||||
|
return batch_results
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 使用本地PDF文件
|
# 使用本地PDF文件
|
||||||
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
|
pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径
|
||||||
target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做"
|
target_texts = [
|
||||||
• 基于 `plan` 执行: 精准驱动 AI 完成任务'''
|
'''创建 `plan` 文件: 固化和锁定最终的"怎么做"
|
||||||
|
• 基于 `plan` 执行: 精准驱动 AI 完成任务''',
|
||||||
|
"其他要查找的文本1",
|
||||||
|
"其他要查找的文本2"
|
||||||
|
]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print("智能模糊查找:")
|
print("批量智能模糊查找:")
|
||||||
positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7)
|
batch_positions = smart_fuzzy_find_text_batch(pdf_file_path, target_texts, similarity_threshold=0.7)
|
||||||
|
|
||||||
if positions:
|
for target_text, positions in batch_positions.items():
|
||||||
print(f"找到文本在以下位置:")
|
print(f"\n查找文本: {target_text[:50]}{'...' if len(target_text) > 50 else ''}")
|
||||||
for pos in positions:
|
if positions:
|
||||||
if len(pos) >= 7: # 包含匹配内容和相似度信息
|
print(f"找到文本在以下位置:")
|
||||||
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}")
|
for pos in positions:
|
||||||
print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}")
|
if len(pos) >= 6: # 包含匹配内容和相似度信息
|
||||||
print("-" * 50)
|
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
|
||||||
else:
|
if len(pos) >= 7: # 包含相似度信息
|
||||||
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
|
print(f"相似度: {pos[6]:.2f}")
|
||||||
else:
|
if len(pos) >= 6: # 包含匹配内容
|
||||||
print("未找到文本")
|
print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}")
|
||||||
|
print("-" * 50)
|
||||||
|
else:
|
||||||
|
print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
|
||||||
|
else:
|
||||||
|
print("未找到文本")
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
print(e)
|
print(e)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理PDF时出错: {e}")
|
print(f"处理PDF时出错: {e}")
|
||||||
|
Reference in New Issue
Block a user