diff --git a/src/get_pos_pdf.py b/src/get_pos_pdf.py index 3ecf2f4..3c44bd7 100644 --- a/src/get_pos_pdf.py +++ b/src/get_pos_pdf.py @@ -54,23 +54,23 @@ def clean_text_for_fuzzy_match(text): # 标准化空白字符 cleaned = re.sub(r'\s+', ' ', cleaned.strip()) return cleaned -def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8): +def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold=0.8): """ - 在PDF中模糊查找指定文本并返回坐标 + 在PDF中批量模糊查找指定文本并返回坐标 Args: pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 + target_texts (list): 要查找的文本列表 similarity_threshold (float): 相似度阈值 (0-1),默认0.8 Returns: - list: 包含匹配文本坐标信息的列表 + dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - # 清理目标文本 - cleaned_target = clean_text_for_fuzzy_match(target_text) + # 初始化结果字典 + batch_results = {text: [] for text in target_texts} # 打开本地PDF文件 with open(pdf_path, 'rb') as fp: @@ -82,366 +82,138 @@ def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8): device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) - found_positions = [] - # 处理每一页 + pages_chars = [] for page_num, page in enumerate(PDFPage.create_pages(doc), 1): interpreter.process_page(page) layout = device.get_result() char_list = parse_char_layout(layout) - - # 将页面字符组合成文本 - page_text = ''.join([char_info['char'] for char_info in char_list]) - cleaned_page_text = clean_text_for_fuzzy_match(page_text) - - # 滑动窗口查找相似文本 + pages_chars.append((page_num, char_list)) + + # 为每个目标文本进行查找 + for target_text in target_texts: + # 清理目标文本 + cleaned_target = clean_text_for_fuzzy_match(target_text) target_len = len(cleaned_target) + if target_len == 0: continue + + found_positions = [] - # 存储所有匹配的块 - matches = [] - for i in range(len(cleaned_page_text) - target_len + 1): - window_text = cleaned_page_text[i:i + target_len] - similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() + # 在每一页中查找 + for page_num, char_list in pages_chars: + # 将页面字符组合成文本 + page_text = ''.join([char_info['char'] for char_info in char_list]) + cleaned_page_text = clean_text_for_fuzzy_match(page_text) - if similarity >= similarity_threshold: - # 找到匹配项,记录位置和相似度 - if i < len(char_list): - matches.append({ - 'start_idx': i, - 'end_idx': min(i + target_len - 1, len(char_list) - 1), - 'similarity': similarity - }) - - # 合并相邻的匹配块 - if matches: - # 按起始位置排序 - matches.sort(key=lambda x: x['start_idx']) - - # 合并相邻或重叠的匹配块 - merged_matches = [] - current_match = matches[0].copy() # 创建副本 - - for i in range(1, len(matches)): - next_match = matches[i] - # 如果下一个匹配块与当前块相邻或重叠,则合并 - # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 - if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): - # 合并索引范围 - current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) - current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) - # 计算加权平均相似度 - total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ - (next_match['end_idx'] - next_match['start_idx'] + 1) - current_match['similarity'] = ( - current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + - next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) - ) / total_length - else: - # 不相邻,保存当前块,开始新的块 - merged_matches.append(current_match) - current_match = next_match.copy() # 创建副本 - - # 添加最后一个块 - merged_matches.append(current_match) - - # 为每个合并后的块生成坐标信息 - for match in merged_matches: - start_idx = match['start_idx'] - end_idx = match['end_idx'] + # 滑动窗口查找相似文本 + matches = [] + for i in range(len(cleaned_page_text) - target_len + 1): + window_text = cleaned_page_text[i:i + target_len] + similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() - if start_idx < len(char_list) and end_idx < len(char_list): - # 获取匹配区域的所有字符 - matched_chars = char_list[start_idx:end_idx+1] + if similarity >= similarity_threshold: + # 找到匹配项,记录位置和相似度 + if i < len(char_list): + matches.append({ + 'start_idx': i, + 'end_idx': min(i + target_len - 1, len(char_list) - 1), + 'similarity': similarity + }) + + # 合并相邻的匹配块 + if matches: + # 按起始位置排序 + matches.sort(key=lambda x: x['start_idx']) + + # 合并相邻或重叠的匹配块 + merged_matches = [] + current_match = matches[0].copy() # 创建副本 + + for i in range(1, len(matches)): + next_match = matches[i] + # 如果下一个匹配块与当前块相邻或重叠,则合并 + # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 + if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): + # 合并索引范围 + current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) + current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) + # 计算加权平均相似度 + total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ + (next_match['end_idx'] - next_match['start_idx'] + 1) + current_match['similarity'] = ( + current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + + next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) + ) / total_length + else: + # 不相邻,保存当前块,开始新的块 + merged_matches.append(current_match) + current_match = next_match.copy() # 创建副本 + + # 添加最后一个块 + merged_matches.append(current_match) + + # 为每个合并后的块生成坐标信息 + for match in merged_matches: + start_idx = match['start_idx'] + end_idx = match['end_idx'] - # 过滤掉坐标为0的字符(通常是特殊字符) - valid_chars = [char for char in matched_chars - if char['x'] > 0 and char['y'] > 0] - - # 如果没有有效字符,则使用所有字符 - chars_to_use = valid_chars if valid_chars else matched_chars - - # 计算边界框 (left, right, top, bottom) - if chars_to_use: - # 计算边界值 - left = min([char['x'] for char in chars_to_use]) - right = max([char['x'] for char in chars_to_use]) - bottom = min([char['y'] for char in chars_to_use]) - top = max([char['y'] for char in chars_to_use]) + if start_idx < len(char_list) and end_idx < len(char_list): + # 获取匹配区域的所有字符 + matched_chars = char_list[start_idx:end_idx+1] - # 获取匹配的文本内容 - matched_text = ''.join([char_info['char'] for char_info in chars_to_use]) + # 过滤掉坐标为0的字符(通常是特殊字符) + valid_chars = [char for char in matched_chars + if char['x'] > 0 and char['y'] > 0] - # 只有当边界框有效时才添加结果 - if left >= 0 and right > left and top > bottom: - position = [ - page_num, - left, # left - right, # right - top, # top - bottom, # bottom - matched_text, # 添加匹配的内容 - match['similarity'] # 添加相似度信息 - ] - found_positions.append(position) - - return found_positions + # 如果没有有效字符,则使用所有字符 + chars_to_use = valid_chars if valid_chars else matched_chars + + # 计算边界框 (left, right, top, bottom) + if chars_to_use: + # 计算边界值 + left = min([char['x'] for char in chars_to_use]) + right = max([char['x'] for char in chars_to_use]) + bottom = min([char['y'] for char in chars_to_use]) + top = max([char['y'] for char in chars_to_use]) + + # 获取匹配的文本内容 + matched_text = ''.join([char_info['char'] for char_info in chars_to_use]) + + # 只有当边界框有效时才添加结果 + if left >= 0 and right > left and top > bottom: + position = [ + page_num, + left, # left + right, # right + top, # top + bottom, # bottom + matched_text, # 添加匹配的内容 + match['similarity'] # 添加相似度信息 + ] + found_positions.append(position) + + batch_results[target_text] = found_positions + + return batch_results + +def find_text_positions_batch(pdf_path, target_texts): """ - 在PDF中模糊查找指定文本并返回坐标 + 在PDF中批量查找指定文本并返回坐标 Args: pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 - similarity_threshold (float): 相似度阈值 (0-1),默认0.8 + target_texts (list): 要查找的文本列表 Returns: - list: 包含匹配文本坐标信息的列表 + dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - # 清理目标文本 - cleaned_target = clean_text_for_fuzzy_match(target_text) - - # 打开本地PDF文件 - with open(pdf_path, 'rb') as fp: - parser = PDFParser(fp) - doc = PDFDocument(parser) - - rsrcmgr = PDFResourceManager() - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - found_positions = [] - - # 处理每一页 - for page_num, page in enumerate(PDFPage.create_pages(doc), 1): - interpreter.process_page(page) - layout = device.get_result() - char_list = parse_char_layout(layout) - - # 将页面字符组合成文本 - page_text = ''.join([char_info['char'] for char_info in char_list]) - cleaned_page_text = clean_text_for_fuzzy_match(page_text) - - # 滑动窗口查找相似文本 - target_len = len(cleaned_target) - if target_len == 0: - continue - - # 存储所有匹配的块 - matches = [] - for i in range(len(cleaned_page_text) - target_len + 1): - window_text = cleaned_page_text[i:i + target_len] - similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() - - if similarity >= similarity_threshold: - # 找到匹配项,记录位置和相似度 - if i < len(char_list): - matches.append({ - 'start_idx': i, - 'end_idx': min(i + target_len - 1, len(char_list) - 1), - 'similarity': similarity - }) - - # 合并相邻的匹配块 - if matches: - # 按起始位置排序 - matches.sort(key=lambda x: x['start_idx']) - - # 合并相邻或重叠的匹配块 - merged_matches = [] - current_match = matches[0].copy() # 创建副本 - - for i in range(1, len(matches)): - next_match = matches[i] - # 如果下一个匹配块与当前块相邻或重叠,则合并 - # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 - if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): - # 合并索引范围 - current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) - current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) - # 计算加权平均相似度 - total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ - (next_match['end_idx'] - next_match['start_idx'] + 1) - current_match['similarity'] = ( - current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + - next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) - ) / total_length - else: - # 不相邻,保存当前块,开始新的块 - merged_matches.append(current_match) - current_match = next_match.copy() # 创建副本 - - # 添加最后一个块 - merged_matches.append(current_match) - - # 为每个合并后的块生成坐标信息 - for match in merged_matches: - start_idx = match['start_idx'] - end_idx = match['end_idx'] - - if start_idx < len(char_list) and end_idx < len(char_list): - # 获取匹配区域的所有字符 - matched_chars = char_list[start_idx:end_idx+1] - - # 计算边界框 (left, right, top, bottom) - if matched_chars: - # 计算边界值 - left = min([char['x'] for char in matched_chars]) - right = max([char['x'] for char in matched_chars]) - bottom = min([char['y'] for char in matched_chars]) - top = max([char['y'] for char in matched_chars]) - - # 获取匹配的文本内容 - matched_text = ''.join([char_info['char'] for char_info in matched_chars]) - - position = [ - page_num, - left, # left - right, # right - top, # top - bottom, # bottom - matched_text, # 添加匹配的内容 - match['similarity'] # 添加相似度信息 - ] - found_positions.append(position) - - return found_positions - """ - 在PDF中模糊查找指定文本并返回坐标 - - Args: - pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 - similarity_threshold (float): 相似度阈值 (0-1),默认0.8 - - Returns: - list: 包含匹配文本坐标信息的列表 - """ - if not os.path.exists(pdf_path): - raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - - # 清理目标文本 - cleaned_target = clean_text_for_fuzzy_match(target_text) - - # 打开本地PDF文件 - with open(pdf_path, 'rb') as fp: - parser = PDFParser(fp) - doc = PDFDocument(parser) - - rsrcmgr = PDFResourceManager() - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - found_positions = [] - - # 处理每一页 - for page_num, page in enumerate(PDFPage.create_pages(doc), 1): - interpreter.process_page(page) - layout = device.get_result() - char_list = parse_char_layout(layout) - - # 将页面字符组合成文本 - page_text = ''.join([char_info['char'] for char_info in char_list]) - cleaned_page_text = clean_text_for_fuzzy_match(page_text) - - # 滑动窗口查找相似文本 - target_len = len(cleaned_target) - if target_len == 0: - continue - - # 存储所有匹配的块 - matches = [] - for i in range(len(cleaned_page_text) - target_len + 1): - window_text = cleaned_page_text[i:i + target_len] - similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() - - if similarity >= similarity_threshold: - # 找到匹配项,记录位置和相似度 - if i < len(char_list): - matches.append({ - 'start_idx': i, - 'end_idx': min(i + target_len - 1, len(char_list) - 1), - 'similarity': similarity - }) - - # 合并相邻的匹配块 - if matches: - # 按起始位置排序 - matches.sort(key=lambda x: x['start_idx']) - - # 合并相邻或重叠的匹配块 - merged_matches = [] - current_match = matches[0] - - for i in range(1, len(matches)): - next_match = matches[i] - # 如果下一个匹配块与当前块相邻或重叠,则合并 - if next_match['start_idx'] <= current_match['end_idx'] + target_len: - # 合并索引范围 - current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) - # 平均相似度 - current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2 - else: - # 不相邻,保存当前块,开始新的块 - merged_matches.append(current_match) - current_match = next_match - - # 添加最后一个块 - merged_matches.append(current_match) - - # 为每个合并后的块生成坐标信息 - for match in merged_matches: - start_idx = match['start_idx'] - end_idx = match['end_idx'] - - if start_idx < len(char_list) and end_idx < len(char_list): - # 获取匹配区域的所有字符 - matched_chars = char_list[start_idx:end_idx+1] - - # 计算边界框 (left, right, top, bottom) - if matched_chars: - # 计算边界值 - left = min([char['x'] for char in matched_chars]) - right = max([char['x'] for char in matched_chars]) - bottom = min([char['y'] for char in matched_chars]) - top = max([char['y'] for char in matched_chars]) - - # 获取匹配的文本内容 - matched_text = ''.join([char_info['char'] for char_info in matched_chars]) - - position = [ - page_num, - left, # left - right, # right - top, # top - bottom, # bottom - matched_text, # 添加匹配的内容 - match['similarity'] # 添加相似度信息 - ] - found_positions.append(position) - - return found_positions -def find_text_positions(pdf_path, target_text): - """ - 在PDF中查找指定文本并返回坐标 - - Args: - pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 - - Returns: - list: 包含匹配文本坐标信息的列表 - """ - if not os.path.exists(pdf_path): - raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - - # 标准化目标文本 - normalized_target = normalize_text(target_text) + # 初始化结果字典 + batch_results = {text: [] for text in target_texts} # 打开本地PDF文件 with open(pdf_path, 'rb') as fp: @@ -468,101 +240,32 @@ def find_text_positions(pdf_path, target_text): full_text = ''.join([char_info['char'] for char_info in all_chars]) normalized_full_text = normalize_text(full_text) - # 在标准化文本中查找目标文本 - found_positions = [] - start = 0 - while True: - pos = normalized_full_text.find(normalized_target, start) - if pos == -1: - break + # 为每个目标文本查找位置 + for target_text in target_texts: + # 标准化目标文本 + normalized_target = normalize_text(target_text) - # 找到匹配项,获取对应的坐标信息 - if pos < len(all_chars): - start_char = all_chars[pos] - end_pos = pos + len(normalized_target) - 1 - if end_pos < len(all_chars): - end_char = all_chars[end_pos] - # 确定在哪一页 - page_num = 1 - for i, page_start in enumerate(page_start_indices): - if pos >= page_start: - page_num = i + 1 - - # 获取匹配的文本内容 - matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]]) - - # 计算边界框 (left, right, top, bottom) - left = min(start_char['x'], end_char['x']) - right = max(start_char['x'], end_char['x']) - bottom = min(start_char['y'], end_char['y']) - top = max(start_char['y'], end_char['y']) - - position=[ - page_num, - left, # left - right, # right - top, # top - bottom, # bottom - matched_text, # 添加匹配的内容 - 1.0 # 添加相似度信息(精确匹配为1.0) - ] - found_positions.append(position) - - start = pos + 1 - - return found_positions - -def find_text_in_pdf_per_page(pdf_path, target_text): - """ - 在PDF中逐页查找指定文本并返回坐标 - - Args: - pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 - - Returns: - list: 包含匹配文本坐标信息的列表 - """ - if not os.path.exists(pdf_path): - raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - - # 标准化目标文本 - normalized_target = normalize_text(target_text) - - # 打开本地PDF文件 - with open(pdf_path, 'rb') as fp: - parser = PDFParser(fp) - doc = PDFDocument(parser) - - rsrcmgr = PDFResourceManager() - laparams = LAParams() - device = PDFPageAggregator(rsrcmgr, laparams=laparams) - interpreter = PDFPageInterpreter(rsrcmgr, device) - - found_positions = [] - - # 处理每一页 - for page_num, page in enumerate(PDFPage.create_pages(doc), 1): - interpreter.process_page(page) - layout = device.get_result() - char_list = parse_char_layout(layout) - - # 将页面字符组合成文本并标准化 - page_text = ''.join([char_info['char'] for char_info in char_list]) - normalized_page_text = normalize_text(page_text) - - # 在页面文本中查找目标文本 - pos = normalized_page_text.find(normalized_target) - if pos != -1: + found_positions = [] + start = 0 + while True: + pos = normalized_full_text.find(normalized_target, start) + if pos == -1: + break + # 找到匹配项,获取对应的坐标信息 - if pos < len(char_list): - start_char = char_list[pos] + if pos < len(all_chars): + start_char = all_chars[pos] end_pos = pos + len(normalized_target) - 1 - if end_pos < len(char_list): - end_char = char_list[end_pos] + if end_pos < len(all_chars): + end_char = all_chars[end_pos] + # 确定在哪一页 + page_num = 1 + for i, page_start in enumerate(page_start_indices): + if pos >= page_start: + page_num = i + 1 # 获取匹配的文本内容 - matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]]) + matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]]) # 计算边界框 (left, right, top, bottom) left = min(start_char['x'], end_char['x']) @@ -570,41 +273,37 @@ def find_text_in_pdf_per_page(pdf_path, target_text): bottom = min(start_char['y'], end_char['y']) top = max(start_char['y'], end_char['y']) - position=[ + position = [ page_num, left, # left right, # right top, # top bottom, # bottom - matched_text, # 添加匹配的内容 - 1.0 # 添加相似度信息(精确匹配为1.0) ] found_positions.append(position) - - return found_positions + + start = pos + 1 + + batch_results[target_text] = found_positions + + return batch_results -def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): +def find_text_in_pdf_per_page_batch(pdf_path, target_texts): """ - 查找部分匹配的文本(适用于较长的文本) + 在PDF中逐页批量查找指定文本并返回坐标 Args: pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 - min_match_ratio (float): 最小匹配比例 (0-1) + target_texts (list): 要查找的文本列表 Returns: - list: 包含匹配文本坐标信息的列表 + dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 """ if not os.path.exists(pdf_path): raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") - # 将目标文本分割成关键词或短语 - normalized_target = normalize_text(target_text) - # 提取关键词(移除常见停用词后的词) - keywords = [word for word in normalized_target.split() if len(word) > 2] - - if not keywords: - keywords = normalized_target.split() # 如果没有长词,则使用所有词 + # 初始化结果字典 + batch_results = {text: [] for text in target_texts} # 打开本地PDF文件 with open(pdf_path, 'rb') as fp: @@ -616,7 +315,77 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) - found_positions = [] + # 处理每一页 + for page_num, page in enumerate(PDFPage.create_pages(doc), 1): + interpreter.process_page(page) + layout = device.get_result() + char_list = parse_char_layout(layout) + + # 将页面字符组合成文本并标准化 + page_text = ''.join([char_info['char'] for char_info in char_list]) + normalized_page_text = normalize_text(page_text) + + # 为每个目标文本在当前页查找 + for target_text in target_texts: + normalized_target = normalize_text(target_text) + + # 在页面文本中查找目标文本 + pos = normalized_page_text.find(normalized_target) + if pos != -1: + # 找到匹配项,获取对应的坐标信息 + if pos < len(char_list): + start_char = char_list[pos] + end_pos = pos + len(normalized_target) - 1 + if end_pos < len(char_list): + end_char = char_list[end_pos] + + # 获取匹配的文本内容 + matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]]) + + # 计算边界框 (left, right, top, bottom) + left = min(start_char['x'], end_char['x']) + right = max(start_char['x'], end_char['x']) + bottom = min(start_char['y'], end_char['y']) + top = max(start_char['y'], end_char['y']) + + position = [ + page_num, + left, # left + right, # right + top, # top + bottom, # bottom + ] + batch_results[target_text].append(position) + + return batch_results + +def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.7): + """ + 批量查找部分匹配的文本(适用于较长的文本) + + Args: + pdf_path (str): PDF文件路径 + target_texts (list): 要查找的文本列表 + min_match_ratio (float): 最小匹配比例 (0-1) + + Returns: + dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 + """ + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") + + # 初始化结果字典 + batch_results = {text: [] for text in target_texts} + + # 打开本地PDF文件 + with open(pdf_path, 'rb') as fp: + parser = PDFParser(fp) + doc = PDFDocument(parser) + + rsrcmgr = PDFResourceManager() + laparams = LAParams() + device = PDFPageAggregator(rsrcmgr, laparams=laparams) + interpreter = PDFPageInterpreter(rsrcmgr, device) # 处理每一页 for page_num, page in enumerate(PDFPage.create_pages(doc), 1): @@ -628,90 +397,136 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): page_text = ''.join([char_info['char'] for char_info in char_list]) normalized_page_text = normalize_text(page_text) - # 计算匹配的关键词数量 - matched_keywords = 0 - for keyword in keywords: - if keyword in normalized_page_text: - matched_keywords += 1 - - # 如果匹配的关键词比例超过阈值,则认为找到匹配 - if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio: - # 简单起见,返回页面第一个字符和最后一个字符的坐标 - if char_list: - start_char = char_list[0] - end_char = char_list[-1] - match_ratio = matched_keywords / len(keywords) + # 为每个目标文本计算匹配 + for target_text in target_texts: + # 将目标文本分割成关键词或短语 + normalized_target = normalize_text(target_text) + # 提取关键词(移除常见停用词后的词) + keywords = [word for word in normalized_target.split() if len(word) > 2] + + if not keywords: + keywords = normalized_target.split() # 如果没有长词,则使用所有词 + + if not keywords: + continue - # 获取页面文本作为匹配内容 - matched_text = ''.join([char_info['char'] for char_info in char_list]) - - # 计算边界框 (left, right, top, bottom) - left = min(start_char['x'], end_char['x']) - right = max(start_char['x'], end_char['x']) - bottom = min(start_char['y'], end_char['y']) - top = max(start_char['y'], end_char['y']) - - position = [ - page_num, - left, # left - right, # right - top, # top - bottom, # bottom - matched_text[:100] + "..." if len(matched_text) > 100 else matched_text, # 添加匹配的内容(限制长度) - match_ratio # 添加匹配比例信息 - ] - found_positions.append(position) - - return found_positions -def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8): + # 计算匹配的关键词数量 + matched_keywords = 0 + for keyword in keywords: + if keyword in normalized_page_text: + matched_keywords += 1 + + # 如果匹配的关键词比例超过阈值,则认为找到匹配 + if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio: + # 简单起见,返回页面第一个字符和最后一个字符的坐标 + if char_list: + start_char = char_list[0] + end_char = char_list[-1] + match_ratio = matched_keywords / len(keywords) + + # 获取页面文本作为匹配内容 + matched_text = ''.join([char_info['char'] for char_info in char_list]) + + # 计算边界框 (left, right, top, bottom) + left = min(start_char['x'], end_char['x']) + right = max(start_char['x'], end_char['x']) + bottom = min(start_char['y'], end_char['y']) + top = max(start_char['y'], end_char['y']) + + position = [ + page_num, + left, # left + right, # right + top, # top + bottom, # bottom + ] + batch_results[target_text].append(position) + + return batch_results + +def smart_fuzzy_find_text_batch(pdf_path, target_texts, similarity_threshold=0.8): """ - 智能模糊文本查找,结合多种方法 + 智能批量模糊文本查找,结合多种方法 Args: pdf_path (str): PDF文件路径 - target_text (str): 要查找的文本 + target_texts (list): 要查找的文本列表 similarity_threshold (float): 相似度阈值 Returns: - list: 包含匹配文本坐标信息的列表 + dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 """ + # 初始化结果字典 + batch_results = {text: [] for text in target_texts} + # 方法1: 精确匹配 - exact_results = find_text_in_pdf_per_page(pdf_path, target_text) - if exact_results: - return exact_results + exact_results = find_text_in_pdf_per_page_batch(pdf_path, target_texts) - # 方法2: 模糊匹配 - fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold) - if fuzzy_results: - return fuzzy_results + # 对于已经找到精确匹配的文本,直接使用结果 + remaining_texts = [] + for text in target_texts: + if exact_results.get(text): + batch_results[text] = exact_results[text] + else: + remaining_texts.append(text) - # 方法3: 部分匹配(关键词匹配) - partial_results = find_partial_text_positions(pdf_path, target_text, 0.5) - return partial_results + if not remaining_texts: + return batch_results + + # 方法2: 模糊匹配(仅对未找到精确匹配的文本) + fuzzy_results = find_fuzzy_text_positions_batch(pdf_path, remaining_texts, similarity_threshold) + + # 更新结果 + for text in remaining_texts: + if fuzzy_results.get(text): + batch_results[text] = fuzzy_results[text] + remaining_texts = [t for t in remaining_texts if t != text] # 从剩余文本中移除 + + if not remaining_texts: + return batch_results + + # 方法3: 部分匹配(关键词匹配,仅对仍未找到匹配的文本) + partial_results = find_partial_text_positions_batch(pdf_path, remaining_texts, 0.5) + + # 更新最终结果 + for text in remaining_texts: + if partial_results.get(text): + batch_results[text] = partial_results[text] + + return batch_results if __name__ == '__main__': # 使用本地PDF文件 pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf' # 修改为你的PDF文件路径 - target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做" -• 基于 `plan` 执行: 精准驱动 AI 完成任务''' + target_texts = [ + '''创建 `plan` 文件: 固化和锁定最终的"怎么做" +• 基于 `plan` 执行: 精准驱动 AI 完成任务''', + "其他要查找的文本1", + "其他要查找的文本2" + ] try: - print("智能模糊查找:") - positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7) + print("批量智能模糊查找:") + batch_positions = smart_fuzzy_find_text_batch(pdf_file_path, target_texts, similarity_threshold=0.7) - if positions: - print(f"找到文本在以下位置:") - for pos in positions: - if len(pos) >= 7: # 包含匹配内容和相似度信息 - print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}") - print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}") - print("-" * 50) - else: - print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") - else: - print("未找到文本") - + for target_text, positions in batch_positions.items(): + print(f"\n查找文本: {target_text[:50]}{'...' if len(target_text) > 50 else ''}") + if positions: + print(f"找到文本在以下位置:") + for pos in positions: + if len(pos) >= 6: # 包含匹配内容和相似度信息 + print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") + if len(pos) >= 7: # 包含相似度信息 + print(f"相似度: {pos[6]:.2f}") + if len(pos) >= 6: # 包含匹配内容 + print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}") + print("-" * 50) + else: + print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") + else: + print("未找到文本") + except FileNotFoundError as e: print(e) except Exception as e: - print(f"处理PDF时出错: {e}") \ No newline at end of file + print(f"处理PDF时出错: {e}")