实现PDF批量模糊文本查找功能,支持多个目标文本的相似度匹配,优化文本坐标返回逻辑
This commit is contained in:
		| @@ -54,23 +54,23 @@ def clean_text_for_fuzzy_match(text): | ||||
|     # 标准化空白字符 | ||||
|     cleaned = re.sub(r'\s+', ' ', cleaned.strip()) | ||||
|     return cleaned | ||||
| def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8): | ||||
| def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold=0.8): | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|     在PDF中批量模糊查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         target_texts (list): 要查找的文本列表 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|         dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|     # 初始化结果字典 | ||||
|     batch_results = {text: [] for text in target_texts} | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
| @@ -82,366 +82,138 @@ def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8): | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         pages_chars = [] | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             pages_chars.append((page_num, char_list)) | ||||
|          | ||||
|         # 为每个目标文本进行查找 | ||||
|         for target_text in target_texts: | ||||
|             # 清理目标文本 | ||||
|             cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|             target_len = len(cleaned_target) | ||||
|              | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|                  | ||||
|             found_positions = [] | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|             # 在每一页中查找 | ||||
|             for page_num, char_list in pages_chars: | ||||
|                 # 将页面字符组合成文本 | ||||
|                 page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|                 cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0].copy()  # 创建副本 | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): | ||||
|                         # 合并索引范围 | ||||
|                         current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 计算加权平均相似度 | ||||
|                         total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ | ||||
|                                       (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         current_match['similarity'] = ( | ||||
|                             current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + | ||||
|                             next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         ) / total_length | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match.copy()  # 创建副本 | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                 # 滑动窗口查找相似文本 | ||||
|                 matches = [] | ||||
|                 for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                     window_text = cleaned_page_text[i:i + target_len] | ||||
|                     similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                     if similarity >= similarity_threshold: | ||||
|                         # 找到匹配项,记录位置和相似度 | ||||
|                         if i < len(char_list): | ||||
|                             matches.append({ | ||||
|                                 'start_idx': i, | ||||
|                                 'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                                 'similarity': similarity | ||||
|                             }) | ||||
|                  | ||||
|                 # 合并相邻的匹配块 | ||||
|                 if matches: | ||||
|                     # 按起始位置排序 | ||||
|                     matches.sort(key=lambda x: x['start_idx']) | ||||
|                      | ||||
|                     # 合并相邻或重叠的匹配块 | ||||
|                     merged_matches = [] | ||||
|                     current_match = matches[0].copy()  # 创建副本 | ||||
|                      | ||||
|                     for i in range(1, len(matches)): | ||||
|                         next_match = matches[i] | ||||
|                         # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                         # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 | ||||
|                         if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): | ||||
|                             # 合并索引范围 | ||||
|                             current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) | ||||
|                             current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                             # 计算加权平均相似度 | ||||
|                             total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ | ||||
|                                           (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                             current_match['similarity'] = ( | ||||
|                                 current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + | ||||
|                                 next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                             ) / total_length | ||||
|                         else: | ||||
|                             # 不相邻,保存当前块,开始新的块 | ||||
|                             merged_matches.append(current_match) | ||||
|                             current_match = next_match.copy()  # 创建副本 | ||||
|                      | ||||
|                     # 添加最后一个块 | ||||
|                     merged_matches.append(current_match) | ||||
|                      | ||||
|                     # 为每个合并后的块生成坐标信息 | ||||
|                     for match in merged_matches: | ||||
|                         start_idx = match['start_idx'] | ||||
|                         end_idx = match['end_idx'] | ||||
|                          | ||||
|                         # 过滤掉坐标为0的字符(通常是特殊字符) | ||||
|                         valid_chars = [char for char in matched_chars  | ||||
|                                      if char['x'] > 0 and char['y'] > 0] | ||||
|                          | ||||
|                         # 如果没有有效字符,则使用所有字符 | ||||
|                         chars_to_use = valid_chars if valid_chars else matched_chars | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if chars_to_use: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in chars_to_use]) | ||||
|                             right = max([char['x'] for char in chars_to_use]) | ||||
|                             bottom = min([char['y'] for char in chars_to_use]) | ||||
|                             top = max([char['y'] for char in chars_to_use]) | ||||
|                         if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                             # 获取匹配区域的所有字符 | ||||
|                             matched_chars = char_list[start_idx:end_idx+1] | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in chars_to_use]) | ||||
|                             # 过滤掉坐标为0的字符(通常是特殊字符) | ||||
|                             valid_chars = [char for char in matched_chars  | ||||
|                                          if char['x'] > 0 and char['y'] > 0] | ||||
|                              | ||||
|                             # 只有当边界框有效时才添加结果 | ||||
|                             if left >= 0 and right > left and top > bottom: | ||||
|                                 position = [ | ||||
|                                     page_num, | ||||
|                                     left,    # left | ||||
|                                     right,   # right | ||||
|                                     top,     # top | ||||
|                                     bottom,  # bottom | ||||
|                                     matched_text,  # 添加匹配的内容 | ||||
|                                     match['similarity']  # 添加相似度信息 | ||||
|                                 ] | ||||
|                                 found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|                             # 如果没有有效字符,则使用所有字符 | ||||
|                             chars_to_use = valid_chars if valid_chars else matched_chars | ||||
|                              | ||||
|                             # 计算边界框 (left, right, top, bottom) | ||||
|                             if chars_to_use: | ||||
|                                 # 计算边界值 | ||||
|                                 left = min([char['x'] for char in chars_to_use]) | ||||
|                                 right = max([char['x'] for char in chars_to_use]) | ||||
|                                 bottom = min([char['y'] for char in chars_to_use]) | ||||
|                                 top = max([char['y'] for char in chars_to_use]) | ||||
|                                  | ||||
|                                 # 获取匹配的文本内容 | ||||
|                                 matched_text = ''.join([char_info['char'] for char_info in chars_to_use]) | ||||
|                                  | ||||
|                                 # 只有当边界框有效时才添加结果 | ||||
|                                 if left >= 0 and right > left and top > bottom: | ||||
|                                     position = [ | ||||
|                                         page_num, | ||||
|                                         left,    # left | ||||
|                                         right,   # right | ||||
|                                         top,     # top | ||||
|                                         bottom,  # bottom | ||||
|                                         matched_text,  # 添加匹配的内容 | ||||
|                                         match['similarity']  # 添加相似度信息 | ||||
|                                     ] | ||||
|                                     found_positions.append(position) | ||||
|              | ||||
|             batch_results[target_text] = found_positions | ||||
|      | ||||
|     return batch_results | ||||
|  | ||||
| def find_text_positions_batch(pdf_path, target_texts): | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|     在PDF中批量查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|         target_texts (list): 要查找的文本列表 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|         dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             target_len = len(cleaned_target) | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0].copy()  # 创建副本 | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): | ||||
|                         # 合并索引范围 | ||||
|                         current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 计算加权平均相似度 | ||||
|                         total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ | ||||
|                                       (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         current_match['similarity'] = ( | ||||
|                             current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + | ||||
|                             next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         ) / total_length | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match.copy()  # 创建副本 | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if matched_chars: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in matched_chars]) | ||||
|                             right = max([char['x'] for char in matched_chars]) | ||||
|                             bottom = min([char['y'] for char in matched_chars]) | ||||
|                             top = max([char['y'] for char in matched_chars]) | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in matched_chars]) | ||||
|                              | ||||
|                             position = [ | ||||
|                                 page_num, | ||||
|                                 left,    # left | ||||
|                                 right,   # right | ||||
|                                 top,     # top | ||||
|                                 bottom,  # bottom | ||||
|                                 matched_text,  # 添加匹配的内容 | ||||
|                                 match['similarity']  # 添加相似度信息 | ||||
|                             ] | ||||
|                             found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             target_len = len(cleaned_target) | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0] | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + target_len: | ||||
|                         # 合并索引范围 | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 平均相似度 | ||||
|                         current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2 | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if matched_chars: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in matched_chars]) | ||||
|                             right = max([char['x'] for char in matched_chars]) | ||||
|                             bottom = min([char['y'] for char in matched_chars]) | ||||
|                             top = max([char['y'] for char in matched_chars]) | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in matched_chars]) | ||||
|                              | ||||
|                             position = [ | ||||
|                                 page_num, | ||||
|                                 left,    # left | ||||
|                                 right,   # right | ||||
|                                 top,     # top | ||||
|                                 bottom,  # bottom | ||||
|                                 matched_text,  # 添加匹配的内容 | ||||
|                                 match['similarity']  # 添加相似度信息 | ||||
|                             ] | ||||
|                             found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
| def find_text_positions(pdf_path, target_text): | ||||
|     """ | ||||
|     在PDF中查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 标准化目标文本 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|     # 初始化结果字典 | ||||
|     batch_results = {text: [] for text in target_texts} | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
| @@ -468,101 +240,32 @@ def find_text_positions(pdf_path, target_text): | ||||
|         full_text = ''.join([char_info['char'] for char_info in all_chars]) | ||||
|         normalized_full_text = normalize_text(full_text) | ||||
|          | ||||
|         # 在标准化文本中查找目标文本 | ||||
|         found_positions = [] | ||||
|         start = 0 | ||||
|         while True: | ||||
|             pos = normalized_full_text.find(normalized_target, start) | ||||
|             if pos == -1: | ||||
|                 break | ||||
|         # 为每个目标文本查找位置 | ||||
|         for target_text in target_texts: | ||||
|             # 标准化目标文本 | ||||
|             normalized_target = normalize_text(target_text) | ||||
|              | ||||
|             # 找到匹配项,获取对应的坐标信息 | ||||
|             if pos < len(all_chars): | ||||
|                 start_char = all_chars[pos] | ||||
|                 end_pos = pos + len(normalized_target) - 1 | ||||
|                 if end_pos < len(all_chars): | ||||
|                     end_char = all_chars[end_pos] | ||||
|                     # 确定在哪一页 | ||||
|                     page_num = 1 | ||||
|                     for i, page_start in enumerate(page_start_indices): | ||||
|                         if pos >= page_start: | ||||
|                             page_num = i + 1 | ||||
|                      | ||||
|                     # 获取匹配的文本内容 | ||||
|                     matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]]) | ||||
|                      | ||||
|                     # 计算边界框 (left, right, top, bottom) | ||||
|                     left = min(start_char['x'], end_char['x']) | ||||
|                     right = max(start_char['x'], end_char['x']) | ||||
|                     bottom = min(start_char['y'], end_char['y']) | ||||
|                     top = max(start_char['y'], end_char['y']) | ||||
|                      | ||||
|                     position=[ | ||||
|                         page_num, | ||||
|                         left,    # left | ||||
|                         right,   # right | ||||
|                         top,     # top | ||||
|                         bottom,  # bottom | ||||
|                         matched_text,  # 添加匹配的内容 | ||||
|                         1.0  # 添加相似度信息(精确匹配为1.0) | ||||
|                     ] | ||||
|                     found_positions.append(position) | ||||
|              | ||||
|             start = pos + 1 | ||||
|          | ||||
|         return found_positions | ||||
|  | ||||
| def find_text_in_pdf_per_page(pdf_path, target_text): | ||||
|     """ | ||||
|     在PDF中逐页查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 标准化目标文本 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本并标准化 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             normalized_page_text = normalize_text(page_text) | ||||
|              | ||||
|             # 在页面文本中查找目标文本 | ||||
|             pos = normalized_page_text.find(normalized_target) | ||||
|             if pos != -1: | ||||
|             found_positions = [] | ||||
|             start = 0 | ||||
|             while True: | ||||
|                 pos = normalized_full_text.find(normalized_target, start) | ||||
|                 if pos == -1: | ||||
|                     break | ||||
|                  | ||||
|                 # 找到匹配项,获取对应的坐标信息 | ||||
|                 if pos < len(char_list): | ||||
|                     start_char = char_list[pos] | ||||
|                 if pos < len(all_chars): | ||||
|                     start_char = all_chars[pos] | ||||
|                     end_pos = pos + len(normalized_target) - 1 | ||||
|                     if end_pos < len(char_list): | ||||
|                         end_char = char_list[end_pos] | ||||
|                     if end_pos < len(all_chars): | ||||
|                         end_char = all_chars[end_pos] | ||||
|                         # 确定在哪一页 | ||||
|                         page_num = 1 | ||||
|                         for i, page_start in enumerate(page_start_indices): | ||||
|                             if pos >= page_start: | ||||
|                                 page_num = i + 1 | ||||
|                          | ||||
|                         # 获取匹配的文本内容 | ||||
|                         matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]]) | ||||
|                         matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]]) | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         left = min(start_char['x'], end_char['x']) | ||||
| @@ -570,41 +273,37 @@ def find_text_in_pdf_per_page(pdf_path, target_text): | ||||
|                         bottom = min(start_char['y'], end_char['y']) | ||||
|                         top = max(start_char['y'], end_char['y']) | ||||
|                          | ||||
|                         position=[ | ||||
|                         position = [ | ||||
|                             page_num, | ||||
|                             left,    # left | ||||
|                             right,   # right | ||||
|                             top,     # top | ||||
|                             bottom,  # bottom | ||||
|                             matched_text,  # 添加匹配的内容 | ||||
|                             1.0  # 添加相似度信息(精确匹配为1.0) | ||||
|                         ] | ||||
|                         found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|                  | ||||
|                 start = pos + 1 | ||||
|              | ||||
|             batch_results[target_text] = found_positions | ||||
|      | ||||
|     return batch_results | ||||
|  | ||||
| def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): | ||||
| def find_text_in_pdf_per_page_batch(pdf_path, target_texts): | ||||
|     """ | ||||
|     查找部分匹配的文本(适用于较长的文本) | ||||
|     在PDF中逐页批量查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         min_match_ratio (float): 最小匹配比例 (0-1) | ||||
|         target_texts (list): 要查找的文本列表 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|         dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 将目标文本分割成关键词或短语 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|     # 提取关键词(移除常见停用词后的词) | ||||
|     keywords = [word for word in normalized_target.split() if len(word) > 2] | ||||
|      | ||||
|     if not keywords: | ||||
|         keywords = normalized_target.split()  # 如果没有长词,则使用所有词 | ||||
|     # 初始化结果字典 | ||||
|     batch_results = {text: [] for text in target_texts} | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
| @@ -616,7 +315,77 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本并标准化 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             normalized_page_text = normalize_text(page_text) | ||||
|              | ||||
|             # 为每个目标文本在当前页查找 | ||||
|             for target_text in target_texts: | ||||
|                 normalized_target = normalize_text(target_text) | ||||
|                  | ||||
|                 # 在页面文本中查找目标文本 | ||||
|                 pos = normalized_page_text.find(normalized_target) | ||||
|                 if pos != -1: | ||||
|                     # 找到匹配项,获取对应的坐标信息 | ||||
|                     if pos < len(char_list): | ||||
|                         start_char = char_list[pos] | ||||
|                         end_pos = pos + len(normalized_target) - 1 | ||||
|                         if end_pos < len(char_list): | ||||
|                             end_char = char_list[end_pos] | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]]) | ||||
|                              | ||||
|                             # 计算边界框 (left, right, top, bottom) | ||||
|                             left = min(start_char['x'], end_char['x']) | ||||
|                             right = max(start_char['x'], end_char['x']) | ||||
|                             bottom = min(start_char['y'], end_char['y']) | ||||
|                             top = max(start_char['y'], end_char['y']) | ||||
|                              | ||||
|                             position = [ | ||||
|                                 page_num, | ||||
|                                 left,    # left | ||||
|                                 right,   # right | ||||
|                                 top,     # top | ||||
|                                 bottom,  # bottom | ||||
|                             ] | ||||
|                             batch_results[target_text].append(position) | ||||
|      | ||||
|     return batch_results | ||||
|  | ||||
| def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.7): | ||||
|     """ | ||||
|     批量查找部分匹配的文本(适用于较长的文本) | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_texts (list): 要查找的文本列表 | ||||
|         min_match_ratio (float): 最小匹配比例 (0-1) | ||||
|      | ||||
|     Returns: | ||||
|         dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 初始化结果字典 | ||||
|     batch_results = {text: [] for text in target_texts} | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
| @@ -628,90 +397,136 @@ def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             normalized_page_text = normalize_text(page_text) | ||||
|              | ||||
|             # 计算匹配的关键词数量 | ||||
|             matched_keywords = 0 | ||||
|             for keyword in keywords: | ||||
|                 if keyword in normalized_page_text: | ||||
|                     matched_keywords += 1 | ||||
|              | ||||
|             # 如果匹配的关键词比例超过阈值,则认为找到匹配 | ||||
|             if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio: | ||||
|                 # 简单起见,返回页面第一个字符和最后一个字符的坐标 | ||||
|                 if char_list: | ||||
|                     start_char = char_list[0] | ||||
|                     end_char = char_list[-1] | ||||
|                     match_ratio = matched_keywords / len(keywords) | ||||
|             # 为每个目标文本计算匹配 | ||||
|             for target_text in target_texts: | ||||
|                 # 将目标文本分割成关键词或短语 | ||||
|                 normalized_target = normalize_text(target_text) | ||||
|                 # 提取关键词(移除常见停用词后的词) | ||||
|                 keywords = [word for word in normalized_target.split() if len(word) > 2] | ||||
|                  | ||||
|                 if not keywords: | ||||
|                     keywords = normalized_target.split()  # 如果没有长词,则使用所有词 | ||||
|                  | ||||
|                 if not keywords: | ||||
|                     continue | ||||
|                      | ||||
|                     # 获取页面文本作为匹配内容 | ||||
|                     matched_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|                      | ||||
|                     # 计算边界框 (left, right, top, bottom) | ||||
|                     left = min(start_char['x'], end_char['x']) | ||||
|                     right = max(start_char['x'], end_char['x']) | ||||
|                     bottom = min(start_char['y'], end_char['y']) | ||||
|                     top = max(start_char['y'], end_char['y']) | ||||
|                      | ||||
|                     position = [ | ||||
|                         page_num, | ||||
|                         left,    # left | ||||
|                         right,   # right | ||||
|                         top,     # top | ||||
|                         bottom,  # bottom | ||||
|                         matched_text[:100] + "..." if len(matched_text) > 100 else matched_text,  # 添加匹配的内容(限制长度) | ||||
|                         match_ratio  # 添加匹配比例信息 | ||||
|                     ] | ||||
|                     found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
| def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8): | ||||
|                 # 计算匹配的关键词数量 | ||||
|                 matched_keywords = 0 | ||||
|                 for keyword in keywords: | ||||
|                     if keyword in normalized_page_text: | ||||
|                         matched_keywords += 1 | ||||
|                  | ||||
|                 # 如果匹配的关键词比例超过阈值,则认为找到匹配 | ||||
|                 if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio: | ||||
|                     # 简单起见,返回页面第一个字符和最后一个字符的坐标 | ||||
|                     if char_list: | ||||
|                         start_char = char_list[0] | ||||
|                         end_char = char_list[-1] | ||||
|                         match_ratio = matched_keywords / len(keywords) | ||||
|                          | ||||
|                         # 获取页面文本作为匹配内容 | ||||
|                         matched_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         left = min(start_char['x'], end_char['x']) | ||||
|                         right = max(start_char['x'], end_char['x']) | ||||
|                         bottom = min(start_char['y'], end_char['y']) | ||||
|                         top = max(start_char['y'], end_char['y']) | ||||
|                          | ||||
|                         position = [ | ||||
|                             page_num, | ||||
|                             left,    # left | ||||
|                             right,   # right | ||||
|                             top,     # top | ||||
|                             bottom,  # bottom | ||||
|                         ] | ||||
|                         batch_results[target_text].append(position) | ||||
|      | ||||
|     return batch_results | ||||
|  | ||||
| def smart_fuzzy_find_text_batch(pdf_path, target_texts, similarity_threshold=0.8): | ||||
|     """ | ||||
|     智能模糊文本查找,结合多种方法 | ||||
|     智能批量模糊文本查找,结合多种方法 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         target_texts (list): 要查找的文本列表 | ||||
|         similarity_threshold (float): 相似度阈值 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|         dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典 | ||||
|     """ | ||||
|     # 初始化结果字典 | ||||
|     batch_results = {text: [] for text in target_texts} | ||||
|      | ||||
|     # 方法1: 精确匹配 | ||||
|     exact_results = find_text_in_pdf_per_page(pdf_path, target_text) | ||||
|     if exact_results: | ||||
|         return exact_results | ||||
|     exact_results = find_text_in_pdf_per_page_batch(pdf_path, target_texts) | ||||
|      | ||||
|     # 方法2: 模糊匹配 | ||||
|     fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold) | ||||
|     if fuzzy_results: | ||||
|         return fuzzy_results | ||||
|     # 对于已经找到精确匹配的文本,直接使用结果 | ||||
|     remaining_texts = [] | ||||
|     for text in target_texts: | ||||
|         if exact_results.get(text): | ||||
|             batch_results[text] = exact_results[text] | ||||
|         else: | ||||
|             remaining_texts.append(text) | ||||
|      | ||||
|     # 方法3: 部分匹配(关键词匹配) | ||||
|     partial_results = find_partial_text_positions(pdf_path, target_text, 0.5) | ||||
|     return partial_results | ||||
|     if not remaining_texts: | ||||
|         return batch_results | ||||
|      | ||||
|     # 方法2: 模糊匹配(仅对未找到精确匹配的文本) | ||||
|     fuzzy_results = find_fuzzy_text_positions_batch(pdf_path, remaining_texts, similarity_threshold) | ||||
|      | ||||
|     # 更新结果 | ||||
|     for text in remaining_texts: | ||||
|         if fuzzy_results.get(text): | ||||
|             batch_results[text] = fuzzy_results[text] | ||||
|             remaining_texts = [t for t in remaining_texts if t != text]  # 从剩余文本中移除 | ||||
|      | ||||
|     if not remaining_texts: | ||||
|         return batch_results | ||||
|      | ||||
|     # 方法3: 部分匹配(关键词匹配,仅对仍未找到匹配的文本) | ||||
|     partial_results = find_partial_text_positions_batch(pdf_path, remaining_texts, 0.5) | ||||
|      | ||||
|     # 更新最终结果 | ||||
|     for text in remaining_texts: | ||||
|         if partial_results.get(text): | ||||
|             batch_results[text] = partial_results[text] | ||||
|      | ||||
|     return batch_results | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     # 使用本地PDF文件 | ||||
|     pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径 | ||||
|     target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做" | ||||
| • 基于 `plan` 执行: 精准驱动 AI 完成任务''' | ||||
|     target_texts = [ | ||||
|         '''创建 `plan` 文件: 固化和锁定最终的"怎么做" | ||||
| • 基于 `plan` 执行: 精准驱动 AI 完成任务''', | ||||
|         "其他要查找的文本1", | ||||
|         "其他要查找的文本2" | ||||
|     ] | ||||
|  | ||||
|     try: | ||||
|         print("智能模糊查找:") | ||||
|         positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7) | ||||
|         print("批量智能模糊查找:") | ||||
|         batch_positions = smart_fuzzy_find_text_batch(pdf_file_path, target_texts, similarity_threshold=0.7) | ||||
|          | ||||
|         if positions: | ||||
|             print(f"找到文本在以下位置:") | ||||
|             for pos in positions: | ||||
|                 if len(pos) >= 7:  # 包含匹配内容和相似度信息 | ||||
|                     print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}") | ||||
|                     print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}") | ||||
|                     print("-" * 50) | ||||
|                 else: | ||||
|                     print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") | ||||
|         else: | ||||
|             print("未找到文本") | ||||
|              | ||||
|         for target_text, positions in batch_positions.items(): | ||||
|             print(f"\n查找文本: {target_text[:50]}{'...' if len(target_text) > 50 else ''}") | ||||
|             if positions: | ||||
|                 print(f"找到文本在以下位置:") | ||||
|                 for pos in positions: | ||||
|                     if len(pos) >= 6:  # 包含匹配内容和相似度信息 | ||||
|                         print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") | ||||
|                         if len(pos) >= 7:  # 包含相似度信息 | ||||
|                             print(f"相似度: {pos[6]:.2f}") | ||||
|                         if len(pos) >= 6:  # 包含匹配内容 | ||||
|                             print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}") | ||||
|                         print("-" * 50) | ||||
|                     else: | ||||
|                         print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") | ||||
|             else: | ||||
|                 print("未找到文本") | ||||
|                  | ||||
|     except FileNotFoundError as e: | ||||
|         print(e) | ||||
|     except Exception as e: | ||||
|         print(f"处理PDF时出错: {e}") | ||||
|         print(f"处理PDF时出错: {e}") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user