实现智能模糊文本查找功能,支持精确、模糊和部分匹配,优化文本坐标返回逻辑
This commit is contained in:
		| @@ -1,6 +1,8 @@ | ||||
| import requests | ||||
| import io | ||||
| import os | ||||
| import re | ||||
| from difflib import SequenceMatcher | ||||
| from pdfminer.pdfdocument import PDFDocument | ||||
| from pdfminer.pdfpage import PDFPage | ||||
| from pdfminer.pdfparser import PDFParser | ||||
| @@ -44,6 +46,386 @@ def normalize_text(text): | ||||
|     normalized = re.sub(r'\s+', ' ', text.strip()) | ||||
|     return normalized | ||||
|  | ||||
|  | ||||
| def clean_text_for_fuzzy_match(text): | ||||
|     """清理文本用于模糊匹配,移除特殊字符,只保留字母数字和空格""" | ||||
|     # 移除标点符号和特殊字符,只保留字母、数字、中文字符和空格 | ||||
|     cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text) | ||||
|     # 标准化空白字符 | ||||
|     cleaned = re.sub(r'\s+', ' ', cleaned.strip()) | ||||
|     return cleaned | ||||
| def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8): | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             target_len = len(cleaned_target) | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0].copy()  # 创建副本 | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): | ||||
|                         # 合并索引范围 | ||||
|                         current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 计算加权平均相似度 | ||||
|                         total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ | ||||
|                                       (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         current_match['similarity'] = ( | ||||
|                             current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + | ||||
|                             next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         ) / total_length | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match.copy()  # 创建副本 | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                          | ||||
|                         # 过滤掉坐标为0的字符(通常是特殊字符) | ||||
|                         valid_chars = [char for char in matched_chars  | ||||
|                                      if char['x'] > 0 and char['y'] > 0] | ||||
|                          | ||||
|                         # 如果没有有效字符,则使用所有字符 | ||||
|                         chars_to_use = valid_chars if valid_chars else matched_chars | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if chars_to_use: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in chars_to_use]) | ||||
|                             right = max([char['x'] for char in chars_to_use]) | ||||
|                             bottom = min([char['y'] for char in chars_to_use]) | ||||
|                             top = max([char['y'] for char in chars_to_use]) | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in chars_to_use]) | ||||
|                              | ||||
|                             # 只有当边界框有效时才添加结果 | ||||
|                             if left >= 0 and right > left and top > bottom: | ||||
|                                 position = [ | ||||
|                                     page_num, | ||||
|                                     left,    # left | ||||
|                                     right,   # right | ||||
|                                     top,     # top | ||||
|                                     bottom,  # bottom | ||||
|                                     matched_text,  # 添加匹配的内容 | ||||
|                                     match['similarity']  # 添加相似度信息 | ||||
|                                 ] | ||||
|                                 found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             target_len = len(cleaned_target) | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0].copy()  # 创建副本 | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10): | ||||
|                         # 合并索引范围 | ||||
|                         current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx']) | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 计算加权平均相似度 | ||||
|                         total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \ | ||||
|                                       (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         current_match['similarity'] = ( | ||||
|                             current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) + | ||||
|                             next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1) | ||||
|                         ) / total_length | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match.copy()  # 创建副本 | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if matched_chars: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in matched_chars]) | ||||
|                             right = max([char['x'] for char in matched_chars]) | ||||
|                             bottom = min([char['y'] for char in matched_chars]) | ||||
|                             top = max([char['y'] for char in matched_chars]) | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in matched_chars]) | ||||
|                              | ||||
|                             position = [ | ||||
|                                 page_num, | ||||
|                                 left,    # left | ||||
|                                 right,   # right | ||||
|                                 top,     # top | ||||
|                                 bottom,  # bottom | ||||
|                                 matched_text,  # 添加匹配的内容 | ||||
|                                 match['similarity']  # 添加相似度信息 | ||||
|                             ] | ||||
|                             found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|     """ | ||||
|     在PDF中模糊查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 (0-1),默认0.8 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 清理目标文本 | ||||
|     cleaned_target = clean_text_for_fuzzy_match(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             cleaned_page_text = clean_text_for_fuzzy_match(page_text) | ||||
|              | ||||
|             # 滑动窗口查找相似文本 | ||||
|             target_len = len(cleaned_target) | ||||
|             if target_len == 0: | ||||
|                 continue | ||||
|              | ||||
|             # 存储所有匹配的块 | ||||
|             matches = [] | ||||
|             for i in range(len(cleaned_page_text) - target_len + 1): | ||||
|                 window_text = cleaned_page_text[i:i + target_len] | ||||
|                 similarity = SequenceMatcher(None, cleaned_target, window_text).ratio() | ||||
|                  | ||||
|                 if similarity >= similarity_threshold: | ||||
|                     # 找到匹配项,记录位置和相似度 | ||||
|                     if i < len(char_list): | ||||
|                         matches.append({ | ||||
|                             'start_idx': i, | ||||
|                             'end_idx': min(i + target_len - 1, len(char_list) - 1), | ||||
|                             'similarity': similarity | ||||
|                         }) | ||||
|              | ||||
|             # 合并相邻的匹配块 | ||||
|             if matches: | ||||
|                 # 按起始位置排序 | ||||
|                 matches.sort(key=lambda x: x['start_idx']) | ||||
|                  | ||||
|                 # 合并相邻或重叠的匹配块 | ||||
|                 merged_matches = [] | ||||
|                 current_match = matches[0] | ||||
|                  | ||||
|                 for i in range(1, len(matches)): | ||||
|                     next_match = matches[i] | ||||
|                     # 如果下一个匹配块与当前块相邻或重叠,则合并 | ||||
|                     if next_match['start_idx'] <= current_match['end_idx'] + target_len: | ||||
|                         # 合并索引范围 | ||||
|                         current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx']) | ||||
|                         # 平均相似度 | ||||
|                         current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2 | ||||
|                     else: | ||||
|                         # 不相邻,保存当前块,开始新的块 | ||||
|                         merged_matches.append(current_match) | ||||
|                         current_match = next_match | ||||
|                  | ||||
|                 # 添加最后一个块 | ||||
|                 merged_matches.append(current_match) | ||||
|                  | ||||
|                 # 为每个合并后的块生成坐标信息 | ||||
|                 for match in merged_matches: | ||||
|                     start_idx = match['start_idx'] | ||||
|                     end_idx = match['end_idx'] | ||||
|                      | ||||
|                     if start_idx < len(char_list) and end_idx < len(char_list): | ||||
|                         # 获取匹配区域的所有字符 | ||||
|                         matched_chars = char_list[start_idx:end_idx+1] | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         if matched_chars: | ||||
|                             # 计算边界值 | ||||
|                             left = min([char['x'] for char in matched_chars]) | ||||
|                             right = max([char['x'] for char in matched_chars]) | ||||
|                             bottom = min([char['y'] for char in matched_chars]) | ||||
|                             top = max([char['y'] for char in matched_chars]) | ||||
|                              | ||||
|                             # 获取匹配的文本内容 | ||||
|                             matched_text = ''.join([char_info['char'] for char_info in matched_chars]) | ||||
|                              | ||||
|                             position = [ | ||||
|                                 page_num, | ||||
|                                 left,    # left | ||||
|                                 right,   # right | ||||
|                                 top,     # top | ||||
|                                 bottom,  # bottom | ||||
|                                 matched_text,  # 添加匹配的内容 | ||||
|                                 match['similarity']  # 添加相似度信息 | ||||
|                             ] | ||||
|                             found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
| def find_text_positions(pdf_path, target_text): | ||||
|     """ | ||||
|     在PDF中查找指定文本并返回坐标 | ||||
| @@ -106,15 +488,25 @@ def find_text_positions(pdf_path, target_text): | ||||
|                         if pos >= page_start: | ||||
|                             page_num = i + 1 | ||||
|                      | ||||
|                     position_info = { | ||||
|                         'page': page_num, | ||||
|                         'text': normalized_target, | ||||
|                         'start_x': start_char['x'], | ||||
|                         'start_y': start_char['y'], | ||||
|                         'end_x': end_char['x'], | ||||
|                         'end_y': end_char['y'] | ||||
|                     } | ||||
|                     found_positions.append(position_info) | ||||
|                     # 获取匹配的文本内容 | ||||
|                     matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]]) | ||||
|                      | ||||
|                     # 计算边界框 (left, right, top, bottom) | ||||
|                     left = min(start_char['x'], end_char['x']) | ||||
|                     right = max(start_char['x'], end_char['x']) | ||||
|                     bottom = min(start_char['y'], end_char['y']) | ||||
|                     top = max(start_char['y'], end_char['y']) | ||||
|                      | ||||
|                     position=[ | ||||
|                         page_num, | ||||
|                         left,    # left | ||||
|                         right,   # right | ||||
|                         top,     # top | ||||
|                         bottom,  # bottom | ||||
|                         matched_text,  # 添加匹配的内容 | ||||
|                         1.0  # 添加相似度信息(精确匹配为1.0) | ||||
|                     ] | ||||
|                     found_positions.append(position) | ||||
|              | ||||
|             start = pos + 1 | ||||
|          | ||||
| @@ -169,47 +561,153 @@ def find_text_in_pdf_per_page(pdf_path, target_text): | ||||
|                     if end_pos < len(char_list): | ||||
|                         end_char = char_list[end_pos] | ||||
|                          | ||||
|                         position_info = { | ||||
|                             'page': page_num, | ||||
|                             'text': normalized_target, | ||||
|                             'start_x': start_char['x'], | ||||
|                             'start_y': start_char['y'], | ||||
|                             'end_x': end_char['x'], | ||||
|                             'end_y': end_char['y'] | ||||
|                         } | ||||
|                         found_positions.append(position_info) | ||||
|                         # 获取匹配的文本内容 | ||||
|                         matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]]) | ||||
|                          | ||||
|                         # 计算边界框 (left, right, top, bottom) | ||||
|                         left = min(start_char['x'], end_char['x']) | ||||
|                         right = max(start_char['x'], end_char['x']) | ||||
|                         bottom = min(start_char['y'], end_char['y']) | ||||
|                         top = max(start_char['y'], end_char['y']) | ||||
|                          | ||||
|                         position=[ | ||||
|                             page_num, | ||||
|                             left,    # left | ||||
|                             right,   # right | ||||
|                             top,     # top | ||||
|                             bottom,  # bottom | ||||
|                             matched_text,  # 添加匹配的内容 | ||||
|                             1.0  # 添加相似度信息(精确匹配为1.0) | ||||
|                         ] | ||||
|                         found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
|  | ||||
| def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7): | ||||
|     """ | ||||
|     查找部分匹配的文本(适用于较长的文本) | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         min_match_ratio (float): 最小匹配比例 (0-1) | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 将目标文本分割成关键词或短语 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|     # 提取关键词(移除常见停用词后的词) | ||||
|     keywords = [word for word in normalized_target.split() if len(word) > 2] | ||||
|      | ||||
|     if not keywords: | ||||
|         keywords = normalized_target.split()  # 如果没有长词,则使用所有词 | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本并标准化 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             normalized_page_text = normalize_text(page_text) | ||||
|              | ||||
|             # 计算匹配的关键词数量 | ||||
|             matched_keywords = 0 | ||||
|             for keyword in keywords: | ||||
|                 if keyword in normalized_page_text: | ||||
|                     matched_keywords += 1 | ||||
|              | ||||
|             # 如果匹配的关键词比例超过阈值,则认为找到匹配 | ||||
|             if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio: | ||||
|                 # 简单起见,返回页面第一个字符和最后一个字符的坐标 | ||||
|                 if char_list: | ||||
|                     start_char = char_list[0] | ||||
|                     end_char = char_list[-1] | ||||
|                     match_ratio = matched_keywords / len(keywords) | ||||
|                      | ||||
|                     # 获取页面文本作为匹配内容 | ||||
|                     matched_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|                      | ||||
|                     # 计算边界框 (left, right, top, bottom) | ||||
|                     left = min(start_char['x'], end_char['x']) | ||||
|                     right = max(start_char['x'], end_char['x']) | ||||
|                     bottom = min(start_char['y'], end_char['y']) | ||||
|                     top = max(start_char['y'], end_char['y']) | ||||
|                      | ||||
|                     position = [ | ||||
|                         page_num, | ||||
|                         left,    # left | ||||
|                         right,   # right | ||||
|                         top,     # top | ||||
|                         bottom,  # bottom | ||||
|                         matched_text[:100] + "..." if len(matched_text) > 100 else matched_text,  # 添加匹配的内容(限制长度) | ||||
|                         match_ratio  # 添加匹配比例信息 | ||||
|                     ] | ||||
|                     found_positions.append(position) | ||||
|          | ||||
|         return found_positions | ||||
| def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8): | ||||
|     """ | ||||
|     智能模糊文本查找,结合多种方法 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|         similarity_threshold (float): 相似度阈值 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     # 方法1: 精确匹配 | ||||
|     exact_results = find_text_in_pdf_per_page(pdf_path, target_text) | ||||
|     if exact_results: | ||||
|         return exact_results | ||||
|      | ||||
|     # 方法2: 模糊匹配 | ||||
|     fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold) | ||||
|     if fuzzy_results: | ||||
|         return fuzzy_results | ||||
|      | ||||
|     # 方法3: 部分匹配(关键词匹配) | ||||
|     partial_results = find_partial_text_positions(pdf_path, target_text, 0.5) | ||||
|     return partial_results | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     # 使用本地PDF文件 | ||||
|     pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径 | ||||
|     target_text = '''执行方式: | ||||
| • 在当前 chat 中,已有上下文,但可能混乱 | ||||
| • 新开一个 chat,干净的上下文,需要填充''' | ||||
|     target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做" | ||||
| • 基于 `plan` 执行: 精准驱动 AI 完成任务''' | ||||
|  | ||||
|     try: | ||||
|         print("方法1:全文搜索") | ||||
|         positions = find_text_positions(pdf_file_path, target_text) | ||||
|         if positions: | ||||
|             print(f"找到文本在以下位置:") | ||||
|             for pos in positions: | ||||
|                 print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") | ||||
|         else: | ||||
|             print("未找到文本") | ||||
|         print("智能模糊查找:") | ||||
|         positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7) | ||||
|          | ||||
|         print("\n方法2:逐页搜索") | ||||
|         positions = find_text_in_pdf_per_page(pdf_file_path, target_text) | ||||
|         if positions: | ||||
|             print(f"找到文本在以下位置:") | ||||
|             for pos in positions: | ||||
|                 print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") | ||||
|                 if len(pos) >= 7:  # 包含匹配内容和相似度信息 | ||||
|                     print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}") | ||||
|                     print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}") | ||||
|                     print("-" * 50) | ||||
|                 else: | ||||
|                     print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})") | ||||
|         else: | ||||
|             print("未找到文本") | ||||
|              | ||||
|   | ||||
		Reference in New Issue
	
	Block a user