新增PDF文本查找功能,支持全文和逐页搜索,返回文本坐标信息
This commit is contained in:
		
							
								
								
									
										219
									
								
								src/get_pos_pdf.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										219
									
								
								src/get_pos_pdf.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,219 @@ | ||||
| import requests | ||||
| import io | ||||
| import os | ||||
| from pdfminer.pdfdocument import PDFDocument | ||||
| from pdfminer.pdfpage import PDFPage | ||||
| from pdfminer.pdfparser import PDFParser | ||||
| from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | ||||
| from pdfminer.converter import PDFPageAggregator | ||||
| from pdfminer.layout import LAParams, LTText, LTChar, LTAnno | ||||
|  | ||||
| def parse_char_layout(layout): | ||||
|     """解析页面内容,一个字母一个字母的解析""" | ||||
|     # bbox: | ||||
|     # x0:从页面左侧到框左边缘的距离。 | ||||
|     # y0:从页面底部到框的下边缘的距离。 | ||||
|     # x1:从页面左侧到方框右边缘的距离。 | ||||
|     # y1:从页面底部到框的上边缘的距离 | ||||
|     char_list = [] | ||||
|     for textbox in layout: | ||||
|         if isinstance(textbox, LTText): | ||||
|             for line in textbox: | ||||
|                 for char in line: | ||||
|                     # If the char is a line-break or an empty space, the word is complete | ||||
|                     if isinstance(char, LTAnno): | ||||
|                         char_info = { | ||||
|                             'x': char.bbox[0] if hasattr(char, 'bbox') else 0, | ||||
|                             'y': char.bbox[3] if hasattr(char, 'bbox') else 0, | ||||
|                             'char': char.get_text() | ||||
|                         } | ||||
|                         char_list.append(char_info) | ||||
|                     elif isinstance(char, LTChar): | ||||
|                         char_info = { | ||||
|                             'x': char.bbox[0], | ||||
|                             'y': char.bbox[3], | ||||
|                             'char': char.get_text() | ||||
|                         } | ||||
|                         char_list.append(char_info) | ||||
|     return char_list | ||||
|  | ||||
| def normalize_text(text): | ||||
|     """标准化文本,移除多余空白字符""" | ||||
|     # 将换行符、制表符等替换为空格,然后合并多个空格为一个 | ||||
|     import re | ||||
|     normalized = re.sub(r'\s+', ' ', text.strip()) | ||||
|     return normalized | ||||
|  | ||||
| def find_text_positions(pdf_path, target_text): | ||||
|     """ | ||||
|     在PDF中查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 标准化目标文本 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         all_chars = []  # 存储所有页面的字符 | ||||
|         page_start_indices = []  # 存储每页开始的索引 | ||||
|          | ||||
|         # 处理每一页并收集所有字符 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             page_start_indices.append(len(all_chars)) | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|             all_chars.extend(char_list) | ||||
|          | ||||
|         # 将所有字符组合成文本并标准化 | ||||
|         full_text = ''.join([char_info['char'] for char_info in all_chars]) | ||||
|         normalized_full_text = normalize_text(full_text) | ||||
|          | ||||
|         # 在标准化文本中查找目标文本 | ||||
|         found_positions = [] | ||||
|         start = 0 | ||||
|         while True: | ||||
|             pos = normalized_full_text.find(normalized_target, start) | ||||
|             if pos == -1: | ||||
|                 break | ||||
|              | ||||
|             # 找到匹配项,获取对应的坐标信息 | ||||
|             if pos < len(all_chars): | ||||
|                 start_char = all_chars[pos] | ||||
|                 end_pos = pos + len(normalized_target) - 1 | ||||
|                 if end_pos < len(all_chars): | ||||
|                     end_char = all_chars[end_pos] | ||||
|                     # 确定在哪一页 | ||||
|                     page_num = 1 | ||||
|                     for i, page_start in enumerate(page_start_indices): | ||||
|                         if pos >= page_start: | ||||
|                             page_num = i + 1 | ||||
|                      | ||||
|                     position_info = { | ||||
|                         'page': page_num, | ||||
|                         'text': normalized_target, | ||||
|                         'start_x': start_char['x'], | ||||
|                         'start_y': start_char['y'], | ||||
|                         'end_x': end_char['x'], | ||||
|                         'end_y': end_char['y'] | ||||
|                     } | ||||
|                     found_positions.append(position_info) | ||||
|              | ||||
|             start = pos + 1 | ||||
|          | ||||
|         return found_positions | ||||
|  | ||||
| def find_text_in_pdf_per_page(pdf_path, target_text): | ||||
|     """ | ||||
|     在PDF中逐页查找指定文本并返回坐标 | ||||
|      | ||||
|     Args: | ||||
|         pdf_path (str): PDF文件路径 | ||||
|         target_text (str): 要查找的文本 | ||||
|      | ||||
|     Returns: | ||||
|         list: 包含匹配文本坐标信息的列表 | ||||
|     """ | ||||
|     if not os.path.exists(pdf_path): | ||||
|         raise FileNotFoundError(f"PDF文件不存在: {pdf_path}") | ||||
|      | ||||
|     # 标准化目标文本 | ||||
|     normalized_target = normalize_text(target_text) | ||||
|      | ||||
|     # 打开本地PDF文件 | ||||
|     with open(pdf_path, 'rb') as fp: | ||||
|         parser = PDFParser(fp) | ||||
|         doc = PDFDocument(parser) | ||||
|          | ||||
|         rsrcmgr = PDFResourceManager() | ||||
|         laparams = LAParams() | ||||
|         device = PDFPageAggregator(rsrcmgr, laparams=laparams) | ||||
|         interpreter = PDFPageInterpreter(rsrcmgr, device) | ||||
|          | ||||
|         found_positions = [] | ||||
|          | ||||
|         # 处理每一页 | ||||
|         for page_num, page in enumerate(PDFPage.create_pages(doc), 1): | ||||
|             interpreter.process_page(page) | ||||
|             layout = device.get_result() | ||||
|             char_list = parse_char_layout(layout) | ||||
|              | ||||
|             # 将页面字符组合成文本并标准化 | ||||
|             page_text = ''.join([char_info['char'] for char_info in char_list]) | ||||
|             normalized_page_text = normalize_text(page_text) | ||||
|              | ||||
|             # 在页面文本中查找目标文本 | ||||
|             pos = normalized_page_text.find(normalized_target) | ||||
|             if pos != -1: | ||||
|                 # 找到匹配项,获取对应的坐标信息 | ||||
|                 if pos < len(char_list): | ||||
|                     start_char = char_list[pos] | ||||
|                     end_pos = pos + len(normalized_target) - 1 | ||||
|                     if end_pos < len(char_list): | ||||
|                         end_char = char_list[end_pos] | ||||
|                          | ||||
|                         position_info = { | ||||
|                             'page': page_num, | ||||
|                             'text': normalized_target, | ||||
|                             'start_x': start_char['x'], | ||||
|                             'start_y': start_char['y'], | ||||
|                             'end_x': end_char['x'], | ||||
|                             'end_y': end_char['y'] | ||||
|                         } | ||||
|                         found_positions.append(position_info) | ||||
|          | ||||
|         return found_positions | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| if __name__ == '__main__': | ||||
|     # 使用本地PDF文件 | ||||
|     pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径 | ||||
|     target_text = '''执行方式: | ||||
| • 在当前 chat 中,已有上下文,但可能混乱 | ||||
| • 新开一个 chat,干净的上下文,需要填充''' | ||||
|      | ||||
|     try: | ||||
|         print("方法1:全文搜索") | ||||
|         positions = find_text_positions(pdf_file_path, target_text) | ||||
|         if positions: | ||||
|             print(f"找到文本在以下位置:") | ||||
|             for pos in positions: | ||||
|                 print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") | ||||
|         else: | ||||
|             print("未找到文本") | ||||
|              | ||||
|         print("\n方法2:逐页搜索") | ||||
|         positions = find_text_in_pdf_per_page(pdf_file_path, target_text) | ||||
|         if positions: | ||||
|             print(f"找到文本在以下位置:") | ||||
|             for pos in positions: | ||||
|                 print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})") | ||||
|         else: | ||||
|             print("未找到文本") | ||||
|              | ||||
|     except FileNotFoundError as e: | ||||
|         print(e) | ||||
|     except Exception as e: | ||||
|         print(f"处理PDF时出错: {e}") | ||||
		Reference in New Issue
	
	Block a user