新增PDF文本查找功能，支持全文和逐页搜索，返回文本坐标信息

2025-07-30 08:51:30 +08:00
parent 466fae53c9
commit 44ef61daab
1 changed files with 219 additions and 0 deletions
--- a/src/get_pos_pdf.py
+++ b/src/get_pos_pdf.py
@@ -0,0 +1,219 @@
+import requests
+import io
+import os
+from pdfminer.pdfdocument import PDFDocument
+from pdfminer.pdfpage import PDFPage
+from pdfminer.pdfparser import PDFParser
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.converter import PDFPageAggregator
+from pdfminer.layout import LAParams, LTText, LTChar, LTAnno
+
+def parse_char_layout(layout):
+    """解析页面内容，一个字母一个字母的解析"""
+    # bbox:
+    # x0：从页面左侧到框左边缘的距离。
+    # y0：从页面底部到框的下边缘的距离。
+    # x1：从页面左侧到方框右边缘的距离。
+    # y1：从页面底部到框的上边缘的距离
+    char_list = []
+    for textbox in layout:
+        if isinstance(textbox, LTText):
+            for line in textbox:
+                for char in line:
+                    # If the char is a line-break or an empty space, the word is complete
+                    if isinstance(char, LTAnno):
+                        char_info = {
+                            'x': char.bbox[0] if hasattr(char, 'bbox') else 0,
+                            'y': char.bbox[3] if hasattr(char, 'bbox') else 0,
+                            'char': char.get_text()
+                        }
+                        char_list.append(char_info)
+                    elif isinstance(char, LTChar):
+                        char_info = {
+                            'x': char.bbox[0],
+                            'y': char.bbox[3],
+                            'char': char.get_text()
+                        }
+                        char_list.append(char_info)
+    return char_list
+
+def normalize_text(text):
+    """标准化文本，移除多余空白字符"""
+    # 将换行符、制表符等替换为空格，然后合并多个空格为一个
+    import re
+    normalized = re.sub(r'\s+', ' ', text.strip())
+    return normalized
+
+def find_text_positions(pdf_path, target_text):
+    """
+    在PDF中查找指定文本并返回坐标
+    
+    Args:
+        pdf_path (str): PDF文件路径
+        target_text (str): 要查找的文本
+    
+    Returns:
+        list: 包含匹配文本坐标信息的列表
+    """
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
+    
+    # 标准化目标文本
+    normalized_target = normalize_text(target_text)
+    
+    # 打开本地PDF文件
+    with open(pdf_path, 'rb') as fp:
+        parser = PDFParser(fp)
+        doc = PDFDocument(parser)
+        
+        rsrcmgr = PDFResourceManager()
+        laparams = LAParams()
+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        
+        all_chars = []  # 存储所有页面的字符
+        page_start_indices = []  # 存储每页开始的索引
+        
+        # 处理每一页并收集所有字符
+        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
+            page_start_indices.append(len(all_chars))
+            interpreter.process_page(page)
+            layout = device.get_result()
+            char_list = parse_char_layout(layout)
+            all_chars.extend(char_list)
+        
+        # 将所有字符组合成文本并标准化
+        full_text = ''.join([char_info['char'] for char_info in all_chars])
+        normalized_full_text = normalize_text(full_text)
+        
+        # 在标准化文本中查找目标文本
+        found_positions = []
+        start = 0
+        while True:
+            pos = normalized_full_text.find(normalized_target, start)
+            if pos == -1:
+                break
+            
+            # 找到匹配项，获取对应的坐标信息
+            if pos < len(all_chars):
+                start_char = all_chars[pos]
+                end_pos = pos + len(normalized_target) - 1
+                if end_pos < len(all_chars):
+                    end_char = all_chars[end_pos]
+                    # 确定在哪一页
+                    page_num = 1
+                    for i, page_start in enumerate(page_start_indices):
+                        if pos >= page_start:
+                            page_num = i + 1
+                    
+                    position_info = {
+                        'page': page_num,
+                        'text': normalized_target,
+                        'start_x': start_char['x'],
+                        'start_y': start_char['y'],
+                        'end_x': end_char['x'],
+                        'end_y': end_char['y']
+                    }
+                    found_positions.append(position_info)
+            
+            start = pos + 1
+        
+        return found_positions
+
+def find_text_in_pdf_per_page(pdf_path, target_text):
+    """
+    在PDF中逐页查找指定文本并返回坐标
+    
+    Args:
+        pdf_path (str): PDF文件路径
+        target_text (str): 要查找的文本
+    
+    Returns:
+        list: 包含匹配文本坐标信息的列表
+    """
+    if not os.path.exists(pdf_path):
+        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
+    
+    # 标准化目标文本
+    normalized_target = normalize_text(target_text)
+    
+    # 打开本地PDF文件
+    with open(pdf_path, 'rb') as fp:
+        parser = PDFParser(fp)
+        doc = PDFDocument(parser)
+        
+        rsrcmgr = PDFResourceManager()
+        laparams = LAParams()
+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        
+        found_positions = []
+        
+        # 处理每一页
+        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
+            interpreter.process_page(page)
+            layout = device.get_result()
+            char_list = parse_char_layout(layout)
+            
+            # 将页面字符组合成文本并标准化
+            page_text = ''.join([char_info['char'] for char_info in char_list])
+            normalized_page_text = normalize_text(page_text)
+            
+            # 在页面文本中查找目标文本
+            pos = normalized_page_text.find(normalized_target)
+            if pos != -1:
+                # 找到匹配项，获取对应的坐标信息
+                if pos < len(char_list):
+                    start_char = char_list[pos]
+                    end_pos = pos + len(normalized_target) - 1
+                    if end_pos < len(char_list):
+                        end_char = char_list[end_pos]
+                        
+                        position_info = {
+                            'page': page_num,
+                            'text': normalized_target,
+                            'start_x': start_char['x'],
+                            'start_y': start_char['y'],
+                            'end_x': end_char['x'],
+                            'end_y': end_char['y']
+                        }
+                        found_positions.append(position_info)
+        
+        return found_positions
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    # 使用本地PDF文件
+    pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径
+    target_text = '''执行方式：
+• 在当前 chat 中，已有上下文，但可能混乱
+• 新开一个 chat，干净的上下文，需要填充'''
+    
+    try:
+        print("方法1：全文搜索")
+        positions = find_text_positions(pdf_file_path, target_text)
+        if positions:
+            print(f"找到文本在以下位置:")
+            for pos in positions:
+                print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
+        else:
+            print("未找到文本")
+            
+        print("\n方法2：逐页搜索")
+        positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
+        if positions:
+            print(f"找到文本在以下位置:")
+            for pos in positions:
+                print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
+        else:
+            print("未找到文本")
+            
+    except FileNotFoundError as e:
+        print(e)
+    except Exception as e:
+        print(f"处理PDF时出错: {e}")