实现智能模糊文本查找功能，支持精确、模糊和部分匹配，优化文本坐标返回逻辑

2025-07-30 12:48:11 +08:00
parent 44ef61daab
commit 73557a272d
1 changed files with 537 additions and 39 deletions
--- a/src/get_pos_pdf.py
+++ b/src/get_pos_pdf.py
@@ -1,6 +1,8 @@
 import requests
 import io
 import os
 import re
 from difflib import SequenceMatcher
 from pdfminer.pdfdocument import PDFDocument
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdfparser import PDFParser
@@ -44,6 +46,386 @@ def normalize_text(text):
    normalized = re.sub(r'\s+', ' ', text.strip())
    return normalized
 def clean_text_for_fuzzy_match(text):
    """清理文本用于模糊匹配，移除特殊字符，只保留字母数字和空格"""
    # 移除标点符号和特殊字符，只保留字母、数字、中文字符和空格
    cleaned = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text)
    # 标准化空白字符
    cleaned = re.sub(r'\s+', ' ', cleaned.strip())
    return cleaned
 def find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold=0.8):
    """
    在PDF中模糊查找指定文本并返回坐标
    Args:
        pdf_path (str): PDF文件路径
        target_text (str): 要查找的文本
        similarity_threshold (float): 相似度阈值 (0-1)，默认0.8
    Returns:
        list: 包含匹配文本坐标信息的列表
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
    # 清理目标文本
    cleaned_target = clean_text_for_fuzzy_match(target_text)
    # 打开本地PDF文件
    with open(pdf_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        found_positions = []
        # 处理每一页
        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
            interpreter.process_page(page)
            layout = device.get_result()
            char_list = parse_char_layout(layout)
            # 将页面字符组合成文本
            page_text = ''.join([char_info['char'] for char_info in char_list])
            cleaned_page_text = clean_text_for_fuzzy_match(page_text)
            # 滑动窗口查找相似文本
            target_len = len(cleaned_target)
            if target_len == 0:
                continue
            # 存储所有匹配的块
            matches = []
            for i in range(len(cleaned_page_text) - target_len + 1):
                window_text = cleaned_page_text[i:i + target_len]
                similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
                if similarity >= similarity_threshold:
                    # 找到匹配项，记录位置和相似度
                    if i < len(char_list):
                        matches.append({
                            'start_idx': i,
                            'end_idx': min(i + target_len - 1, len(char_list) - 1),
                            'similarity': similarity
                        })
            # 合并相邻的匹配块
            if matches:
                # 按起始位置排序
                matches.sort(key=lambda x: x['start_idx'])
                # 合并相邻或重叠的匹配块
                merged_matches = []
                current_match = matches[0].copy()  # 创建副本
                for i in range(1, len(matches)):
                    next_match = matches[i]
                    # 如果下一个匹配块与当前块相邻或重叠，则合并
                    # 判断条件：下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
                    if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
                        # 合并索引范围
                        current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
                        current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
                        # 计算加权平均相似度
                        total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
                                      (next_match['end_idx'] - next_match['start_idx'] + 1)
                        current_match['similarity'] = (
                            current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
                            next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
                        ) / total_length
                    else:
                        # 不相邻，保存当前块，开始新的块
                        merged_matches.append(current_match)
                        current_match = next_match.copy()  # 创建副本
                # 添加最后一个块
                merged_matches.append(current_match)
                # 为每个合并后的块生成坐标信息
                for match in merged_matches:
                    start_idx = match['start_idx']
                    end_idx = match['end_idx']
                    if start_idx < len(char_list) and end_idx < len(char_list):
                        # 获取匹配区域的所有字符
                        matched_chars = char_list[start_idx:end_idx+1]
                        # 过滤掉坐标为0的字符（通常是特殊字符）
                        valid_chars = [char for char in matched_chars 
                                     if char['x'] > 0 and char['y'] > 0]
                        # 如果没有有效字符，则使用所有字符
                        chars_to_use = valid_chars if valid_chars else matched_chars
                        # 计算边界框 (left, right, top, bottom)
                        if chars_to_use:
                            # 计算边界值
                            left = min([char['x'] for char in chars_to_use])
                            right = max([char['x'] for char in chars_to_use])
                            bottom = min([char['y'] for char in chars_to_use])
                            top = max([char['y'] for char in chars_to_use])
                            # 获取匹配的文本内容
                            matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
                            # 只有当边界框有效时才添加结果
                            if left >= 0 and right > left and top > bottom:
                                position = [
                                    page_num,
                                    left,    # left
                                    right,   # right
                                    top,     # top
                                    bottom,  # bottom
                                    matched_text,  # 添加匹配的内容
                                    match['similarity']  # 添加相似度信息
                                ]
                                found_positions.append(position)
        return found_positions
    """
    在PDF中模糊查找指定文本并返回坐标
    Args:
        pdf_path (str): PDF文件路径
        target_text (str): 要查找的文本
        similarity_threshold (float): 相似度阈值 (0-1)，默认0.8
    Returns:
        list: 包含匹配文本坐标信息的列表
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
    # 清理目标文本
    cleaned_target = clean_text_for_fuzzy_match(target_text)
    # 打开本地PDF文件
    with open(pdf_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        found_positions = []
        # 处理每一页
        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
            interpreter.process_page(page)
            layout = device.get_result()
            char_list = parse_char_layout(layout)
            # 将页面字符组合成文本
            page_text = ''.join([char_info['char'] for char_info in char_list])
            cleaned_page_text = clean_text_for_fuzzy_match(page_text)
            # 滑动窗口查找相似文本
            target_len = len(cleaned_target)
            if target_len == 0:
                continue
            # 存储所有匹配的块
            matches = []
            for i in range(len(cleaned_page_text) - target_len + 1):
                window_text = cleaned_page_text[i:i + target_len]
                similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
                if similarity >= similarity_threshold:
                    # 找到匹配项，记录位置和相似度
                    if i < len(char_list):
                        matches.append({
                            'start_idx': i,
                            'end_idx': min(i + target_len - 1, len(char_list) - 1),
                            'similarity': similarity
                        })
            # 合并相邻的匹配块
            if matches:
                # 按起始位置排序
                matches.sort(key=lambda x: x['start_idx'])
                # 合并相邻或重叠的匹配块
                merged_matches = []
                current_match = matches[0].copy()  # 创建副本
                for i in range(1, len(matches)):
                    next_match = matches[i]
                    # 如果下一个匹配块与当前块相邻或重叠，则合并
                    # 判断条件：下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
                    if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
                        # 合并索引范围
                        current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
                        current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
                        # 计算加权平均相似度
                        total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
                                      (next_match['end_idx'] - next_match['start_idx'] + 1)
                        current_match['similarity'] = (
                            current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
                            next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
                        ) / total_length
                    else:
                        # 不相邻，保存当前块，开始新的块
                        merged_matches.append(current_match)
                        current_match = next_match.copy()  # 创建副本
                # 添加最后一个块
                merged_matches.append(current_match)
                # 为每个合并后的块生成坐标信息
                for match in merged_matches:
                    start_idx = match['start_idx']
                    end_idx = match['end_idx']
                    if start_idx < len(char_list) and end_idx < len(char_list):
                        # 获取匹配区域的所有字符
                        matched_chars = char_list[start_idx:end_idx+1]
                        # 计算边界框 (left, right, top, bottom)
                        if matched_chars:
                            # 计算边界值
                            left = min([char['x'] for char in matched_chars])
                            right = max([char['x'] for char in matched_chars])
                            bottom = min([char['y'] for char in matched_chars])
                            top = max([char['y'] for char in matched_chars])
                            # 获取匹配的文本内容
                            matched_text = ''.join([char_info['char'] for char_info in matched_chars])
                            position = [
                                page_num,
                                left,    # left
                                right,   # right
                                top,     # top
                                bottom,  # bottom
                                matched_text,  # 添加匹配的内容
                                match['similarity']  # 添加相似度信息
                            ]
                            found_positions.append(position)
        return found_positions
    """
    在PDF中模糊查找指定文本并返回坐标
    Args:
        pdf_path (str): PDF文件路径
        target_text (str): 要查找的文本
        similarity_threshold (float): 相似度阈值 (0-1)，默认0.8
    Returns:
        list: 包含匹配文本坐标信息的列表
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
    # 清理目标文本
    cleaned_target = clean_text_for_fuzzy_match(target_text)
    # 打开本地PDF文件
    with open(pdf_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        found_positions = []
        # 处理每一页
        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
            interpreter.process_page(page)
            layout = device.get_result()
            char_list = parse_char_layout(layout)
            # 将页面字符组合成文本
            page_text = ''.join([char_info['char'] for char_info in char_list])
            cleaned_page_text = clean_text_for_fuzzy_match(page_text)
            # 滑动窗口查找相似文本
            target_len = len(cleaned_target)
            if target_len == 0:
                continue
            # 存储所有匹配的块
            matches = []
            for i in range(len(cleaned_page_text) - target_len + 1):
                window_text = cleaned_page_text[i:i + target_len]
                similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
                if similarity >= similarity_threshold:
                    # 找到匹配项，记录位置和相似度
                    if i < len(char_list):
                        matches.append({
                            'start_idx': i,
                            'end_idx': min(i + target_len - 1, len(char_list) - 1),
                            'similarity': similarity
                        })
            # 合并相邻的匹配块
            if matches:
                # 按起始位置排序
                matches.sort(key=lambda x: x['start_idx'])
                # 合并相邻或重叠的匹配块
                merged_matches = []
                current_match = matches[0]
                for i in range(1, len(matches)):
                    next_match = matches[i]
                    # 如果下一个匹配块与当前块相邻或重叠，则合并
                    if next_match['start_idx'] <= current_match['end_idx'] + target_len:
                        # 合并索引范围
                        current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
                        # 平均相似度
                        current_match['similarity'] = (current_match['similarity'] + next_match['similarity']) / 2
                    else:
                        # 不相邻，保存当前块，开始新的块
                        merged_matches.append(current_match)
                        current_match = next_match
                # 添加最后一个块
                merged_matches.append(current_match)
                # 为每个合并后的块生成坐标信息
                for match in merged_matches:
                    start_idx = match['start_idx']
                    end_idx = match['end_idx']
                    if start_idx < len(char_list) and end_idx < len(char_list):
                        # 获取匹配区域的所有字符
                        matched_chars = char_list[start_idx:end_idx+1]
                        # 计算边界框 (left, right, top, bottom)
                        if matched_chars:
                            # 计算边界值
                            left = min([char['x'] for char in matched_chars])
                            right = max([char['x'] for char in matched_chars])
                            bottom = min([char['y'] for char in matched_chars])
                            top = max([char['y'] for char in matched_chars])
                            # 获取匹配的文本内容
                            matched_text = ''.join([char_info['char'] for char_info in matched_chars])
                            position = [
                                page_num,
                                left,    # left
                                right,   # right
                                top,     # top
                                bottom,  # bottom
                                matched_text,  # 添加匹配的内容
                                match['similarity']  # 添加相似度信息
                            ]
                            found_positions.append(position)
        return found_positions
 def find_text_positions(pdf_path, target_text):
    """
    在PDF中查找指定文本并返回坐标
@@ -106,15 +488,25 @@ def find_text_positions(pdf_path, target_text):
                        if pos >= page_start:
                            page_num = i + 1
-                    position_info = {
+                    # 获取匹配的文本内容
-                        'page': page_num,
+                    matched_text = ''.join([char_info['char'] for char_info in all_chars[pos:pos+len(normalized_target)]])
-                        'text': normalized_target,
+                    
-                        'start_x': start_char['x'],
+                    # 计算边界框 (left, right, top, bottom)
-                        'start_y': start_char['y'],
+                    left = min(start_char['x'], end_char['x'])
-                        'end_x': end_char['x'],
+                    right = max(start_char['x'], end_char['x'])
-                        'end_y': end_char['y']
+                    bottom = min(start_char['y'], end_char['y'])
-                    }
+                    top = max(start_char['y'], end_char['y'])
-                    found_positions.append(position_info)
+                    
                    position=[
                        page_num,
                        left,    # left
                        right,   # right
                        top,     # top
                        bottom,  # bottom
                        matched_text,  # 添加匹配的内容
                        1.0  # 添加相似度信息（精确匹配为1.0）
                    ]
                    found_positions.append(position)
            start = pos + 1
@@ -169,47 +561,153 @@ def find_text_in_pdf_per_page(pdf_path, target_text):
                    if end_pos < len(char_list):
                        end_char = char_list[end_pos]
-                        position_info = {
+                        # 获取匹配的文本内容
-                            'page': page_num,
+                        matched_text = ''.join([char_info['char'] for char_info in char_list[pos:pos+len(normalized_target)]])
-                            'text': normalized_target,
+                        
-                            'start_x': start_char['x'],
+                        # 计算边界框 (left, right, top, bottom)
-                            'start_y': start_char['y'],
+                        left = min(start_char['x'], end_char['x'])
-                            'end_x': end_char['x'],
+                        right = max(start_char['x'], end_char['x'])
-                            'end_y': end_char['y']
+                        bottom = min(start_char['y'], end_char['y'])
-                        }
+                        top = max(start_char['y'], end_char['y'])
-                        found_positions.append(position_info)
+                        
                        position=[
                            page_num,
                            left,    # left
                            right,   # right
                            top,     # top
                            bottom,  # bottom
                            matched_text,  # 添加匹配的内容
                            1.0  # 添加相似度信息（精确匹配为1.0）
                        ]
                        found_positions.append(position)
        return found_positions
 def find_partial_text_positions(pdf_path, target_text, min_match_ratio=0.7):
    """
    查找部分匹配的文本（适用于较长的文本）
    Args:
        pdf_path (str): PDF文件路径
        target_text (str): 要查找的文本
        min_match_ratio (float): 最小匹配比例 (0-1)
    Returns:
        list: 包含匹配文本坐标信息的列表
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
    # 将目标文本分割成关键词或短语
    normalized_target = normalize_text(target_text)
    # 提取关键词（移除常见停用词后的词）
    keywords = [word for word in normalized_target.split() if len(word) > 2]
    if not keywords:
        keywords = normalized_target.split()  # 如果没有长词，则使用所有词
    # 打开本地PDF文件
    with open(pdf_path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        found_positions = []
        # 处理每一页
        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
            interpreter.process_page(page)
            layout = device.get_result()
            char_list = parse_char_layout(layout)
            # 将页面字符组合成文本并标准化
            page_text = ''.join([char_info['char'] for char_info in char_list])
            normalized_page_text = normalize_text(page_text)
            # 计算匹配的关键词数量
            matched_keywords = 0
            for keyword in keywords:
                if keyword in normalized_page_text:
                    matched_keywords += 1
            # 如果匹配的关键词比例超过阈值，则认为找到匹配
            if len(keywords) > 0 and (matched_keywords / len(keywords)) >= min_match_ratio:
                # 简单起见，返回页面第一个字符和最后一个字符的坐标
                if char_list:
                    start_char = char_list[0]
                    end_char = char_list[-1]
                    match_ratio = matched_keywords / len(keywords)
                    # 获取页面文本作为匹配内容
                    matched_text = ''.join([char_info['char'] for char_info in char_list])
                    # 计算边界框 (left, right, top, bottom)
                    left = min(start_char['x'], end_char['x'])
                    right = max(start_char['x'], end_char['x'])
                    bottom = min(start_char['y'], end_char['y'])
                    top = max(start_char['y'], end_char['y'])
                    position = [
                        page_num,
                        left,    # left
                        right,   # right
                        top,     # top
                        bottom,  # bottom
                        matched_text[:100] + "..." if len(matched_text) > 100 else matched_text,  # 添加匹配的内容（限制长度）
                        match_ratio  # 添加匹配比例信息
                    ]
                    found_positions.append(position)
        return found_positions
 def smart_fuzzy_find_text(pdf_path, target_text, similarity_threshold=0.8):
    """
    智能模糊文本查找，结合多种方法
    Args:
        pdf_path (str): PDF文件路径
        target_text (str): 要查找的文本
        similarity_threshold (float): 相似度阈值
    Returns:
        list: 包含匹配文本坐标信息的列表
    """
    # 方法1: 精确匹配
    exact_results = find_text_in_pdf_per_page(pdf_path, target_text)
    if exact_results:
        return exact_results
    # 方法2: 模糊匹配
    fuzzy_results = find_fuzzy_text_positions(pdf_path, target_text, similarity_threshold)
    if fuzzy_results:
        return fuzzy_results
    # 方法3: 部分匹配（关键词匹配）
    partial_results = find_partial_text_positions(pdf_path, target_text, 0.5)
    return partial_results
 if __name__ == '__main__':
    # 使用本地PDF文件
    pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径
-    target_text = '''执行方式：
+    target_text = '''创建 `plan` 文件: 固化和锁定最终的"怎么做"
-• 在当前 chat 中，已有上下文，但可能混乱
+• 基于 `plan` 执行: 精准驱动 AI 完成任务'''
 • 新开一个 chat，干净的上下文，需要填充'''
    try:
-        print("方法1：全文搜索")
+        print("智能模糊查找:")
-        positions = find_text_positions(pdf_file_path, target_text)
+        positions = smart_fuzzy_find_text(pdf_file_path, target_text, similarity_threshold=0.7)
        if positions:
            print(f"找到文本在以下位置:")
            for pos in positions:
                print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
        else:
            print("未找到文本")
        print("\n方法2：逐页搜索")
        positions = find_text_in_pdf_per_page(pdf_file_path, target_text)
        if positions:
            print(f"找到文本在以下位置:")
            for pos in positions:
-                print(f"页面: {pos['page']}, 起始坐标: ({pos['start_x']:.2f}, {pos['start_y']:.2f}), 结束坐标: ({pos['end_x']:.2f}, {pos['end_y']:.2f})")
+                if len(pos) >= 7:  # 包含匹配内容和相似度信息
                    print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f}), 相似度: {pos[6]:.2f}")
                    print(f"匹配内容: {pos[5][:50]}{'...' if len(pos[5]) > 50 else ''}")
                    print("-" * 50)
                else:
                    print(f"页面: {pos[0]}, 边界框: Left({pos[1]:.2f}), Right({pos[2]:.2f}), Top({pos[3]:.2f}), Bottom({pos[4]:.2f})")
        else:
            print("未找到文本")