优化PDF文本查找功能,新增详细查找方法,支持按块和行查找,返回匹配文本的位置信息
This commit is contained in:
		@@ -10,6 +10,8 @@ from elasticsearch import Elasticsearch
 | 
			
		||||
from minio import Minio
 | 
			
		||||
from minio.error import S3Error
 | 
			
		||||
 | 
			
		||||
from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from dotenv import load_dotenv  # 新增
 | 
			
		||||
# 加载 .env 文件中的环境变量
 | 
			
		||||
@@ -45,14 +47,15 @@ MINIO_CONFIG = {
 | 
			
		||||
    "secure": False
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
 | 
			
		||||
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
 | 
			
		||||
    """
 | 
			
		||||
    在 Elasticsearch 中更新指定文档块的 img_id。
 | 
			
		||||
    在 Elasticsearch 中更新指定文档块的position and img_id。
 | 
			
		||||
    
 | 
			
		||||
    :param tenant_id: 租户 ID
 | 
			
		||||
    :param doc_id: 文档 ID
 | 
			
		||||
    :param chunk_id: 文档块 ID
 | 
			
		||||
    :param new_img_id: 新的 img_id
 | 
			
		||||
    :param position: 位置信息
 | 
			
		||||
    :return: 更新结果
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
@@ -81,12 +84,21 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
 | 
			
		||||
        hit = result['hits']['hits'][0]
 | 
			
		||||
        doc_id_in_es = hit['_id']
 | 
			
		||||
 | 
			
		||||
        # 构建更新请求
 | 
			
		||||
        update_body = {
 | 
			
		||||
            "doc": {
 | 
			
		||||
                "img_id": new_img_id
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        # 构建更新请求 - 只更新存在的字段
 | 
			
		||||
        update_body = {"doc": {}}
 | 
			
		||||
        
 | 
			
		||||
        # 只有当 new_img_id 存在时才更新 img_id
 | 
			
		||||
        if new_img_id is not None:
 | 
			
		||||
            update_body["doc"]["img_id"] = new_img_id
 | 
			
		||||
            
 | 
			
		||||
        # 只有当 position 存在时才更新 positions
 | 
			
		||||
        if position is not None:
 | 
			
		||||
            update_body["doc"]["positions"] = position
 | 
			
		||||
 | 
			
		||||
        # 如果没有需要更新的字段,直接返回成功
 | 
			
		||||
        if not update_body["doc"]:
 | 
			
		||||
            print("没有需要更新的字段")
 | 
			
		||||
            return {"code": 0, "message": "No fields to update"}
 | 
			
		||||
 | 
			
		||||
        # 更新文档
 | 
			
		||||
        update_result = es.update(
 | 
			
		||||
@@ -100,19 +112,37 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
 | 
			
		||||
 | 
			
		||||
        # 验证更新
 | 
			
		||||
        verify_doc = es.get(index=index_name, id=doc_id_in_es)
 | 
			
		||||
        if verify_doc['_source'].get('img_id') == new_img_id:
 | 
			
		||||
            print(f"成功更新 img_id 为: {new_img_id}")
 | 
			
		||||
 | 
			
		||||
        # 检查 img_id 是否已更新(如果提供了 new_img_id)
 | 
			
		||||
        img_id_updated = True
 | 
			
		||||
        if new_img_id is not None:
 | 
			
		||||
            img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
 | 
			
		||||
            if img_id_updated:
 | 
			
		||||
                print(f"成功更新 img_id 为: {new_img_id}")
 | 
			
		||||
            else:
 | 
			
		||||
                print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
 | 
			
		||||
 | 
			
		||||
        # 检查 position 是否已更新(如果提供了 position)
 | 
			
		||||
        position_updated = True
 | 
			
		||||
        if position is not None:
 | 
			
		||||
            position_updated = verify_doc['_source'].get('positions') == position
 | 
			
		||||
            if position_updated:
 | 
			
		||||
                print(f"成功更新 position 为: {position}")
 | 
			
		||||
            else:
 | 
			
		||||
                print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
 | 
			
		||||
 | 
			
		||||
        # 统一返回结果
 | 
			
		||||
        if img_id_updated and position_updated:
 | 
			
		||||
            return {"code": 0, "message": ""}
 | 
			
		||||
        else:
 | 
			
		||||
            print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
 | 
			
		||||
            return {"code": 100, "message": "Failed to verify img_id update"}
 | 
			
		||||
            return {"code": 100, "message": "Failed to verify update"}
 | 
			
		||||
        
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"更新 Elasticsearch 时发生错误: {str(e)}")
 | 
			
		||||
        return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_minio_client():
 | 
			
		||||
    """创建MinIO客户端"""
 | 
			
		||||
    return Minio(
 | 
			
		||||
@@ -295,7 +325,10 @@ def process_txt_chunks(dataset_id, document, txt_path):
 | 
			
		||||
    try:
 | 
			
		||||
        with open(txt_path, 'r', encoding='utf-8') as file:
 | 
			
		||||
            file_content = file.read()
 | 
			
		||||
        img_chunk_ids = []
 | 
			
		||||
        
 | 
			
		||||
        # 使用字典列表替代三个独立的列表
 | 
			
		||||
        chunks_info = []
 | 
			
		||||
 | 
			
		||||
        for num, txt_chunk in enumerate(file_content.split('\n\n')):
 | 
			
		||||
            if txt_chunk.strip():
 | 
			
		||||
                print(f"处理文本块: {txt_chunk[:30]}...")
 | 
			
		||||
@@ -307,6 +340,16 @@ def process_txt_chunks(dataset_id, document, txt_path):
 | 
			
		||||
                    clean_chunk = remove_images_from_content(txt_chunk)
 | 
			
		||||
                    chunk = document.add_chunk(content=clean_chunk)
 | 
			
		||||
                    
 | 
			
		||||
                    # 初始化chunk信息
 | 
			
		||||
                    chunk_info = {
 | 
			
		||||
                        'id': chunk.id,
 | 
			
		||||
                        'text': chunk.content,
 | 
			
		||||
                        'has_image': False,  # 默认为False
 | 
			
		||||
                        'img_url': img_url
 | 
			
		||||
                    }
 | 
			
		||||
                    
 | 
			
		||||
                    upload_success = False
 | 
			
		||||
                    
 | 
			
		||||
                    # 判断是否为网络图片 (新增逻辑)
 | 
			
		||||
                    if img_url.startswith(('http://', 'https://')):
 | 
			
		||||
                        # 下载网络图片到临时文件
 | 
			
		||||
@@ -321,10 +364,10 @@ def process_txt_chunks(dataset_id, document, txt_path):
 | 
			
		||||
                            
 | 
			
		||||
                            # 上传临时文件
 | 
			
		||||
                            if upload_file2minio(dataset_id, chunk.id, tmp_path):
 | 
			
		||||
                                img_chunk_ids.append(chunk.id)
 | 
			
		||||
                                # new_img_id = f"{dataset_id}-{chunk.id}"
 | 
			
		||||
                                # print(f"网络图片 {img_url} 已下载并上传,新的 img_id: {new_img_id}")
 | 
			
		||||
                                # update_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, new_img_id)
 | 
			
		||||
                                upload_success = True
 | 
			
		||||
                                new_img_id = f"{dataset_id}-{chunk.id}"
 | 
			
		||||
                                print(f"网络图片 {img_url} 已下载并上传,新的 img_id: {new_img_id}")
 | 
			
		||||
                                # update_positon_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, [], new_img_id)
 | 
			
		||||
                            
 | 
			
		||||
                            # 删除临时文件
 | 
			
		||||
                            os.unlink(tmp_path)
 | 
			
		||||
@@ -340,23 +383,84 @@ def process_txt_chunks(dataset_id, document, txt_path):
 | 
			
		||||
                        print(f"图片绝对路径: {img_abs_path}")
 | 
			
		||||
                        if os.path.exists(img_abs_path):
 | 
			
		||||
                            if upload_file2minio(dataset_id, chunk.id, img_abs_path):
 | 
			
		||||
                                img_chunk_ids.append(chunk.id)
 | 
			
		||||
                                # new_img_id = f"{dataset_id}-{chunk.id}"
 | 
			
		||||
                                # print(f"图片 {img_abs_path} 已上传,新的 img_id: {new_img_id}")
 | 
			
		||||
                                # update_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, new_img_id)
 | 
			
		||||
                                upload_success = True
 | 
			
		||||
                                new_img_id = f"{dataset_id}-{chunk.id}"
 | 
			
		||||
                                print(f"图片 {img_abs_path} 已上传,新的 img_id: {new_img_id}")
 | 
			
		||||
                                #update_positon_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, [], new_img_id)
 | 
			
		||||
                        else:
 | 
			
		||||
                            print(f"图片未找到: {img_abs_path},跳过。")
 | 
			
		||||
                    
 | 
			
		||||
                    # 只有上传成功后才设置has_image为True
 | 
			
		||||
                    if upload_success:
 | 
			
		||||
                        chunk_info['has_image'] = True
 | 
			
		||||
                    
 | 
			
		||||
                    chunks_info.append(chunk_info)
 | 
			
		||||
                else:
 | 
			
		||||
                    print("未检测到图片链接,直接添加文本块。")
 | 
			
		||||
                    chunk = document.add_chunk(content=txt_chunk)
 | 
			
		||||
                    # 为无图片的chunk添加信息
 | 
			
		||||
                    chunk_info = {
 | 
			
		||||
                        'id': chunk.id,
 | 
			
		||||
                        'text': chunk.content,
 | 
			
		||||
                        'has_image': False,
 | 
			
		||||
                        'img_url': None
 | 
			
		||||
                    }
 | 
			
		||||
                    chunks_info.append(chunk_info)
 | 
			
		||||
                    
 | 
			
		||||
                print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
 | 
			
		||||
        for img_chunk_id in img_chunk_ids:
 | 
			
		||||
            update_img_id_in_elasticsearch(elastic_tenant_id, document.id, img_chunk_id, f"{dataset_id}-{img_chunk_id}")
 | 
			
		||||
                
 | 
			
		||||
        return chunks_info  # 返回chunks_info
 | 
			
		||||
                
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"处理文本文件时出错: {txt_path},错误: {e}")
 | 
			
		||||
        return []  # 出错时返回空列表
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_positions_from_chunk(pdf_path, chunks_info):
 | 
			
		||||
    """
 | 
			
		||||
    从PDF中获取文本块的位置信息
 | 
			
		||||
    
 | 
			
		||||
    :param pdf_path: PDF文件路径
 | 
			
		||||
    :param chunks_info: 文本块信息列表,每个元素包含'id'和'text'键
 | 
			
		||||
    :return: 包含位置信息的列表
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        # 提取所有chunk的文本内容用于批量查找
 | 
			
		||||
        chunk_texts = [chunk_info['text'] for chunk_info in chunks_info]
 | 
			
		||||
        
 | 
			
		||||
        # 使用智能模糊查找获取位置信息
 | 
			
		||||
        batch_positions = smart_fuzzy_find_text_batch(pdf_path, chunk_texts, similarity_threshold=0.7)
 | 
			
		||||
        
 | 
			
		||||
        # 将位置信息与chunks_info关联,并确保数据类型正确
 | 
			
		||||
        for i, chunk_info in enumerate(chunks_info):
 | 
			
		||||
            positions = batch_positions[i] if i < len(batch_positions) else []
 | 
			
		||||
            
 | 
			
		||||
            # 处理位置信息
 | 
			
		||||
            processed_positions = []
 | 
			
		||||
            for pos in positions:
 | 
			
		||||
                if isinstance(pos, dict):
 | 
			
		||||
                    # 创建新的位置字典,确保所有坐标都是整数
 | 
			
		||||
                    processed_pos = {
 | 
			
		||||
                        'x0': int(round(float(pos['x0']))) if pos.get('x0') is not None else 0,
 | 
			
		||||
                        'y0': int(round(float(pos['y0']))) if pos.get('y0') is not None else 0,
 | 
			
		||||
                        'x1': int(round(float(pos['x1']))) if pos.get('x1') is not None else 0,
 | 
			
		||||
                        'y1': int(round(float(pos['y1']))) if pos.get('y1') is not None else 0,
 | 
			
		||||
                        'page': int(pos['page']) if pos.get('page') is not None else 0
 | 
			
		||||
                    }
 | 
			
		||||
                    processed_positions.append(processed_pos)
 | 
			
		||||
            
 | 
			
		||||
            # 更新chunk_info中的positions
 | 
			
		||||
            chunk_info['positions'] = processed_positions
 | 
			
		||||
            
 | 
			
		||||
        return chunks_info
 | 
			
		||||
        
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print(f"获取PDF文本位置信息时出错: {str(e)}")
 | 
			
		||||
        # 出错时为每个chunk添加空的位置信息
 | 
			
		||||
        for chunk_info in chunks_info:
 | 
			
		||||
            chunk_info['positions'] = []
 | 
			
		||||
        return chunks_info
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -371,7 +475,19 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
 | 
			
		||||
            
 | 
			
		||||
        txt_path = txt_dict.get(name)
 | 
			
		||||
        if txt_path:
 | 
			
		||||
            process_txt_chunks(dataset.id,document, txt_path)
 | 
			
		||||
            chunks_info=process_txt_chunks(dataset.id,document, txt_path)
 | 
			
		||||
            if  chunks_info:
 | 
			
		||||
                chunks_info=get_positions_from_chunk(pdf_path, chunks_info)
 | 
			
		||||
                for chunk_info in chunks_info:
 | 
			
		||||
                    print(f"Chunk ID: {chunk_info['id']}, Text: {chunk_info['text'][:30]}..., Has Image: {chunk_info['has_image']}, Positions: {chunk_info['positions']}")
 | 
			
		||||
                    if chunk_info['has_image']:
 | 
			
		||||
                        # 如果有图片,更新Elasticsearch中的img_id
 | 
			
		||||
                        update_positon_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk_info['id'], chunk_info['positions'], f"{dataset.id}-{chunk_info['id']}")
 | 
			
		||||
                    else:
 | 
			
		||||
                        # 如果没有图片,仍然更新位置信息
 | 
			
		||||
                        update_positon_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk_info['id'], chunk_info['positions'], None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -58,6 +58,151 @@ def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold
 | 
			
		||||
    """
 | 
			
		||||
    在PDF中批量模糊查找指定文本并返回坐标
 | 
			
		||||
    
 | 
			
		||||
    Args:
 | 
			
		||||
        pdf_path (str): PDF文件路径
 | 
			
		||||
        target_texts (list): 要查找的文本列表
 | 
			
		||||
        similarity_threshold (float): 相似度阈值 (0-1),默认0.8
 | 
			
		||||
    
 | 
			
		||||
    Returns:
 | 
			
		||||
        list: 每个元素是一个列表,包含匹配文本坐标信息
 | 
			
		||||
    """
 | 
			
		||||
    if not os.path.exists(pdf_path):
 | 
			
		||||
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
 | 
			
		||||
    
 | 
			
		||||
    # 初始化结果列表
 | 
			
		||||
    batch_results = [[] for _ in target_texts]
 | 
			
		||||
    
 | 
			
		||||
    # 打开本地PDF文件
 | 
			
		||||
    with open(pdf_path, 'rb') as fp:
 | 
			
		||||
        parser = PDFParser(fp)
 | 
			
		||||
        doc = PDFDocument(parser)
 | 
			
		||||
        
 | 
			
		||||
        rsrcmgr = PDFResourceManager()
 | 
			
		||||
        laparams = LAParams()
 | 
			
		||||
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
 | 
			
		||||
        interpreter = PDFPageInterpreter(rsrcmgr, device)
 | 
			
		||||
        
 | 
			
		||||
        # 处理每一页
 | 
			
		||||
        pages_chars = []
 | 
			
		||||
        for page_num, page in enumerate(PDFPage.create_pages(doc), 1):
 | 
			
		||||
            interpreter.process_page(page)
 | 
			
		||||
            layout = device.get_result()
 | 
			
		||||
            char_list = parse_char_layout(layout)
 | 
			
		||||
            pages_chars.append((page_num, char_list))
 | 
			
		||||
        
 | 
			
		||||
        # 预处理所有页面的文本
 | 
			
		||||
        pages_cleaned_text = []
 | 
			
		||||
        for page_num, char_list in pages_chars:
 | 
			
		||||
            page_text = ''.join([char_info['char'] for char_info in char_list])
 | 
			
		||||
            cleaned_page_text = clean_text_for_fuzzy_match(page_text)
 | 
			
		||||
            pages_cleaned_text.append((page_num, cleaned_page_text, char_list))
 | 
			
		||||
        
 | 
			
		||||
        # 为每个目标文本进行查找
 | 
			
		||||
        for idx, target_text in enumerate(target_texts):
 | 
			
		||||
            # 清理目标文本
 | 
			
		||||
            cleaned_target = clean_text_for_fuzzy_match(target_text)
 | 
			
		||||
            target_len = len(cleaned_target)
 | 
			
		||||
            
 | 
			
		||||
            if target_len == 0:
 | 
			
		||||
                continue
 | 
			
		||||
                
 | 
			
		||||
            found_positions = []
 | 
			
		||||
            
 | 
			
		||||
            # 在每一页中查找
 | 
			
		||||
            for page_num, cleaned_page_text, char_list in pages_cleaned_text:
 | 
			
		||||
                # 滑动窗口查找相似文本
 | 
			
		||||
                matches = []
 | 
			
		||||
                for i in range(len(cleaned_page_text) - target_len + 1):
 | 
			
		||||
                    window_text = cleaned_page_text[i:i + target_len]
 | 
			
		||||
                    similarity = SequenceMatcher(None, cleaned_target, window_text).ratio()
 | 
			
		||||
                    
 | 
			
		||||
                    if similarity >= similarity_threshold:
 | 
			
		||||
                        # 找到匹配项,记录位置和相似度
 | 
			
		||||
                        if i < len(char_list):
 | 
			
		||||
                            matches.append({
 | 
			
		||||
                                'start_idx': i,
 | 
			
		||||
                                'end_idx': min(i + target_len - 1, len(char_list) - 1),
 | 
			
		||||
                                'similarity': similarity
 | 
			
		||||
                            })
 | 
			
		||||
                
 | 
			
		||||
                # 合并相邻的匹配块
 | 
			
		||||
                if matches:
 | 
			
		||||
                    # 按起始位置排序
 | 
			
		||||
                    matches.sort(key=lambda x: x['start_idx'])
 | 
			
		||||
                    
 | 
			
		||||
                    # 合并相邻或重叠的匹配块
 | 
			
		||||
                    merged_matches = []
 | 
			
		||||
                    current_match = matches[0].copy()  # 创建副本
 | 
			
		||||
                    
 | 
			
		||||
                    for i in range(1, len(matches)):
 | 
			
		||||
                        next_match = matches[i]
 | 
			
		||||
                        # 如果下一个匹配块与当前块相邻或重叠,则合并
 | 
			
		||||
                        # 判断条件:下一个块的起始位置 <= 当前块的结束位置 + 一些缓冲距离
 | 
			
		||||
                        if next_match['start_idx'] <= current_match['end_idx'] + min(target_len, 10):
 | 
			
		||||
                            # 合并索引范围
 | 
			
		||||
                            current_match['start_idx'] = min(current_match['start_idx'], next_match['start_idx'])
 | 
			
		||||
                            current_match['end_idx'] = max(current_match['end_idx'], next_match['end_idx'])
 | 
			
		||||
                            # 计算加权平均相似度
 | 
			
		||||
                            total_length = (current_match['end_idx'] - current_match['start_idx'] + 1) + \
 | 
			
		||||
                                          (next_match['end_idx'] - next_match['start_idx'] + 1)
 | 
			
		||||
                            current_match['similarity'] = (
 | 
			
		||||
                                current_match['similarity'] * (current_match['end_idx'] - current_match['start_idx'] + 1) +
 | 
			
		||||
                                next_match['similarity'] * (next_match['end_idx'] - next_match['start_idx'] + 1)
 | 
			
		||||
                            ) / total_length
 | 
			
		||||
                        else:
 | 
			
		||||
                            # 不相邻,保存当前块,开始新的块
 | 
			
		||||
                            merged_matches.append(current_match)
 | 
			
		||||
                            current_match = next_match.copy()  # 创建副本
 | 
			
		||||
                    
 | 
			
		||||
                    # 添加最后一个块
 | 
			
		||||
                    merged_matches.append(current_match)
 | 
			
		||||
                    
 | 
			
		||||
                    # 为每个合并后的块生成坐标信息
 | 
			
		||||
                    for match in merged_matches:
 | 
			
		||||
                        start_idx = match['start_idx']
 | 
			
		||||
                        end_idx = match['end_idx']
 | 
			
		||||
                        
 | 
			
		||||
                        if start_idx < len(char_list) and end_idx < len(char_list):
 | 
			
		||||
                            # 获取匹配区域的所有字符
 | 
			
		||||
                            matched_chars = char_list[start_idx:end_idx+1]
 | 
			
		||||
                            
 | 
			
		||||
                            # 过滤掉坐标为0的字符(通常是特殊字符)
 | 
			
		||||
                            valid_chars = [char for char in matched_chars 
 | 
			
		||||
                                         if char['x'] > 0 and char['y'] > 0]
 | 
			
		||||
                            
 | 
			
		||||
                            # 如果没有有效字符,则使用所有字符
 | 
			
		||||
                            chars_to_use = valid_chars if valid_chars else matched_chars
 | 
			
		||||
                            
 | 
			
		||||
                            # 计算边界框 (left, right, top, bottom)
 | 
			
		||||
                            if chars_to_use:
 | 
			
		||||
                                # 计算边界值
 | 
			
		||||
                                left = min([char['x'] for char in chars_to_use])
 | 
			
		||||
                                right = max([char['x'] for char in chars_to_use])
 | 
			
		||||
                                bottom = min([char['y'] for char in chars_to_use])
 | 
			
		||||
                                top = max([char['y'] for char in chars_to_use])
 | 
			
		||||
                                
 | 
			
		||||
                                # 获取匹配的文本内容
 | 
			
		||||
                                matched_text = ''.join([char_info['char'] for char_info in chars_to_use])
 | 
			
		||||
                                
 | 
			
		||||
                                # 只有当边界框有效时才添加结果
 | 
			
		||||
                                if left >= 0 and right > left and top > bottom:
 | 
			
		||||
                                    position = [
 | 
			
		||||
                                        page_num,
 | 
			
		||||
                                        left,    # left
 | 
			
		||||
                                        right,   # right
 | 
			
		||||
                                        top,     # top
 | 
			
		||||
                                        bottom,  # bottom
 | 
			
		||||
                                        matched_text,  # 添加匹配的内容
 | 
			
		||||
                                        match['similarity']  # 添加相似度信息
 | 
			
		||||
                                    ]
 | 
			
		||||
                                    found_positions.append(position)
 | 
			
		||||
            
 | 
			
		||||
            batch_results[idx] = found_positions
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
    """
 | 
			
		||||
    在PDF中批量模糊查找指定文本并返回坐标
 | 
			
		||||
    
 | 
			
		||||
    Args:
 | 
			
		||||
        pdf_path (str): PDF文件路径
 | 
			
		||||
        target_texts (list): 要查找的文本列表
 | 
			
		||||
@@ -90,6 +235,13 @@ def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold
 | 
			
		||||
            char_list = parse_char_layout(layout)
 | 
			
		||||
            pages_chars.append((page_num, char_list))
 | 
			
		||||
        
 | 
			
		||||
        # 预处理所有页面的文本
 | 
			
		||||
        pages_cleaned_text = []
 | 
			
		||||
        for page_num, char_list in pages_chars:
 | 
			
		||||
            page_text = ''.join([char_info['char'] for char_info in char_list])
 | 
			
		||||
            cleaned_page_text = clean_text_for_fuzzy_match(page_text)
 | 
			
		||||
            pages_cleaned_text.append((page_num, cleaned_page_text, char_list))
 | 
			
		||||
        
 | 
			
		||||
        # 为每个目标文本进行查找
 | 
			
		||||
        for target_text in target_texts:
 | 
			
		||||
            # 清理目标文本
 | 
			
		||||
@@ -102,11 +254,7 @@ def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold
 | 
			
		||||
            found_positions = []
 | 
			
		||||
            
 | 
			
		||||
            # 在每一页中查找
 | 
			
		||||
            for page_num, char_list in pages_chars:
 | 
			
		||||
                # 将页面字符组合成文本
 | 
			
		||||
                page_text = ''.join([char_info['char'] for char_info in char_list])
 | 
			
		||||
                cleaned_page_text = clean_text_for_fuzzy_match(page_text)
 | 
			
		||||
                
 | 
			
		||||
            for page_num, cleaned_page_text, char_list in pages_cleaned_text:
 | 
			
		||||
                # 滑动窗口查找相似文本
 | 
			
		||||
                matches = []
 | 
			
		||||
                for i in range(len(cleaned_page_text) - target_len + 1):
 | 
			
		||||
@@ -197,7 +345,6 @@ def find_fuzzy_text_positions_batch(pdf_path, target_texts, similarity_threshold
 | 
			
		||||
            batch_results[target_text] = found_positions
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
 | 
			
		||||
def find_text_positions_batch(pdf_path, target_texts):
 | 
			
		||||
    """
 | 
			
		||||
    在PDF中批量查找指定文本并返回坐标
 | 
			
		||||
@@ -207,13 +354,13 @@ def find_text_positions_batch(pdf_path, target_texts):
 | 
			
		||||
        target_texts (list): 要查找的文本列表
 | 
			
		||||
    
 | 
			
		||||
    Returns:
 | 
			
		||||
        dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
 | 
			
		||||
        list: 每个元素是一个列表,包含匹配文本坐标信息
 | 
			
		||||
    """
 | 
			
		||||
    if not os.path.exists(pdf_path):
 | 
			
		||||
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
 | 
			
		||||
    
 | 
			
		||||
    # 初始化结果字典
 | 
			
		||||
    batch_results = {text: [] for text in target_texts}
 | 
			
		||||
    # 初始化结果列表
 | 
			
		||||
    batch_results = [[] for _ in target_texts]
 | 
			
		||||
    
 | 
			
		||||
    # 打开本地PDF文件
 | 
			
		||||
    with open(pdf_path, 'rb') as fp:
 | 
			
		||||
@@ -241,7 +388,7 @@ def find_text_positions_batch(pdf_path, target_texts):
 | 
			
		||||
        normalized_full_text = normalize_text(full_text)
 | 
			
		||||
        
 | 
			
		||||
        # 为每个目标文本查找位置
 | 
			
		||||
        for target_text in target_texts:
 | 
			
		||||
        for idx, target_text in enumerate(target_texts):
 | 
			
		||||
            # 标准化目标文本
 | 
			
		||||
            normalized_target = normalize_text(target_text)
 | 
			
		||||
            
 | 
			
		||||
@@ -284,7 +431,7 @@ def find_text_positions_batch(pdf_path, target_texts):
 | 
			
		||||
                
 | 
			
		||||
                start = pos + 1
 | 
			
		||||
            
 | 
			
		||||
            batch_results[target_text] = found_positions
 | 
			
		||||
            batch_results[idx] = found_positions
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
 | 
			
		||||
@@ -297,13 +444,13 @@ def find_text_in_pdf_per_page_batch(pdf_path, target_texts):
 | 
			
		||||
        target_texts (list): 要查找的文本列表
 | 
			
		||||
    
 | 
			
		||||
    Returns:
 | 
			
		||||
        dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
 | 
			
		||||
        list: 每个元素是一个列表,包含匹配文本坐标信息
 | 
			
		||||
    """
 | 
			
		||||
    if not os.path.exists(pdf_path):
 | 
			
		||||
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
 | 
			
		||||
    
 | 
			
		||||
    # 初始化结果字典
 | 
			
		||||
    batch_results = {text: [] for text in target_texts}
 | 
			
		||||
    # 初始化结果列表
 | 
			
		||||
    batch_results = [[] for _ in target_texts]
 | 
			
		||||
    
 | 
			
		||||
    # 打开本地PDF文件
 | 
			
		||||
    with open(pdf_path, 'rb') as fp:
 | 
			
		||||
@@ -325,10 +472,11 @@ def find_text_in_pdf_per_page_batch(pdf_path, target_texts):
 | 
			
		||||
            page_text = ''.join([char_info['char'] for char_info in char_list])
 | 
			
		||||
            normalized_page_text = normalize_text(page_text)
 | 
			
		||||
            
 | 
			
		||||
            # 预处理所有目标文本
 | 
			
		||||
            normalized_targets = [normalize_text(text) for text in target_texts]
 | 
			
		||||
            
 | 
			
		||||
            # 为每个目标文本在当前页查找
 | 
			
		||||
            for target_text in target_texts:
 | 
			
		||||
                normalized_target = normalize_text(target_text)
 | 
			
		||||
                
 | 
			
		||||
            for idx, normalized_target in enumerate(normalized_targets):
 | 
			
		||||
                # 在页面文本中查找目标文本
 | 
			
		||||
                pos = normalized_page_text.find(normalized_target)
 | 
			
		||||
                if pos != -1:
 | 
			
		||||
@@ -349,13 +497,13 @@ def find_text_in_pdf_per_page_batch(pdf_path, target_texts):
 | 
			
		||||
                            top = max(start_char['y'], end_char['y'])
 | 
			
		||||
                            
 | 
			
		||||
                            position = [
 | 
			
		||||
                                page_num,
 | 
			
		||||
                                left,    # left
 | 
			
		||||
                                right,   # right
 | 
			
		||||
                                top,     # top
 | 
			
		||||
                                bottom,  # bottom
 | 
			
		||||
                                int(page_num),
 | 
			
		||||
                                int(left),    # left
 | 
			
		||||
                                int(right),   # right
 | 
			
		||||
                                int(top),     # top
 | 
			
		||||
                                int(bottom),  # bottom
 | 
			
		||||
                            ]
 | 
			
		||||
                            batch_results[target_text].append(position)
 | 
			
		||||
                            batch_results[idx].append(position)
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
 | 
			
		||||
@@ -369,13 +517,13 @@ def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.
 | 
			
		||||
        min_match_ratio (float): 最小匹配比例 (0-1)
 | 
			
		||||
    
 | 
			
		||||
    Returns:
 | 
			
		||||
        dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
 | 
			
		||||
        list: 每个元素是一个列表,包含匹配文本坐标信息
 | 
			
		||||
    """
 | 
			
		||||
    if not os.path.exists(pdf_path):
 | 
			
		||||
        raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")
 | 
			
		||||
    
 | 
			
		||||
    # 初始化结果字典
 | 
			
		||||
    batch_results = {text: [] for text in target_texts}
 | 
			
		||||
    # 初始化结果列表
 | 
			
		||||
    batch_results = [[] for _ in target_texts]
 | 
			
		||||
    
 | 
			
		||||
    # 打开本地PDF文件
 | 
			
		||||
    with open(pdf_path, 'rb') as fp:
 | 
			
		||||
@@ -397,9 +545,10 @@ def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.
 | 
			
		||||
            page_text = ''.join([char_info['char'] for char_info in char_list])
 | 
			
		||||
            normalized_page_text = normalize_text(page_text)
 | 
			
		||||
            
 | 
			
		||||
            # 为每个目标文本计算匹配
 | 
			
		||||
            # 预处理所有目标文本
 | 
			
		||||
            normalized_targets = []
 | 
			
		||||
            keywords_list = []
 | 
			
		||||
            for target_text in target_texts:
 | 
			
		||||
                # 将目标文本分割成关键词或短语
 | 
			
		||||
                normalized_target = normalize_text(target_text)
 | 
			
		||||
                # 提取关键词(移除常见停用词后的词)
 | 
			
		||||
                keywords = [word for word in normalized_target.split() if len(word) > 2]
 | 
			
		||||
@@ -407,6 +556,11 @@ def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.
 | 
			
		||||
                if not keywords:
 | 
			
		||||
                    keywords = normalized_target.split()  # 如果没有长词,则使用所有词
 | 
			
		||||
                
 | 
			
		||||
                normalized_targets.append(normalized_target)
 | 
			
		||||
                keywords_list.append(keywords if keywords else [])
 | 
			
		||||
            
 | 
			
		||||
            # 为每个目标文本计算匹配
 | 
			
		||||
            for idx, (normalized_target, keywords) in enumerate(zip(normalized_targets, keywords_list)):
 | 
			
		||||
                if not keywords:
 | 
			
		||||
                    continue
 | 
			
		||||
                    
 | 
			
		||||
@@ -440,7 +594,7 @@ def find_partial_text_positions_batch(pdf_path, target_texts, min_match_ratio=0.
 | 
			
		||||
                            top,     # top
 | 
			
		||||
                            bottom,  # bottom
 | 
			
		||||
                        ]
 | 
			
		||||
                        batch_results[target_text].append(position)
 | 
			
		||||
                        batch_results[idx].append(position)
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
 | 
			
		||||
@@ -454,62 +608,71 @@ def smart_fuzzy_find_text_batch(pdf_path, target_texts, similarity_threshold=0.8
 | 
			
		||||
        similarity_threshold (float): 相似度阈值
 | 
			
		||||
    
 | 
			
		||||
    Returns:
 | 
			
		||||
        dict: 以target_text为键,包含匹配文本坐标信息列表为值的字典
 | 
			
		||||
        list: 每个元素是一个列表,包含匹配文本坐标信息
 | 
			
		||||
    """
 | 
			
		||||
    # 初始化结果字典
 | 
			
		||||
    batch_results = {text: [] for text in target_texts}
 | 
			
		||||
    # 初始化结果列表
 | 
			
		||||
    batch_results = [[] for _ in target_texts]
 | 
			
		||||
    
 | 
			
		||||
    # 方法1: 精确匹配
 | 
			
		||||
    exact_results = find_text_in_pdf_per_page_batch(pdf_path, target_texts)
 | 
			
		||||
    
 | 
			
		||||
    # 对于已经找到精确匹配的文本,直接使用结果
 | 
			
		||||
    remaining_texts = []
 | 
			
		||||
    for text in target_texts:
 | 
			
		||||
        if exact_results.get(text):
 | 
			
		||||
            batch_results[text] = exact_results[text]
 | 
			
		||||
    remaining_indices = []
 | 
			
		||||
    for idx, results in enumerate(exact_results):
 | 
			
		||||
        if results:
 | 
			
		||||
            batch_results[idx] = results
 | 
			
		||||
        else:
 | 
			
		||||
            remaining_texts.append(text)
 | 
			
		||||
            remaining_indices.append(idx)
 | 
			
		||||
    
 | 
			
		||||
    if not remaining_texts:
 | 
			
		||||
    if not remaining_indices:
 | 
			
		||||
        return batch_results
 | 
			
		||||
    
 | 
			
		||||
    # 构建剩余文本列表
 | 
			
		||||
    remaining_texts = [target_texts[idx] for idx in remaining_indices]
 | 
			
		||||
    
 | 
			
		||||
    # 方法2: 模糊匹配(仅对未找到精确匹配的文本)
 | 
			
		||||
    fuzzy_results = find_fuzzy_text_positions_batch(pdf_path, remaining_texts, similarity_threshold)
 | 
			
		||||
    
 | 
			
		||||
    # 更新结果
 | 
			
		||||
    for text in remaining_texts:
 | 
			
		||||
        if fuzzy_results.get(text):
 | 
			
		||||
            batch_results[text] = fuzzy_results[text]
 | 
			
		||||
            remaining_texts = [t for t in remaining_texts if t != text]  # 从剩余文本中移除
 | 
			
		||||
    for i, idx in enumerate(remaining_indices):
 | 
			
		||||
        if fuzzy_results[i]:
 | 
			
		||||
            batch_results[idx] = fuzzy_results[i]
 | 
			
		||||
            remaining_indices = [ri for ri in remaining_indices if ri != idx]  # 从剩余索引中移除
 | 
			
		||||
    
 | 
			
		||||
    if not remaining_texts:
 | 
			
		||||
    if not remaining_indices:
 | 
			
		||||
        return batch_results
 | 
			
		||||
    
 | 
			
		||||
    # 构建剩余文本列表
 | 
			
		||||
    remaining_texts = [target_texts[idx] for idx in remaining_indices]
 | 
			
		||||
    
 | 
			
		||||
    # 方法3: 部分匹配(关键词匹配,仅对仍未找到匹配的文本)
 | 
			
		||||
    partial_results = find_partial_text_positions_batch(pdf_path, remaining_texts, 0.5)
 | 
			
		||||
    
 | 
			
		||||
    # 更新最终结果
 | 
			
		||||
    for text in remaining_texts:
 | 
			
		||||
        if partial_results.get(text):
 | 
			
		||||
            batch_results[text] = partial_results[text]
 | 
			
		||||
    for i, idx in enumerate(remaining_indices):
 | 
			
		||||
        if partial_results[i]:
 | 
			
		||||
            batch_results[idx] = partial_results[i]
 | 
			
		||||
    
 | 
			
		||||
    return batch_results
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    # 使用本地PDF文件
 | 
			
		||||
    pdf_file_path = 'F:\\gitea\\ragflow_api_test\\ai协作方式.pdf'  # 修改为你的PDF文件路径
 | 
			
		||||
    pdf_file_path = 'F:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'  # 修改为你的PDF文件路径
 | 
			
		||||
    target_texts = [
 | 
			
		||||
        '''创建 `plan` 文件: 固化和锁定最终的"怎么做"
 | 
			
		||||
• 基于 `plan` 执行: 精准驱动 AI 完成任务''',
 | 
			
		||||
        "其他要查找的文本1",
 | 
			
		||||
        "其他要查找的文本2"
 | 
			
		||||
        '''一、总体要求
 | 
			
		||||
以习近平新时代中国特色社会主义思想为指导,完整、准确、全面贯彻新发展理念,统筹发展和安全,充分发挥数据的基础资源和创新引擎作用,整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合,全面提升城市全域数字化转型的整体性、系统性、协同性,不断满足人民日益增长的美好生活需要,为全面建设社会主义现代化国家提供强大动力。到2027年,全国城市全域数字化转型取得明显成效,形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市,有力支撑数字中国建设。到2030年,全国城市全域数字化转型全面突破,人民群众的获得感、幸福感、安全感全面提升,涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''',
 | 
			
		||||
        '''二、全领域推进城市数字化转型
 | 
			
		||||
(一)建立城市数字化共性基础。构建统一规划、统一架构、统一标准、统一运维的城市运行和治理智能中枢,打造线上线下联动、服务管理协同的城市共性支撑平台,构建开放兼容、共性赋能、安全可靠的综合性基础环境,推进算法、模型等数字资源一体集成部署,探索建立共性组件、模块等共享协作机制。鼓励发展基于人工智能等技术的智能分析、智能调度、智能监管、辅助决策,全面支撑赋能城市数字化转型场景建设与发展。鼓励有条件的地方推进城市信息模型、时空大数据、国土空间基础信息、实景三维中国等基础平台功能整合、协同发展、应用赋能,为城市数字化转型提供统一的时空框架,因地制宜有序探索推进数字孪生城市建设,推动虚实共生、仿真推演、迭代优化的数字孪生场景落地。
 | 
			
		||||
(二)培育壮大城市数字经济。深入推进数字技术与一二三产业深度融合,鼓励平台企业构建多层次产业互联网服务平台。因地制宜发展智慧农业,加快工业互联网规模化应用,推动金融、物流等生产性服务业和商贸、文旅、康养等生活性服务业数字化转型,提升“上云用数赋智”水平。深化数字化转型促进中心建设,促进城市数字化转型和大中小企业融合创新协同发展。因地制宜发展新兴数字产业,加强大数据、人工智能、区块链、先进计算、未来网络、卫星遥感、三维建模等关键数字技术在城市场景中集成应用,加快技术创新成果转化,打造具有国际竞争力的数字产业集群。培育壮大数据产业,发展一批数据商和第三方专业服务机构,提高数据要素应用支撑与服务能力。''',
 | 
			
		||||
        """(三)促进新型产城融合发展。创新生产空间和生活空间融合的数字化场景,加强城市空间开发利用大数据分析,推进数字化赋能郊区新城,实现城市多中心、网络化、组团式发展。推动城市“数字更新”,加快街区、商圈等城市微单元基础设施智能化升级,探索利用数字技术创新应用场景,激发产城融合服务能级与数字活力。深化城市场景开放促进以城带产,提升产业聚合力。加速创新资源共享助力以产促城,发展虚拟园区和跨区域协同创新平台,增强城市数字经济就业吸附力。"""
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        print("批量智能模糊查找:")
 | 
			
		||||
        batch_positions = smart_fuzzy_find_text_batch(pdf_file_path, target_texts, similarity_threshold=0.7)
 | 
			
		||||
        
 | 
			
		||||
        for target_text, positions in batch_positions.items():
 | 
			
		||||
        # 现在 batch_positions 是一个列表,需要使用 enumerate 来同时获取索引和位置信息
 | 
			
		||||
        for idx, positions in enumerate(batch_positions):
 | 
			
		||||
            target_text = target_texts[idx]
 | 
			
		||||
            print(f"\n查找文本: {target_text[:50]}{'...' if len(target_text) > 50 else ''}")
 | 
			
		||||
            if positions:
 | 
			
		||||
                print(f"找到文本在以下位置:")
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										53
									
								
								src/get_pos_pdf_.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								src/get_pos_pdf_.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,53 @@
 | 
			
		||||
import fitz  # PyMuPDF
 | 
			
		||||
import difflib
 | 
			
		||||
 | 
			
		||||
def find_text_in_pdf_detailed(pdf_path, query_text, threshold=0.8):
 | 
			
		||||
    """
 | 
			
		||||
    在PDF中详细查找文本,按块和行查找。
 | 
			
		||||
    """
 | 
			
		||||
    results = []
 | 
			
		||||
    doc = fitz.open(pdf_path)
 | 
			
		||||
 | 
			
		||||
    # 清理查询文本
 | 
			
		||||
    cleaned_query = ' '.join(query_text.split())
 | 
			
		||||
    print(f"查找文本: {cleaned_query[:100]}...")
 | 
			
		||||
 | 
			
		||||
    for page_num in range(len(doc)):
 | 
			
		||||
        page = doc.load_page(page_num)
 | 
			
		||||
        blocks = page.get_text("dict")["blocks"]
 | 
			
		||||
 | 
			
		||||
        for block in blocks:
 | 
			
		||||
            if "lines" not in block:
 | 
			
		||||
                continue
 | 
			
		||||
            
 | 
			
		||||
            # 组合整个块的文本
 | 
			
		||||
            block_text = ""
 | 
			
		||||
            for line in block["lines"]:
 | 
			
		||||
                for span in line["spans"]:
 | 
			
		||||
                    block_text += span["text"]
 | 
			
		||||
            
 | 
			
		||||
            if block_text.strip():
 | 
			
		||||
                similarity = difflib.SequenceMatcher(None, cleaned_query.strip(), block_text.strip()).ratio()
 | 
			
		||||
                if similarity >= threshold:
 | 
			
		||||
                    # 使用块的边界框
 | 
			
		||||
                    bbox = block["bbox"] if "bbox" in block else None
 | 
			
		||||
                    if bbox:
 | 
			
		||||
                        results.append((page_num + 1, bbox))
 | 
			
		||||
                        print(f"第 {page_num + 1} 页块匹配,相似度: {similarity:.2f}")
 | 
			
		||||
                elif similarity >= 0.1:  # 调试输出
 | 
			
		||||
                    print(f"第 {page_num + 1} 页块相似度: {similarity:.2f}")
 | 
			
		||||
 | 
			
		||||
    doc.close()
 | 
			
		||||
    return results
 | 
			
		||||
 | 
			
		||||
# 示例用法
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    pdf_path = 'F:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
 | 
			
		||||
    query = '''一、总体要求
 | 
			
		||||
以习近平新时代中国特色社会主义思想为指导,完'''
 | 
			
		||||
    
 | 
			
		||||
    print("开始详细查找...")
 | 
			
		||||
    matches = find_text_in_pdf_detailed(pdf_path, query, threshold=0.3)
 | 
			
		||||
    print(f"找到 {len(matches)} 个匹配项")
 | 
			
		||||
    for page, bbox in matches:
 | 
			
		||||
        print(f"在第 {page} 页找到匹配,位置:{bbox}")
 | 
			
		||||
		Reference in New Issue
	
	Block a user