新增批量更新Elasticsearch文档的功能，优化process_pdf_txt_pairs函数以提高处理效率

优化 Elasticsearch 更新逻辑，支持批量位置更新，调整匹配结果处理，新增位置整数格式返回
新增 chunk_pos.py 文件，集成 Elasticsearch 功能，支持文档块位置和图像 ID 更新，优化索引映射获取逻辑
2025-08-08 17:20:45 +08:00 · 2025-08-08 10:38:24 +08:00 · 2025-08-07 17:04:20 +08:00
3 changed files with 465 additions and 74 deletions
--- a/chunk_pos.py
+++ b/chunk_pos.py
@@ -0,0 +1,256 @@
+from elasticsearch import Elasticsearch
+#from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch
+# 初始化 Elasticsearch   用户名elastic，密码infini_rag_flow
+
+from dotenv import load_dotenv  # 新增
+import os
+import json
+# 加载 .env 文件中的环境变量
+load_dotenv()
+
+
+
+
+# 初始化 Elasticsearch
+es = Elasticsearch(
+    [{
+        'host': os.getenv("ELASTIC_HOST"),
+        'port': int(os.getenv("ELASTIC_PORT")),
+        'scheme': 'http'
+    }],
+    basic_auth=(
+        os.getenv("ELASTIC_USERNAME"),
+        os.getenv("ELASTIC_PASSWORD")
+    )
+)
+
+
+def get_index_mapping(tenant_id):
+    """
+    获取指定索引的 mapping 信息
+    
+    :param tenant_id: 租户 ID
+    :return: mapping 信息
+    """
+    index_name = f"ragflow_{tenant_id}"
+    
+    try:
+        mapping = es.indices.get_mapping(index=index_name)
+        # 将 ObjectApiResponse 转换为普通字典
+        mapping_dict = dict(mapping)
+        return {"code": 0, "message": "", "data": mapping_dict}
+    except Exception as e:
+        return {"code": 500, "message": str(e), "data": {}}
+
+def update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, positions):
+    """
+    在 Elasticsearch 中更新指定文档块的position and img_id。
+    
+    :param tenant_id: 租户 ID
+    :param doc_id: 文档 ID
+    :param chunk_id: 文档块 ID
+    :param new_img_id: 新的 img_id
+    :param position: 位置信息
+    :return: 更新结果
+    """
+    if not positions:
+        return
+
+    position_int = []
+
+    for pos in positions:
+        if len(pos) != 5:
+            continue  # Skip invalid positions
+            
+        pn, left, right, top, bottom = pos
+        # 使用元组格式，与原始RAGFlow保持一致
+        position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
+
+    if position_int:  # Only add if we have valid positions
+        # 仅添加精确位置信息，不修改排序字段  
+        
+        # 构建索引名称
+        index_name = f"ragflow_{tenant_id}"
+
+        # 构建查询条件
+        query = {
+            "bool": {
+                "must": [
+                    {"term": {"doc_id": doc_id}},
+                    {"term": {"_id": chunk_id}}
+                ]
+            }
+        }
+
+        # 搜索目标文档
+        result = es.search(index=index_name, body={"query": query})
+
+        # 检查是否找到目标文档
+        if result['hits']['total']['value'] == 0:
+            print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
+            return {"code": 102, "message": f"Can't find this chunk {chunk_id}"}
+
+        # 获取目标文档的 ID
+        hit = result['hits']['hits'][0]
+        doc_id_in_es = hit['_id']
+
+        # 构建更新请求 - 只更新存在的字段
+        update_body = {"doc": {}}
+        update_body["doc"]["position_int"] = position_int
+        update_body["doc"]["page_num_int"] = [position_int[0][0]]
+        update_body["doc"]["top_int"] = [position_int[0][3]]
+
+
+                
+
+
+        # 更新文档
+        update_result = es.update(
+            index=index_name, 
+            id=doc_id_in_es, 
+            body=update_body,
+            refresh=True  # 确保更新立即可见
+        )
+            
+        print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
+
+
+        
+
+
+
+
+def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
+    """
+    在 Elasticsearch 中更新指定文档块的position and img_id。
+    
+    :param tenant_id: 租户 ID
+    :param doc_id: 文档 ID
+    :param chunk_id: 文档块 ID
+    :param new_img_id: 新的 img_id
+    :param position: 位置信息
+    :return: 更新结果
+    """
+    try:
+        
+        # 构建索引名称
+        index_name = f"ragflow_{tenant_id}"
+
+        # 构建查询条件
+        query = {
+            "bool": {
+                "must": [
+                    {"term": {"doc_id": doc_id}},
+                    {"term": {"_id": chunk_id}}
+                ]
+            }
+        }
+
+        # 搜索目标文档
+        result = es.search(index=index_name, body={"query": query})
+
+        # 检查是否找到目标文档
+        if result['hits']['total']['value'] == 0:
+            print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
+            return {"code": 102, "message": f"Can't find this chunk {chunk_id}"}
+
+        # 获取目标文档的 ID
+        hit = result['hits']['hits'][0]
+        doc_id_in_es = hit['_id']
+
+        # 构建更新请求 - 只更新存在的字段
+        update_body = {"doc": {}}
+        
+        #只有当 new_img_id 存在时才更新 img_id
+        if new_img_id is not None:
+            update_body["doc"]["img_id"] = new_img_id
+            
+        # 只有当 position 存在时才更新 positions
+        if position is not None:
+
+            update_body["doc"]["positions"] = position
+
+
+        # 如果没有需要更新的字段，直接返回成功
+        if not update_body["doc"]:
+            print("没有需要更新的字段")
+            return {"code": 0, "message": "No fields to update"}
+
+        # 更新文档
+        update_result = es.update(
+            index=index_name, 
+            id=doc_id_in_es, 
+            body=update_body,
+            refresh=True  # 确保更新立即可见
+        )
+        
+        print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
+
+        # 验证更新
+        verify_doc = es.get(index=index_name, id=doc_id_in_es)
+
+        # 检查 img_id 是否已更新（如果提供了 new_img_id）
+        img_id_updated = True
+        if new_img_id is not None:
+            img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
+            if img_id_updated:
+                print(f"成功更新 img_id 为: {new_img_id}")
+            else:
+                print(f"更新验证失败，当前 img_id: {verify_doc['_source'].get('img_id')}")
+
+        # 检查 position 是否已更新（如果提供了 position）
+        position_updated = True
+        if position is not None:
+            position_updated = verify_doc['_source'].get('positions') == position
+            if position_updated:
+                print(f"成功更新 position 为: {position}")
+            else:
+                print(f"更新验证失败，当前 position: {verify_doc['_source'].get('positions')}")
+
+        # 统一返回结果
+        if img_id_updated and position_updated:
+            return {"code": 0, "message": ""}
+        else:
+            return {"code": 100, "message": "Failed to verify update"}
+        
+
+    except Exception as e:
+        print(f"更新 Elasticsearch 时发生错误: {str(e)}")
+        return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
+
+
+
+
+
+
+# 示例调用 - 列出特定文档的所有 chunks
+if __name__ == "__main__":
+    try:
+        print(es.info())
+    except Exception as e:
+        print("连接失败：", e)
+
+
+# 单位电脑
+    tenant_id = "d669205e57a211f0b9e7324e7f243034"
+    new_img_id ="10345832587311f0919f3a2728512a4b-bd04866cd05337281"
+    doc_id="ea8d75966df811f0925ac6e8db75f472"
+    chunk_id="4a4927560a7e6d80"
+    # 添加以下代码来检查索引映射
+    # mapping_result = get_index_mapping(tenant_id)
+    # print("Positions field mapping:", mapping_result["data"][f"ragflow_{tenant_id}"]["mappings"]["properties"]["positions"])
+
+
+
+
+    # 左，右 -->
+    #上， 下| 上面最小，下面最大
+
+
+
+    pos = [[4, 0, 100, 200, 510]]
+    #pos_string = json.dumps(pos)  # 转换为 JSON 字符串
+    update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, pos)
+
+
+    #update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, "")
--- a/src/add_chunk_cli_pdf_img.py
+++ b/src/add_chunk_cli_pdf_img.py
@@ -9,8 +9,10 @@ import tempfile
 from elasticsearch import Elasticsearch
 from minio import Minio
 from minio.error import S3Error
+from find_text_in_pdf_enhanced import find_text_in_pdf
+import time
+

-# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch


 from dotenv import load_dotenv  # 新增
@@ -47,7 +49,149 @@ MINIO_CONFIG = {
    "secure": False
 }

-def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
+from elasticsearch.helpers import bulk
+
+def bulk_update_elasticsearch(tenant_id, updates):
+    """
+    批量更新Elasticsearch中的文档
+    
+    :param tenant_id: 租户ID
+    :param updates: 更新信息列表，每个元素包含doc_id, chunk_id, positions, new_img_id
+    :return: 更新结果
+    """
+    try:
+        index_name = f"ragflow_{tenant_id}"
+        
+        # 构建批量操作列表
+        actions = []
+        
+        for update_info in updates:
+            doc_id = update_info['doc_id']
+            chunk_id = update_info['chunk_id']
+            positions = update_info.get('positions', [])
+            new_img_id = update_info.get('new_img_id')
+            
+            # 构建查询条件来找到文档
+            query = {
+                "bool": {
+                    "must": [
+                        {"term": {"doc_id": doc_id}},
+                        {"term": {"_id": chunk_id}}
+                    ]
+                }
+            }
+            
+            # 搜索目标文档
+            result = es.search(index=index_name, body={"query": query})
+            
+            # 检查是否找到目标文档
+            if result['hits']['total']['value'] == 0:
+                print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
+                continue
+                
+            # 获取目标文档的 ID
+            hit = result['hits']['hits'][0]
+            doc_id_in_es = hit['_id']
+            
+            # 构建更新请求 - 只更新存在的字段
+            doc_update = {}
+            
+            # 只有当 new_img_id 存在时才更新 img_id
+            if new_img_id is not None:
+                doc_update["img_id"] = new_img_id
+                
+            # 只有当 positions 存在时才更新 positions
+            if positions:
+                position_int = []
+                
+                for pos in positions:
+                    if len(pos) != 5:
+                        continue  # Skip invalid positions
+                        
+                    pn, left, right, top, bottom = pos
+                    # 使用元组格式，与原始RAGFlow保持一致
+                    position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
+                    
+                if position_int:
+                    doc_update["position_int"] = position_int
+                    doc_update["page_num_int"] = [position_int[0][0]]
+                    doc_update["top_int"] = [position_int[0][3]]
+            
+            # 如果没有需要更新的字段，跳过
+            if not doc_update:
+                print(f"没有需要更新的字段 for chunk {chunk_id}")
+                continue
+            
+            # 添加到批量操作列表
+            action = {
+                "_op_type": "update",
+                "_index": index_name,
+                "_id": doc_id_in_es,
+                "doc": doc_update
+            }
+            actions.append(action)
+        
+        # 执行批量更新
+        if actions:
+            results = bulk(es, actions, refresh=True)
+            print(f"批量更新完成，成功处理 {results[0]} 个操作")
+            return {"code": 0, "message": f"Successfully updated {results[0]} documents"}
+        else:
+            print("没有需要执行的更新操作")
+            return {"code": 0, "message": "No updates to perform"}
+            
+    except Exception as e:
+        print(f"批量更新 Elasticsearch 时发生错误: {str(e)}")
+        return {"code": 101, "message": f"Error in bulk update: {str(e)}"}
+
+# 修改 process_pdf_txt_pairs 函数以使用批量更新
+def process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset):
+    """处理PDF-TXT文件对，使用批量更新提高效率"""
+    # 收集所有需要更新的信息
+    all_updates = []
+    
+    for name, pdf_path in pdf_dict.items():
+        display_name = os.path.basename(pdf_path)
+        document = upload_or_get_document(dataset, pdf_path, display_name)
+        print(f"选择的文档: {document.name}，ID: {document.id}")
+        if not document:
+            continue
+            
+        txt_path = txt_dict.get(name)
+        if txt_path:
+            chunks_info = process_txt_chunks(dataset.id, document, txt_path)
+            
+            time.sleep(1)  # 等待chunk处理完成
+            if chunks_info:
+                chunks_info = get_positions_from_chunk(pdf_path, chunks_info)
+                
+                # 收集更新信息而不是立即更新
+                for chunk_info in chunks_info:
+                    print(f"Chunk ID: {chunk_info['id']}, Text: {chunk_info['text'][:30]}..., Has Image: {chunk_info['has_image']}, Positions: {chunk_info['positions']}")
+                    
+                    update_info = {
+                        'doc_id': document.id,
+                        'chunk_id': chunk_info['id'],
+                        'positions': chunk_info['positions']
+                    }
+                    
+                    if chunk_info['has_image']:
+                        # 如果有图片，准备更新img_id
+                        update_info['new_img_id'] = f"{dataset.id}-{chunk_info['id']}"
+                    # 如果没有图片，new_img_id为None，不会更新img_id字段
+                    
+                    all_updates.append(update_info)
+    
+    # 执行批量更新
+    if all_updates:
+        result = bulk_update_elasticsearch(elastic_tenant_id, all_updates)
+        print(f"批量更新结果: {result}")
+
+
+
+
+
+def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id):
    """
    在 Elasticsearch 中更新指定文档块的position and img_id。
    
@@ -88,29 +232,27 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
        # 构建更新请求 - 只更新存在的字段
        update_body = {"doc": {}}
        
-        # 只有当 new_img_id 存在时才更新 img_id
+        #只有当 new_img_id 存在时才更新 img_id
        if new_img_id is not None:
            update_body["doc"]["img_id"] = new_img_id
            
        # 只有当 position 存在时才更新 positions
-        if position is not None:
-            # 如果传入的是嵌套字典格式的 position
-            if isinstance(position, list) and all(isinstance(p, dict) for p in position):
-                # 将字典格式转换为整数列表格式
-                formatted_positions = []
-                for pos in position:
-                    pos_list = [
-                        pos.get('page', 0),  # 页码
-                        int(round(float(pos.get('x0', 0)))),  # x0
-                        int(round(float(pos.get('x1', 0)))),  # x1
-                        int(round(float(pos.get('y0', 0)))),  # y0
-                        int(round(float(pos.get('y1', 0))))   # y1
-                    ]
-                    formatted_positions.append(pos_list)
-                update_body["doc"]["positions"] = formatted_positions
-            # 如果已经是整数列表格式
-            elif isinstance(position, list):
-                update_body["doc"]["positions"] = position
+        if positions :
+
+            position_int = []
+
+            for pos in positions:
+                if len(pos) != 5:
+                    continue  # Skip invalid positions
+                    
+                pn, left, right, top, bottom = pos
+                # 使用元组格式，与原始RAGFlow保持一致
+                position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
+            if position_int:
+                update_body["doc"]["position_int"] = position_int
+                update_body["doc"]["page_num_int"] = [position_int[0][0]]
+                update_body["doc"]["top_int"] = [position_int[0][3]]          
+

        # 如果没有需要更新的字段，直接返回成功
        if not update_body["doc"]:
@@ -127,32 +269,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
        
        print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")

-        # 验证更新
-        verify_doc = es.get(index=index_name, id=doc_id_in_es)

-        # 检查 img_id 是否已更新（如果提供了 new_img_id）
-        img_id_updated = True
-        if new_img_id is not None:
-            img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
-            if img_id_updated:
-                print(f"成功更新 img_id 为: {new_img_id}")
-            else:
-                print(f"更新验证失败，当前 img_id: {verify_doc['_source'].get('img_id')}")
-
-        # 检查 position 是否已更新（如果提供了 position）
-        position_updated = True
-        if position is not None:
-            position_updated = verify_doc['_source'].get('positions') == position
-            if position_updated:
-                print(f"成功更新 position 为: {position}")
-            else:
-                print(f"更新验证失败，当前 position: {verify_doc['_source'].get('positions')}")
-
-        # 统一返回结果
-        if img_id_updated and position_updated:
-            return {"code": 0, "message": ""}
-        else:
-            return {"code": 100, "message": "Failed to verify update"}
        

    except Exception as e:
@@ -160,6 +277,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
        return {"code": 101, "message": f"Error updating img_id: {str(e)}"}


+
 def get_minio_client():
    """创建MinIO客户端"""
    return Minio(
@@ -444,43 +562,57 @@ def get_positions_from_chunk(pdf_path, chunks_info):
    try:
        # 提取所有chunk的文本内容用于批量查找
        chunk_texts = [chunk_info['text'] for chunk_info in chunks_info]
+        print(f"批量查找文本块: {chunk_texts}")
        
        # 使用智能模糊查找获取位置信息
-        batch_positions = smart_fuzzy_find_text_batch(pdf_path, chunk_texts, similarity_threshold=0.7)
+        matches = find_text_in_pdf(
+            pdf_path,
+            chunk_texts,  
+            threshold=60
+        )
+        print(f"匹配结果: {matches}")
        
        # 将位置信息与chunks_info关联，并确保数据类型正确
        for i, chunk_info in enumerate(chunks_info):
-            positions = batch_positions[i] if i < len(batch_positions) else []
-            
-            # 处理位置信息
-            processed_positions = []
-            for pos in positions:
-                if isinstance(pos, dict):
-                    # 创建新的位置字典，确保所有坐标都是整数
-                    processed_pos = {
-                        'x0': int(round(float(pos['x0']))) if pos.get('x0') is not None else 0,
-                        'y0': int(round(float(pos['y0']))) if pos.get('y0') is not None else 0,
-                        'x1': int(round(float(pos['x1']))) if pos.get('x1') is not None else 0,
-                        'y1': int(round(float(pos['y1']))) if pos.get('y1') is not None else 0,
-                        'page': int(pos['page']) if pos.get('page') is not None else 0
-                    }
-                    processed_positions.append(processed_pos)
-            
-            # 更新chunk_info中的positions
-            chunk_info['positions'] = processed_positions
+            # 确保 chunk_info 包含 'positions' 键
+            if 'positions' not in chunk_info:
+                chunk_info['positions'] = []
+                
+            print(f"处理第 {i+1} 个chunk: {chunk_info['text']}")
+            print(f"更新前位置: {chunk_info['positions']}")
            
+            if isinstance(matches, list) and i < len(matches):
+                chunk_info['positions']=[mat['position_int'] for mat in matches[i] if 'position_int' in mat]
+
+                # # 如果matches是列表且索引有效
+                # if isinstance(matches[i], dict) and 'position_int' in matches[i]:
+                #     chunk_info['positions'] = matches[i]['position_int']
+                #     print(f"更新后位置: {chunk_info['positions']}")
+                # else:
+                #     chunk_info['positions'] = []
+                #     print(f"未找到有效位置信息，设置为空列表")
+            else:
+                chunk_info['positions'] = []
+                print(f"匹配结果无效或索引越界，设置为空列表")
+                
+        # 验证更新结果
+        print("最终chunks_info状态:")
+        for i, chunk_info in enumerate(chunks_info):
+            print(f"  Chunk {i+1}: ID={chunk_info['id']}, Positions={chunk_info['positions']}")
+                
        return chunks_info
        
    except Exception as e:
        print(f"获取PDF文本位置信息时出错: {str(e)}")
        # 出错时为每个chunk添加空的位置信息
        for chunk_info in chunks_info:
-            chunk_info['positions'] = []
+            # 确保 chunk_info 包含 'positions' 键
+            if 'positions' not in chunk_info:
+                chunk_info['positions'] = []
        return chunks_info



-
 def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
    """处理PDF-TXT文件对"""
    for name, pdf_path in pdf_dict.items():
@@ -493,6 +625,8 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
        txt_path = txt_dict.get(name)
        if txt_path:
            chunks_info=process_txt_chunks(dataset.id,document, txt_path)
+
+            time.sleep(1)
            if  chunks_info:
                chunks_info=get_positions_from_chunk(pdf_path, chunks_info)
                for chunk_info in chunks_info:
@@ -507,7 +641,6 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):


 def main():
-
    """主函数，处理PDF和TXT文件对
    
    dataset.id = bucket_name
@@ -528,8 +661,8 @@ def main():
        print("未选择数据集。")
        return
        
-    process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
-
+    # 使用批量处理函数替代原来的处理函数
+    process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset)



--- a/src/find_text_in_pdf_enhanced.py
+++ b/src/find_text_in_pdf_enhanced.py
@@ -161,9 +161,10 @@ def find_text_in_pdf(pdf_path,
                if matched_lines:
                    _, merged_bbox = _merge_lines(matched_lines)
                    results.append({
-                        "page": p + 1,
+                        "page": p,
                        "bbox": merged_bbox,
-                        "matched_text": matched_text
+                        "matched_text": matched_text,
+                        "position_int":[p, merged_bbox[0], merged_bbox[2], merged_bbox[1], merged_bbox[3]]
                    })
            if results:
                batch_results[idx].extend(results)
@@ -206,6 +207,7 @@ def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
 if __name__ == "__main__":
    pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
    pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf'
+    pdf_path ="F:\\Synology_nas\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf"
    query = [
        '''一、总体要求
 以习近平新时代中国特色社会主义思想为指导，完整、准确、全面贯彻新发展理念，统筹发展和安全，充分发挥数据的基础资源和创新引擎作用，整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合，全面提升城市全域数字化转型的整体性、系统性、协同性，不断满足人民日益增长的美好生活需要，为全面建设社会主义现代化国家提供强大动力。到2027年，全国城市全域数字化转型取得明显成效，形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市，有力支撑数字中国建设。到2030年，全国城市全域数字化转型全面突破，人民群众的获得感、幸福感、安全感全面提升，涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''',
@@ -271,7 +273,7 @@ if __name__ == "__main__":
    # 1. 找跨行正则匹配
    matches = find_text_in_pdf(
        pdf_path,
-        query,  # 你的正则
+        query,  
        threshold=60
        
    )
@@ -284,7 +286,7 @@ if __name__ == "__main__":
        
        #highlight_matches(pdf_path, query_matches, "example_highlighted.pdf")
        for m in query_matches:
-            print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}")
+            print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}, 位置_int: {m['position_int']}")
        print("------------------")
Author	SHA1	Message	Date
glowzz	51f24ced05	新增批量更新Elasticsearch文档的功能，优化process_pdf_txt_pairs函数以提高处理效率	2025-08-08 17:20:45 +08:00
glowzz	1c23d272bb	优化 Elasticsearch 更新逻辑，支持批量位置更新，调整匹配结果处理，新增位置整数格式返回	2025-08-08 10:38:24 +08:00
glowzz	c1d66237e6	新增 chunk_pos.py 文件，集成 Elasticsearch 功能，支持文档块位置和图像 ID 更新，优化索引映射获取逻辑	2025-08-07 17:04:20 +08:00