新增批量更新Elasticsearch文档的功能,优化process_pdf_txt_pairs函数以提高处理效率
This commit is contained in:
		| @@ -12,7 +12,7 @@ from minio.error import S3Error | ||||
| from find_text_in_pdf_enhanced import find_text_in_pdf | ||||
| import time | ||||
|  | ||||
| # from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch | ||||
|  | ||||
|  | ||||
|  | ||||
| from dotenv import load_dotenv  # 新增 | ||||
| @@ -49,6 +49,148 @@ MINIO_CONFIG = { | ||||
|     "secure": False | ||||
| } | ||||
|  | ||||
| from elasticsearch.helpers import bulk | ||||
|  | ||||
| def bulk_update_elasticsearch(tenant_id, updates): | ||||
|     """ | ||||
|     批量更新Elasticsearch中的文档 | ||||
|      | ||||
|     :param tenant_id: 租户ID | ||||
|     :param updates: 更新信息列表,每个元素包含doc_id, chunk_id, positions, new_img_id | ||||
|     :return: 更新结果 | ||||
|     """ | ||||
|     try: | ||||
|         index_name = f"ragflow_{tenant_id}" | ||||
|          | ||||
|         # 构建批量操作列表 | ||||
|         actions = [] | ||||
|          | ||||
|         for update_info in updates: | ||||
|             doc_id = update_info['doc_id'] | ||||
|             chunk_id = update_info['chunk_id'] | ||||
|             positions = update_info.get('positions', []) | ||||
|             new_img_id = update_info.get('new_img_id') | ||||
|              | ||||
|             # 构建查询条件来找到文档 | ||||
|             query = { | ||||
|                 "bool": { | ||||
|                     "must": [ | ||||
|                         {"term": {"doc_id": doc_id}}, | ||||
|                         {"term": {"_id": chunk_id}} | ||||
|                     ] | ||||
|                 } | ||||
|             } | ||||
|              | ||||
|             # 搜索目标文档 | ||||
|             result = es.search(index=index_name, body={"query": query}) | ||||
|              | ||||
|             # 检查是否找到目标文档 | ||||
|             if result['hits']['total']['value'] == 0: | ||||
|                 print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}") | ||||
|                 continue | ||||
|                  | ||||
|             # 获取目标文档的 ID | ||||
|             hit = result['hits']['hits'][0] | ||||
|             doc_id_in_es = hit['_id'] | ||||
|              | ||||
|             # 构建更新请求 - 只更新存在的字段 | ||||
|             doc_update = {} | ||||
|              | ||||
|             # 只有当 new_img_id 存在时才更新 img_id | ||||
|             if new_img_id is not None: | ||||
|                 doc_update["img_id"] = new_img_id | ||||
|                  | ||||
|             # 只有当 positions 存在时才更新 positions | ||||
|             if positions: | ||||
|                 position_int = [] | ||||
|                  | ||||
|                 for pos in positions: | ||||
|                     if len(pos) != 5: | ||||
|                         continue  # Skip invalid positions | ||||
|                          | ||||
|                     pn, left, right, top, bottom = pos | ||||
|                     # 使用元组格式,与原始RAGFlow保持一致 | ||||
|                     position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom))) | ||||
|                      | ||||
|                 if position_int: | ||||
|                     doc_update["position_int"] = position_int | ||||
|                     doc_update["page_num_int"] = [position_int[0][0]] | ||||
|                     doc_update["top_int"] = [position_int[0][3]] | ||||
|              | ||||
|             # 如果没有需要更新的字段,跳过 | ||||
|             if not doc_update: | ||||
|                 print(f"没有需要更新的字段 for chunk {chunk_id}") | ||||
|                 continue | ||||
|              | ||||
|             # 添加到批量操作列表 | ||||
|             action = { | ||||
|                 "_op_type": "update", | ||||
|                 "_index": index_name, | ||||
|                 "_id": doc_id_in_es, | ||||
|                 "doc": doc_update | ||||
|             } | ||||
|             actions.append(action) | ||||
|          | ||||
|         # 执行批量更新 | ||||
|         if actions: | ||||
|             results = bulk(es, actions, refresh=True) | ||||
|             print(f"批量更新完成,成功处理 {results[0]} 个操作") | ||||
|             return {"code": 0, "message": f"Successfully updated {results[0]} documents"} | ||||
|         else: | ||||
|             print("没有需要执行的更新操作") | ||||
|             return {"code": 0, "message": "No updates to perform"} | ||||
|              | ||||
|     except Exception as e: | ||||
|         print(f"批量更新 Elasticsearch 时发生错误: {str(e)}") | ||||
|         return {"code": 101, "message": f"Error in bulk update: {str(e)}"} | ||||
|  | ||||
| # 修改 process_pdf_txt_pairs 函数以使用批量更新 | ||||
| def process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset): | ||||
|     """处理PDF-TXT文件对,使用批量更新提高效率""" | ||||
|     # 收集所有需要更新的信息 | ||||
|     all_updates = [] | ||||
|      | ||||
|     for name, pdf_path in pdf_dict.items(): | ||||
|         display_name = os.path.basename(pdf_path) | ||||
|         document = upload_or_get_document(dataset, pdf_path, display_name) | ||||
|         print(f"选择的文档: {document.name},ID: {document.id}") | ||||
|         if not document: | ||||
|             continue | ||||
|              | ||||
|         txt_path = txt_dict.get(name) | ||||
|         if txt_path: | ||||
|             chunks_info = process_txt_chunks(dataset.id, document, txt_path) | ||||
|              | ||||
|             time.sleep(1)  # 等待chunk处理完成 | ||||
|             if chunks_info: | ||||
|                 chunks_info = get_positions_from_chunk(pdf_path, chunks_info) | ||||
|                  | ||||
|                 # 收集更新信息而不是立即更新 | ||||
|                 for chunk_info in chunks_info: | ||||
|                     print(f"Chunk ID: {chunk_info['id']}, Text: {chunk_info['text'][:30]}..., Has Image: {chunk_info['has_image']}, Positions: {chunk_info['positions']}") | ||||
|                      | ||||
|                     update_info = { | ||||
|                         'doc_id': document.id, | ||||
|                         'chunk_id': chunk_info['id'], | ||||
|                         'positions': chunk_info['positions'] | ||||
|                     } | ||||
|                      | ||||
|                     if chunk_info['has_image']: | ||||
|                         # 如果有图片,准备更新img_id | ||||
|                         update_info['new_img_id'] = f"{dataset.id}-{chunk_info['id']}" | ||||
|                     # 如果没有图片,new_img_id为None,不会更新img_id字段 | ||||
|                      | ||||
|                     all_updates.append(update_info) | ||||
|      | ||||
|     # 执行批量更新 | ||||
|     if all_updates: | ||||
|         result = bulk_update_elasticsearch(elastic_tenant_id, all_updates) | ||||
|         print(f"批量更新结果: {result}") | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id): | ||||
|     """ | ||||
|     在 Elasticsearch 中更新指定文档块的position and img_id。 | ||||
| @@ -112,11 +254,6 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | ||||
|                 update_body["doc"]["top_int"] = [position_int[0][3]]           | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|              | ||||
|  | ||||
|  | ||||
|         # 如果没有需要更新的字段,直接返回成功 | ||||
|         if not update_body["doc"]: | ||||
|             print("没有需要更新的字段") | ||||
| @@ -132,32 +269,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | ||||
|          | ||||
|         print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}") | ||||
|  | ||||
|     #     # 验证更新 | ||||
|     #     verify_doc = es.get(index=index_name, id=doc_id_in_es) | ||||
|  | ||||
|     #     # 检查 img_id 是否已更新(如果提供了 new_img_id) | ||||
|     #     img_id_updated = True | ||||
|     #     if new_img_id is not None: | ||||
|     #         img_id_updated = verify_doc['_source'].get('img_id') == new_img_id | ||||
|     #         if img_id_updated: | ||||
|     #             print(f"成功更新 img_id 为: {new_img_id}") | ||||
|     #         else: | ||||
|     #             print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}") | ||||
|  | ||||
|     #     # 检查 position 是否已更新(如果提供了 position) | ||||
|     #     position_updated = True | ||||
|     #     if position is not None: | ||||
|     #         position_updated = verify_doc['_source'].get('positions') == position | ||||
|     #         if position_updated: | ||||
|     #             print(f"成功更新 position 为: {position}") | ||||
|     #         else: | ||||
|     #             print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}") | ||||
|  | ||||
|     #     # 统一返回结果 | ||||
|     #     if img_id_updated and position_updated: | ||||
|     #         return {"code": 0, "message": ""} | ||||
|     #     else: | ||||
|     #         return {"code": 100, "message": "Failed to verify update"} | ||||
|          | ||||
|  | ||||
|     except Exception as e: | ||||
| @@ -529,7 +641,6 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): | ||||
|  | ||||
|  | ||||
| def main(): | ||||
|  | ||||
|     """主函数,处理PDF和TXT文件对 | ||||
|      | ||||
|     dataset.id = bucket_name | ||||
| @@ -550,8 +661,8 @@ def main(): | ||||
|         print("未选择数据集。") | ||||
|         return | ||||
|          | ||||
|     process_pdf_txt_pairs(pdf_dict, txt_dict, dataset) | ||||
|  | ||||
|     # 使用批量处理函数替代原来的处理函数 | ||||
|     process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset) | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user