新增批量更新Elasticsearch文档的功能,优化process_pdf_txt_pairs函数以提高处理效率
This commit is contained in:
@@ -12,7 +12,7 @@ from minio.error import S3Error
|
|||||||
from find_text_in_pdf_enhanced import find_text_in_pdf
|
from find_text_in_pdf_enhanced import find_text_in_pdf
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
|
|
||||||
|
|
||||||
|
|
||||||
from dotenv import load_dotenv # 新增
|
from dotenv import load_dotenv # 新增
|
||||||
@@ -49,6 +49,148 @@ MINIO_CONFIG = {
|
|||||||
"secure": False
|
"secure": False
|
||||||
}
|
}
|
||||||
|
|
||||||
|
from elasticsearch.helpers import bulk
|
||||||
|
|
||||||
|
def bulk_update_elasticsearch(tenant_id, updates):
|
||||||
|
"""
|
||||||
|
批量更新Elasticsearch中的文档
|
||||||
|
|
||||||
|
:param tenant_id: 租户ID
|
||||||
|
:param updates: 更新信息列表,每个元素包含doc_id, chunk_id, positions, new_img_id
|
||||||
|
:return: 更新结果
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
# 构建批量操作列表
|
||||||
|
actions = []
|
||||||
|
|
||||||
|
for update_info in updates:
|
||||||
|
doc_id = update_info['doc_id']
|
||||||
|
chunk_id = update_info['chunk_id']
|
||||||
|
positions = update_info.get('positions', [])
|
||||||
|
new_img_id = update_info.get('new_img_id')
|
||||||
|
|
||||||
|
# 构建查询条件来找到文档
|
||||||
|
query = {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"term": {"doc_id": doc_id}},
|
||||||
|
{"term": {"_id": chunk_id}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 搜索目标文档
|
||||||
|
result = es.search(index=index_name, body={"query": query})
|
||||||
|
|
||||||
|
# 检查是否找到目标文档
|
||||||
|
if result['hits']['total']['value'] == 0:
|
||||||
|
print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取目标文档的 ID
|
||||||
|
hit = result['hits']['hits'][0]
|
||||||
|
doc_id_in_es = hit['_id']
|
||||||
|
|
||||||
|
# 构建更新请求 - 只更新存在的字段
|
||||||
|
doc_update = {}
|
||||||
|
|
||||||
|
# 只有当 new_img_id 存在时才更新 img_id
|
||||||
|
if new_img_id is not None:
|
||||||
|
doc_update["img_id"] = new_img_id
|
||||||
|
|
||||||
|
# 只有当 positions 存在时才更新 positions
|
||||||
|
if positions:
|
||||||
|
position_int = []
|
||||||
|
|
||||||
|
for pos in positions:
|
||||||
|
if len(pos) != 5:
|
||||||
|
continue # Skip invalid positions
|
||||||
|
|
||||||
|
pn, left, right, top, bottom = pos
|
||||||
|
# 使用元组格式,与原始RAGFlow保持一致
|
||||||
|
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
|
||||||
|
|
||||||
|
if position_int:
|
||||||
|
doc_update["position_int"] = position_int
|
||||||
|
doc_update["page_num_int"] = [position_int[0][0]]
|
||||||
|
doc_update["top_int"] = [position_int[0][3]]
|
||||||
|
|
||||||
|
# 如果没有需要更新的字段,跳过
|
||||||
|
if not doc_update:
|
||||||
|
print(f"没有需要更新的字段 for chunk {chunk_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 添加到批量操作列表
|
||||||
|
action = {
|
||||||
|
"_op_type": "update",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id": doc_id_in_es,
|
||||||
|
"doc": doc_update
|
||||||
|
}
|
||||||
|
actions.append(action)
|
||||||
|
|
||||||
|
# 执行批量更新
|
||||||
|
if actions:
|
||||||
|
results = bulk(es, actions, refresh=True)
|
||||||
|
print(f"批量更新完成,成功处理 {results[0]} 个操作")
|
||||||
|
return {"code": 0, "message": f"Successfully updated {results[0]} documents"}
|
||||||
|
else:
|
||||||
|
print("没有需要执行的更新操作")
|
||||||
|
return {"code": 0, "message": "No updates to perform"}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"批量更新 Elasticsearch 时发生错误: {str(e)}")
|
||||||
|
return {"code": 101, "message": f"Error in bulk update: {str(e)}"}
|
||||||
|
|
||||||
|
# 修改 process_pdf_txt_pairs 函数以使用批量更新
|
||||||
|
def process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset):
|
||||||
|
"""处理PDF-TXT文件对,使用批量更新提高效率"""
|
||||||
|
# 收集所有需要更新的信息
|
||||||
|
all_updates = []
|
||||||
|
|
||||||
|
for name, pdf_path in pdf_dict.items():
|
||||||
|
display_name = os.path.basename(pdf_path)
|
||||||
|
document = upload_or_get_document(dataset, pdf_path, display_name)
|
||||||
|
print(f"选择的文档: {document.name},ID: {document.id}")
|
||||||
|
if not document:
|
||||||
|
continue
|
||||||
|
|
||||||
|
txt_path = txt_dict.get(name)
|
||||||
|
if txt_path:
|
||||||
|
chunks_info = process_txt_chunks(dataset.id, document, txt_path)
|
||||||
|
|
||||||
|
time.sleep(1) # 等待chunk处理完成
|
||||||
|
if chunks_info:
|
||||||
|
chunks_info = get_positions_from_chunk(pdf_path, chunks_info)
|
||||||
|
|
||||||
|
# 收集更新信息而不是立即更新
|
||||||
|
for chunk_info in chunks_info:
|
||||||
|
print(f"Chunk ID: {chunk_info['id']}, Text: {chunk_info['text'][:30]}..., Has Image: {chunk_info['has_image']}, Positions: {chunk_info['positions']}")
|
||||||
|
|
||||||
|
update_info = {
|
||||||
|
'doc_id': document.id,
|
||||||
|
'chunk_id': chunk_info['id'],
|
||||||
|
'positions': chunk_info['positions']
|
||||||
|
}
|
||||||
|
|
||||||
|
if chunk_info['has_image']:
|
||||||
|
# 如果有图片,准备更新img_id
|
||||||
|
update_info['new_img_id'] = f"{dataset.id}-{chunk_info['id']}"
|
||||||
|
# 如果没有图片,new_img_id为None,不会更新img_id字段
|
||||||
|
|
||||||
|
all_updates.append(update_info)
|
||||||
|
|
||||||
|
# 执行批量更新
|
||||||
|
if all_updates:
|
||||||
|
result = bulk_update_elasticsearch(elastic_tenant_id, all_updates)
|
||||||
|
print(f"批量更新结果: {result}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id):
|
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id):
|
||||||
"""
|
"""
|
||||||
在 Elasticsearch 中更新指定文档块的position and img_id。
|
在 Elasticsearch 中更新指定文档块的position and img_id。
|
||||||
@@ -109,12 +251,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
if position_int:
|
if position_int:
|
||||||
update_body["doc"]["position_int"] = position_int
|
update_body["doc"]["position_int"] = position_int
|
||||||
update_body["doc"]["page_num_int"] = [position_int[0][0]]
|
update_body["doc"]["page_num_int"] = [position_int[0][0]]
|
||||||
update_body["doc"]["top_int"] = [position_int[0][3]]
|
update_body["doc"]["top_int"] = [position_int[0][3]]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# 如果没有需要更新的字段,直接返回成功
|
# 如果没有需要更新的字段,直接返回成功
|
||||||
@@ -132,32 +269,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
|
|
||||||
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
||||||
|
|
||||||
# # 验证更新
|
|
||||||
# verify_doc = es.get(index=index_name, id=doc_id_in_es)
|
|
||||||
|
|
||||||
# # 检查 img_id 是否已更新(如果提供了 new_img_id)
|
|
||||||
# img_id_updated = True
|
|
||||||
# if new_img_id is not None:
|
|
||||||
# img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
|
|
||||||
# if img_id_updated:
|
|
||||||
# print(f"成功更新 img_id 为: {new_img_id}")
|
|
||||||
# else:
|
|
||||||
# print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
|
|
||||||
|
|
||||||
# # 检查 position 是否已更新(如果提供了 position)
|
|
||||||
# position_updated = True
|
|
||||||
# if position is not None:
|
|
||||||
# position_updated = verify_doc['_source'].get('positions') == position
|
|
||||||
# if position_updated:
|
|
||||||
# print(f"成功更新 position 为: {position}")
|
|
||||||
# else:
|
|
||||||
# print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
|
|
||||||
|
|
||||||
# # 统一返回结果
|
|
||||||
# if img_id_updated and position_updated:
|
|
||||||
# return {"code": 0, "message": ""}
|
|
||||||
# else:
|
|
||||||
# return {"code": 100, "message": "Failed to verify update"}
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -529,7 +641,6 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
"""主函数,处理PDF和TXT文件对
|
"""主函数,处理PDF和TXT文件对
|
||||||
|
|
||||||
dataset.id = bucket_name
|
dataset.id = bucket_name
|
||||||
@@ -550,8 +661,8 @@ def main():
|
|||||||
print("未选择数据集。")
|
print("未选择数据集。")
|
||||||
return
|
return
|
||||||
|
|
||||||
process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
|
# 使用批量处理函数替代原来的处理函数
|
||||||
|
process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user