优化 Elasticsearch 更新逻辑,支持批量位置更新,调整匹配结果处理,新增位置整数格式返回

This commit is contained in:
2025-08-08 10:38:24 +08:00
parent c1d66237e6
commit 1c23d272bb
2 changed files with 92 additions and 68 deletions

View File

@@ -9,6 +9,8 @@ import tempfile
from elasticsearch import Elasticsearch
from minio import Minio
from minio.error import S3Error
from find_text_in_pdf_enhanced import find_text_in_pdf
import time
# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
@@ -47,7 +49,7 @@ MINIO_CONFIG = {
"secure": False
}
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id):
"""
在 Elasticsearch 中更新指定文档块的position and img_id。
@@ -93,24 +95,27 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
update_body["doc"]["img_id"] = new_img_id
# 只有当 position 存在时才更新 positions
if position is not None:
# 如果传入的是嵌套字典格式的 position
if isinstance(position, list) and all(isinstance(p, dict) for p in position):
# 将字典格式转换为整数列表格式
formatted_positions = []
for pos in position:
pos_list = [
pos.get('page', 0), # 页码
int(round(float(pos.get('x0', 0)))), # x0
int(round(float(pos.get('x1', 0)))), # x1
int(round(float(pos.get('y0', 0)))), # y0
int(round(float(pos.get('y1', 0)))) # y1
]
formatted_positions.append(pos_list)
update_body["doc"]["positions"] = formatted_positions
# 如果已经是整数列表格式
elif isinstance(position, list):
update_body["doc"]["positions"] = position
if positions :
position_int = []
for pos in positions:
if len(pos) != 5:
continue # Skip invalid positions
pn, left, right, top, bottom = pos
# 使用元组格式与原始RAGFlow保持一致
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
if position_int:
update_body["doc"]["position_int"] = position_int
update_body["doc"]["page_num_int"] = [position_int[0][0]]
update_body["doc"]["top_int"] = [position_int[0][3]]
# 如果没有需要更新的字段,直接返回成功
if not update_body["doc"]:
@@ -127,32 +132,32 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
# 验证更新
verify_doc = es.get(index=index_name, id=doc_id_in_es)
# # 验证更新
# verify_doc = es.get(index=index_name, id=doc_id_in_es)
# 检查 img_id 是否已更新(如果提供了 new_img_id
img_id_updated = True
if new_img_id is not None:
img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
if img_id_updated:
print(f"成功更新 img_id 为: {new_img_id}")
else:
print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
# # 检查 img_id 是否已更新(如果提供了 new_img_id
# img_id_updated = True
# if new_img_id is not None:
# img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
# if img_id_updated:
# print(f"成功更新 img_id 为: {new_img_id}")
# else:
# print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
# 检查 position 是否已更新(如果提供了 position
position_updated = True
if position is not None:
position_updated = verify_doc['_source'].get('positions') == position
if position_updated:
print(f"成功更新 position 为: {position}")
else:
print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
# # 检查 position 是否已更新(如果提供了 position
# position_updated = True
# if position is not None:
# position_updated = verify_doc['_source'].get('positions') == position
# if position_updated:
# print(f"成功更新 position 为: {position}")
# else:
# print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
# 统一返回结果
if img_id_updated and position_updated:
return {"code": 0, "message": ""}
else:
return {"code": 100, "message": "Failed to verify update"}
# # 统一返回结果
# if img_id_updated and position_updated:
# return {"code": 0, "message": ""}
# else:
# return {"code": 100, "message": "Failed to verify update"}
except Exception as e:
@@ -160,6 +165,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
def get_minio_client():
"""创建MinIO客户端"""
return Minio(
@@ -444,30 +450,43 @@ def get_positions_from_chunk(pdf_path, chunks_info):
try:
# 提取所有chunk的文本内容用于批量查找
chunk_texts = [chunk_info['text'] for chunk_info in chunks_info]
print(f"批量查找文本块: {chunk_texts}")
# 使用智能模糊查找获取位置信息
batch_positions = smart_fuzzy_find_text_batch(pdf_path, chunk_texts, similarity_threshold=0.7)
matches = find_text_in_pdf(
pdf_path,
chunk_texts,
threshold=60
)
print(f"匹配结果: {matches}")
# 将位置信息与chunks_info关联并确保数据类型正确
for i, chunk_info in enumerate(chunks_info):
positions = batch_positions[i] if i < len(batch_positions) else []
# 确保 chunk_info 包含 'positions' 键
if 'positions' not in chunk_info:
chunk_info['positions'] = []
# 处理位置信息
processed_positions = []
for pos in positions:
if isinstance(pos, dict):
# 创建新的位置字典,确保所有坐标都是整数
processed_pos = {
'x0': int(round(float(pos['x0']))) if pos.get('x0') is not None else 0,
'y0': int(round(float(pos['y0']))) if pos.get('y0') is not None else 0,
'x1': int(round(float(pos['x1']))) if pos.get('x1') is not None else 0,
'y1': int(round(float(pos['y1']))) if pos.get('y1') is not None else 0,
'page': int(pos['page']) if pos.get('page') is not None else 0
}
processed_positions.append(processed_pos)
print(f"处理第 {i+1} 个chunk: {chunk_info['text']}")
print(f"更新前位置: {chunk_info['positions']}")
# 更新chunk_info中的positions
chunk_info['positions'] = processed_positions
if isinstance(matches, list) and i < len(matches):
chunk_info['positions']=[mat['position_int'] for mat in matches[i] if 'position_int' in mat]
# # 如果matches是列表且索引有效
# if isinstance(matches[i], dict) and 'position_int' in matches[i]:
# chunk_info['positions'] = matches[i]['position_int']
# print(f"更新后位置: {chunk_info['positions']}")
# else:
# chunk_info['positions'] = []
# print(f"未找到有效位置信息,设置为空列表")
else:
chunk_info['positions'] = []
print(f"匹配结果无效或索引越界,设置为空列表")
# 验证更新结果
print("最终chunks_info状态:")
for i, chunk_info in enumerate(chunks_info):
print(f" Chunk {i+1}: ID={chunk_info['id']}, Positions={chunk_info['positions']}")
return chunks_info
@@ -475,12 +494,13 @@ def get_positions_from_chunk(pdf_path, chunks_info):
print(f"获取PDF文本位置信息时出错: {str(e)}")
# 出错时为每个chunk添加空的位置信息
for chunk_info in chunks_info:
# 确保 chunk_info 包含 'positions' 键
if 'positions' not in chunk_info:
chunk_info['positions'] = []
return chunks_info
def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
"""处理PDF-TXT文件对"""
for name, pdf_path in pdf_dict.items():
@@ -493,6 +513,8 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
txt_path = txt_dict.get(name)
if txt_path:
chunks_info=process_txt_chunks(dataset.id,document, txt_path)
time.sleep(1)
if chunks_info:
chunks_info=get_positions_from_chunk(pdf_path, chunks_info)
for chunk_info in chunks_info:

View File

@@ -161,9 +161,10 @@ def find_text_in_pdf(pdf_path,
if matched_lines:
_, merged_bbox = _merge_lines(matched_lines)
results.append({
"page": p + 1,
"page": p,
"bbox": merged_bbox,
"matched_text": matched_text
"matched_text": matched_text,
"position_int":[p, merged_bbox[0], merged_bbox[2], merged_bbox[1], merged_bbox[3]]
})
if results:
batch_results[idx].extend(results)
@@ -206,6 +207,7 @@ def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
if __name__ == "__main__":
pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf'
pdf_path ="F:\\Synology_nas\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf"
query = [
'''一、总体要求
以习近平新时代中国特色社会主义思想为指导完整、准确、全面贯彻新发展理念统筹发展和安全充分发挥数据的基础资源和创新引擎作用整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合全面提升城市全域数字化转型的整体性、系统性、协同性不断满足人民日益增长的美好生活需要为全面建设社会主义现代化国家提供强大动力。到2027年全国城市全域数字化转型取得明显成效形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市有力支撑数字中国建设。到2030年全国城市全域数字化转型全面突破人民群众的获得感、幸福感、安全感全面提升涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''',
@@ -271,7 +273,7 @@ if __name__ == "__main__":
# 1. 找跨行正则匹配
matches = find_text_in_pdf(
pdf_path,
query, # 你的正则
query,
threshold=60
)
@@ -284,7 +286,7 @@ if __name__ == "__main__":
#highlight_matches(pdf_path, query_matches, "example_highlighted.pdf")
for m in query_matches:
print(f"{m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}")
print(f"{m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}, 位置_int: {m['position_int']}")
print("------------------")