Compare commits
3 Commits
e5ac523bd9
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 51f24ced05 | |||
| 1c23d272bb | |||
| c1d66237e6 |
256
chunk_pos.py
Normal file
256
chunk_pos.py
Normal file
@@ -0,0 +1,256 @@
|
|||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
#from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch
|
||||||
|
# 初始化 Elasticsearch 用户名elastic,密码infini_rag_flow
|
||||||
|
|
||||||
|
from dotenv import load_dotenv # 新增
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
# 加载 .env 文件中的环境变量
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 初始化 Elasticsearch
|
||||||
|
es = Elasticsearch(
|
||||||
|
[{
|
||||||
|
'host': os.getenv("ELASTIC_HOST"),
|
||||||
|
'port': int(os.getenv("ELASTIC_PORT")),
|
||||||
|
'scheme': 'http'
|
||||||
|
}],
|
||||||
|
basic_auth=(
|
||||||
|
os.getenv("ELASTIC_USERNAME"),
|
||||||
|
os.getenv("ELASTIC_PASSWORD")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_index_mapping(tenant_id):
|
||||||
|
"""
|
||||||
|
获取指定索引的 mapping 信息
|
||||||
|
|
||||||
|
:param tenant_id: 租户 ID
|
||||||
|
:return: mapping 信息
|
||||||
|
"""
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
mapping = es.indices.get_mapping(index=index_name)
|
||||||
|
# 将 ObjectApiResponse 转换为普通字典
|
||||||
|
mapping_dict = dict(mapping)
|
||||||
|
return {"code": 0, "message": "", "data": mapping_dict}
|
||||||
|
except Exception as e:
|
||||||
|
return {"code": 500, "message": str(e), "data": {}}
|
||||||
|
|
||||||
|
def update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, positions):
|
||||||
|
"""
|
||||||
|
在 Elasticsearch 中更新指定文档块的position and img_id。
|
||||||
|
|
||||||
|
:param tenant_id: 租户 ID
|
||||||
|
:param doc_id: 文档 ID
|
||||||
|
:param chunk_id: 文档块 ID
|
||||||
|
:param new_img_id: 新的 img_id
|
||||||
|
:param position: 位置信息
|
||||||
|
:return: 更新结果
|
||||||
|
"""
|
||||||
|
if not positions:
|
||||||
|
return
|
||||||
|
|
||||||
|
position_int = []
|
||||||
|
|
||||||
|
for pos in positions:
|
||||||
|
if len(pos) != 5:
|
||||||
|
continue # Skip invalid positions
|
||||||
|
|
||||||
|
pn, left, right, top, bottom = pos
|
||||||
|
# 使用元组格式,与原始RAGFlow保持一致
|
||||||
|
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
|
||||||
|
|
||||||
|
if position_int: # Only add if we have valid positions
|
||||||
|
# 仅添加精确位置信息,不修改排序字段
|
||||||
|
|
||||||
|
# 构建索引名称
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
# 构建查询条件
|
||||||
|
query = {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"term": {"doc_id": doc_id}},
|
||||||
|
{"term": {"_id": chunk_id}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 搜索目标文档
|
||||||
|
result = es.search(index=index_name, body={"query": query})
|
||||||
|
|
||||||
|
# 检查是否找到目标文档
|
||||||
|
if result['hits']['total']['value'] == 0:
|
||||||
|
print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
|
||||||
|
return {"code": 102, "message": f"Can't find this chunk {chunk_id}"}
|
||||||
|
|
||||||
|
# 获取目标文档的 ID
|
||||||
|
hit = result['hits']['hits'][0]
|
||||||
|
doc_id_in_es = hit['_id']
|
||||||
|
|
||||||
|
# 构建更新请求 - 只更新存在的字段
|
||||||
|
update_body = {"doc": {}}
|
||||||
|
update_body["doc"]["position_int"] = position_int
|
||||||
|
update_body["doc"]["page_num_int"] = [position_int[0][0]]
|
||||||
|
update_body["doc"]["top_int"] = [position_int[0][3]]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 更新文档
|
||||||
|
update_result = es.update(
|
||||||
|
index=index_name,
|
||||||
|
id=doc_id_in_es,
|
||||||
|
body=update_body,
|
||||||
|
refresh=True # 确保更新立即可见
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
|
||||||
|
"""
|
||||||
|
在 Elasticsearch 中更新指定文档块的position and img_id。
|
||||||
|
|
||||||
|
:param tenant_id: 租户 ID
|
||||||
|
:param doc_id: 文档 ID
|
||||||
|
:param chunk_id: 文档块 ID
|
||||||
|
:param new_img_id: 新的 img_id
|
||||||
|
:param position: 位置信息
|
||||||
|
:return: 更新结果
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
|
||||||
|
# 构建索引名称
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
# 构建查询条件
|
||||||
|
query = {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"term": {"doc_id": doc_id}},
|
||||||
|
{"term": {"_id": chunk_id}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 搜索目标文档
|
||||||
|
result = es.search(index=index_name, body={"query": query})
|
||||||
|
|
||||||
|
# 检查是否找到目标文档
|
||||||
|
if result['hits']['total']['value'] == 0:
|
||||||
|
print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
|
||||||
|
return {"code": 102, "message": f"Can't find this chunk {chunk_id}"}
|
||||||
|
|
||||||
|
# 获取目标文档的 ID
|
||||||
|
hit = result['hits']['hits'][0]
|
||||||
|
doc_id_in_es = hit['_id']
|
||||||
|
|
||||||
|
# 构建更新请求 - 只更新存在的字段
|
||||||
|
update_body = {"doc": {}}
|
||||||
|
|
||||||
|
#只有当 new_img_id 存在时才更新 img_id
|
||||||
|
if new_img_id is not None:
|
||||||
|
update_body["doc"]["img_id"] = new_img_id
|
||||||
|
|
||||||
|
# 只有当 position 存在时才更新 positions
|
||||||
|
if position is not None:
|
||||||
|
|
||||||
|
update_body["doc"]["positions"] = position
|
||||||
|
|
||||||
|
|
||||||
|
# 如果没有需要更新的字段,直接返回成功
|
||||||
|
if not update_body["doc"]:
|
||||||
|
print("没有需要更新的字段")
|
||||||
|
return {"code": 0, "message": "No fields to update"}
|
||||||
|
|
||||||
|
# 更新文档
|
||||||
|
update_result = es.update(
|
||||||
|
index=index_name,
|
||||||
|
id=doc_id_in_es,
|
||||||
|
body=update_body,
|
||||||
|
refresh=True # 确保更新立即可见
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
||||||
|
|
||||||
|
# 验证更新
|
||||||
|
verify_doc = es.get(index=index_name, id=doc_id_in_es)
|
||||||
|
|
||||||
|
# 检查 img_id 是否已更新(如果提供了 new_img_id)
|
||||||
|
img_id_updated = True
|
||||||
|
if new_img_id is not None:
|
||||||
|
img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
|
||||||
|
if img_id_updated:
|
||||||
|
print(f"成功更新 img_id 为: {new_img_id}")
|
||||||
|
else:
|
||||||
|
print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
|
||||||
|
|
||||||
|
# 检查 position 是否已更新(如果提供了 position)
|
||||||
|
position_updated = True
|
||||||
|
if position is not None:
|
||||||
|
position_updated = verify_doc['_source'].get('positions') == position
|
||||||
|
if position_updated:
|
||||||
|
print(f"成功更新 position 为: {position}")
|
||||||
|
else:
|
||||||
|
print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
|
||||||
|
|
||||||
|
# 统一返回结果
|
||||||
|
if img_id_updated and position_updated:
|
||||||
|
return {"code": 0, "message": ""}
|
||||||
|
else:
|
||||||
|
return {"code": 100, "message": "Failed to verify update"}
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"更新 Elasticsearch 时发生错误: {str(e)}")
|
||||||
|
return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 示例调用 - 列出特定文档的所有 chunks
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
print(es.info())
|
||||||
|
except Exception as e:
|
||||||
|
print("连接失败:", e)
|
||||||
|
|
||||||
|
|
||||||
|
# 单位电脑
|
||||||
|
tenant_id = "d669205e57a211f0b9e7324e7f243034"
|
||||||
|
new_img_id ="10345832587311f0919f3a2728512a4b-bd04866cd05337281"
|
||||||
|
doc_id="ea8d75966df811f0925ac6e8db75f472"
|
||||||
|
chunk_id="4a4927560a7e6d80"
|
||||||
|
# 添加以下代码来检查索引映射
|
||||||
|
# mapping_result = get_index_mapping(tenant_id)
|
||||||
|
# print("Positions field mapping:", mapping_result["data"][f"ragflow_{tenant_id}"]["mappings"]["properties"]["positions"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# 左,右 -->
|
||||||
|
#上, 下| 上面最小,下面最大
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
pos = [[4, 0, 100, 200, 510]]
|
||||||
|
#pos_string = json.dumps(pos) # 转换为 JSON 字符串
|
||||||
|
update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, pos)
|
||||||
|
|
||||||
|
|
||||||
|
#update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, "")
|
||||||
@@ -9,8 +9,10 @@ import tempfile
|
|||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from minio import Minio
|
from minio import Minio
|
||||||
from minio.error import S3Error
|
from minio.error import S3Error
|
||||||
|
from find_text_in_pdf_enhanced import find_text_in_pdf
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
|
|
||||||
|
|
||||||
|
|
||||||
from dotenv import load_dotenv # 新增
|
from dotenv import load_dotenv # 新增
|
||||||
@@ -47,7 +49,149 @@ MINIO_CONFIG = {
|
|||||||
"secure": False
|
"secure": False
|
||||||
}
|
}
|
||||||
|
|
||||||
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id):
|
from elasticsearch.helpers import bulk
|
||||||
|
|
||||||
|
def bulk_update_elasticsearch(tenant_id, updates):
|
||||||
|
"""
|
||||||
|
批量更新Elasticsearch中的文档
|
||||||
|
|
||||||
|
:param tenant_id: 租户ID
|
||||||
|
:param updates: 更新信息列表,每个元素包含doc_id, chunk_id, positions, new_img_id
|
||||||
|
:return: 更新结果
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
# 构建批量操作列表
|
||||||
|
actions = []
|
||||||
|
|
||||||
|
for update_info in updates:
|
||||||
|
doc_id = update_info['doc_id']
|
||||||
|
chunk_id = update_info['chunk_id']
|
||||||
|
positions = update_info.get('positions', [])
|
||||||
|
new_img_id = update_info.get('new_img_id')
|
||||||
|
|
||||||
|
# 构建查询条件来找到文档
|
||||||
|
query = {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{"term": {"doc_id": doc_id}},
|
||||||
|
{"term": {"_id": chunk_id}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# 搜索目标文档
|
||||||
|
result = es.search(index=index_name, body={"query": query})
|
||||||
|
|
||||||
|
# 检查是否找到目标文档
|
||||||
|
if result['hits']['total']['value'] == 0:
|
||||||
|
print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 获取目标文档的 ID
|
||||||
|
hit = result['hits']['hits'][0]
|
||||||
|
doc_id_in_es = hit['_id']
|
||||||
|
|
||||||
|
# 构建更新请求 - 只更新存在的字段
|
||||||
|
doc_update = {}
|
||||||
|
|
||||||
|
# 只有当 new_img_id 存在时才更新 img_id
|
||||||
|
if new_img_id is not None:
|
||||||
|
doc_update["img_id"] = new_img_id
|
||||||
|
|
||||||
|
# 只有当 positions 存在时才更新 positions
|
||||||
|
if positions:
|
||||||
|
position_int = []
|
||||||
|
|
||||||
|
for pos in positions:
|
||||||
|
if len(pos) != 5:
|
||||||
|
continue # Skip invalid positions
|
||||||
|
|
||||||
|
pn, left, right, top, bottom = pos
|
||||||
|
# 使用元组格式,与原始RAGFlow保持一致
|
||||||
|
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
|
||||||
|
|
||||||
|
if position_int:
|
||||||
|
doc_update["position_int"] = position_int
|
||||||
|
doc_update["page_num_int"] = [position_int[0][0]]
|
||||||
|
doc_update["top_int"] = [position_int[0][3]]
|
||||||
|
|
||||||
|
# 如果没有需要更新的字段,跳过
|
||||||
|
if not doc_update:
|
||||||
|
print(f"没有需要更新的字段 for chunk {chunk_id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 添加到批量操作列表
|
||||||
|
action = {
|
||||||
|
"_op_type": "update",
|
||||||
|
"_index": index_name,
|
||||||
|
"_id": doc_id_in_es,
|
||||||
|
"doc": doc_update
|
||||||
|
}
|
||||||
|
actions.append(action)
|
||||||
|
|
||||||
|
# 执行批量更新
|
||||||
|
if actions:
|
||||||
|
results = bulk(es, actions, refresh=True)
|
||||||
|
print(f"批量更新完成,成功处理 {results[0]} 个操作")
|
||||||
|
return {"code": 0, "message": f"Successfully updated {results[0]} documents"}
|
||||||
|
else:
|
||||||
|
print("没有需要执行的更新操作")
|
||||||
|
return {"code": 0, "message": "No updates to perform"}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"批量更新 Elasticsearch 时发生错误: {str(e)}")
|
||||||
|
return {"code": 101, "message": f"Error in bulk update: {str(e)}"}
|
||||||
|
|
||||||
|
# 修改 process_pdf_txt_pairs 函数以使用批量更新
|
||||||
|
def process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset):
|
||||||
|
"""处理PDF-TXT文件对,使用批量更新提高效率"""
|
||||||
|
# 收集所有需要更新的信息
|
||||||
|
all_updates = []
|
||||||
|
|
||||||
|
for name, pdf_path in pdf_dict.items():
|
||||||
|
display_name = os.path.basename(pdf_path)
|
||||||
|
document = upload_or_get_document(dataset, pdf_path, display_name)
|
||||||
|
print(f"选择的文档: {document.name},ID: {document.id}")
|
||||||
|
if not document:
|
||||||
|
continue
|
||||||
|
|
||||||
|
txt_path = txt_dict.get(name)
|
||||||
|
if txt_path:
|
||||||
|
chunks_info = process_txt_chunks(dataset.id, document, txt_path)
|
||||||
|
|
||||||
|
time.sleep(1) # 等待chunk处理完成
|
||||||
|
if chunks_info:
|
||||||
|
chunks_info = get_positions_from_chunk(pdf_path, chunks_info)
|
||||||
|
|
||||||
|
# 收集更新信息而不是立即更新
|
||||||
|
for chunk_info in chunks_info:
|
||||||
|
print(f"Chunk ID: {chunk_info['id']}, Text: {chunk_info['text'][:30]}..., Has Image: {chunk_info['has_image']}, Positions: {chunk_info['positions']}")
|
||||||
|
|
||||||
|
update_info = {
|
||||||
|
'doc_id': document.id,
|
||||||
|
'chunk_id': chunk_info['id'],
|
||||||
|
'positions': chunk_info['positions']
|
||||||
|
}
|
||||||
|
|
||||||
|
if chunk_info['has_image']:
|
||||||
|
# 如果有图片,准备更新img_id
|
||||||
|
update_info['new_img_id'] = f"{dataset.id}-{chunk_info['id']}"
|
||||||
|
# 如果没有图片,new_img_id为None,不会更新img_id字段
|
||||||
|
|
||||||
|
all_updates.append(update_info)
|
||||||
|
|
||||||
|
# 执行批量更新
|
||||||
|
if all_updates:
|
||||||
|
result = bulk_update_elasticsearch(elastic_tenant_id, all_updates)
|
||||||
|
print(f"批量更新结果: {result}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id):
|
||||||
"""
|
"""
|
||||||
在 Elasticsearch 中更新指定文档块的position and img_id。
|
在 Elasticsearch 中更新指定文档块的position and img_id。
|
||||||
|
|
||||||
@@ -93,24 +237,22 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
update_body["doc"]["img_id"] = new_img_id
|
update_body["doc"]["img_id"] = new_img_id
|
||||||
|
|
||||||
# 只有当 position 存在时才更新 positions
|
# 只有当 position 存在时才更新 positions
|
||||||
if position is not None:
|
if positions :
|
||||||
# 如果传入的是嵌套字典格式的 position
|
|
||||||
if isinstance(position, list) and all(isinstance(p, dict) for p in position):
|
position_int = []
|
||||||
# 将字典格式转换为整数列表格式
|
|
||||||
formatted_positions = []
|
for pos in positions:
|
||||||
for pos in position:
|
if len(pos) != 5:
|
||||||
pos_list = [
|
continue # Skip invalid positions
|
||||||
pos.get('page', 0), # 页码
|
|
||||||
int(round(float(pos.get('x0', 0)))), # x0
|
pn, left, right, top, bottom = pos
|
||||||
int(round(float(pos.get('x1', 0)))), # x1
|
# 使用元组格式,与原始RAGFlow保持一致
|
||||||
int(round(float(pos.get('y0', 0)))), # y0
|
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
|
||||||
int(round(float(pos.get('y1', 0)))) # y1
|
if position_int:
|
||||||
]
|
update_body["doc"]["position_int"] = position_int
|
||||||
formatted_positions.append(pos_list)
|
update_body["doc"]["page_num_int"] = [position_int[0][0]]
|
||||||
update_body["doc"]["positions"] = formatted_positions
|
update_body["doc"]["top_int"] = [position_int[0][3]]
|
||||||
# 如果已经是整数列表格式
|
|
||||||
elif isinstance(position, list):
|
|
||||||
update_body["doc"]["positions"] = position
|
|
||||||
|
|
||||||
# 如果没有需要更新的字段,直接返回成功
|
# 如果没有需要更新的字段,直接返回成功
|
||||||
if not update_body["doc"]:
|
if not update_body["doc"]:
|
||||||
@@ -127,32 +269,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
|
|
||||||
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}")
|
||||||
|
|
||||||
# 验证更新
|
|
||||||
verify_doc = es.get(index=index_name, id=doc_id_in_es)
|
|
||||||
|
|
||||||
# 检查 img_id 是否已更新(如果提供了 new_img_id)
|
|
||||||
img_id_updated = True
|
|
||||||
if new_img_id is not None:
|
|
||||||
img_id_updated = verify_doc['_source'].get('img_id') == new_img_id
|
|
||||||
if img_id_updated:
|
|
||||||
print(f"成功更新 img_id 为: {new_img_id}")
|
|
||||||
else:
|
|
||||||
print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}")
|
|
||||||
|
|
||||||
# 检查 position 是否已更新(如果提供了 position)
|
|
||||||
position_updated = True
|
|
||||||
if position is not None:
|
|
||||||
position_updated = verify_doc['_source'].get('positions') == position
|
|
||||||
if position_updated:
|
|
||||||
print(f"成功更新 position 为: {position}")
|
|
||||||
else:
|
|
||||||
print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}")
|
|
||||||
|
|
||||||
# 统一返回结果
|
|
||||||
if img_id_updated and position_updated:
|
|
||||||
return {"code": 0, "message": ""}
|
|
||||||
else:
|
|
||||||
return {"code": 100, "message": "Failed to verify update"}
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -160,6 +277,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
|
return {"code": 101, "message": f"Error updating img_id: {str(e)}"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_minio_client():
|
def get_minio_client():
|
||||||
"""创建MinIO客户端"""
|
"""创建MinIO客户端"""
|
||||||
return Minio(
|
return Minio(
|
||||||
@@ -444,30 +562,43 @@ def get_positions_from_chunk(pdf_path, chunks_info):
|
|||||||
try:
|
try:
|
||||||
# 提取所有chunk的文本内容用于批量查找
|
# 提取所有chunk的文本内容用于批量查找
|
||||||
chunk_texts = [chunk_info['text'] for chunk_info in chunks_info]
|
chunk_texts = [chunk_info['text'] for chunk_info in chunks_info]
|
||||||
|
print(f"批量查找文本块: {chunk_texts}")
|
||||||
|
|
||||||
# 使用智能模糊查找获取位置信息
|
# 使用智能模糊查找获取位置信息
|
||||||
batch_positions = smart_fuzzy_find_text_batch(pdf_path, chunk_texts, similarity_threshold=0.7)
|
matches = find_text_in_pdf(
|
||||||
|
pdf_path,
|
||||||
|
chunk_texts,
|
||||||
|
threshold=60
|
||||||
|
)
|
||||||
|
print(f"匹配结果: {matches}")
|
||||||
|
|
||||||
# 将位置信息与chunks_info关联,并确保数据类型正确
|
# 将位置信息与chunks_info关联,并确保数据类型正确
|
||||||
for i, chunk_info in enumerate(chunks_info):
|
for i, chunk_info in enumerate(chunks_info):
|
||||||
positions = batch_positions[i] if i < len(batch_positions) else []
|
# 确保 chunk_info 包含 'positions' 键
|
||||||
|
if 'positions' not in chunk_info:
|
||||||
|
chunk_info['positions'] = []
|
||||||
|
|
||||||
# 处理位置信息
|
print(f"处理第 {i+1} 个chunk: {chunk_info['text']}")
|
||||||
processed_positions = []
|
print(f"更新前位置: {chunk_info['positions']}")
|
||||||
for pos in positions:
|
|
||||||
if isinstance(pos, dict):
|
|
||||||
# 创建新的位置字典,确保所有坐标都是整数
|
|
||||||
processed_pos = {
|
|
||||||
'x0': int(round(float(pos['x0']))) if pos.get('x0') is not None else 0,
|
|
||||||
'y0': int(round(float(pos['y0']))) if pos.get('y0') is not None else 0,
|
|
||||||
'x1': int(round(float(pos['x1']))) if pos.get('x1') is not None else 0,
|
|
||||||
'y1': int(round(float(pos['y1']))) if pos.get('y1') is not None else 0,
|
|
||||||
'page': int(pos['page']) if pos.get('page') is not None else 0
|
|
||||||
}
|
|
||||||
processed_positions.append(processed_pos)
|
|
||||||
|
|
||||||
# 更新chunk_info中的positions
|
if isinstance(matches, list) and i < len(matches):
|
||||||
chunk_info['positions'] = processed_positions
|
chunk_info['positions']=[mat['position_int'] for mat in matches[i] if 'position_int' in mat]
|
||||||
|
|
||||||
|
# # 如果matches是列表且索引有效
|
||||||
|
# if isinstance(matches[i], dict) and 'position_int' in matches[i]:
|
||||||
|
# chunk_info['positions'] = matches[i]['position_int']
|
||||||
|
# print(f"更新后位置: {chunk_info['positions']}")
|
||||||
|
# else:
|
||||||
|
# chunk_info['positions'] = []
|
||||||
|
# print(f"未找到有效位置信息,设置为空列表")
|
||||||
|
else:
|
||||||
|
chunk_info['positions'] = []
|
||||||
|
print(f"匹配结果无效或索引越界,设置为空列表")
|
||||||
|
|
||||||
|
# 验证更新结果
|
||||||
|
print("最终chunks_info状态:")
|
||||||
|
for i, chunk_info in enumerate(chunks_info):
|
||||||
|
print(f" Chunk {i+1}: ID={chunk_info['id']}, Positions={chunk_info['positions']}")
|
||||||
|
|
||||||
return chunks_info
|
return chunks_info
|
||||||
|
|
||||||
@@ -475,12 +606,13 @@ def get_positions_from_chunk(pdf_path, chunks_info):
|
|||||||
print(f"获取PDF文本位置信息时出错: {str(e)}")
|
print(f"获取PDF文本位置信息时出错: {str(e)}")
|
||||||
# 出错时为每个chunk添加空的位置信息
|
# 出错时为每个chunk添加空的位置信息
|
||||||
for chunk_info in chunks_info:
|
for chunk_info in chunks_info:
|
||||||
|
# 确保 chunk_info 包含 'positions' 键
|
||||||
|
if 'positions' not in chunk_info:
|
||||||
chunk_info['positions'] = []
|
chunk_info['positions'] = []
|
||||||
return chunks_info
|
return chunks_info
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
||||||
"""处理PDF-TXT文件对"""
|
"""处理PDF-TXT文件对"""
|
||||||
for name, pdf_path in pdf_dict.items():
|
for name, pdf_path in pdf_dict.items():
|
||||||
@@ -493,6 +625,8 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
|||||||
txt_path = txt_dict.get(name)
|
txt_path = txt_dict.get(name)
|
||||||
if txt_path:
|
if txt_path:
|
||||||
chunks_info=process_txt_chunks(dataset.id,document, txt_path)
|
chunks_info=process_txt_chunks(dataset.id,document, txt_path)
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
if chunks_info:
|
if chunks_info:
|
||||||
chunks_info=get_positions_from_chunk(pdf_path, chunks_info)
|
chunks_info=get_positions_from_chunk(pdf_path, chunks_info)
|
||||||
for chunk_info in chunks_info:
|
for chunk_info in chunks_info:
|
||||||
@@ -507,7 +641,6 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
"""主函数,处理PDF和TXT文件对
|
"""主函数,处理PDF和TXT文件对
|
||||||
|
|
||||||
dataset.id = bucket_name
|
dataset.id = bucket_name
|
||||||
@@ -528,8 +661,8 @@ def main():
|
|||||||
print("未选择数据集。")
|
print("未选择数据集。")
|
||||||
return
|
return
|
||||||
|
|
||||||
process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
|
# 使用批量处理函数替代原来的处理函数
|
||||||
|
process_pdf_txt_pairs_bulk(pdf_dict, txt_dict, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -161,9 +161,10 @@ def find_text_in_pdf(pdf_path,
|
|||||||
if matched_lines:
|
if matched_lines:
|
||||||
_, merged_bbox = _merge_lines(matched_lines)
|
_, merged_bbox = _merge_lines(matched_lines)
|
||||||
results.append({
|
results.append({
|
||||||
"page": p + 1,
|
"page": p,
|
||||||
"bbox": merged_bbox,
|
"bbox": merged_bbox,
|
||||||
"matched_text": matched_text
|
"matched_text": matched_text,
|
||||||
|
"position_int":[p, merged_bbox[0], merged_bbox[2], merged_bbox[1], merged_bbox[3]]
|
||||||
})
|
})
|
||||||
if results:
|
if results:
|
||||||
batch_results[idx].extend(results)
|
batch_results[idx].extend(results)
|
||||||
@@ -206,6 +207,7 @@ def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
|
pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf'
|
||||||
pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf'
|
pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf'
|
||||||
|
pdf_path ="F:\\Synology_nas\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf"
|
||||||
query = [
|
query = [
|
||||||
'''一、总体要求
|
'''一、总体要求
|
||||||
以习近平新时代中国特色社会主义思想为指导,完整、准确、全面贯彻新发展理念,统筹发展和安全,充分发挥数据的基础资源和创新引擎作用,整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合,全面提升城市全域数字化转型的整体性、系统性、协同性,不断满足人民日益增长的美好生活需要,为全面建设社会主义现代化国家提供强大动力。到2027年,全国城市全域数字化转型取得明显成效,形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市,有力支撑数字中国建设。到2030年,全国城市全域数字化转型全面突破,人民群众的获得感、幸福感、安全感全面提升,涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''',
|
以习近平新时代中国特色社会主义思想为指导,完整、准确、全面贯彻新发展理念,统筹发展和安全,充分发挥数据的基础资源和创新引擎作用,整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合,全面提升城市全域数字化转型的整体性、系统性、协同性,不断满足人民日益增长的美好生活需要,为全面建设社会主义现代化国家提供强大动力。到2027年,全国城市全域数字化转型取得明显成效,形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市,有力支撑数字中国建设。到2030年,全国城市全域数字化转型全面突破,人民群众的获得感、幸福感、安全感全面提升,涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''',
|
||||||
@@ -271,7 +273,7 @@ if __name__ == "__main__":
|
|||||||
# 1. 找跨行正则匹配
|
# 1. 找跨行正则匹配
|
||||||
matches = find_text_in_pdf(
|
matches = find_text_in_pdf(
|
||||||
pdf_path,
|
pdf_path,
|
||||||
query, # 你的正则
|
query,
|
||||||
threshold=60
|
threshold=60
|
||||||
|
|
||||||
)
|
)
|
||||||
@@ -284,7 +286,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
#highlight_matches(pdf_path, query_matches, "example_highlighted.pdf")
|
#highlight_matches(pdf_path, query_matches, "example_highlighted.pdf")
|
||||||
for m in query_matches:
|
for m in query_matches:
|
||||||
print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}")
|
print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}, 位置_int: {m['position_int']}")
|
||||||
print("------------------")
|
print("------------------")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user