test positions
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
|
from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch
|
||||||
# 初始化 Elasticsearch 用户名elastic,密码infini_rag_flow
|
# 初始化 Elasticsearch 用户名elastic,密码infini_rag_flow
|
||||||
es = Elasticsearch(
|
es = Elasticsearch(
|
||||||
[{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}],
|
[{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}],
|
||||||
@@ -58,6 +58,39 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
|
|||||||
else:
|
else:
|
||||||
return {"code": 100, "message": "Failed to update img_id"}
|
return {"code": 100, "message": "Failed to update img_id"}
|
||||||
|
|
||||||
|
def get_index_mapping(tenant_id):
|
||||||
|
"""
|
||||||
|
获取指定索引的 mapping 信息
|
||||||
|
|
||||||
|
:param tenant_id: 租户 ID
|
||||||
|
:return: mapping 信息
|
||||||
|
"""
|
||||||
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
mapping = es.indices.get_mapping(index=index_name)
|
||||||
|
# 将 ObjectApiResponse 转换为普通字典
|
||||||
|
mapping_dict = dict(mapping)
|
||||||
|
return {"code": 0, "message": "", "data": mapping_dict}
|
||||||
|
except Exception as e:
|
||||||
|
return {"code": 500, "message": str(e), "data": {}}
|
||||||
|
|
||||||
|
# 在主函数中调用示例
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# ... 现有代码 ...
|
||||||
|
|
||||||
|
# 获取 mapping 信息
|
||||||
|
tenant_id = "9c73df5a3ebc11f08410c237296aa408"
|
||||||
|
mapping_result = get_index_mapping(tenant_id)
|
||||||
|
if mapping_result["code"] == 0:
|
||||||
|
print("索引 mapping 信息:")
|
||||||
|
import json
|
||||||
|
# 使用 default=str 处理不能直接序列化的对象
|
||||||
|
print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False, default=str))
|
||||||
|
else:
|
||||||
|
print(f"获取 mapping 失败: {mapping_result['message']}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def list_chunk_information(tenant_id, dataset_id, doc_id=None, chunk_id=None, size=1000):
|
def list_chunk_information(tenant_id, dataset_id, doc_id=None, chunk_id=None, size=1000):
|
||||||
"""
|
"""
|
||||||
@@ -122,16 +155,32 @@ if __name__ == "__main__":
|
|||||||
dataset_id = "0e6127da574a11f0a59c7e7439a490f8" # dataset_id = kb_id
|
dataset_id = "0e6127da574a11f0a59c7e7439a490f8" # dataset_id = kb_id
|
||||||
doc_id = "cbf576385bc911f08f23fedc3996e479"
|
doc_id = "cbf576385bc911f08f23fedc3996e479"
|
||||||
doc_id = "323113d8670c11f0b4255ea1d23c381a"
|
doc_id = "323113d8670c11f0b4255ea1d23c381a"
|
||||||
|
doc_id = "5cdab2fa67cb11f0a21592edb0e63cad" #
|
||||||
chunk_id = "f035247f7de579b0" #
|
chunk_id = "f035247f7de579b0" #
|
||||||
chunk_id = "b2d53baddbfde97c" #
|
chunk_id = "b2d53baddbfde97c" #
|
||||||
|
chunk_id = "e46a067c1edf939a"
|
||||||
new_img_id = "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345"
|
new_img_id = "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345"
|
||||||
new_img_id = "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c"
|
#new_img_id = "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c"
|
||||||
#new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png"
|
#new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png"
|
||||||
|
pos= [3, 317, 397, 123, 182]
|
||||||
|
|
||||||
|
# 获取 mapping 信息
|
||||||
|
tenant_id = "9c73df5a3ebc11f08410c237296aa408"
|
||||||
|
mapping_result = get_index_mapping(tenant_id)
|
||||||
|
if mapping_result["code"] == 0:
|
||||||
|
print("索引 mapping 信息:")
|
||||||
|
import json
|
||||||
|
print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False))
|
||||||
|
else:
|
||||||
|
print(f"获取 mapping 失败: {mapping_result['message']}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#chunk_list = list_chunk_information(tenant_id, dataset_id, doc_id=doc_id)
|
#chunk_list = list_chunk_information(tenant_id, dataset_id, doc_id=doc_id)
|
||||||
update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id)
|
# update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id)
|
||||||
|
update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, new_img_id)
|
||||||
# if chunk_list["code"] == 0:
|
# if chunk_list["code"] == 0:
|
||||||
# print(f"找到 {len(chunk_list['data'])} 个 chunks")
|
# print(f"找到 {len(chunk_list['data'])} 个 chunks")
|
||||||
# for chunk in chunk_list['data']:
|
# for chunk in chunk_list['data']:
|
||||||
|
@@ -10,7 +10,7 @@ from elasticsearch import Elasticsearch
|
|||||||
from minio import Minio
|
from minio import Minio
|
||||||
from minio.error import S3Error
|
from minio.error import S3Error
|
||||||
|
|
||||||
from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
|
# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch
|
||||||
|
|
||||||
|
|
||||||
from dotenv import load_dotenv # 新增
|
from dotenv import load_dotenv # 新增
|
||||||
@@ -59,6 +59,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
:return: 更新结果
|
:return: 更新结果
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
|
||||||
# 构建索引名称
|
# 构建索引名称
|
||||||
index_name = f"ragflow_{tenant_id}"
|
index_name = f"ragflow_{tenant_id}"
|
||||||
|
|
||||||
@@ -93,6 +94,22 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position
|
|||||||
|
|
||||||
# 只有当 position 存在时才更新 positions
|
# 只有当 position 存在时才更新 positions
|
||||||
if position is not None:
|
if position is not None:
|
||||||
|
# 如果传入的是嵌套字典格式的 position
|
||||||
|
if isinstance(position, list) and all(isinstance(p, dict) for p in position):
|
||||||
|
# 将字典格式转换为整数列表格式
|
||||||
|
formatted_positions = []
|
||||||
|
for pos in position:
|
||||||
|
pos_list = [
|
||||||
|
pos.get('page', 0), # 页码
|
||||||
|
int(round(float(pos.get('x0', 0)))), # x0
|
||||||
|
int(round(float(pos.get('x1', 0)))), # x1
|
||||||
|
int(round(float(pos.get('y0', 0)))), # y0
|
||||||
|
int(round(float(pos.get('y1', 0)))) # y1
|
||||||
|
]
|
||||||
|
formatted_positions.append(pos_list)
|
||||||
|
update_body["doc"]["positions"] = formatted_positions
|
||||||
|
# 如果已经是整数列表格式
|
||||||
|
elif isinstance(position, list):
|
||||||
update_body["doc"]["positions"] = position
|
update_body["doc"]["positions"] = position
|
||||||
|
|
||||||
# 如果没有需要更新的字段,直接返回成功
|
# 如果没有需要更新的字段,直接返回成功
|
||||||
|
Reference in New Issue
Block a user