diff --git a/chunk_operations.py b/chunk_operations.py index 7d1f209..98e3403 100644 --- a/chunk_operations.py +++ b/chunk_operations.py @@ -1,5 +1,5 @@ from elasticsearch import Elasticsearch - +from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch # 初始化 Elasticsearch 用户名elastic,密码infini_rag_flow es = Elasticsearch( [{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}], @@ -58,6 +58,39 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id): else: return {"code": 100, "message": "Failed to update img_id"} +def get_index_mapping(tenant_id): + """ + 获取指定索引的 mapping 信息 + + :param tenant_id: 租户 ID + :return: mapping 信息 + """ + index_name = f"ragflow_{tenant_id}" + + try: + mapping = es.indices.get_mapping(index=index_name) + # 将 ObjectApiResponse 转换为普通字典 + mapping_dict = dict(mapping) + return {"code": 0, "message": "", "data": mapping_dict} + except Exception as e: + return {"code": 500, "message": str(e), "data": {}} + +# 在主函数中调用示例 +if __name__ == "__main__": + # ... 现有代码 ... + + # 获取 mapping 信息 + tenant_id = "9c73df5a3ebc11f08410c237296aa408" + mapping_result = get_index_mapping(tenant_id) + if mapping_result["code"] == 0: + print("索引 mapping 信息:") + import json + # 使用 default=str 处理不能直接序列化的对象 + print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False, default=str)) + else: + print(f"获取 mapping 失败: {mapping_result['message']}") + + def list_chunk_information(tenant_id, dataset_id, doc_id=None, chunk_id=None, size=1000): """ @@ -121,17 +154,33 @@ if __name__ == "__main__": tenant_id = "9c73df5a3ebc11f08410c237296aa408" dataset_id = "0e6127da574a11f0a59c7e7439a490f8" # dataset_id = kb_id doc_id = "cbf576385bc911f08f23fedc3996e479" - doc_id = "323113d8670c11f0b4255ea1d23c381a" + doc_id = "323113d8670c11f0b4255ea1d23c381a" + doc_id = "5cdab2fa67cb11f0a21592edb0e63cad" # chunk_id = "f035247f7de579b0" # chunk_id = "b2d53baddbfde97c" # + chunk_id = "e46a067c1edf939a" new_img_id = "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345" - new_img_id = "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c" + #new_img_id = "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c" #new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png" + pos= [3, 317, 397, 123, 182] + + # 获取 mapping 信息 + tenant_id = "9c73df5a3ebc11f08410c237296aa408" + mapping_result = get_index_mapping(tenant_id) + if mapping_result["code"] == 0: + print("索引 mapping 信息:") + import json + print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False)) + else: + print(f"获取 mapping 失败: {mapping_result['message']}") + + #chunk_list = list_chunk_information(tenant_id, dataset_id, doc_id=doc_id) - update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id) + # update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id) + update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, new_img_id) # if chunk_list["code"] == 0: # print(f"找到 {len(chunk_list['data'])} 个 chunks") # for chunk in chunk_list['data']: diff --git a/src/add_chunk_cli_pdf_img.py b/src/add_chunk_cli_pdf_img.py index c73de95..7067dcb 100644 --- a/src/add_chunk_cli_pdf_img.py +++ b/src/add_chunk_cli_pdf_img.py @@ -10,7 +10,7 @@ from elasticsearch import Elasticsearch from minio import Minio from minio.error import S3Error -from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch +# from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch from dotenv import load_dotenv # 新增 @@ -59,6 +59,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position :return: 更新结果 """ try: + # 构建索引名称 index_name = f"ragflow_{tenant_id}" @@ -93,7 +94,23 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position # 只有当 position 存在时才更新 positions if position is not None: - update_body["doc"]["positions"] = position + # 如果传入的是嵌套字典格式的 position + if isinstance(position, list) and all(isinstance(p, dict) for p in position): + # 将字典格式转换为整数列表格式 + formatted_positions = [] + for pos in position: + pos_list = [ + pos.get('page', 0), # 页码 + int(round(float(pos.get('x0', 0)))), # x0 + int(round(float(pos.get('x1', 0)))), # x1 + int(round(float(pos.get('y0', 0)))), # y0 + int(round(float(pos.get('y1', 0)))) # y1 + ] + formatted_positions.append(pos_list) + update_body["doc"]["positions"] = formatted_positions + # 如果已经是整数列表格式 + elif isinstance(position, list): + update_body["doc"]["positions"] = position # 如果没有需要更新的字段,直接返回成功 if not update_body["doc"]: