test positions
This commit is contained in:
		| @@ -1,5 +1,5 @@ | |||||||
| from elasticsearch import Elasticsearch | from elasticsearch import Elasticsearch | ||||||
|  | from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch | ||||||
| # 初始化 Elasticsearch   用户名elastic,密码infini_rag_flow | # 初始化 Elasticsearch   用户名elastic,密码infini_rag_flow | ||||||
| es = Elasticsearch( | es = Elasticsearch( | ||||||
|     [{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}], |     [{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}], | ||||||
| @@ -58,6 +58,39 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id): | |||||||
|     else: |     else: | ||||||
|         return {"code": 100, "message": "Failed to update img_id"} |         return {"code": 100, "message": "Failed to update img_id"} | ||||||
|      |      | ||||||
|  | def get_index_mapping(tenant_id): | ||||||
|  |     """ | ||||||
|  |     获取指定索引的 mapping 信息 | ||||||
|  |      | ||||||
|  |     :param tenant_id: 租户 ID | ||||||
|  |     :return: mapping 信息 | ||||||
|  |     """ | ||||||
|  |     index_name = f"ragflow_{tenant_id}" | ||||||
|  |      | ||||||
|  |     try: | ||||||
|  |         mapping = es.indices.get_mapping(index=index_name) | ||||||
|  |         # 将 ObjectApiResponse 转换为普通字典 | ||||||
|  |         mapping_dict = dict(mapping) | ||||||
|  |         return {"code": 0, "message": "", "data": mapping_dict} | ||||||
|  |     except Exception as e: | ||||||
|  |         return {"code": 500, "message": str(e), "data": {}} | ||||||
|  |  | ||||||
|  | # 在主函数中调用示例 | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     # ... 现有代码 ... | ||||||
|  |      | ||||||
|  |     # 获取 mapping 信息 | ||||||
|  |     tenant_id = "9c73df5a3ebc11f08410c237296aa408" | ||||||
|  |     mapping_result = get_index_mapping(tenant_id) | ||||||
|  |     if mapping_result["code"] == 0: | ||||||
|  |         print("索引 mapping 信息:") | ||||||
|  |         import json | ||||||
|  |         # 使用 default=str 处理不能直接序列化的对象 | ||||||
|  |         print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False, default=str)) | ||||||
|  |     else: | ||||||
|  |         print(f"获取 mapping 失败: {mapping_result['message']}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def list_chunk_information(tenant_id, dataset_id, doc_id=None, chunk_id=None, size=1000): | def list_chunk_information(tenant_id, dataset_id, doc_id=None, chunk_id=None, size=1000): | ||||||
|     """ |     """ | ||||||
| @@ -121,17 +154,33 @@ if __name__ == "__main__": | |||||||
|     tenant_id = "9c73df5a3ebc11f08410c237296aa408" |     tenant_id = "9c73df5a3ebc11f08410c237296aa408" | ||||||
|     dataset_id = "0e6127da574a11f0a59c7e7439a490f8"  #  dataset_id = kb_id |     dataset_id = "0e6127da574a11f0a59c7e7439a490f8"  #  dataset_id = kb_id | ||||||
|     doc_id = "cbf576385bc911f08f23fedc3996e479" |     doc_id = "cbf576385bc911f08f23fedc3996e479" | ||||||
|     doc_id = "323113d8670c11f0b4255ea1d23c381a"   |     doc_id = "323113d8670c11f0b4255ea1d23c381a"  | ||||||
|  |     doc_id = "5cdab2fa67cb11f0a21592edb0e63cad"  #  | ||||||
|     chunk_id = "f035247f7de579b0"  #   |     chunk_id = "f035247f7de579b0"  #   | ||||||
|     chunk_id = "b2d53baddbfde97c"  #  |     chunk_id = "b2d53baddbfde97c"  #  | ||||||
|  |     chunk_id = "e46a067c1edf939a" | ||||||
|     new_img_id =    "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345" |     new_img_id =    "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345" | ||||||
|     new_img_id =  "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c" |     #new_img_id =  "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c" | ||||||
|     #new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png" |     #new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png" | ||||||
|  |     pos=    [3, 317, 397, 123, 182] | ||||||
|  |  | ||||||
|  |     # 获取 mapping 信息 | ||||||
|  |     tenant_id = "9c73df5a3ebc11f08410c237296aa408" | ||||||
|  |     mapping_result = get_index_mapping(tenant_id) | ||||||
|  |     if mapping_result["code"] == 0: | ||||||
|  |         print("索引 mapping 信息:") | ||||||
|  |         import json | ||||||
|  |         print(json.dumps(mapping_result["data"], indent=2, ensure_ascii=False)) | ||||||
|  |     else: | ||||||
|  |         print(f"获取 mapping 失败: {mapping_result['message']}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     #chunk_list = list_chunk_information(tenant_id, dataset_id, doc_id=doc_id) |     #chunk_list = list_chunk_information(tenant_id, dataset_id, doc_id=doc_id) | ||||||
|     update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id) |     # update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id,new_img_id) | ||||||
|  |     update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, new_img_id) | ||||||
|     # if chunk_list["code"] == 0: |     # if chunk_list["code"] == 0: | ||||||
|     #     print(f"找到 {len(chunk_list['data'])} 个 chunks") |     #     print(f"找到 {len(chunk_list['data'])} 个 chunks") | ||||||
|     #     for chunk in chunk_list['data']: |     #     for chunk in chunk_list['data']: | ||||||
|   | |||||||
| @@ -10,7 +10,7 @@ from elasticsearch import Elasticsearch | |||||||
| from minio import Minio | from minio import Minio | ||||||
| from minio.error import S3Error | from minio.error import S3Error | ||||||
|  |  | ||||||
| from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch | # from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch | ||||||
|  |  | ||||||
|  |  | ||||||
| from dotenv import load_dotenv  # 新增 | from dotenv import load_dotenv  # 新增 | ||||||
| @@ -59,6 +59,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | |||||||
|     :return: 更新结果 |     :return: 更新结果 | ||||||
|     """ |     """ | ||||||
|     try: |     try: | ||||||
|  |          | ||||||
|         # 构建索引名称 |         # 构建索引名称 | ||||||
|         index_name = f"ragflow_{tenant_id}" |         index_name = f"ragflow_{tenant_id}" | ||||||
|  |  | ||||||
| @@ -93,7 +94,23 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | |||||||
|              |              | ||||||
|         # 只有当 position 存在时才更新 positions |         # 只有当 position 存在时才更新 positions | ||||||
|         if position is not None: |         if position is not None: | ||||||
|             update_body["doc"]["positions"] = position |             # 如果传入的是嵌套字典格式的 position | ||||||
|  |             if isinstance(position, list) and all(isinstance(p, dict) for p in position): | ||||||
|  |                 # 将字典格式转换为整数列表格式 | ||||||
|  |                 formatted_positions = [] | ||||||
|  |                 for pos in position: | ||||||
|  |                     pos_list = [ | ||||||
|  |                         pos.get('page', 0),  # 页码 | ||||||
|  |                         int(round(float(pos.get('x0', 0)))),  # x0 | ||||||
|  |                         int(round(float(pos.get('x1', 0)))),  # x1 | ||||||
|  |                         int(round(float(pos.get('y0', 0)))),  # y0 | ||||||
|  |                         int(round(float(pos.get('y1', 0))))   # y1 | ||||||
|  |                     ] | ||||||
|  |                     formatted_positions.append(pos_list) | ||||||
|  |                 update_body["doc"]["positions"] = formatted_positions | ||||||
|  |             # 如果已经是整数列表格式 | ||||||
|  |             elif isinstance(position, list): | ||||||
|  |                 update_body["doc"]["positions"] = position | ||||||
|  |  | ||||||
|         # 如果没有需要更新的字段,直接返回成功 |         # 如果没有需要更新的字段,直接返回成功 | ||||||
|         if not update_body["doc"]: |         if not update_body["doc"]: | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user