Compare commits
	
		
			2 Commits
		
	
	
		
			e5ac523bd9
			...
			1c23d272bb
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 1c23d272bb | |||
| c1d66237e6 | 
							
								
								
									
										256
									
								
								chunk_pos.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										256
									
								
								chunk_pos.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,256 @@ | |||||||
|  | from elasticsearch import Elasticsearch | ||||||
|  | #from src.add_chunk_cli_pdf_img import update_positon_img_id_in_elasticsearch | ||||||
|  | # 初始化 Elasticsearch   用户名elastic,密码infini_rag_flow | ||||||
|  |  | ||||||
|  | from dotenv import load_dotenv  # 新增 | ||||||
|  | import os | ||||||
|  | import json | ||||||
|  | # 加载 .env 文件中的环境变量 | ||||||
|  | load_dotenv() | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # 初始化 Elasticsearch | ||||||
|  | es = Elasticsearch( | ||||||
|  |     [{ | ||||||
|  |         'host': os.getenv("ELASTIC_HOST"), | ||||||
|  |         'port': int(os.getenv("ELASTIC_PORT")), | ||||||
|  |         'scheme': 'http' | ||||||
|  |     }], | ||||||
|  |     basic_auth=( | ||||||
|  |         os.getenv("ELASTIC_USERNAME"), | ||||||
|  |         os.getenv("ELASTIC_PASSWORD") | ||||||
|  |     ) | ||||||
|  | ) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def get_index_mapping(tenant_id): | ||||||
|  |     """ | ||||||
|  |     获取指定索引的 mapping 信息 | ||||||
|  |      | ||||||
|  |     :param tenant_id: 租户 ID | ||||||
|  |     :return: mapping 信息 | ||||||
|  |     """ | ||||||
|  |     index_name = f"ragflow_{tenant_id}" | ||||||
|  |      | ||||||
|  |     try: | ||||||
|  |         mapping = es.indices.get_mapping(index=index_name) | ||||||
|  |         # 将 ObjectApiResponse 转换为普通字典 | ||||||
|  |         mapping_dict = dict(mapping) | ||||||
|  |         return {"code": 0, "message": "", "data": mapping_dict} | ||||||
|  |     except Exception as e: | ||||||
|  |         return {"code": 500, "message": str(e), "data": {}} | ||||||
|  |  | ||||||
|  | def update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, positions): | ||||||
|  |     """ | ||||||
|  |     在 Elasticsearch 中更新指定文档块的position and img_id。 | ||||||
|  |      | ||||||
|  |     :param tenant_id: 租户 ID | ||||||
|  |     :param doc_id: 文档 ID | ||||||
|  |     :param chunk_id: 文档块 ID | ||||||
|  |     :param new_img_id: 新的 img_id | ||||||
|  |     :param position: 位置信息 | ||||||
|  |     :return: 更新结果 | ||||||
|  |     """ | ||||||
|  |     if not positions: | ||||||
|  |         return | ||||||
|  |  | ||||||
|  |     position_int = [] | ||||||
|  |  | ||||||
|  |     for pos in positions: | ||||||
|  |         if len(pos) != 5: | ||||||
|  |             continue  # Skip invalid positions | ||||||
|  |              | ||||||
|  |         pn, left, right, top, bottom = pos | ||||||
|  |         # 使用元组格式,与原始RAGFlow保持一致 | ||||||
|  |         position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom))) | ||||||
|  |  | ||||||
|  |     if position_int:  # Only add if we have valid positions | ||||||
|  |         # 仅添加精确位置信息,不修改排序字段   | ||||||
|  |          | ||||||
|  |         # 构建索引名称 | ||||||
|  |         index_name = f"ragflow_{tenant_id}" | ||||||
|  |  | ||||||
|  |         # 构建查询条件 | ||||||
|  |         query = { | ||||||
|  |             "bool": { | ||||||
|  |                 "must": [ | ||||||
|  |                     {"term": {"doc_id": doc_id}}, | ||||||
|  |                     {"term": {"_id": chunk_id}} | ||||||
|  |                 ] | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         # 搜索目标文档 | ||||||
|  |         result = es.search(index=index_name, body={"query": query}) | ||||||
|  |  | ||||||
|  |         # 检查是否找到目标文档 | ||||||
|  |         if result['hits']['total']['value'] == 0: | ||||||
|  |             print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}") | ||||||
|  |             return {"code": 102, "message": f"Can't find this chunk {chunk_id}"} | ||||||
|  |  | ||||||
|  |         # 获取目标文档的 ID | ||||||
|  |         hit = result['hits']['hits'][0] | ||||||
|  |         doc_id_in_es = hit['_id'] | ||||||
|  |  | ||||||
|  |         # 构建更新请求 - 只更新存在的字段 | ||||||
|  |         update_body = {"doc": {}} | ||||||
|  |         update_body["doc"]["position_int"] = position_int | ||||||
|  |         update_body["doc"]["page_num_int"] = [position_int[0][0]] | ||||||
|  |         update_body["doc"]["top_int"] = [position_int[0][3]] | ||||||
|  |  | ||||||
|  |  | ||||||
|  |                  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         # 更新文档 | ||||||
|  |         update_result = es.update( | ||||||
|  |             index=index_name,  | ||||||
|  |             id=doc_id_in_es,  | ||||||
|  |             body=update_body, | ||||||
|  |             refresh=True  # 确保更新立即可见 | ||||||
|  |         ) | ||||||
|  |              | ||||||
|  |         print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |          | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id): | ||||||
|  |     """ | ||||||
|  |     在 Elasticsearch 中更新指定文档块的position and img_id。 | ||||||
|  |      | ||||||
|  |     :param tenant_id: 租户 ID | ||||||
|  |     :param doc_id: 文档 ID | ||||||
|  |     :param chunk_id: 文档块 ID | ||||||
|  |     :param new_img_id: 新的 img_id | ||||||
|  |     :param position: 位置信息 | ||||||
|  |     :return: 更新结果 | ||||||
|  |     """ | ||||||
|  |     try: | ||||||
|  |          | ||||||
|  |         # 构建索引名称 | ||||||
|  |         index_name = f"ragflow_{tenant_id}" | ||||||
|  |  | ||||||
|  |         # 构建查询条件 | ||||||
|  |         query = { | ||||||
|  |             "bool": { | ||||||
|  |                 "must": [ | ||||||
|  |                     {"term": {"doc_id": doc_id}}, | ||||||
|  |                     {"term": {"_id": chunk_id}} | ||||||
|  |                 ] | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         # 搜索目标文档 | ||||||
|  |         result = es.search(index=index_name, body={"query": query}) | ||||||
|  |  | ||||||
|  |         # 检查是否找到目标文档 | ||||||
|  |         if result['hits']['total']['value'] == 0: | ||||||
|  |             print(f"在 Elasticsearch 中未找到文档: index={index_name}, doc_id={doc_id}, chunk_id={chunk_id}") | ||||||
|  |             return {"code": 102, "message": f"Can't find this chunk {chunk_id}"} | ||||||
|  |  | ||||||
|  |         # 获取目标文档的 ID | ||||||
|  |         hit = result['hits']['hits'][0] | ||||||
|  |         doc_id_in_es = hit['_id'] | ||||||
|  |  | ||||||
|  |         # 构建更新请求 - 只更新存在的字段 | ||||||
|  |         update_body = {"doc": {}} | ||||||
|  |          | ||||||
|  |         #只有当 new_img_id 存在时才更新 img_id | ||||||
|  |         if new_img_id is not None: | ||||||
|  |             update_body["doc"]["img_id"] = new_img_id | ||||||
|  |              | ||||||
|  |         # 只有当 position 存在时才更新 positions | ||||||
|  |         if position is not None: | ||||||
|  |  | ||||||
|  |             update_body["doc"]["positions"] = position | ||||||
|  |  | ||||||
|  |  | ||||||
|  |         # 如果没有需要更新的字段,直接返回成功 | ||||||
|  |         if not update_body["doc"]: | ||||||
|  |             print("没有需要更新的字段") | ||||||
|  |             return {"code": 0, "message": "No fields to update"} | ||||||
|  |  | ||||||
|  |         # 更新文档 | ||||||
|  |         update_result = es.update( | ||||||
|  |             index=index_name,  | ||||||
|  |             id=doc_id_in_es,  | ||||||
|  |             body=update_body, | ||||||
|  |             refresh=True  # 确保更新立即可见 | ||||||
|  |         ) | ||||||
|  |          | ||||||
|  |         print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}") | ||||||
|  |  | ||||||
|  |         # 验证更新 | ||||||
|  |         verify_doc = es.get(index=index_name, id=doc_id_in_es) | ||||||
|  |  | ||||||
|  |         # 检查 img_id 是否已更新(如果提供了 new_img_id) | ||||||
|  |         img_id_updated = True | ||||||
|  |         if new_img_id is not None: | ||||||
|  |             img_id_updated = verify_doc['_source'].get('img_id') == new_img_id | ||||||
|  |             if img_id_updated: | ||||||
|  |                 print(f"成功更新 img_id 为: {new_img_id}") | ||||||
|  |             else: | ||||||
|  |                 print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}") | ||||||
|  |  | ||||||
|  |         # 检查 position 是否已更新(如果提供了 position) | ||||||
|  |         position_updated = True | ||||||
|  |         if position is not None: | ||||||
|  |             position_updated = verify_doc['_source'].get('positions') == position | ||||||
|  |             if position_updated: | ||||||
|  |                 print(f"成功更新 position 为: {position}") | ||||||
|  |             else: | ||||||
|  |                 print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}") | ||||||
|  |  | ||||||
|  |         # 统一返回结果 | ||||||
|  |         if img_id_updated and position_updated: | ||||||
|  |             return {"code": 0, "message": ""} | ||||||
|  |         else: | ||||||
|  |             return {"code": 100, "message": "Failed to verify update"} | ||||||
|  |          | ||||||
|  |  | ||||||
|  |     except Exception as e: | ||||||
|  |         print(f"更新 Elasticsearch 时发生错误: {str(e)}") | ||||||
|  |         return {"code": 101, "message": f"Error updating img_id: {str(e)}"} | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # 示例调用 - 列出特定文档的所有 chunks | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     try: | ||||||
|  |         print(es.info()) | ||||||
|  |     except Exception as e: | ||||||
|  |         print("连接失败:", e) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # 单位电脑 | ||||||
|  |     tenant_id = "d669205e57a211f0b9e7324e7f243034" | ||||||
|  |     new_img_id ="10345832587311f0919f3a2728512a4b-bd04866cd05337281" | ||||||
|  |     doc_id="ea8d75966df811f0925ac6e8db75f472" | ||||||
|  |     chunk_id="4a4927560a7e6d80" | ||||||
|  |     # 添加以下代码来检查索引映射 | ||||||
|  |     # mapping_result = get_index_mapping(tenant_id) | ||||||
|  |     # print("Positions field mapping:", mapping_result["data"][f"ragflow_{tenant_id}"]["mappings"]["properties"]["positions"]) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     # 左,右 --> | ||||||
|  |     #上, 下| 上面最小,下面最大 | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     pos = [[4, 0, 100, 200, 510]] | ||||||
|  |     #pos_string = json.dumps(pos)  # 转换为 JSON 字符串 | ||||||
|  |     update_positon_in_elasticsearch(tenant_id, doc_id, chunk_id, pos) | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     #update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, pos, "") | ||||||
| @@ -9,6 +9,8 @@ import tempfile | |||||||
| from elasticsearch import Elasticsearch | from elasticsearch import Elasticsearch | ||||||
| from minio import Minio | from minio import Minio | ||||||
| from minio.error import S3Error | from minio.error import S3Error | ||||||
|  | from find_text_in_pdf_enhanced import find_text_in_pdf | ||||||
|  | import time | ||||||
|  |  | ||||||
| # from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch | # from get_pos_pdf import smart_fuzzy_find_text_batch, find_text_positions_batch | ||||||
|  |  | ||||||
| @@ -47,7 +49,7 @@ MINIO_CONFIG = { | |||||||
|     "secure": False |     "secure": False | ||||||
| } | } | ||||||
|  |  | ||||||
| def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position, new_img_id): | def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, positions, new_img_id): | ||||||
|     """ |     """ | ||||||
|     在 Elasticsearch 中更新指定文档块的position and img_id。 |     在 Elasticsearch 中更新指定文档块的position and img_id。 | ||||||
|      |      | ||||||
| @@ -88,29 +90,32 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | |||||||
|         # 构建更新请求 - 只更新存在的字段 |         # 构建更新请求 - 只更新存在的字段 | ||||||
|         update_body = {"doc": {}} |         update_body = {"doc": {}} | ||||||
|          |          | ||||||
|         # 只有当 new_img_id 存在时才更新 img_id |         #只有当 new_img_id 存在时才更新 img_id | ||||||
|         if new_img_id is not None: |         if new_img_id is not None: | ||||||
|             update_body["doc"]["img_id"] = new_img_id |             update_body["doc"]["img_id"] = new_img_id | ||||||
|              |              | ||||||
|         # 只有当 position 存在时才更新 positions |         # 只有当 position 存在时才更新 positions | ||||||
|         if position is not None: |         if positions : | ||||||
|             # 如果传入的是嵌套字典格式的 position |  | ||||||
|             if isinstance(position, list) and all(isinstance(p, dict) for p in position): |             position_int = [] | ||||||
|                 # 将字典格式转换为整数列表格式 |  | ||||||
|                 formatted_positions = [] |             for pos in positions: | ||||||
|                 for pos in position: |                 if len(pos) != 5: | ||||||
|                     pos_list = [ |                     continue  # Skip invalid positions | ||||||
|                         pos.get('page', 0),  # 页码 |                      | ||||||
|                         int(round(float(pos.get('x0', 0)))),  # x0 |                 pn, left, right, top, bottom = pos | ||||||
|                         int(round(float(pos.get('x1', 0)))),  # x1 |                 # 使用元组格式,与原始RAGFlow保持一致 | ||||||
|                         int(round(float(pos.get('y0', 0)))),  # y0 |                 position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom))) | ||||||
|                         int(round(float(pos.get('y1', 0))))   # y1 |             if position_int: | ||||||
|                     ] |                 update_body["doc"]["position_int"] = position_int | ||||||
|                     formatted_positions.append(pos_list) |                 update_body["doc"]["page_num_int"] = [position_int[0][0]] | ||||||
|                 update_body["doc"]["positions"] = formatted_positions |                 update_body["doc"]["top_int"] = [position_int[0][3]] | ||||||
|             # 如果已经是整数列表格式 |  | ||||||
|             elif isinstance(position, list): |  | ||||||
|                 update_body["doc"]["positions"] = position |  | ||||||
|  |  | ||||||
|  |              | ||||||
|  |  | ||||||
|  |  | ||||||
|         # 如果没有需要更新的字段,直接返回成功 |         # 如果没有需要更新的字段,直接返回成功 | ||||||
|         if not update_body["doc"]: |         if not update_body["doc"]: | ||||||
| @@ -127,32 +132,32 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | |||||||
|          |          | ||||||
|         print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}") |         print(f"Elasticsearch 更新结果: index={index_name}, id={doc_id_in_es}, result={update_result}") | ||||||
|  |  | ||||||
|         # 验证更新 |     #     # 验证更新 | ||||||
|         verify_doc = es.get(index=index_name, id=doc_id_in_es) |     #     verify_doc = es.get(index=index_name, id=doc_id_in_es) | ||||||
|  |  | ||||||
|         # 检查 img_id 是否已更新(如果提供了 new_img_id) |     #     # 检查 img_id 是否已更新(如果提供了 new_img_id) | ||||||
|         img_id_updated = True |     #     img_id_updated = True | ||||||
|         if new_img_id is not None: |     #     if new_img_id is not None: | ||||||
|             img_id_updated = verify_doc['_source'].get('img_id') == new_img_id |     #         img_id_updated = verify_doc['_source'].get('img_id') == new_img_id | ||||||
|             if img_id_updated: |     #         if img_id_updated: | ||||||
|                 print(f"成功更新 img_id 为: {new_img_id}") |     #             print(f"成功更新 img_id 为: {new_img_id}") | ||||||
|             else: |     #         else: | ||||||
|                 print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}") |     #             print(f"更新验证失败,当前 img_id: {verify_doc['_source'].get('img_id')}") | ||||||
|  |  | ||||||
|         # 检查 position 是否已更新(如果提供了 position) |     #     # 检查 position 是否已更新(如果提供了 position) | ||||||
|         position_updated = True |     #     position_updated = True | ||||||
|         if position is not None: |     #     if position is not None: | ||||||
|             position_updated = verify_doc['_source'].get('positions') == position |     #         position_updated = verify_doc['_source'].get('positions') == position | ||||||
|             if position_updated: |     #         if position_updated: | ||||||
|                 print(f"成功更新 position 为: {position}") |     #             print(f"成功更新 position 为: {position}") | ||||||
|             else: |     #         else: | ||||||
|                 print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}") |     #             print(f"更新验证失败,当前 position: {verify_doc['_source'].get('positions')}") | ||||||
|  |  | ||||||
|         # 统一返回结果 |     #     # 统一返回结果 | ||||||
|         if img_id_updated and position_updated: |     #     if img_id_updated and position_updated: | ||||||
|             return {"code": 0, "message": ""} |     #         return {"code": 0, "message": ""} | ||||||
|         else: |     #     else: | ||||||
|             return {"code": 100, "message": "Failed to verify update"} |     #         return {"code": 100, "message": "Failed to verify update"} | ||||||
|          |          | ||||||
|  |  | ||||||
|     except Exception as e: |     except Exception as e: | ||||||
| @@ -160,6 +165,7 @@ def update_positon_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, position | |||||||
|         return {"code": 101, "message": f"Error updating img_id: {str(e)}"} |         return {"code": 101, "message": f"Error updating img_id: {str(e)}"} | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_minio_client(): | def get_minio_client(): | ||||||
|     """创建MinIO客户端""" |     """创建MinIO客户端""" | ||||||
|     return Minio( |     return Minio( | ||||||
| @@ -444,30 +450,43 @@ def get_positions_from_chunk(pdf_path, chunks_info): | |||||||
|     try: |     try: | ||||||
|         # 提取所有chunk的文本内容用于批量查找 |         # 提取所有chunk的文本内容用于批量查找 | ||||||
|         chunk_texts = [chunk_info['text'] for chunk_info in chunks_info] |         chunk_texts = [chunk_info['text'] for chunk_info in chunks_info] | ||||||
|  |         print(f"批量查找文本块: {chunk_texts}") | ||||||
|          |          | ||||||
|         # 使用智能模糊查找获取位置信息 |         # 使用智能模糊查找获取位置信息 | ||||||
|         batch_positions = smart_fuzzy_find_text_batch(pdf_path, chunk_texts, similarity_threshold=0.7) |         matches = find_text_in_pdf( | ||||||
|  |             pdf_path, | ||||||
|  |             chunk_texts,   | ||||||
|  |             threshold=60 | ||||||
|  |         ) | ||||||
|  |         print(f"匹配结果: {matches}") | ||||||
|          |          | ||||||
|         # 将位置信息与chunks_info关联,并确保数据类型正确 |         # 将位置信息与chunks_info关联,并确保数据类型正确 | ||||||
|         for i, chunk_info in enumerate(chunks_info): |         for i, chunk_info in enumerate(chunks_info): | ||||||
|             positions = batch_positions[i] if i < len(batch_positions) else [] |             # 确保 chunk_info 包含 'positions' 键 | ||||||
|  |             if 'positions' not in chunk_info: | ||||||
|  |                 chunk_info['positions'] = [] | ||||||
|                  |                  | ||||||
|             # 处理位置信息 |             print(f"处理第 {i+1} 个chunk: {chunk_info['text']}") | ||||||
|             processed_positions = [] |             print(f"更新前位置: {chunk_info['positions']}") | ||||||
|             for pos in positions: |  | ||||||
|                 if isinstance(pos, dict): |  | ||||||
|                     # 创建新的位置字典,确保所有坐标都是整数 |  | ||||||
|                     processed_pos = { |  | ||||||
|                         'x0': int(round(float(pos['x0']))) if pos.get('x0') is not None else 0, |  | ||||||
|                         'y0': int(round(float(pos['y0']))) if pos.get('y0') is not None else 0, |  | ||||||
|                         'x1': int(round(float(pos['x1']))) if pos.get('x1') is not None else 0, |  | ||||||
|                         'y1': int(round(float(pos['y1']))) if pos.get('y1') is not None else 0, |  | ||||||
|                         'page': int(pos['page']) if pos.get('page') is not None else 0 |  | ||||||
|                     } |  | ||||||
|                     processed_positions.append(processed_pos) |  | ||||||
|              |              | ||||||
|             # 更新chunk_info中的positions |             if isinstance(matches, list) and i < len(matches): | ||||||
|             chunk_info['positions'] = processed_positions |                 chunk_info['positions']=[mat['position_int'] for mat in matches[i] if 'position_int' in mat] | ||||||
|  |  | ||||||
|  |                 # # 如果matches是列表且索引有效 | ||||||
|  |                 # if isinstance(matches[i], dict) and 'position_int' in matches[i]: | ||||||
|  |                 #     chunk_info['positions'] = matches[i]['position_int'] | ||||||
|  |                 #     print(f"更新后位置: {chunk_info['positions']}") | ||||||
|  |                 # else: | ||||||
|  |                 #     chunk_info['positions'] = [] | ||||||
|  |                 #     print(f"未找到有效位置信息,设置为空列表") | ||||||
|  |             else: | ||||||
|  |                 chunk_info['positions'] = [] | ||||||
|  |                 print(f"匹配结果无效或索引越界,设置为空列表") | ||||||
|  |                  | ||||||
|  |         # 验证更新结果 | ||||||
|  |         print("最终chunks_info状态:") | ||||||
|  |         for i, chunk_info in enumerate(chunks_info): | ||||||
|  |             print(f"  Chunk {i+1}: ID={chunk_info['id']}, Positions={chunk_info['positions']}") | ||||||
|                  |                  | ||||||
|         return chunks_info |         return chunks_info | ||||||
|          |          | ||||||
| @@ -475,12 +494,13 @@ def get_positions_from_chunk(pdf_path, chunks_info): | |||||||
|         print(f"获取PDF文本位置信息时出错: {str(e)}") |         print(f"获取PDF文本位置信息时出错: {str(e)}") | ||||||
|         # 出错时为每个chunk添加空的位置信息 |         # 出错时为每个chunk添加空的位置信息 | ||||||
|         for chunk_info in chunks_info: |         for chunk_info in chunks_info: | ||||||
|  |             # 确保 chunk_info 包含 'positions' 键 | ||||||
|  |             if 'positions' not in chunk_info: | ||||||
|                 chunk_info['positions'] = [] |                 chunk_info['positions'] = [] | ||||||
|         return chunks_info |         return chunks_info | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): | def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): | ||||||
|     """处理PDF-TXT文件对""" |     """处理PDF-TXT文件对""" | ||||||
|     for name, pdf_path in pdf_dict.items(): |     for name, pdf_path in pdf_dict.items(): | ||||||
| @@ -493,6 +513,8 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): | |||||||
|         txt_path = txt_dict.get(name) |         txt_path = txt_dict.get(name) | ||||||
|         if txt_path: |         if txt_path: | ||||||
|             chunks_info=process_txt_chunks(dataset.id,document, txt_path) |             chunks_info=process_txt_chunks(dataset.id,document, txt_path) | ||||||
|  |  | ||||||
|  |             time.sleep(1) | ||||||
|             if  chunks_info: |             if  chunks_info: | ||||||
|                 chunks_info=get_positions_from_chunk(pdf_path, chunks_info) |                 chunks_info=get_positions_from_chunk(pdf_path, chunks_info) | ||||||
|                 for chunk_info in chunks_info: |                 for chunk_info in chunks_info: | ||||||
|   | |||||||
| @@ -161,9 +161,10 @@ def find_text_in_pdf(pdf_path, | |||||||
|                 if matched_lines: |                 if matched_lines: | ||||||
|                     _, merged_bbox = _merge_lines(matched_lines) |                     _, merged_bbox = _merge_lines(matched_lines) | ||||||
|                     results.append({ |                     results.append({ | ||||||
|                         "page": p + 1, |                         "page": p, | ||||||
|                         "bbox": merged_bbox, |                         "bbox": merged_bbox, | ||||||
|                         "matched_text": matched_text |                         "matched_text": matched_text, | ||||||
|  |                         "position_int":[p, merged_bbox[0], merged_bbox[2], merged_bbox[1], merged_bbox[3]] | ||||||
|                     }) |                     }) | ||||||
|             if results: |             if results: | ||||||
|                 batch_results[idx].extend(results) |                 batch_results[idx].extend(results) | ||||||
| @@ -206,6 +207,7 @@ def highlight_matches(pdf_path, matches, output_path="highlighted.pdf"): | |||||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||||
|     pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf' |     pdf_path = 'e:\\2\\2024深化智慧城市发展推进城市全域数字化转型的指导意见.pdf' | ||||||
|     pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf' |     pdf_path = 'G:\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf' | ||||||
|  |     pdf_path ="F:\\Synology_nas\\SynologyDrive\\大模型\\RAG\\20250805党建\\中国共产党领导干部廉洁从业若干准则.pdf" | ||||||
|     query = [ |     query = [ | ||||||
|         '''一、总体要求 |         '''一、总体要求 | ||||||
| 以习近平新时代中国特色社会主义思想为指导,完整、准确、全面贯彻新发展理念,统筹发展和安全,充分发挥数据的基础资源和创新引擎作用,整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合,全面提升城市全域数字化转型的整体性、系统性、协同性,不断满足人民日益增长的美好生活需要,为全面建设社会主义现代化国家提供强大动力。到2027年,全国城市全域数字化转型取得明显成效,形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市,有力支撑数字中国建设。到2030年,全国城市全域数字化转型全面突破,人民群众的获得感、幸福感、安全感全面提升,涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''', | 以习近平新时代中国特色社会主义思想为指导,完整、准确、全面贯彻新发展理念,统筹发展和安全,充分发挥数据的基础资源和创新引擎作用,整体性重塑智慧城市技术架构、系统性变革城市管理流程、一体化推动产城深度融合,全面提升城市全域数字化转型的整体性、系统性、协同性,不断满足人民日益增长的美好生活需要,为全面建设社会主义现代化国家提供强大动力。到2027年,全国城市全域数字化转型取得明显成效,形成一批横向打通、纵向贯通、各具特色的宜居、韧性、智慧城市,有力支撑数字中国建设。到2030年,全国城市全域数字化转型全面突破,人民群众的获得感、幸福感、安全感全面提升,涌现一批数字文明时代具有全球竞争力的中国式现代化城市。''', | ||||||
| @@ -271,7 +273,7 @@ if __name__ == "__main__": | |||||||
|     # 1. 找跨行正则匹配 |     # 1. 找跨行正则匹配 | ||||||
|     matches = find_text_in_pdf( |     matches = find_text_in_pdf( | ||||||
|         pdf_path, |         pdf_path, | ||||||
|         query,  # 你的正则 |         query,   | ||||||
|         threshold=60 |         threshold=60 | ||||||
|          |          | ||||||
|     ) |     ) | ||||||
| @@ -284,7 +286,7 @@ if __name__ == "__main__": | |||||||
|          |          | ||||||
|         #highlight_matches(pdf_path, query_matches, "example_highlighted.pdf") |         #highlight_matches(pdf_path, query_matches, "example_highlighted.pdf") | ||||||
|         for m in query_matches: |         for m in query_matches: | ||||||
|             print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}") |             print(f"第 {m['page']} 页 匹配: {m['matched_text'][:50]}... 位置: {m['bbox']}, 位置_int: {m['position_int']}") | ||||||
|         print("------------------")    |         print("------------------")    | ||||||
|             |             | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user