更新 Elasticsearch 和 MinIO 配置,重构文件上传逻辑,添加图片链接处理功能
This commit is contained in:
		| @@ -2,7 +2,7 @@ from elasticsearch import Elasticsearch | ||||
|  | ||||
| # 初始化 Elasticsearch   用户名elastic,密码infini_rag_flow | ||||
| es = Elasticsearch( | ||||
|     [{'host': '192.168.107.165', 'port': 1200, 'scheme': 'http'}], | ||||
|     [{'host': '127.0.0.1', 'port': 1200, 'scheme': 'http'}], | ||||
|     basic_auth=('elastic', 'infini_rag_flow') | ||||
| ) | ||||
|  | ||||
| @@ -118,12 +118,14 @@ if __name__ == "__main__": | ||||
|         print("连接失败:", e) | ||||
|  | ||||
|     tenant_id = "d669205e57a211f0b9e7324e7f243034" | ||||
|     dataset_id = "10345832587311f0919f3a2728512a4b"  #  dataset_id = kb_id | ||||
|     tenant_id = "9c73df5a3ebc11f08410c237296aa408" | ||||
|     dataset_id = "0e6127da574a11f0a59c7e7439a490f8"  #  dataset_id = kb_id | ||||
|     doc_id = "cbf576385bc911f08f23fedc3996e479" | ||||
|     doc_id = "4300e558609511f08b8bde4a87f78768"   | ||||
|     doc_id = "323113d8670c11f0b4255ea1d23c381a"   | ||||
|     chunk_id = "f035247f7de579b0"  #   | ||||
|     chunk_id = "5b3445f7861ff772"  #  | ||||
|     chunk_id = "b2d53baddbfde97c"  #  | ||||
|     new_img_id =    "10345832587311f0919f3a2728512a4b-f035247f7de579b0" #"new_img_id_12345" | ||||
|     new_img_id =  "0e6127da574a11f0a59c7e7439a490f8-b2d53baddbfde97c" | ||||
|     #new_img_id ="c5142bce5ac611f0ae707a8b5ba029cb-thumbnail_fb3cbc165ac611f0b5897a8b5ba029cb.png" | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										108
									
								
								minio_api.py
									
									
									
									
									
								
							
							
						
						
									
										108
									
								
								minio_api.py
									
									
									
									
									
								
							| @@ -2,7 +2,7 @@ from minio import Minio | ||||
| from minio.error import S3Error | ||||
| import os | ||||
| from datetime import timedelta | ||||
| MINIO_HOST=os.getenv("MINIO_HOST", "127.0.0.1") | ||||
| MINIO_HOST="127.0.0.1" | ||||
|  | ||||
| MINIO_CONFIG = { | ||||
|     "endpoint": f"{MINIO_HOST}:{os.getenv('MINIO_PORT', '9000')}", | ||||
| @@ -37,35 +37,6 @@ def get_minio_client(): | ||||
| #         'data': upload_result['data'] | ||||
| #     }) | ||||
|  | ||||
| def upload_files_to_server(files, parent_id=None, user_id=None): | ||||
|     """ | ||||
|     上传文件到MinIO服务器 | ||||
|     :param files: 文件路径列表 | ||||
|     :param parent_id: 父级ID(可选) | ||||
|     :param user_id: 用户ID(可选) | ||||
|     """ | ||||
|     for file_path in files: | ||||
|         if not os.path.isfile(file_path): | ||||
|             print(f"文件不存在: {file_path}") | ||||
|             continue | ||||
|          | ||||
|         # 获取文件名 | ||||
|         file_name = os.path.basename(file_path) | ||||
|          | ||||
|         # 上传文件到MinIO | ||||
|         try: | ||||
|             with open(file_path, 'rb') as file_data: | ||||
|                 minio_client.put_object ( | ||||
|                     bucket_name=parent_id, | ||||
|                     object_name=file_name, | ||||
|                     data=file_data, | ||||
|                     length=os.path.getsize(file_path) | ||||
|                 ) | ||||
|             print(f"文件已上传到MinIO: {parent_id}/{file_name}") | ||||
|         except S3Error as e: | ||||
|             print(f"上传文件 '{file_name}' 失败: {e}") | ||||
|         except Exception as e: | ||||
|             print(f"发生错误: {e}") | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -81,48 +52,55 @@ def upload_files_to_server(files, parent_id=None, user_id=None): | ||||
| #     secure=False  # 使用HTTPS(如果是本地测试且未配置SSL,可设置为False) | ||||
| # ) | ||||
|  | ||||
| minio_client= get_minio_client() | ||||
| def upload_file2minio(bucket_name, object_name, file_path): | ||||
|     """上传文件到MinIO | ||||
|     # 通过fput_object上传时: | ||||
|  | ||||
|     # 如果object_name为image\image.jpg,则上传后的名字就是image\image.jpg; | ||||
|  | ||||
| # 要上传的存储桶信息 | ||||
| bucket_name = "my-bucket"  # 替换为你的存储桶名称 | ||||
| object_name = "image/1.jpg"  # 文件在MinIO中存储的名称 | ||||
| file_path = "G:\\11\\ragflow_api_test\\2.jpg"  # 本地文件路径 | ||||
|  | ||||
|  | ||||
| # 通过fput_object上传时: | ||||
|  | ||||
| # 如果object_name为image\image.jpg,则上传后的名字就是image\image.jpg; | ||||
|  | ||||
| # 如果object_name为image/image.jpg,则上传后image为文件夹,文件名为image.jpg; | ||||
|  | ||||
|  | ||||
| try: | ||||
|     # 检查存储桶是否存在,如果不存在则创建(可选) | ||||
|     if not minio_client.bucket_exists(bucket_name): | ||||
|         minio_client.make_bucket(bucket_name) | ||||
|         print(f"Bucket '{bucket_name}' created") | ||||
|     # 如果object_name为image/image.jpg,则上传后image为文件夹,文件名为image.jpg; | ||||
|      | ||||
|     # 上传文件 | ||||
|     minio_client.fput_object( | ||||
|         bucket_name=bucket_name, | ||||
|         object_name=object_name, | ||||
|         file_path=file_path | ||||
|     ) | ||||
|     """ | ||||
|  | ||||
|     # 获取文件的预签名URL(可选) | ||||
|     res = minio_client.get_presigned_url("GET", bucket_name, object_name, expires=timedelta(days=7)) | ||||
|     minio_client= get_minio_client() | ||||
|  | ||||
|     try: | ||||
|         # 检查存储桶是否存在,如果不存在则创建(可选) | ||||
|         if not minio_client.bucket_exists(bucket_name): | ||||
|             minio_client.make_bucket(bucket_name) | ||||
|             print(f"Bucket '{bucket_name}' created") | ||||
|          | ||||
|         # 上传文件 | ||||
|         minio_client.fput_object( | ||||
|             bucket_name=bucket_name, | ||||
|             object_name=object_name, | ||||
|             file_path=file_path | ||||
|         ) | ||||
|  | ||||
|         # 获取文件的预签名URL(可选) | ||||
|         #res = minio_client.get_presigned_url("GET", bucket_name, object_name, expires=timedelta(days=7)) | ||||
|  | ||||
|         #res = "http://127.0.0.1:9000" + "/"+bucket_name+"/"  + object_name | ||||
|  | ||||
|          | ||||
|         #print(res) | ||||
|         print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}' 为 '{object_name}'") | ||||
|          | ||||
|     except S3Error as exc: | ||||
|         print("MinIO错误:", exc) | ||||
|     except Exception as e: | ||||
|         print("发生错误:", e) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|         # 要上传的存储桶信息 | ||||
|     bucket_name = "my-bucket"  # 替换为你的存储桶名称 | ||||
|     object_name = "image/1.jpg"  # 文件在MinIO中存储的名称 | ||||
|     file_path = "G:\\11\\ragflow_api_test\\2.jpg"  # 本地文件路径 | ||||
|     upload_file2minio(bucket_name, object_name, file_path) | ||||
|  | ||||
|     #res = "http://127.0.0.1:9000" + "/"+bucket_name+"/"  + object_name | ||||
|  | ||||
|      | ||||
|     print(res) | ||||
|     print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}' 为 '{object_name}'") | ||||
|      | ||||
| except S3Error as exc: | ||||
|     print("MinIO错误:", exc) | ||||
| except Exception as e: | ||||
|     print("发生错误:", e) | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -7,10 +7,148 @@ base_url = "http://127.0.0.1:8099" | ||||
|  | ||||
|  | ||||
| ## 公司内网 | ||||
| base_url = "http://192.168.107.165:8099" | ||||
| api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" | ||||
| # base_url = "http://192.168.107.165:8099" | ||||
| # api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" | ||||
|  | ||||
|  | ||||
| elastic_tenant_id = "9c73df5a3ebc11f08410c237296aa408" | ||||
|  | ||||
| rag_object = RAGFlow(api_key=api_key, base_url=base_url) | ||||
|  | ||||
| elastic_url = "127.0.0.1" | ||||
|  | ||||
| from elasticsearch import Elasticsearch | ||||
|  | ||||
| # 初始化 Elasticsearch   用户名elastic,密码infini_rag_flow | ||||
| es = Elasticsearch( | ||||
|     [{'host': elastic_url, 'port': 1200, 'scheme': 'http'}], | ||||
|     basic_auth=('elastic', 'infini_rag_flow') | ||||
| ) | ||||
|  | ||||
| def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id): | ||||
|     """ | ||||
|     在 Elasticsearch 中更新指定文档块的 img_id。 | ||||
|     如果img_id不存在,则增加一个新的 img_id。 | ||||
|  | ||||
|     :param tenant_id: 租户 ID | ||||
|     :param doc_id: 文档 ID | ||||
|     :param chunk_id: 文档块 ID | ||||
|     :param new_img_id: 新的 img_id | ||||
|     :return: 更新结果 | ||||
|  | ||||
|     """ | ||||
|     # 构建索引名称 | ||||
|     index_name = f"ragflow_{tenant_id}"  # 这里需要替换为实际的索引名称生成逻辑 | ||||
|  | ||||
|     # 构建查询条件 | ||||
|     query = { | ||||
|         "bool": { | ||||
|             "must": [ | ||||
|                 {"term": {"doc_id": doc_id}}, | ||||
|                 {"term": {"_id": chunk_id}} | ||||
|             ] | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     # 搜索目标文档 | ||||
|     result = es.search(index=index_name, body={"query": query}) | ||||
|  | ||||
|     # 检查是否找到目标文档 | ||||
|     if result['hits']['total']['value'] == 0: | ||||
|         return {"code": 102, "message": f"Can't find this chunk {chunk_id}"} | ||||
|      | ||||
|  | ||||
|     # 获取目标文档的 ID | ||||
|     hit = result['hits']['hits'][0] | ||||
|     doc_id_in_es = hit['_id'] | ||||
|  | ||||
|     update_body = { | ||||
|         "doc": { | ||||
|             "img_id": new_img_id | ||||
|         } | ||||
|         } | ||||
|  | ||||
|     # 更新文档 | ||||
|     update_result = es.update(index=index_name, id=doc_id_in_es, body=update_body) | ||||
|     print("更新结果:", update_result) | ||||
|      | ||||
|      | ||||
|  | ||||
|     if update_result['result'] == 'updated': | ||||
|         return {"code": 0, "message": ""} | ||||
|     else: | ||||
|         return {"code": 100, "message": "Failed to update img_id"} | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| from minio import Minio | ||||
| from minio.error import S3Error | ||||
|  | ||||
|  | ||||
| MINIO_HOST="127.0.0.1" | ||||
|  | ||||
| MINIO_CONFIG = { | ||||
|     "endpoint": f"{MINIO_HOST}:{os.getenv('MINIO_PORT', '9000')}", | ||||
|     "access_key": os.getenv("MINIO_USER", "rag_flow"), | ||||
|     "secret_key": os.getenv("MINIO_PASSWORD", "infini_rag_flow"), | ||||
|     "secure": False | ||||
| } | ||||
|  | ||||
| def get_minio_client(): | ||||
|     """创建MinIO客户端""" | ||||
|     return Minio( | ||||
|         endpoint=MINIO_CONFIG["endpoint"], | ||||
|         access_key=MINIO_CONFIG["access_key"], | ||||
|         secret_key=MINIO_CONFIG["secret_key"], | ||||
|         secure=MINIO_CONFIG["secure"] | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def upload_file2minio(bucket_name, object_name, file_path): | ||||
|     """上传文件到MinIO | ||||
|     # 通过fput_object上传时: | ||||
|  | ||||
|     # 如果object_name为image\image.jpg,则上传后的名字就是image\image.jpg; | ||||
|  | ||||
|     # 如果object_name为image/image.jpg,则上传后image为文件夹,文件名为image.jpg; | ||||
|      | ||||
|     """ | ||||
|  | ||||
|     minio_client= get_minio_client() | ||||
|  | ||||
|     try: | ||||
|         # 检查存储桶是否存在,如果不存在则创建(可选) | ||||
|         if not minio_client.bucket_exists(bucket_name): | ||||
|             minio_client.make_bucket(bucket_name) | ||||
|             print(f"Bucket '{bucket_name}' created") | ||||
|          | ||||
|         # 上传文件 | ||||
|         minio_client.fput_object( | ||||
|             bucket_name=bucket_name, | ||||
|             object_name=object_name, | ||||
|             file_path=file_path | ||||
|         ) | ||||
|  | ||||
|         # 获取文件的预签名URL(可选) | ||||
|         #res = minio_client.get_presigned_url("GET", bucket_name, object_name, expires=timedelta(days=7)) | ||||
|  | ||||
|         #res = "http://127.0.0.1:9000" + "/"+bucket_name+"/"  + object_name | ||||
|  | ||||
|          | ||||
|         #print(res) | ||||
|         print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}' 为 '{object_name}'") | ||||
|         return True | ||||
|          | ||||
|     except S3Error as exc: | ||||
|         print("MinIO错误:", exc) | ||||
|         return False | ||||
|     except Exception as e: | ||||
|         print("发生错误:", e) | ||||
|         return False | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -120,17 +258,29 @@ def divid_txt_chunk_img(txt_chunk): | ||||
|      | ||||
|     return clean_text, image_paths | ||||
|  | ||||
| def upload_images_to_minio(image_paths, document): | ||||
|     """ | ||||
|     上传图片到MinIO, | ||||
| def extract_images_from_chunk( content): | ||||
|     """从chunk内容中提取图片链接""" | ||||
|     img_pattern = r'!\[.*?\]\((.*?)\)' | ||||
|     return re.findall(img_pattern, content) | ||||
|  | ||||
| def remove_images_from_content( content): | ||||
|     """从内容中移除图片链接""" | ||||
|     # 移除markdown图片语法  | ||||
|     content = re.sub(r'!\[.*?\]\(.*?\)', '', content) | ||||
|     # 清理多余的空行 | ||||
|     content = re.sub(r'\n\s*\n\s*\n', '\n\n', content) | ||||
|     return content.strip() | ||||
|      | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def process_txt_chunks( dataset_id, document, txt_path): | ||||
|     """处理文本分块并添加到文档 | ||||
|     dataset_id = kb_id | ||||
|      | ||||
|      | ||||
|     """ | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| def process_txt_chunks(document, txt_path): | ||||
|     """处理文本分块并添加到文档""" | ||||
|     try: | ||||
|         with open(txt_path, 'r', encoding='utf-8') as file: | ||||
|             file_content = file.read() | ||||
| @@ -138,23 +288,51 @@ def process_txt_chunks(document, txt_path): | ||||
|         for num, txt_chunk in enumerate(file_content.split('\n\n')): | ||||
|             if txt_chunk.strip(): | ||||
|                 print(f"处理文本块: {txt_chunk[:30]}...") | ||||
|                 chunk = document.add_chunk(content=txt_chunk) | ||||
|                 img_urls= extract_images_from_chunk(txt_chunk) | ||||
|                 img_url = img_urls[0] if img_urls else None | ||||
|                 if img_url: | ||||
|                     print(f"检测到图片链接: {img_url}") | ||||
|                     # 清楚图片链接 | ||||
|                     clean_chunk = remove_images_from_content(txt_chunk) | ||||
|                     chunk = document.add_chunk(content=clean_chunk) | ||||
|                      | ||||
|                     # 判断是相对路径还是绝对路径 | ||||
|                     if not os.path.isabs(img_url): | ||||
|                         img_abs_path = os.path.join(os.path.dirname(txt_path), img_url) | ||||
|                     else: | ||||
|                         img_abs_path = img_url | ||||
|                         print(f"图片绝对路径: {img_abs_path}") | ||||
|                     if not os.path.exists(img_abs_path): | ||||
|                         print(f"图片未找到: {img_abs_path},跳过。") | ||||
|                         continue | ||||
|                     else: | ||||
|                         if(upload_file2minio(dataset_id, chunk.id, img_abs_path)): | ||||
|                             new_img_id = f"{dataset_id}-{chunk.id}" | ||||
|                             print(f"图片 {img_abs_path} 已上传,新的 img_id: {new_img_id}") | ||||
|  | ||||
|                             update_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, new_img_id) | ||||
|                 else: | ||||
|                     print("未检测到图片链接,直接添加文本块。") | ||||
|                     chunk = document.add_chunk(content=txt_chunk) | ||||
|                 print(f"第{num+1} Chunk添加成功! ID: {chunk.id}") | ||||
|                  | ||||
|     except Exception as e: | ||||
|         print(f"处理文本文件时出错: {txt_path},错误: {e}") | ||||
|  | ||||
|  | ||||
|  | ||||
| def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): | ||||
|     """处理PDF-TXT文件对""" | ||||
|     for name, pdf_path in pdf_dict.items(): | ||||
|         display_name = os.path.basename(pdf_path) | ||||
|         document = upload_or_get_document(dataset, pdf_path, display_name) | ||||
|         print(f"选择的文档: {document.name},ID: {document.id}") | ||||
|         if not document: | ||||
|             continue | ||||
|              | ||||
|         txt_path = txt_dict.get(name) | ||||
|         if txt_path: | ||||
|             process_txt_chunks(document, txt_path) | ||||
|             process_txt_chunks(dataset.id,document, txt_path) | ||||
|  | ||||
| def main(): | ||||
|  | ||||
| @@ -163,12 +341,12 @@ def main(): | ||||
|     dataset.id = bucket_name | ||||
|     chunk_id = object_name | ||||
|     """ | ||||
|     file_path = "g:\\11\\22\\规范\\" | ||||
|     #pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) | ||||
|     file_path = "g:\\11\\22\\test\\" | ||||
|     pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) | ||||
|      | ||||
|     # if not pdf_dict: | ||||
|     #     print("未选择任何文件。") | ||||
|     #     return | ||||
|     if not pdf_dict: | ||||
|         print("未选择任何文件。") | ||||
|         return | ||||
|          | ||||
|     dataset = select_dataset(rag_object) | ||||
|     print(f"选择的数据集: {dataset.name}") | ||||
| @@ -177,7 +355,7 @@ def main(): | ||||
|         print("未选择数据集。") | ||||
|         return | ||||
|          | ||||
|     #process_pdf_txt_pairs(pdf_dict, txt_dict, dataset) | ||||
|     process_pdf_txt_pairs(pdf_dict, txt_dict, dataset) | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user