更新文档处理逻辑,添加从文件读取文本块并上传至知识库的功能
This commit is contained in:
		
							
								
								
									
										40
									
								
								chunk.py
									
									
									
									
									
								
							
							
						
						
									
										40
									
								
								chunk.py
									
									
									
									
									
								
							| @@ -1,28 +1,48 @@ | |||||||
| from ragflow_sdk import RAGFlow | from ragflow_sdk import RAGFlow | ||||||
|  |  | ||||||
| api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" | #api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" | ||||||
| base_url = "http://192.168.107.165:8099" | #base_url = "http://192.168.107.165:8099" | ||||||
|  | base_url = "http://localhost" | ||||||
|  | api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj" | ||||||
|  |  | ||||||
| rag_object = RAGFlow(api_key=api_key, base_url=base_url) | rag_object = RAGFlow(api_key=api_key, base_url=base_url) | ||||||
| #dataset = rag_object.create_dataset(name="kb_1") | #dataset = rag_object.create_dataset(name="kb_1") | ||||||
|  |  | ||||||
|  | datasets = rag_object.list_datasets() | ||||||
|  | #dataset = rag_object.list_datasets(name="kb_1") | ||||||
|  | dataset = rag_object.list_datasets(name="制度") | ||||||
|  |  | ||||||
| dataset = rag_object.list_datasets(name="kb_1") |  | ||||||
| dataset = dataset[0] | dataset = dataset[0] | ||||||
|  |  | ||||||
| # filename1 = "ragflow.txt" | # filename1 = "ragflow.txt" | ||||||
| # blob = open(filename1 , "rb").read() | # blob = open(filename1 , "rb").read() | ||||||
| # dataset.upload_documents([{"display_name":filename1,"blob":blob}]) | # dataset.upload_documents([{"display_name":filename1,"blob":blob}]) | ||||||
| for doc in dataset.list_documents( page=0, page_size=12): | # for doc in dataset.list_documents( page=0, page_size=12): | ||||||
|     print(doc) | #     print(doc) | ||||||
|     print("=========================================") | #     print("=========================================") | ||||||
|  |  | ||||||
|  |  | ||||||
| doc = dataset.list_documents(name= 'ragflow.txt') | doc = dataset.list_documents(name= '科技创新管理办法(试行).pdf') | ||||||
| doc = doc[0] | doc = doc[0] | ||||||
| doc.update({"parser_config": {"chunk_token_count": 256}}) | # doc.update({"parser_config": {"chunk_token_count": 256}}) | ||||||
| chunk = doc.add_chunk(content="xxxxxxx") | file_path ="G:\\11\\ragflow_api_test\\1.txt" | ||||||
| print(doc) | with open(file_path, 'r', encoding='utf-8') as file: | ||||||
|  |     file_content = file.read() | ||||||
|  | for num,txt_chunk in enumerate(file_content.split('\n\n')): | ||||||
|  |     print(f"处理文本块: {txt_chunk[:30]}...")  # 打印前30个字符以示例 | ||||||
|  |  | ||||||
|  |     if txt_chunk.strip():  # 确保不是空行 | ||||||
|  |         chunk = doc.add_chunk(content=txt_chunk) | ||||||
|  |         print(f"第{num+1} Chunk添加成功! ID: {chunk.id}") | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # content = ''' | ||||||
|  | # 第二章 部门职责>第六条 【财务管理部】 | ||||||
|  | # (一) 配合投标相关费用的办理工作; | ||||||
|  | # (二) 负责提供投标所需的相关财务资料。''' | ||||||
|  | # chunk = doc.add_chunk(content=content) | ||||||
|  | # print(f"Chunk添加成功! ID: {chunk.id}") | ||||||
| #dataset.upload_documents([{"display_name": "1.txt", "blob": open('1.txt',"rb").read()}]) | #dataset.upload_documents([{"display_name": "1.txt", "blob": open('1.txt',"rb").read()}]) | ||||||
|  |  | ||||||
| # 查询所有知识库 | # 查询所有知识库 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user