Compare commits
3 Commits
587305f070
...
5b940d5070
Author | SHA1 | Date | |
---|---|---|---|
5b940d5070 | |||
68444ad7ff | |||
bc2aac4eea |
40
chunk.py
40
chunk.py
@@ -1,28 +1,48 @@
|
|||||||
from ragflow_sdk import RAGFlow
|
from ragflow_sdk import RAGFlow
|
||||||
|
|
||||||
api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
#api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
||||||
base_url = "http://192.168.107.165:8099"
|
#base_url = "http://192.168.107.165:8099"
|
||||||
|
base_url = "http://localhost:8099"
|
||||||
|
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
|
||||||
|
|
||||||
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
||||||
#dataset = rag_object.create_dataset(name="kb_1")
|
#dataset = rag_object.create_dataset(name="kb_1")
|
||||||
|
|
||||||
|
datasets = rag_object.list_datasets()
|
||||||
|
#dataset = rag_object.list_datasets(name="kb_1")
|
||||||
|
dataset = rag_object.list_datasets(name="制度")
|
||||||
|
|
||||||
dataset = rag_object.list_datasets(name="kb_1")
|
|
||||||
dataset = dataset[0]
|
dataset = dataset[0]
|
||||||
|
|
||||||
# filename1 = "ragflow.txt"
|
# filename1 = "ragflow.txt"
|
||||||
# blob = open(filename1 , "rb").read()
|
# blob = open(filename1 , "rb").read()
|
||||||
# dataset.upload_documents([{"display_name":filename1,"blob":blob}])
|
# dataset.upload_documents([{"display_name":filename1,"blob":blob}])
|
||||||
for doc in dataset.list_documents( page=0, page_size=12):
|
# for doc in dataset.list_documents( page=0, page_size=12):
|
||||||
print(doc)
|
# print(doc)
|
||||||
print("=========================================")
|
# print("=========================================")
|
||||||
|
|
||||||
|
|
||||||
doc = dataset.list_documents(name= 'ragflow.txt')
|
doc = dataset.list_documents(name= '科技创新管理办法(试行).pdf')
|
||||||
doc = doc[0]
|
doc = doc[0]
|
||||||
doc.update({"parser_config": {"chunk_token_count": 256}})
|
# doc.update({"parser_config": {"chunk_token_count": 256}})
|
||||||
chunk = doc.add_chunk(content="xxxxxxx")
|
file_path ="G:\\11\\ragflow_api_test\\1.txt"
|
||||||
print(doc)
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
file_content = file.read()
|
||||||
|
for num,txt_chunk in enumerate(file_content.split('\n\n')):
|
||||||
|
print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例
|
||||||
|
|
||||||
|
if txt_chunk.strip(): # 确保不是空行
|
||||||
|
chunk = doc.add_chunk(content=txt_chunk)
|
||||||
|
print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# content = '''
|
||||||
|
# 第二章 部门职责>第六条 【财务管理部】
|
||||||
|
# (一) 配合投标相关费用的办理工作;
|
||||||
|
# (二) 负责提供投标所需的相关财务资料。'''
|
||||||
|
# chunk = doc.add_chunk(content=content)
|
||||||
|
# print(f"Chunk添加成功! ID: {chunk.id}")
|
||||||
#dataset.upload_documents([{"display_name": "1.txt", "blob": open('1.txt',"rb").read()}])
|
#dataset.upload_documents([{"display_name": "1.txt", "blob": open('1.txt',"rb").read()}])
|
||||||
|
|
||||||
# 查询所有知识库
|
# 查询所有知识库
|
||||||
|
@@ -1,20 +1,50 @@
|
|||||||
from tkinter import Tk, StringVar, Label, OptionMenu, Button, filedialog
|
from tkinter import Tk, StringVar, Label, OptionMenu, Button, filedialog
|
||||||
from ragflow_sdk import RAGFlow
|
from ragflow_sdk import RAGFlow
|
||||||
|
|
||||||
api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
|
||||||
base_url = "http://192.168.107.165:8099"
|
|
||||||
|
|
||||||
|
base_url = "http://127.0.0.1"
|
||||||
|
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
|
||||||
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
def add_chunk_to_document():
|
|
||||||
dataset_id = dataset_var.get()
|
|
||||||
document_id = document_var.get()
|
|
||||||
file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
|
|
||||||
|
|
||||||
if file_path:
|
def add_chunk_to_document():
|
||||||
with open(file_path, 'r') as file:
|
try:
|
||||||
|
dataset_name = dataset_var.get()
|
||||||
|
document_name = document_var.get()
|
||||||
|
if not dataset_name or not document_name:
|
||||||
|
print("请选择数据集和文档!")
|
||||||
|
return
|
||||||
|
|
||||||
|
file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
|
||||||
|
if not file_path:
|
||||||
|
print("未选择文件")
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
rag_object.add_chunk(dataset_id, document_id, content)
|
|
||||||
|
datasets = rag_object.list_datasets(name=dataset_name)
|
||||||
|
if not datasets:
|
||||||
|
print(f"数据集 '{dataset_name}' 不存在!")
|
||||||
|
return
|
||||||
|
dataset = datasets[0]
|
||||||
|
|
||||||
|
documents = dataset.list_documents(name=document_name)
|
||||||
|
if not documents:
|
||||||
|
print(f"文档 '{document_name}' 不存在!")
|
||||||
|
return
|
||||||
|
document = documents[0]
|
||||||
|
|
||||||
|
content="test chunk"
|
||||||
|
chunk = document.add_chunk(content=content)
|
||||||
|
print(f"Chunk添加成功! ID: {chunk.id}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"添加chunk时发生错误: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_documents(*args):
|
def update_documents(*args):
|
||||||
dataset_name = dataset_var.get()
|
dataset_name = dataset_var.get()
|
||||||
@@ -28,11 +58,15 @@ def update_documents(*args):
|
|||||||
root = Tk()
|
root = Tk()
|
||||||
root.title("Add Chunk to Document")
|
root.title("Add Chunk to Document")
|
||||||
|
|
||||||
dataset_var = StringVar(root)
|
# 初始化变量时设置默认值
|
||||||
document_var = StringVar(root)
|
|
||||||
|
|
||||||
datasets = rag_object.list_datasets()
|
datasets = rag_object.list_datasets()
|
||||||
dataset_menu = OptionMenu(root, dataset_var, *[ds.name for ds in datasets], command=update_documents)
|
dataset_names = [ds.name for ds in datasets]
|
||||||
|
dataset_var = StringVar(root)
|
||||||
|
dataset_var.set(dataset_names[0] if dataset_names else "")
|
||||||
|
document_var = StringVar(root)
|
||||||
|
document_var.set("")
|
||||||
|
|
||||||
|
dataset_menu = OptionMenu(root, dataset_var, *dataset_names, command=update_documents)
|
||||||
dataset_menu.pack()
|
dataset_menu.pack()
|
||||||
|
|
||||||
|
|
||||||
|
142
src/add_chunk_cli.py
Normal file
142
src/add_chunk_cli.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
from ragflow_sdk import RAGFlow
|
||||||
|
import os
|
||||||
|
|
||||||
|
## home
|
||||||
|
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
|
||||||
|
base_url = "http://127.0.0.1:8099"
|
||||||
|
|
||||||
|
|
||||||
|
## 公司内网
|
||||||
|
# base_url = "https://ddyy.iepose.cn"
|
||||||
|
# api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
||||||
|
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def choose_from_list(options, prompt):
|
||||||
|
for idx, item in enumerate(options):
|
||||||
|
print(f"{idx + 1}. {item}")
|
||||||
|
while True:
|
||||||
|
choice = input(prompt)
|
||||||
|
if choice.isdigit() and 1 <= int(choice) <= len(options):
|
||||||
|
return options[int(choice) - 1]
|
||||||
|
else:
|
||||||
|
print("输入无效,请重新输入编号。")
|
||||||
|
|
||||||
|
|
||||||
|
def select_files(file_path, file_type="pdf"):
|
||||||
|
"""
|
||||||
|
选择file_path中的所有指定类型文件(默认pdf),
|
||||||
|
返回文件路径列表
|
||||||
|
"""
|
||||||
|
file_list = []
|
||||||
|
for root, dirs, files in os.walk(file_path):
|
||||||
|
for file in files:
|
||||||
|
if file.lower().endswith(f".{file_type.lower()}"):
|
||||||
|
file_list.append(os.path.join(root, file))
|
||||||
|
return file_list
|
||||||
|
|
||||||
|
|
||||||
|
def pair_pdf_and_txt(pdf_path, txt_path):
|
||||||
|
"""
|
||||||
|
将pdf和txt文件对齐,
|
||||||
|
返回对齐pdf_dict和txt_dict,
|
||||||
|
pdf_dict和txt_dict的key为文件名(不含后缀),value为文件路径
|
||||||
|
txt_dict仅收入与pdf_dict中存在的文件,
|
||||||
|
如果pdf_dict中有文件名没有对应的txt文件,则不收入txt_dict
|
||||||
|
"""
|
||||||
|
pdf_files = select_files(pdf_path, "pdf")
|
||||||
|
txt_files = select_files(txt_path, "txt")
|
||||||
|
|
||||||
|
# 构建文件名到路径的映射
|
||||||
|
pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
|
||||||
|
txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
|
||||||
|
|
||||||
|
# 只保留有对应txt的pdf
|
||||||
|
pdf_dict_aligned = {}
|
||||||
|
txt_dict_aligned = {}
|
||||||
|
for name in pdf_dict:
|
||||||
|
if name in txt_dict_all:
|
||||||
|
pdf_dict_aligned[name] = pdf_dict[name]
|
||||||
|
txt_dict_aligned[name] = txt_dict_all[name]
|
||||||
|
|
||||||
|
return pdf_dict_aligned, txt_dict_aligned
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
file_path = "g:\\11\\22\\"
|
||||||
|
pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
|
||||||
|
|
||||||
|
if not pdf_dict:
|
||||||
|
print("未选择任何文件。")
|
||||||
|
return
|
||||||
|
# 选择数据集
|
||||||
|
datasets = rag_object.list_datasets()
|
||||||
|
if not datasets:
|
||||||
|
print("没有可用的数据集。")
|
||||||
|
return
|
||||||
|
dataset_names = [ds.name for ds in datasets]
|
||||||
|
dataset_name = choose_from_list(dataset_names, "请选择数据集编号:")
|
||||||
|
dataset = [ds for ds in datasets if ds.name == dataset_name][0]
|
||||||
|
|
||||||
|
# # 选择文档
|
||||||
|
# documents = dataset.list_documents()
|
||||||
|
# if not documents:
|
||||||
|
# print("该数据集下没有文档。")
|
||||||
|
# return
|
||||||
|
# document_names = [doc.name for doc in documents]
|
||||||
|
# document_name = choose_from_list(document_names, "请选择文档编号:")
|
||||||
|
# document = [doc for doc in documents if doc.name == document_name][0]
|
||||||
|
|
||||||
|
# 上传所有pdf文件到数据集
|
||||||
|
for name, pdf_path in pdf_dict.items():
|
||||||
|
display_name = os.path.basename(pdf_path)
|
||||||
|
document = None
|
||||||
|
try:
|
||||||
|
document= dataset.list_documents(name=display_name)[0]
|
||||||
|
print(f"文档已存在: {display_name},跳过上传。")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{display_name}不存在")
|
||||||
|
if not document:
|
||||||
|
try:
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
blob = f.read()
|
||||||
|
dataset.upload_documents([{"display_name": display_name, "blob": blob}])
|
||||||
|
document= dataset.list_documents(name=display_name)[0]
|
||||||
|
print(f"已上传PDF: {pdf_path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"上传PDF失败: {pdf_path},错误: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 将txt内容作为chunk添加到文档
|
||||||
|
txt_path = txt_dict.get(name)
|
||||||
|
if txt_path:
|
||||||
|
try:
|
||||||
|
with open(txt_path, 'r', encoding='utf-8') as file:
|
||||||
|
file_content = file.read()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取文件失败: {txt_path},错误: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
for num,txt_chunk in enumerate(file_content.split('\n\n')):
|
||||||
|
print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例
|
||||||
|
|
||||||
|
if txt_chunk.strip(): # 确保不是空行
|
||||||
|
chunk = document.add_chunk(content=txt_chunk)
|
||||||
|
print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
|
||||||
|
|
||||||
|
# chunk = document.add_chunk(content=content)
|
||||||
|
# print(f"Chunk添加成功! 文件: {txt_path},ID: {chunk.id}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"添加chunk时发生错误: {txt_path},错误: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user