更新本地地址配置，修复连接问题；新增命令行接口以支持文档上传和处理功能

重构添加文档块功能，改进文件读取和错误处理逻辑
更新文档处理逻辑，添加从文件读取文本块并上传至知识库的功能
2025-07-06 10:47:10 +08:00 · 2025-07-05 11:43:50 +08:00 · 2025-07-05 11:36:29 +08:00
3 changed files with 219 additions and 23 deletions
--- a/chunk.py
+++ b/chunk.py
@@ -1,28 +1,48 @@
 from ragflow_sdk import RAGFlow
-api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
+#api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
-base_url = "http://192.168.107.165:8099"
+#base_url = "http://192.168.107.165:8099"
 base_url = "http://localhost:8099"
 api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
 rag_object = RAGFlow(api_key=api_key, base_url=base_url)
 #dataset = rag_object.create_dataset(name="kb_1")
 datasets = rag_object.list_datasets()
 #dataset = rag_object.list_datasets(name="kb_1")
 dataset = rag_object.list_datasets(name="制度")
 dataset = rag_object.list_datasets(name="kb_1")
 dataset = dataset[0]
 # filename1 = "ragflow.txt"
 # blob = open(filename1 , "rb").read()
 # dataset.upload_documents([{"display_name":filename1,"blob":blob}])
-for doc in dataset.list_documents( page=0, page_size=12):
+# for doc in dataset.list_documents( page=0, page_size=12):
-    print(doc)
+#     print(doc)
-    print("=========================================")
+#     print("=========================================")
-doc = dataset.list_documents(name= 'ragflow.txt')
+doc = dataset.list_documents(name= '科技创新管理办法（试行）.pdf')
 doc = doc[0]
-doc.update({"parser_config": {"chunk_token_count": 256}})
+# doc.update({"parser_config": {"chunk_token_count": 256}})
-chunk = doc.add_chunk(content="xxxxxxx")
+file_path ="G:\\11\\ragflow_api_test\\1.txt"
-print(doc)
+with open(file_path, 'r', encoding='utf-8') as file:
    file_content = file.read()
 for num,txt_chunk in enumerate(file_content.split('\n\n')):
    print(f"处理文本块: {txt_chunk[:30]}...")  # 打印前30个字符以示例
    if txt_chunk.strip():  # 确保不是空行
        chunk = doc.add_chunk(content=txt_chunk)
        print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
 # content = '''
 # 第二章 部门职责>第六条 【财务管理部】
 # （一） 配合投标相关费用的办理工作；
 # （二） 负责提供投标所需的相关财务资料。'''
 # chunk = doc.add_chunk(content=content)
 # print(f"Chunk添加成功! ID: {chunk.id}")
 #dataset.upload_documents([{"display_name": "1.txt", "blob": open('1.txt',"rb").read()}])
 # 查询所有知识库
--- a/src/add_chunk.py
+++ b/src/add_chunk.py
@@ -1,20 +1,50 @@
 from tkinter import Tk, StringVar, Label, OptionMenu, Button, filedialog
 from ragflow_sdk import RAGFlow
 api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
 base_url = "http://192.168.107.165:8099"
 base_url = "http://127.0.0.1"
 api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
 rag_object = RAGFlow(api_key=api_key, base_url=base_url)
 def add_chunk_to_document():
    dataset_id = dataset_var.get()
    document_id = document_var.get()
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
-    if file_path:
+def add_chunk_to_document():
-        with open(file_path, 'r') as file:
+    try:
        dataset_name = dataset_var.get()
        document_name = document_var.get()
        if not dataset_name or not document_name:
            print("请选择数据集和文档！")
            return
        file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
        if not file_path:
            print("未选择文件")
            return
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
-            rag_object.add_chunk(dataset_id, document_id, content)
+        
        datasets = rag_object.list_datasets(name=dataset_name)
        if not datasets:
            print(f"数据集 '{dataset_name}' 不存在!")
            return
        dataset = datasets[0]
        documents = dataset.list_documents(name=document_name)
        if not documents:
            print(f"文档 '{document_name}' 不存在!")
            return
        document = documents[0]
        content="test chunk"
        chunk = document.add_chunk(content=content)
        print(f"Chunk添加成功! ID: {chunk.id}")
    except Exception as e:
        print(f"添加chunk时发生错误: {e}")
        import traceback
        traceback.print_exc()
 def update_documents(*args):
    dataset_name = dataset_var.get()
@@ -28,11 +58,15 @@ def update_documents(*args):
 root = Tk()
 root.title("Add Chunk to Document")
-dataset_var = StringVar(root)
+# 初始化变量时设置默认值
 document_var = StringVar(root)
 datasets = rag_object.list_datasets()
-dataset_menu = OptionMenu(root, dataset_var, *[ds.name for ds in datasets], command=update_documents)
+dataset_names = [ds.name for ds in datasets]
 dataset_var = StringVar(root)
 dataset_var.set(dataset_names[0] if dataset_names else "")
 document_var = StringVar(root)
 document_var.set("")
 dataset_menu = OptionMenu(root, dataset_var, *dataset_names, command=update_documents)
 dataset_menu.pack()
--- a/src/add_chunk_cli.py
+++ b/src/add_chunk_cli.py
@@ -0,0 +1,142 @@
 from ragflow_sdk import RAGFlow
 import os
 ## home
 api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
 base_url = "http://127.0.0.1:8099"
 ## 公司内网
 # base_url = "https://ddyy.iepose.cn"
 # api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
 rag_object = RAGFlow(api_key=api_key, base_url=base_url)
 def choose_from_list(options, prompt):
    for idx, item in enumerate(options):
        print(f"{idx + 1}. {item}")
    while True:
        choice = input(prompt)
        if choice.isdigit() and 1 <= int(choice) <= len(options):
            return options[int(choice) - 1]
        else:
            print("输入无效，请重新输入编号。")
 def select_files(file_path, file_type="pdf"):
    """
    选择file_path中的所有指定类型文件（默认pdf），
    返回文件路径列表
    """
    file_list = []
    for root, dirs, files in os.walk(file_path):
        for file in files:
            if file.lower().endswith(f".{file_type.lower()}"):
                file_list.append(os.path.join(root, file))
    return file_list
 def pair_pdf_and_txt(pdf_path, txt_path):
    """
    将pdf和txt文件对齐，
    返回对齐pdf_dict和txt_dict，
    pdf_dict和txt_dict的key为文件名（不含后缀），value为文件路径
    txt_dict仅收入与pdf_dict中存在的文件，
    如果pdf_dict中有文件名没有对应的txt文件，则不收入txt_dict
    """
    pdf_files = select_files(pdf_path, "pdf")
    txt_files = select_files(txt_path, "txt")
    # 构建文件名到路径的映射
    pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
    txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
    # 只保留有对应txt的pdf
    pdf_dict_aligned = {}
    txt_dict_aligned = {}
    for name in pdf_dict:
        if name in txt_dict_all:
            pdf_dict_aligned[name] = pdf_dict[name]
            txt_dict_aligned[name] = txt_dict_all[name]
    return pdf_dict_aligned, txt_dict_aligned
 def main():
    file_path = "g:\\11\\22\\"
    pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
    if not pdf_dict:
        print("未选择任何文件。")
        return
    # 选择数据集
    datasets = rag_object.list_datasets()
    if not datasets:
        print("没有可用的数据集。")
        return
    dataset_names = [ds.name for ds in datasets]
    dataset_name = choose_from_list(dataset_names, "请选择数据集编号：")
    dataset = [ds for ds in datasets if ds.name == dataset_name][0]
    # # 选择文档
    # documents = dataset.list_documents()
    # if not documents:
    #     print("该数据集下没有文档。")
    #     return
    # document_names = [doc.name for doc in documents]
    # document_name = choose_from_list(document_names, "请选择文档编号：")
    # document = [doc for doc in documents if doc.name == document_name][0]
    # 上传所有pdf文件到数据集
    for name, pdf_path in pdf_dict.items():
        display_name = os.path.basename(pdf_path)
        document = None
        try:
            document= dataset.list_documents(name=display_name)[0]
            print(f"文档已存在: {display_name}，跳过上传。")
        except Exception as e:
            print(f"{display_name}不存在")
        if not document:
            try:
                with open(pdf_path, "rb") as f:
                    blob = f.read()
                dataset.upload_documents([{"display_name": display_name, "blob": blob}])
                document= dataset.list_documents(name=display_name)[0]
                print(f"已上传PDF: {pdf_path}")
            except Exception as e:
                print(f"上传PDF失败: {pdf_path}，错误: {e}")
                continue
        # 将txt内容作为chunk添加到文档
        txt_path = txt_dict.get(name)
        if txt_path:
            try:
                with open(txt_path, 'r', encoding='utf-8') as file:
                    file_content = file.read()
            except Exception as e:
                print(f"读取文件失败: {txt_path}，错误: {e}")
                continue
            try:
                for num,txt_chunk in enumerate(file_content.split('\n\n')):
                    print(f"处理文本块: {txt_chunk[:30]}...")  # 打印前30个字符以示例
                    if txt_chunk.strip():  # 确保不是空行
                        chunk = document.add_chunk(content=txt_chunk)
                        print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
                # chunk = document.add_chunk(content=content)
                # print(f"Chunk添加成功! 文件: {txt_path}，ID: {chunk.id}")
            except Exception as e:
                print(f"添加chunk时发生错误: {txt_path}，错误: {e}")
 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
glowz	5b940d5070	更新本地地址配置，修复连接问题；新增命令行接口以支持文档上传和处理功能	2025-07-06 10:47:10 +08:00
glowz	68444ad7ff	重构添加文档块功能，改进文件读取和错误处理逻辑	2025-07-05 11:43:50 +08:00
glowz	bc2aac4eea	更新文档处理逻辑，添加从文件读取文本块并上传至知识库的功能	2025-07-05 11:36:29 +08:00