From 5b940d5070ae14e09e2ac3529a65a606d96ec13e Mon Sep 17 00:00:00 2001
From: glowz <24627181@qq.com>
Date: Sun, 6 Jul 2025 10:47:10 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=9C=AC=E5=9C=B0=E5=9C=B0?=
 =?UTF-8?q?=E5=9D=80=E9=85=8D=E7=BD=AE=EF=BC=8C=E4=BF=AE=E5=A4=8D=E8=BF=9E?=
 =?UTF-8?q?=E6=8E=A5=E9=97=AE=E9=A2=98=EF=BC=9B=E6=96=B0=E5=A2=9E=E5=91=BD?=
 =?UTF-8?q?=E4=BB=A4=E8=A1=8C=E6=8E=A5=E5=8F=A3=E4=BB=A5=E6=94=AF=E6=8C=81?=
 =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=8A=E4=BC=A0=E5=92=8C=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=E5=8A=9F=E8=83=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 chunk.py             |   2 +-
 src/add_chunk_cli.py | 142 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 143 insertions(+), 1 deletion(-)
 create mode 100644 src/add_chunk_cli.py

diff --git a/chunk.py b/chunk.py
index 1ee2dfd..aa96b5a 100644
--- a/chunk.py
+++ b/chunk.py
@@ -2,7 +2,7 @@ from ragflow_sdk import RAGFlow
 
 #api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
 #base_url = "http://192.168.107.165:8099"
-base_url = "http://localhost"
+base_url = "http://localhost:8099"
 api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
 
 rag_object = RAGFlow(api_key=api_key, base_url=base_url)
diff --git a/src/add_chunk_cli.py b/src/add_chunk_cli.py
new file mode 100644
index 0000000..c78cd85
--- /dev/null
+++ b/src/add_chunk_cli.py
@@ -0,0 +1,142 @@
+from ragflow_sdk import RAGFlow
+import os
+
+## home
+api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
+base_url = "http://127.0.0.1:8099"
+
+
+## 公司内网
+# base_url = "https://ddyy.iepose.cn"
+# api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
+rag_object = RAGFlow(api_key=api_key, base_url=base_url)
+
+
+
+
+
+
+def choose_from_list(options, prompt):
+    for idx, item in enumerate(options):
+        print(f"{idx + 1}. {item}")
+    while True:
+        choice = input(prompt)
+        if choice.isdigit() and 1 <= int(choice) <= len(options):
+            return options[int(choice) - 1]
+        else:
+            print("输入无效，请重新输入编号。")
+
+
+def select_files(file_path, file_type="pdf"):
+    """
+    选择file_path中的所有指定类型文件（默认pdf），
+    返回文件路径列表
+    """
+    file_list = []
+    for root, dirs, files in os.walk(file_path):
+        for file in files:
+            if file.lower().endswith(f".{file_type.lower()}"):
+                file_list.append(os.path.join(root, file))
+    return file_list
+
+
+def pair_pdf_and_txt(pdf_path, txt_path):
+    """
+    将pdf和txt文件对齐，
+    返回对齐pdf_dict和txt_dict，
+    pdf_dict和txt_dict的key为文件名（不含后缀），value为文件路径
+    txt_dict仅收入与pdf_dict中存在的文件，
+    如果pdf_dict中有文件名没有对应的txt文件，则不收入txt_dict
+    """
+    pdf_files = select_files(pdf_path, "pdf")
+    txt_files = select_files(txt_path, "txt")
+
+    # 构建文件名到路径的映射
+    pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
+    txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
+
+    # 只保留有对应txt的pdf
+    pdf_dict_aligned = {}
+    txt_dict_aligned = {}
+    for name in pdf_dict:
+        if name in txt_dict_all:
+            pdf_dict_aligned[name] = pdf_dict[name]
+            txt_dict_aligned[name] = txt_dict_all[name]
+
+    return pdf_dict_aligned, txt_dict_aligned
+
+
+
+
+
+def main():
+
+    file_path = "g:\\11\\22\\"
+    pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
+
+    if not pdf_dict:
+        print("未选择任何文件。")
+        return
+    # 选择数据集
+    datasets = rag_object.list_datasets()
+    if not datasets:
+        print("没有可用的数据集。")
+        return
+    dataset_names = [ds.name for ds in datasets]
+    dataset_name = choose_from_list(dataset_names, "请选择数据集编号：")
+    dataset = [ds for ds in datasets if ds.name == dataset_name][0]
+
+    # # 选择文档
+    # documents = dataset.list_documents()
+    # if not documents:
+    #     print("该数据集下没有文档。")
+    #     return
+    # document_names = [doc.name for doc in documents]
+    # document_name = choose_from_list(document_names, "请选择文档编号：")
+    # document = [doc for doc in documents if doc.name == document_name][0]
+
+    # 上传所有pdf文件到数据集
+    for name, pdf_path in pdf_dict.items():
+        display_name = os.path.basename(pdf_path)
+        document = None
+        try:
+            document= dataset.list_documents(name=display_name)[0]
+            print(f"文档已存在: {display_name}，跳过上传。")
+        except Exception as e:
+            print(f"{display_name}不存在")
+        if not document:
+            try:
+                with open(pdf_path, "rb") as f:
+                    blob = f.read()
+                dataset.upload_documents([{"display_name": display_name, "blob": blob}])
+                document= dataset.list_documents(name=display_name)[0]
+                print(f"已上传PDF: {pdf_path}")
+            except Exception as e:
+                print(f"上传PDF失败: {pdf_path}，错误: {e}")
+                continue
+
+        # 将txt内容作为chunk添加到文档
+        txt_path = txt_dict.get(name)
+        if txt_path:
+            try:
+                with open(txt_path, 'r', encoding='utf-8') as file:
+                    file_content = file.read()
+            except Exception as e:
+                print(f"读取文件失败: {txt_path}，错误: {e}")
+                continue
+
+            try:
+                for num,txt_chunk in enumerate(file_content.split('\n\n')):
+                    print(f"处理文本块: {txt_chunk[:30]}...")  # 打印前30个字符以示例
+
+                    if txt_chunk.strip():  # 确保不是空行
+                        chunk = document.add_chunk(content=txt_chunk)
+                        print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
+
+                # chunk = document.add_chunk(content=content)
+                # print(f"Chunk添加成功! 文件: {txt_path}，ID: {chunk.id}")
+            except Exception as e:
+                print(f"添加chunk时发生错误: {txt_path}，错误: {e}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file