From 5b940d5070ae14e09e2ac3529a65a606d96ec13e Mon Sep 17 00:00:00 2001 From: glowz <24627181@qq.com> Date: Sun, 6 Jul 2025 10:47:10 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=9C=AC=E5=9C=B0=E5=9C=B0?= =?UTF-8?q?=E5=9D=80=E9=85=8D=E7=BD=AE=EF=BC=8C=E4=BF=AE=E5=A4=8D=E8=BF=9E?= =?UTF-8?q?=E6=8E=A5=E9=97=AE=E9=A2=98=EF=BC=9B=E6=96=B0=E5=A2=9E=E5=91=BD?= =?UTF-8?q?=E4=BB=A4=E8=A1=8C=E6=8E=A5=E5=8F=A3=E4=BB=A5=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E4=B8=8A=E4=BC=A0=E5=92=8C=E5=A4=84=E7=90=86?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- chunk.py | 2 +- src/add_chunk_cli.py | 142 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 src/add_chunk_cli.py diff --git a/chunk.py b/chunk.py index 1ee2dfd..aa96b5a 100644 --- a/chunk.py +++ b/chunk.py @@ -2,7 +2,7 @@ from ragflow_sdk import RAGFlow #api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" #base_url = "http://192.168.107.165:8099" -base_url = "http://localhost" +base_url = "http://localhost:8099" api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj" rag_object = RAGFlow(api_key=api_key, base_url=base_url) diff --git a/src/add_chunk_cli.py b/src/add_chunk_cli.py new file mode 100644 index 0000000..c78cd85 --- /dev/null +++ b/src/add_chunk_cli.py @@ -0,0 +1,142 @@ +from ragflow_sdk import RAGFlow +import os + +## home +api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj" +base_url = "http://127.0.0.1:8099" + + +## 公司内网 +# base_url = "https://ddyy.iepose.cn" +# api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" +rag_object = RAGFlow(api_key=api_key, base_url=base_url) + + + + + + +def choose_from_list(options, prompt): + for idx, item in enumerate(options): + print(f"{idx + 1}. {item}") + while True: + choice = input(prompt) + if choice.isdigit() and 1 <= int(choice) <= len(options): + return options[int(choice) - 1] + else: + print("输入无效,请重新输入编号。") + + +def select_files(file_path, file_type="pdf"): + """ + 选择file_path中的所有指定类型文件(默认pdf), + 返回文件路径列表 + """ + file_list = [] + for root, dirs, files in os.walk(file_path): + for file in files: + if file.lower().endswith(f".{file_type.lower()}"): + file_list.append(os.path.join(root, file)) + return file_list + + +def pair_pdf_and_txt(pdf_path, txt_path): + """ + 将pdf和txt文件对齐, + 返回对齐pdf_dict和txt_dict, + pdf_dict和txt_dict的key为文件名(不含后缀),value为文件路径 + txt_dict仅收入与pdf_dict中存在的文件, + 如果pdf_dict中有文件名没有对应的txt文件,则不收入txt_dict + """ + pdf_files = select_files(pdf_path, "pdf") + txt_files = select_files(txt_path, "txt") + + # 构建文件名到路径的映射 + pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files} + txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files} + + # 只保留有对应txt的pdf + pdf_dict_aligned = {} + txt_dict_aligned = {} + for name in pdf_dict: + if name in txt_dict_all: + pdf_dict_aligned[name] = pdf_dict[name] + txt_dict_aligned[name] = txt_dict_all[name] + + return pdf_dict_aligned, txt_dict_aligned + + + + + +def main(): + + file_path = "g:\\11\\22\\" + pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path) + + if not pdf_dict: + print("未选择任何文件。") + return + # 选择数据集 + datasets = rag_object.list_datasets() + if not datasets: + print("没有可用的数据集。") + return + dataset_names = [ds.name for ds in datasets] + dataset_name = choose_from_list(dataset_names, "请选择数据集编号:") + dataset = [ds for ds in datasets if ds.name == dataset_name][0] + + # # 选择文档 + # documents = dataset.list_documents() + # if not documents: + # print("该数据集下没有文档。") + # return + # document_names = [doc.name for doc in documents] + # document_name = choose_from_list(document_names, "请选择文档编号:") + # document = [doc for doc in documents if doc.name == document_name][0] + + # 上传所有pdf文件到数据集 + for name, pdf_path in pdf_dict.items(): + display_name = os.path.basename(pdf_path) + document = None + try: + document= dataset.list_documents(name=display_name)[0] + print(f"文档已存在: {display_name},跳过上传。") + except Exception as e: + print(f"{display_name}不存在") + if not document: + try: + with open(pdf_path, "rb") as f: + blob = f.read() + dataset.upload_documents([{"display_name": display_name, "blob": blob}]) + document= dataset.list_documents(name=display_name)[0] + print(f"已上传PDF: {pdf_path}") + except Exception as e: + print(f"上传PDF失败: {pdf_path},错误: {e}") + continue + + # 将txt内容作为chunk添加到文档 + txt_path = txt_dict.get(name) + if txt_path: + try: + with open(txt_path, 'r', encoding='utf-8') as file: + file_content = file.read() + except Exception as e: + print(f"读取文件失败: {txt_path},错误: {e}") + continue + + try: + for num,txt_chunk in enumerate(file_content.split('\n\n')): + print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例 + + if txt_chunk.strip(): # 确保不是空行 + chunk = document.add_chunk(content=txt_chunk) + print(f"第{num+1} Chunk添加成功! ID: {chunk.id}") + + # chunk = document.add_chunk(content=content) + # print(f"Chunk添加成功! 文件: {txt_path},ID: {chunk.id}") + except Exception as e: + print(f"添加chunk时发生错误: {txt_path},错误: {e}") + +if __name__ == "__main__": + main() \ No newline at end of file