from ragflow_sdk import RAGFlow import os ## home api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj" base_url = "http://127.0.0.1:8099" ## 公司内网 base_url = "http://192.168.107.165:8099" api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT" rag_object = RAGFlow(api_key=api_key, base_url=base_url) def choose_from_list(options, prompt): for idx, item in enumerate(options): print(f"{idx + 1}. {item}") while True: choice = input(prompt) if choice.isdigit() and 1 <= int(choice) <= len(options): return options[int(choice) - 1] else: print("输入无效,请重新输入编号。") def select_files(file_path, file_type="pdf"): """ 选择file_path中的所有指定类型文件(默认pdf), 返回文件路径列表 """ file_list = [] for root, dirs, files in os.walk(file_path): for file in files: if file.lower().endswith(f".{file_type.lower()}"): file_list.append(os.path.join(root, file)) return file_list def pair_pdf_and_txt(pdf_path, txt_path): """ 将pdf和txt文件对齐, 返回对齐pdf_dict和txt_dict, pdf_dict和txt_dict的key为文件名(不含后缀),value为文件路径 txt_dict仅收入与pdf_dict中存在的文件, 如果pdf_dict中有文件名没有对应的txt文件,则不收入txt_dict """ pdf_files = select_files(pdf_path, "pdf") txt_files = select_files(txt_path, "txt") # 构建文件名到路径的映射 pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files} txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files} # 只保留有对应txt的pdf pdf_dict_aligned = {} txt_dict_aligned = {} for name in pdf_dict: if name in txt_dict_all: pdf_dict_aligned[name] = pdf_dict[name] txt_dict_aligned[name] = txt_dict_all[name] return pdf_dict_aligned, txt_dict_aligned def main(): file_path = "g:\\11\\22\\路桥设计党建\\" file_path = "F:\\2\\" file_path = "g:\\11\\22\\规范\\" pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path) if not pdf_dict: print("未选择任何文件。") return # 选择数据集 datasets = rag_object.list_datasets() if not datasets: print("没有可用的数据集。") return dataset_names = [ds.name for ds in datasets] dataset_name = choose_from_list(dataset_names, "请选择数据集编号:") dataset = [ds for ds in datasets if ds.name == dataset_name][0] # # 选择文档 # documents = dataset.list_documents() # if not documents: # print("该数据集下没有文档。") # return # document_names = [doc.name for doc in documents] # document_name = choose_from_list(document_names, "请选择文档编号:") # document = [doc for doc in documents if doc.name == document_name][0] # 上传所有pdf文件到数据集 for name, pdf_path in pdf_dict.items(): display_name = os.path.basename(pdf_path) document = None try: document= dataset.list_documents(name=display_name)[0] print(f"文档已存在: {display_name},跳过上传。") except Exception as e: print(f"{display_name}不存在") if not document: try: with open(pdf_path, "rb") as f: blob = f.read() dataset.upload_documents([{"display_name": display_name, "blob": blob}]) document= dataset.list_documents(name=display_name)[0] print(f"已上传PDF: {pdf_path}") except Exception as e: print(f"上传PDF失败: {pdf_path},错误: {e}") continue # 将txt内容作为chunk添加到文档 txt_path = txt_dict.get(name) if txt_path: try: with open(txt_path, 'r', encoding='utf-8') as file: file_content = file.read() except Exception as e: print(f"读取文件失败: {txt_path},错误: {e}") continue try: for num,txt_chunk in enumerate(file_content.split('\n\n')): print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例 if txt_chunk.strip(): # 确保不是空行 chunk = document.add_chunk(content=txt_chunk) print(f"第{num+1} Chunk添加成功! ID: {chunk.id}") # chunk = document.add_chunk(content=content) # print(f"Chunk添加成功! 文件: {txt_path},ID: {chunk.id}") except Exception as e: print(f"添加chunk时发生错误: {txt_path},错误: {e}") if __name__ == "__main__": main()