Files
ragflow_api_test/src/add_chunk_cli.py

144 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from ragflow_sdk import RAGFlow
import os
## home
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
base_url = "http://127.0.0.1:8099"
## 公司内网
base_url = "http://192.168.107.165:8099"
api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
def choose_from_list(options, prompt):
for idx, item in enumerate(options):
print(f"{idx + 1}. {item}")
while True:
choice = input(prompt)
if choice.isdigit() and 1 <= int(choice) <= len(options):
return options[int(choice) - 1]
else:
print("输入无效,请重新输入编号。")
def select_files(file_path, file_type="pdf"):
"""
选择file_path中的所有指定类型文件默认pdf
返回文件路径列表
"""
file_list = []
for root, dirs, files in os.walk(file_path):
for file in files:
if file.lower().endswith(f".{file_type.lower()}"):
file_list.append(os.path.join(root, file))
return file_list
def pair_pdf_and_txt(pdf_path, txt_path):
"""
将pdf和txt文件对齐
返回对齐pdf_dict和txt_dict
pdf_dict和txt_dict的key为文件名不含后缀value为文件路径
txt_dict仅收入与pdf_dict中存在的文件
如果pdf_dict中有文件名没有对应的txt文件则不收入txt_dict
"""
pdf_files = select_files(pdf_path, "pdf")
txt_files = select_files(txt_path, "txt")
# 构建文件名到路径的映射
pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
# 只保留有对应txt的pdf
pdf_dict_aligned = {}
txt_dict_aligned = {}
for name in pdf_dict:
if name in txt_dict_all:
pdf_dict_aligned[name] = pdf_dict[name]
txt_dict_aligned[name] = txt_dict_all[name]
return pdf_dict_aligned, txt_dict_aligned
def main():
file_path = "g:\\11\\22\\路桥设计党建\\"
file_path = "F:\\2\\"
file_path = "g:\\11\\22\\规范\\"
pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
if not pdf_dict:
print("未选择任何文件。")
return
# 选择数据集
datasets = rag_object.list_datasets()
if not datasets:
print("没有可用的数据集。")
return
dataset_names = [ds.name for ds in datasets]
dataset_name = choose_from_list(dataset_names, "请选择数据集编号:")
dataset = [ds for ds in datasets if ds.name == dataset_name][0]
# # 选择文档
# documents = dataset.list_documents()
# if not documents:
# print("该数据集下没有文档。")
# return
# document_names = [doc.name for doc in documents]
# document_name = choose_from_list(document_names, "请选择文档编号:")
# document = [doc for doc in documents if doc.name == document_name][0]
# 上传所有pdf文件到数据集
for name, pdf_path in pdf_dict.items():
display_name = os.path.basename(pdf_path)
document = None
try:
document= dataset.list_documents(name=display_name)[0]
print(f"文档已存在: {display_name},跳过上传。")
except Exception as e:
print(f"{display_name}不存在")
if not document:
try:
with open(pdf_path, "rb") as f:
blob = f.read()
dataset.upload_documents([{"display_name": display_name, "blob": blob}])
document= dataset.list_documents(name=display_name)[0]
print(f"已上传PDF: {pdf_path}")
except Exception as e:
print(f"上传PDF失败: {pdf_path},错误: {e}")
continue
# 将txt内容作为chunk添加到文档
txt_path = txt_dict.get(name)
if txt_path:
try:
with open(txt_path, 'r', encoding='utf-8') as file:
file_content = file.read()
except Exception as e:
print(f"读取文件失败: {txt_path},错误: {e}")
continue
try:
for num,txt_chunk in enumerate(file_content.split('\n\n')):
print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例
if txt_chunk.strip(): # 确保不是空行
chunk = document.add_chunk(content=txt_chunk)
print(f"{num+1} Chunk添加成功! ID: {chunk.id}")
# chunk = document.add_chunk(content=content)
# print(f"Chunk添加成功! 文件: {txt_path}ID: {chunk.id}")
except Exception as e:
print(f"添加chunk时发生错误: {txt_path},错误: {e}")
if __name__ == "__main__":
main()