更新本地地址配置,修复连接问题;新增命令行接口以支持文档上传和处理功能

This commit is contained in:
2025-07-06 10:47:10 +08:00
parent 68444ad7ff
commit 5b940d5070
2 changed files with 143 additions and 1 deletions

View File

@@ -2,7 +2,7 @@ from ragflow_sdk import RAGFlow
#api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
#base_url = "http://192.168.107.165:8099"
base_url = "http://localhost"
base_url = "http://localhost:8099"
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
rag_object = RAGFlow(api_key=api_key, base_url=base_url)

142
src/add_chunk_cli.py Normal file
View File

@@ -0,0 +1,142 @@
from ragflow_sdk import RAGFlow
import os
## home
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
base_url = "http://127.0.0.1:8099"
## 公司内网
# base_url = "https://ddyy.iepose.cn"
# api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
def choose_from_list(options, prompt):
for idx, item in enumerate(options):
print(f"{idx + 1}. {item}")
while True:
choice = input(prompt)
if choice.isdigit() and 1 <= int(choice) <= len(options):
return options[int(choice) - 1]
else:
print("输入无效,请重新输入编号。")
def select_files(file_path, file_type="pdf"):
"""
选择file_path中的所有指定类型文件默认pdf
返回文件路径列表
"""
file_list = []
for root, dirs, files in os.walk(file_path):
for file in files:
if file.lower().endswith(f".{file_type.lower()}"):
file_list.append(os.path.join(root, file))
return file_list
def pair_pdf_and_txt(pdf_path, txt_path):
"""
将pdf和txt文件对齐
返回对齐pdf_dict和txt_dict
pdf_dict和txt_dict的key为文件名不含后缀value为文件路径
txt_dict仅收入与pdf_dict中存在的文件
如果pdf_dict中有文件名没有对应的txt文件则不收入txt_dict
"""
pdf_files = select_files(pdf_path, "pdf")
txt_files = select_files(txt_path, "txt")
# 构建文件名到路径的映射
pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
# 只保留有对应txt的pdf
pdf_dict_aligned = {}
txt_dict_aligned = {}
for name in pdf_dict:
if name in txt_dict_all:
pdf_dict_aligned[name] = pdf_dict[name]
txt_dict_aligned[name] = txt_dict_all[name]
return pdf_dict_aligned, txt_dict_aligned
def main():
file_path = "g:\\11\\22\\"
pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
if not pdf_dict:
print("未选择任何文件。")
return
# 选择数据集
datasets = rag_object.list_datasets()
if not datasets:
print("没有可用的数据集。")
return
dataset_names = [ds.name for ds in datasets]
dataset_name = choose_from_list(dataset_names, "请选择数据集编号:")
dataset = [ds for ds in datasets if ds.name == dataset_name][0]
# # 选择文档
# documents = dataset.list_documents()
# if not documents:
# print("该数据集下没有文档。")
# return
# document_names = [doc.name for doc in documents]
# document_name = choose_from_list(document_names, "请选择文档编号:")
# document = [doc for doc in documents if doc.name == document_name][0]
# 上传所有pdf文件到数据集
for name, pdf_path in pdf_dict.items():
display_name = os.path.basename(pdf_path)
document = None
try:
document= dataset.list_documents(name=display_name)[0]
print(f"文档已存在: {display_name},跳过上传。")
except Exception as e:
print(f"{display_name}不存在")
if not document:
try:
with open(pdf_path, "rb") as f:
blob = f.read()
dataset.upload_documents([{"display_name": display_name, "blob": blob}])
document= dataset.list_documents(name=display_name)[0]
print(f"已上传PDF: {pdf_path}")
except Exception as e:
print(f"上传PDF失败: {pdf_path},错误: {e}")
continue
# 将txt内容作为chunk添加到文档
txt_path = txt_dict.get(name)
if txt_path:
try:
with open(txt_path, 'r', encoding='utf-8') as file:
file_content = file.read()
except Exception as e:
print(f"读取文件失败: {txt_path},错误: {e}")
continue
try:
for num,txt_chunk in enumerate(file_content.split('\n\n')):
print(f"处理文本块: {txt_chunk[:30]}...") # 打印前30个字符以示例
if txt_chunk.strip(): # 确保不是空行
chunk = document.add_chunk(content=txt_chunk)
print(f"{num+1} Chunk添加成功! ID: {chunk.id}")
# chunk = document.add_chunk(content=content)
# print(f"Chunk添加成功! 文件: {txt_path}ID: {chunk.id}")
except Exception as e:
print(f"添加chunk时发生错误: {txt_path},错误: {e}")
if __name__ == "__main__":
main()