Compare commits
2 Commits
b4769d2ec1
...
40211521a2
Author | SHA1 | Date | |
---|---|---|---|
40211521a2 | |||
2cc9dbfcd0 |
192
src/add_chunk_cli_pdf_img.py
Normal file
192
src/add_chunk_cli_pdf_img.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
from ragflow_sdk import RAGFlow
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
## home
|
||||||
|
api_key = "ragflow-MyMjM2ODE2NThlMTExZjBiMzJlNzY5Mj"
|
||||||
|
base_url = "http://127.0.0.1:8099"
|
||||||
|
|
||||||
|
|
||||||
|
## 公司内网
|
||||||
|
base_url = "http://192.168.107.165:8099"
|
||||||
|
api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
||||||
|
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def choose_from_list(options, prompt):
|
||||||
|
for idx, item in enumerate(options):
|
||||||
|
print(f"{idx + 1}. {item}")
|
||||||
|
while True:
|
||||||
|
choice = input(prompt)
|
||||||
|
if choice.isdigit() and 1 <= int(choice) <= len(options):
|
||||||
|
return options[int(choice) - 1]
|
||||||
|
else:
|
||||||
|
print("输入无效,请重新输入编号。")
|
||||||
|
|
||||||
|
|
||||||
|
def select_files(file_path, file_type="pdf"):
|
||||||
|
"""
|
||||||
|
选择file_path中的所有指定类型文件(默认pdf),
|
||||||
|
返回文件路径列表
|
||||||
|
"""
|
||||||
|
file_list = []
|
||||||
|
for root, dirs, files in os.walk(file_path):
|
||||||
|
for file in files:
|
||||||
|
if file.lower().endswith(f".{file_type.lower()}"):
|
||||||
|
file_list.append(os.path.join(root, file))
|
||||||
|
return file_list
|
||||||
|
|
||||||
|
|
||||||
|
def pair_pdf_and_txt(pdf_path, txt_path):
|
||||||
|
"""
|
||||||
|
将pdf和txt文件对齐,
|
||||||
|
返回对齐pdf_dict和txt_dict,
|
||||||
|
pdf_dict和txt_dict的key为文件名(不含后缀),value为文件路径
|
||||||
|
txt_dict仅收入与pdf_dict中存在的文件,
|
||||||
|
如果pdf_dict中有文件名没有对应的txt文件,则不收入txt_dict
|
||||||
|
"""
|
||||||
|
pdf_files = select_files(pdf_path, "pdf")
|
||||||
|
txt_files = select_files(txt_path, "txt")
|
||||||
|
|
||||||
|
# 构建文件名到路径的映射
|
||||||
|
pdf_dict = {os.path.splitext(os.path.basename(f))[0]: f for f in pdf_files}
|
||||||
|
txt_dict_all = {os.path.splitext(os.path.basename(f))[0]: f for f in txt_files}
|
||||||
|
|
||||||
|
# 只保留有对应txt的pdf
|
||||||
|
pdf_dict_aligned = {}
|
||||||
|
txt_dict_aligned = {}
|
||||||
|
for name in pdf_dict:
|
||||||
|
if name in txt_dict_all:
|
||||||
|
pdf_dict_aligned[name] = pdf_dict[name]
|
||||||
|
txt_dict_aligned[name] = txt_dict_all[name]
|
||||||
|
|
||||||
|
return pdf_dict_aligned, txt_dict_aligned
|
||||||
|
|
||||||
|
|
||||||
|
def select_dataset(rag_object):
|
||||||
|
"""选择可用数据集"""
|
||||||
|
datasets = rag_object.list_datasets()
|
||||||
|
if not datasets:
|
||||||
|
print("没有可用的数据集。")
|
||||||
|
return None
|
||||||
|
|
||||||
|
dataset_names = [ds.name for ds in datasets]
|
||||||
|
dataset_name = choose_from_list(dataset_names, "请选择数据集编号:")
|
||||||
|
return [ds for ds in datasets if ds.name == dataset_name][0]
|
||||||
|
|
||||||
|
def upload_or_get_document(dataset, pdf_path, display_name):
|
||||||
|
"""上传或获取已存在的文档"""
|
||||||
|
try:
|
||||||
|
document = dataset.list_documents(name=display_name)[0]
|
||||||
|
print(f"文档已存在: {display_name},跳过上传。")
|
||||||
|
return document
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
blob = f.read()
|
||||||
|
dataset.upload_documents([{"display_name": display_name, "blob": blob}])
|
||||||
|
return dataset.list_documents(name=display_name)[0]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"上传PDF失败: {pdf_path},错误: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def divid_txt_chunk_img(txt_chunk):
|
||||||
|
"""分离文本块中的图片链接和纯文本内容
|
||||||
|
|
||||||
|
输入格式示例:
|
||||||
|
"这是文本内容更多文本"
|
||||||
|
|
||||||
|
返回:
|
||||||
|
clean_text: 移除所有图片链接后的纯文本内容
|
||||||
|
image_paths: 提取到的图片路径列表
|
||||||
|
"""
|
||||||
|
# 正则表达式匹配Markdown图片格式: 
|
||||||
|
pattern = r'!\[.*?\]\((.*?)\)'
|
||||||
|
|
||||||
|
# 提取所有图片路径
|
||||||
|
image_paths = re.findall(pattern, txt_chunk)
|
||||||
|
|
||||||
|
# 移除所有图片标记
|
||||||
|
clean_text = re.sub(pattern, '', txt_chunk)
|
||||||
|
|
||||||
|
# 移除多余空行并清理首尾空白
|
||||||
|
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text).strip()
|
||||||
|
|
||||||
|
return clean_text, image_paths
|
||||||
|
|
||||||
|
def upload_images_to_minio(image_paths, document):
|
||||||
|
"""
|
||||||
|
上传图片到MinIO,
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_txt_chunks(document, txt_path):
|
||||||
|
"""处理文本分块并添加到文档"""
|
||||||
|
try:
|
||||||
|
with open(txt_path, 'r', encoding='utf-8') as file:
|
||||||
|
file_content = file.read()
|
||||||
|
|
||||||
|
for num, txt_chunk in enumerate(file_content.split('\n\n')):
|
||||||
|
if txt_chunk.strip():
|
||||||
|
print(f"处理文本块: {txt_chunk[:30]}...")
|
||||||
|
chunk = document.add_chunk(content=txt_chunk)
|
||||||
|
print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理文本文件时出错: {txt_path},错误: {e}")
|
||||||
|
|
||||||
|
def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
||||||
|
"""处理PDF-TXT文件对"""
|
||||||
|
for name, pdf_path in pdf_dict.items():
|
||||||
|
display_name = os.path.basename(pdf_path)
|
||||||
|
document = upload_or_get_document(dataset, pdf_path, display_name)
|
||||||
|
if not document:
|
||||||
|
continue
|
||||||
|
|
||||||
|
txt_path = txt_dict.get(name)
|
||||||
|
if txt_path:
|
||||||
|
process_txt_chunks(document, txt_path)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
"""主函数,处理PDF和TXT文件对
|
||||||
|
|
||||||
|
dataset.id = bucket_name
|
||||||
|
chunk_id = object_name
|
||||||
|
"""
|
||||||
|
file_path = "g:\\11\\22\\规范\\"
|
||||||
|
#pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path)
|
||||||
|
|
||||||
|
# if not pdf_dict:
|
||||||
|
# print("未选择任何文件。")
|
||||||
|
# return
|
||||||
|
|
||||||
|
dataset = select_dataset(rag_object)
|
||||||
|
print(f"选择的数据集: {dataset.name}")
|
||||||
|
print(f"选择的数据集id: {dataset.id}")
|
||||||
|
if not dataset:
|
||||||
|
print("未选择数据集。")
|
||||||
|
return
|
||||||
|
|
||||||
|
#process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Reference in New Issue
Block a user