Files
ragflow_api_test/markdown_image2minio.py

130 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 引用minio_api.py中的需要的函数
# 对话框选择一个markdown文件将其中的图片上传到MinIO,其中图片的链接格式为![](image.jpg)
# MinIO的bucket_name = "markdown_image",object_name="{markdown的文件名}/{图片出现的顺序}.jpg",图片出现的顺序号如0001.jpg
# 更新markdonw文件中的图片链接为MinIO的链接, 链接格式为http://127.0.0.1:9000/markdown_image/{markdown的文件名}/{图片出现的顺序}.jpg
# 将更新后的markdown文件保存/markdown_image_processed/{markdown的文件名}.md
# 输出必要的处理信息
from minio import Minio
import os
import re
from tkinter import Tk, filedialog
from minio.error import S3Error
MINIO_BUCKET = "md-img"
MINIO_URL = "http://127.0.0.1:9000"
PROCESSED_DIR = "markdown_image_processed"
MINIO_HOST = os.getenv("MINIO_HOST", "127.0.0.1")
MINIO_CONFIG = {
"endpoint": f"{MINIO_HOST}:{os.getenv('MINIO_PORT', '9000')}",
"access_key": os.getenv("MINIO_USER", "rag_flow"),
"secret_key": os.getenv("MINIO_PASSWORD", "infini_rag_flow"),
"secure": False
}
def get_minio_client():
"""创建MinIO客户端"""
print("当前MinIO配置:", MINIO_CONFIG)
return Minio(
endpoint=MINIO_CONFIG["endpoint"],
access_key=MINIO_CONFIG["access_key"],
secret_key=MINIO_CONFIG["secret_key"],
secure=MINIO_CONFIG["secure"]
)
def upload_file_to_minio(client, bucket_name, object_name, file_path):
"""上传文件到MinIO"""
try:
if not client.bucket_exists(bucket_name):
client.make_bucket(bucket_name)
print(f"Bucket '{bucket_name}' created")
client.fput_object(bucket_name, object_name, file_path)
print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}''{object_name}'")
except S3Error as exc:
print("MinIO错误:", exc)
except Exception as e:
print("发生错误:", e)
def select_markdown_file():
"""弹出对话框选择Markdown文件返回文件路径"""
Tk().withdraw()
md_path = filedialog.askopenfilename(
title="选择Markdown文件",
filetypes=[("Markdown files", "*.md")]
)
return md_path
def return_markdown_files(md_path):
"""返回指定目录下的所有Markdown文件路径"""
if not os.path.isdir(md_path):
print(f"路径 {md_path} 不是一个有效的目录。")
return []
md_files = [os.path.join(md_path, f) for f in os.listdir(md_path) if f.endswith('.md')]
if not md_files:
print(f"在目录 {md_path} 中未找到Markdown文件。")
return md_files
def process_markdown_images(md_path, minio_client):
"""处理Markdown文件中的图片并上传到MinIO返回处理后的内容和新文件路径"""
if not md_path:
print("未选择文件,程序退出。")
return
md_filename = os.path.basename(md_path)
md_dir = os.path.dirname(md_path)
md_name, _ = os.path.splitext(md_filename)
print(f"处理文件: {md_filename}")
with open(md_path, "r", encoding="utf-8") as f:
content = f.read()
img_pattern = r'!\[.*?\]\((.*?)\)'
img_paths = re.findall(img_pattern, content)
print(f"检测到{len(img_paths)}张图片。")
new_content = content
for idx, img_path in enumerate(img_paths):
img_ext = os.path.splitext(img_path)[1] or ".jpg"
img_seq = f"{idx+1:04d}{img_ext}"
print(f"处理图片: {img_path} -> {img_seq}")
object_name = f"{md_name}/{img_seq}"
# 绝对路径处理
if not os.path.isabs(img_path):
img_abs_path = os.path.join(os.path.dirname(md_path), img_path)
else:
img_abs_path = img_path
print(f"图片绝对路径: {img_abs_path}")
if not os.path.exists(img_abs_path):
print(f"图片未找到: {img_abs_path},跳过。")
continue
upload_file_to_minio(minio_client, MINIO_BUCKET, object_name, img_abs_path)
minio_link = f"{MINIO_URL}/{MINIO_BUCKET}/{object_name}"
print(f"图片 {img_path} 已上传为 {object_name}")
new_content = new_content.replace(f"]({img_path})", f"]({minio_link})", 1)
os.makedirs(os.path.join(md_dir,PROCESSED_DIR), exist_ok=True)
processed_path = os.path.join(md_dir,PROCESSED_DIR, md_filename)
with open(processed_path, "w", encoding="utf-8") as f:
f.write(new_content)
print(f"处理后的Markdown已保存到: {processed_path}")
return processed_path
def main():
#md_path = select_markdown_file()
md_path = "G:\\11\\ragflow_api_test\\markdown_files"
md_files = return_markdown_files(md_path)
minio_client = get_minio_client()
for md_path in md_files:
process_markdown_images(md_path, minio_client)
if __name__ == "__main__":
main()