Merge branch 'main' of https://git.lqsjy.cn/glowz/ragflow_api_test
This commit is contained in:
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
markdown_files/
|
||||||
|
|
||||||
|
# 忽略所有 .log 文件
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# 忽略特定文件
|
||||||
|
debug.ini
|
||||||
|
|
||||||
|
# 忽略目录
|
||||||
|
tmp/
|
||||||
|
build/
|
||||||
|
|
||||||
|
# 不忽略重要的.log文件
|
||||||
|
!important.log
|
@@ -12,7 +12,6 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
|
|||||||
如果img_id不存在,则增加一个新的 img_id。
|
如果img_id不存在,则增加一个新的 img_id。
|
||||||
|
|
||||||
:param tenant_id: 租户 ID
|
:param tenant_id: 租户 ID
|
||||||
:param dataset_id: 数据集 ID
|
|
||||||
:param doc_id: 文档 ID
|
:param doc_id: 文档 ID
|
||||||
:param chunk_id: 文档块 ID
|
:param chunk_id: 文档块 ID
|
||||||
:param new_img_id: 新的 img_id
|
:param new_img_id: 新的 img_id
|
||||||
@@ -43,25 +42,7 @@ def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
|
|||||||
# 获取目标文档的 ID
|
# 获取目标文档的 ID
|
||||||
hit = result['hits']['hits'][0]
|
hit = result['hits']['hits'][0]
|
||||||
doc_id_in_es = hit['_id']
|
doc_id_in_es = hit['_id']
|
||||||
# print(doc_id_in_es)
|
|
||||||
#print(hit)
|
|
||||||
#print(len(hit['_source']['img_id']))
|
|
||||||
# image_id = hit['_source'].get('img_id', None)
|
|
||||||
|
|
||||||
# if (image_id):
|
|
||||||
# mapping = es.indices.get_mapping(index=index_name)
|
|
||||||
# print(mapping)
|
|
||||||
|
|
||||||
# else:
|
|
||||||
# # img_id 不存在,添加新的 img_id
|
|
||||||
# # 获取索引的映射
|
|
||||||
# mapping = es.indices.get_mapping(index=index_name)
|
|
||||||
# mapping[index_name]['mappings']['properties']['img_id'] = {'type': 'text'}
|
|
||||||
# es.indices.put_mapping(index="my_index", body=mapping)
|
|
||||||
# print(mapping)
|
|
||||||
|
|
||||||
|
|
||||||
# # 构建更新请求
|
|
||||||
update_body = {
|
update_body = {
|
||||||
"doc": {
|
"doc": {
|
||||||
"img_id": new_img_id
|
"img_id": new_img_id
|
||||||
|
130
markdown_image2minio.py
Normal file
130
markdown_image2minio.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
# 引用minio_api.py中的需要的函数
|
||||||
|
# 对话框选择一个markdown文件,将其中的图片上传到MinIO,其中图片的链接格式为,
|
||||||
|
# MinIO的bucket_name = "markdown_image",object_name="{markdown的文件名}/{图片出现的顺序}.jpg",图片出现的顺序号如0001.jpg
|
||||||
|
|
||||||
|
# 更新markdonw文件中的图片链接为MinIO的链接, 链接格式为http://127.0.0.1:9000/markdown_image/{markdown的文件名}/{图片出现的顺序}.jpg
|
||||||
|
# 将更新后的markdown文件保存/markdown_image_processed/{markdown的文件名}.md
|
||||||
|
# 输出必要的处理信息
|
||||||
|
from minio import Minio
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from tkinter import Tk, filedialog
|
||||||
|
from minio.error import S3Error
|
||||||
|
|
||||||
|
MINIO_BUCKET = "md-img"
|
||||||
|
MINIO_URL = "http://127.0.0.1:9000"
|
||||||
|
PROCESSED_DIR = "markdown_image_processed"
|
||||||
|
|
||||||
|
MINIO_HOST = os.getenv("MINIO_HOST", "127.0.0.1")
|
||||||
|
MINIO_CONFIG = {
|
||||||
|
"endpoint": f"{MINIO_HOST}:{os.getenv('MINIO_PORT', '9000')}",
|
||||||
|
"access_key": os.getenv("MINIO_USER", "rag_flow"),
|
||||||
|
"secret_key": os.getenv("MINIO_PASSWORD", "infini_rag_flow"),
|
||||||
|
"secure": False
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_minio_client():
|
||||||
|
"""创建MinIO客户端"""
|
||||||
|
print("当前MinIO配置:", MINIO_CONFIG)
|
||||||
|
return Minio(
|
||||||
|
endpoint=MINIO_CONFIG["endpoint"],
|
||||||
|
access_key=MINIO_CONFIG["access_key"],
|
||||||
|
secret_key=MINIO_CONFIG["secret_key"],
|
||||||
|
secure=MINIO_CONFIG["secure"]
|
||||||
|
)
|
||||||
|
|
||||||
|
def upload_file_to_minio(client, bucket_name, object_name, file_path):
|
||||||
|
"""上传文件到MinIO"""
|
||||||
|
try:
|
||||||
|
if not client.bucket_exists(bucket_name):
|
||||||
|
client.make_bucket(bucket_name)
|
||||||
|
print(f"Bucket '{bucket_name}' created")
|
||||||
|
client.fput_object(bucket_name, object_name, file_path)
|
||||||
|
print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}' 为 '{object_name}'")
|
||||||
|
except S3Error as exc:
|
||||||
|
print("MinIO错误:", exc)
|
||||||
|
except Exception as e:
|
||||||
|
print("发生错误:", e)
|
||||||
|
|
||||||
|
def select_markdown_file():
|
||||||
|
"""弹出对话框选择Markdown文件,返回文件路径"""
|
||||||
|
Tk().withdraw()
|
||||||
|
md_path = filedialog.askopenfilename(
|
||||||
|
title="选择Markdown文件",
|
||||||
|
filetypes=[("Markdown files", "*.md")]
|
||||||
|
)
|
||||||
|
return md_path
|
||||||
|
|
||||||
|
def return_markdown_files(md_path):
|
||||||
|
"""返回指定目录下的所有Markdown文件路径"""
|
||||||
|
if not os.path.isdir(md_path):
|
||||||
|
print(f"路径 {md_path} 不是一个有效的目录。")
|
||||||
|
return []
|
||||||
|
|
||||||
|
md_files = [os.path.join(md_path, f) for f in os.listdir(md_path) if f.endswith('.md')]
|
||||||
|
if not md_files:
|
||||||
|
print(f"在目录 {md_path} 中未找到Markdown文件。")
|
||||||
|
return md_files
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_markdown_images(md_path, minio_client):
|
||||||
|
"""处理Markdown文件中的图片并上传到MinIO,返回处理后的内容和新文件路径"""
|
||||||
|
if not md_path:
|
||||||
|
print("未选择文件,程序退出。")
|
||||||
|
return
|
||||||
|
|
||||||
|
md_filename = os.path.basename(md_path)
|
||||||
|
md_dir = os.path.dirname(md_path)
|
||||||
|
md_name, _ = os.path.splitext(md_filename)
|
||||||
|
print(f"处理文件: {md_filename}")
|
||||||
|
|
||||||
|
with open(md_path, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
img_pattern = r'!\[.*?\]\((.*?)\)'
|
||||||
|
img_paths = re.findall(img_pattern, content)
|
||||||
|
print(f"检测到{len(img_paths)}张图片。")
|
||||||
|
|
||||||
|
new_content = content
|
||||||
|
for idx, img_path in enumerate(img_paths):
|
||||||
|
img_ext = os.path.splitext(img_path)[1] or ".jpg"
|
||||||
|
img_seq = f"{idx+1:04d}{img_ext}"
|
||||||
|
print(f"处理图片: {img_path} -> {img_seq}")
|
||||||
|
object_name = f"{md_name}/{img_seq}"
|
||||||
|
|
||||||
|
# 绝对路径处理
|
||||||
|
if not os.path.isabs(img_path):
|
||||||
|
img_abs_path = os.path.join(os.path.dirname(md_path), img_path)
|
||||||
|
else:
|
||||||
|
img_abs_path = img_path
|
||||||
|
print(f"图片绝对路径: {img_abs_path}")
|
||||||
|
|
||||||
|
if not os.path.exists(img_abs_path):
|
||||||
|
print(f"图片未找到: {img_abs_path},跳过。")
|
||||||
|
continue
|
||||||
|
|
||||||
|
upload_file_to_minio(minio_client, MINIO_BUCKET, object_name, img_abs_path)
|
||||||
|
minio_link = f"{MINIO_URL}/{MINIO_BUCKET}/{object_name}"
|
||||||
|
print(f"图片 {img_path} 已上传为 {object_name}")
|
||||||
|
|
||||||
|
new_content = new_content.replace(f"]({img_path})", f"]({minio_link})", 1)
|
||||||
|
|
||||||
|
os.makedirs(os.path.join(md_dir,PROCESSED_DIR), exist_ok=True)
|
||||||
|
processed_path = os.path.join(md_dir,PROCESSED_DIR, md_filename)
|
||||||
|
with open(processed_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(new_content)
|
||||||
|
print(f"处理后的Markdown已保存到: {processed_path}")
|
||||||
|
return processed_path
|
||||||
|
|
||||||
|
def main():
|
||||||
|
#md_path = select_markdown_file()
|
||||||
|
md_path = "G:\\11\\ragflow_api_test\\markdown_files"
|
||||||
|
md_files = return_markdown_files(md_path)
|
||||||
|
|
||||||
|
minio_client = get_minio_client()
|
||||||
|
for md_path in md_files:
|
||||||
|
process_markdown_images(md_path, minio_client)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@@ -123,3 +123,7 @@ except S3Error as exc:
|
|||||||
print("MinIO错误:", exc)
|
print("MinIO错误:", exc)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("发生错误:", e)
|
print("发生错误:", e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@@ -73,6 +73,7 @@ def main():
|
|||||||
|
|
||||||
file_path = "g:\\11\\22\\路桥设计党建\\"
|
file_path = "g:\\11\\22\\路桥设计党建\\"
|
||||||
file_path = "F:\\2\\"
|
file_path = "F:\\2\\"
|
||||||
|
file_path = "g:\\11\\22\\规范\\"
|
||||||
pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
|
pdf_dict, txt_dict = pair_pdf_and_txt(file_path,file_path)
|
||||||
|
|
||||||
if not pdf_dict:
|
if not pdf_dict:
|
||||||
|
Reference in New Issue
Block a user