更新 Elasticsearch 和 MinIO 配置,重构文件上传逻辑,添加图片链接处理功能
This commit is contained in:
@@ -7,10 +7,148 @@ base_url = "http://127.0.0.1:8099"
|
||||
|
||||
|
||||
## 公司内网
|
||||
base_url = "http://192.168.107.165:8099"
|
||||
api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
||||
# base_url = "http://192.168.107.165:8099"
|
||||
# api_key = "ragflow-I5ZDNjMWNhNTdlMjExZjBiOTEwMzI0ZT"
|
||||
|
||||
|
||||
elastic_tenant_id = "9c73df5a3ebc11f08410c237296aa408"
|
||||
|
||||
rag_object = RAGFlow(api_key=api_key, base_url=base_url)
|
||||
|
||||
elastic_url = "127.0.0.1"
|
||||
|
||||
from elasticsearch import Elasticsearch
|
||||
|
||||
# 初始化 Elasticsearch 用户名elastic,密码infini_rag_flow
|
||||
es = Elasticsearch(
|
||||
[{'host': elastic_url, 'port': 1200, 'scheme': 'http'}],
|
||||
basic_auth=('elastic', 'infini_rag_flow')
|
||||
)
|
||||
|
||||
def update_img_id_in_elasticsearch(tenant_id, doc_id, chunk_id, new_img_id):
|
||||
"""
|
||||
在 Elasticsearch 中更新指定文档块的 img_id。
|
||||
如果img_id不存在,则增加一个新的 img_id。
|
||||
|
||||
:param tenant_id: 租户 ID
|
||||
:param doc_id: 文档 ID
|
||||
:param chunk_id: 文档块 ID
|
||||
:param new_img_id: 新的 img_id
|
||||
:return: 更新结果
|
||||
|
||||
"""
|
||||
# 构建索引名称
|
||||
index_name = f"ragflow_{tenant_id}" # 这里需要替换为实际的索引名称生成逻辑
|
||||
|
||||
# 构建查询条件
|
||||
query = {
|
||||
"bool": {
|
||||
"must": [
|
||||
{"term": {"doc_id": doc_id}},
|
||||
{"term": {"_id": chunk_id}}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
# 搜索目标文档
|
||||
result = es.search(index=index_name, body={"query": query})
|
||||
|
||||
# 检查是否找到目标文档
|
||||
if result['hits']['total']['value'] == 0:
|
||||
return {"code": 102, "message": f"Can't find this chunk {chunk_id}"}
|
||||
|
||||
|
||||
# 获取目标文档的 ID
|
||||
hit = result['hits']['hits'][0]
|
||||
doc_id_in_es = hit['_id']
|
||||
|
||||
update_body = {
|
||||
"doc": {
|
||||
"img_id": new_img_id
|
||||
}
|
||||
}
|
||||
|
||||
# 更新文档
|
||||
update_result = es.update(index=index_name, id=doc_id_in_es, body=update_body)
|
||||
print("更新结果:", update_result)
|
||||
|
||||
|
||||
|
||||
if update_result['result'] == 'updated':
|
||||
return {"code": 0, "message": ""}
|
||||
else:
|
||||
return {"code": 100, "message": "Failed to update img_id"}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
|
||||
|
||||
MINIO_HOST="127.0.0.1"
|
||||
|
||||
MINIO_CONFIG = {
|
||||
"endpoint": f"{MINIO_HOST}:{os.getenv('MINIO_PORT', '9000')}",
|
||||
"access_key": os.getenv("MINIO_USER", "rag_flow"),
|
||||
"secret_key": os.getenv("MINIO_PASSWORD", "infini_rag_flow"),
|
||||
"secure": False
|
||||
}
|
||||
|
||||
def get_minio_client():
|
||||
"""创建MinIO客户端"""
|
||||
return Minio(
|
||||
endpoint=MINIO_CONFIG["endpoint"],
|
||||
access_key=MINIO_CONFIG["access_key"],
|
||||
secret_key=MINIO_CONFIG["secret_key"],
|
||||
secure=MINIO_CONFIG["secure"]
|
||||
)
|
||||
|
||||
|
||||
def upload_file2minio(bucket_name, object_name, file_path):
|
||||
"""上传文件到MinIO
|
||||
# 通过fput_object上传时:
|
||||
|
||||
# 如果object_name为image\image.jpg,则上传后的名字就是image\image.jpg;
|
||||
|
||||
# 如果object_name为image/image.jpg,则上传后image为文件夹,文件名为image.jpg;
|
||||
|
||||
"""
|
||||
|
||||
minio_client= get_minio_client()
|
||||
|
||||
try:
|
||||
# 检查存储桶是否存在,如果不存在则创建(可选)
|
||||
if not minio_client.bucket_exists(bucket_name):
|
||||
minio_client.make_bucket(bucket_name)
|
||||
print(f"Bucket '{bucket_name}' created")
|
||||
|
||||
# 上传文件
|
||||
minio_client.fput_object(
|
||||
bucket_name=bucket_name,
|
||||
object_name=object_name,
|
||||
file_path=file_path
|
||||
)
|
||||
|
||||
# 获取文件的预签名URL(可选)
|
||||
#res = minio_client.get_presigned_url("GET", bucket_name, object_name, expires=timedelta(days=7))
|
||||
|
||||
#res = "http://127.0.0.1:9000" + "/"+bucket_name+"/" + object_name
|
||||
|
||||
|
||||
#print(res)
|
||||
print(f"文件 '{file_path}' 成功上传到存储桶 '{bucket_name}' 为 '{object_name}'")
|
||||
return True
|
||||
|
||||
except S3Error as exc:
|
||||
print("MinIO错误:", exc)
|
||||
return False
|
||||
except Exception as e:
|
||||
print("发生错误:", e)
|
||||
return False
|
||||
|
||||
|
||||
|
||||
@@ -120,17 +258,29 @@ def divid_txt_chunk_img(txt_chunk):
|
||||
|
||||
return clean_text, image_paths
|
||||
|
||||
def upload_images_to_minio(image_paths, document):
|
||||
"""
|
||||
上传图片到MinIO,
|
||||
def extract_images_from_chunk( content):
|
||||
"""从chunk内容中提取图片链接"""
|
||||
img_pattern = r'!\[.*?\]\((.*?)\)'
|
||||
return re.findall(img_pattern, content)
|
||||
|
||||
def remove_images_from_content( content):
|
||||
"""从内容中移除图片链接"""
|
||||
# 移除markdown图片语法 
|
||||
content = re.sub(r'!\[.*?\]\(.*?\)', '', content)
|
||||
# 清理多余的空行
|
||||
content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
|
||||
return content.strip()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def process_txt_chunks( dataset_id, document, txt_path):
|
||||
"""处理文本分块并添加到文档
|
||||
dataset_id = kb_id
|
||||
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
def process_txt_chunks(document, txt_path):
|
||||
"""处理文本分块并添加到文档"""
|
||||
try:
|
||||
with open(txt_path, 'r', encoding='utf-8') as file:
|
||||
file_content = file.read()
|
||||
@@ -138,23 +288,51 @@ def process_txt_chunks(document, txt_path):
|
||||
for num, txt_chunk in enumerate(file_content.split('\n\n')):
|
||||
if txt_chunk.strip():
|
||||
print(f"处理文本块: {txt_chunk[:30]}...")
|
||||
chunk = document.add_chunk(content=txt_chunk)
|
||||
img_urls= extract_images_from_chunk(txt_chunk)
|
||||
img_url = img_urls[0] if img_urls else None
|
||||
if img_url:
|
||||
print(f"检测到图片链接: {img_url}")
|
||||
# 清楚图片链接
|
||||
clean_chunk = remove_images_from_content(txt_chunk)
|
||||
chunk = document.add_chunk(content=clean_chunk)
|
||||
|
||||
# 判断是相对路径还是绝对路径
|
||||
if not os.path.isabs(img_url):
|
||||
img_abs_path = os.path.join(os.path.dirname(txt_path), img_url)
|
||||
else:
|
||||
img_abs_path = img_url
|
||||
print(f"图片绝对路径: {img_abs_path}")
|
||||
if not os.path.exists(img_abs_path):
|
||||
print(f"图片未找到: {img_abs_path},跳过。")
|
||||
continue
|
||||
else:
|
||||
if(upload_file2minio(dataset_id, chunk.id, img_abs_path)):
|
||||
new_img_id = f"{dataset_id}-{chunk.id}"
|
||||
print(f"图片 {img_abs_path} 已上传,新的 img_id: {new_img_id}")
|
||||
|
||||
update_img_id_in_elasticsearch(elastic_tenant_id, document.id, chunk.id, new_img_id)
|
||||
else:
|
||||
print("未检测到图片链接,直接添加文本块。")
|
||||
chunk = document.add_chunk(content=txt_chunk)
|
||||
print(f"第{num+1} Chunk添加成功! ID: {chunk.id}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文本文件时出错: {txt_path},错误: {e}")
|
||||
|
||||
|
||||
|
||||
def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset):
|
||||
"""处理PDF-TXT文件对"""
|
||||
for name, pdf_path in pdf_dict.items():
|
||||
display_name = os.path.basename(pdf_path)
|
||||
document = upload_or_get_document(dataset, pdf_path, display_name)
|
||||
print(f"选择的文档: {document.name},ID: {document.id}")
|
||||
if not document:
|
||||
continue
|
||||
|
||||
txt_path = txt_dict.get(name)
|
||||
if txt_path:
|
||||
process_txt_chunks(document, txt_path)
|
||||
process_txt_chunks(dataset.id,document, txt_path)
|
||||
|
||||
def main():
|
||||
|
||||
@@ -163,12 +341,12 @@ def main():
|
||||
dataset.id = bucket_name
|
||||
chunk_id = object_name
|
||||
"""
|
||||
file_path = "g:\\11\\22\\规范\\"
|
||||
#pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path)
|
||||
file_path = "g:\\11\\22\\test\\"
|
||||
pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path)
|
||||
|
||||
# if not pdf_dict:
|
||||
# print("未选择任何文件。")
|
||||
# return
|
||||
if not pdf_dict:
|
||||
print("未选择任何文件。")
|
||||
return
|
||||
|
||||
dataset = select_dataset(rag_object)
|
||||
print(f"选择的数据集: {dataset.name}")
|
||||
@@ -177,7 +355,7 @@ def main():
|
||||
print("未选择数据集。")
|
||||
return
|
||||
|
||||
#process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
|
||||
process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user