From 40211521a2ffa117ce86dd5c99e38c0d62e20afe Mon Sep 17 00:00:00 2001 From: glowzz <24627181@qq.com> Date: Tue, 22 Jul 2025 18:29:48 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=B8=BB=E5=87=BD=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E6=B7=BB=E5=8A=A0PDF=E5=92=8CTXT=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E5=A4=84=E7=90=86=E8=AF=B4=E6=98=8E=EF=BC=8C=E6=B3=A8=E9=87=8A?= =?UTF-8?q?=E6=8E=89=E6=96=87=E4=BB=B6=E9=80=89=E6=8B=A9=E5=92=8C=E5=A4=84?= =?UTF-8?q?=E7=90=86=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/add_chunk_cli_pdf_img.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/add_chunk_cli_pdf_img.py b/src/add_chunk_cli_pdf_img.py index 8438f3c..e4ad54b 100644 --- a/src/add_chunk_cli_pdf_img.py +++ b/src/add_chunk_cli_pdf_img.py @@ -157,19 +157,27 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): process_txt_chunks(document, txt_path) def main(): - file_path = "g:\\11\\22\\规范\\" - pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) + + """主函数,处理PDF和TXT文件对 - if not pdf_dict: - print("未选择任何文件。") - return + dataset.id = bucket_name + chunk_id = object_name + """ + file_path = "g:\\11\\22\\规范\\" + #pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) + + # if not pdf_dict: + # print("未选择任何文件。") + # return dataset = select_dataset(rag_object) + print(f"选择的数据集: {dataset.name}") + print(f"选择的数据集id: {dataset.id}") if not dataset: print("未选择数据集。") return - process_pdf_txt_pairs(pdf_dict, txt_dict, dataset) + #process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)