diff --git a/src/add_chunk_cli_pdf_img.py b/src/add_chunk_cli_pdf_img.py index 8438f3c..e4ad54b 100644 --- a/src/add_chunk_cli_pdf_img.py +++ b/src/add_chunk_cli_pdf_img.py @@ -157,19 +157,27 @@ def process_pdf_txt_pairs(pdf_dict, txt_dict, dataset): process_txt_chunks(document, txt_path) def main(): - file_path = "g:\\11\\22\\规范\\" - pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) + + """主函数,处理PDF和TXT文件对 - if not pdf_dict: - print("未选择任何文件。") - return + dataset.id = bucket_name + chunk_id = object_name + """ + file_path = "g:\\11\\22\\规范\\" + #pdf_dict, txt_dict = pair_pdf_and_txt(file_path, file_path) + + # if not pdf_dict: + # print("未选择任何文件。") + # return dataset = select_dataset(rag_object) + print(f"选择的数据集: {dataset.name}") + print(f"选择的数据集id: {dataset.id}") if not dataset: print("未选择数据集。") return - process_pdf_txt_pairs(pdf_dict, txt_dict, dataset) + #process_pdf_txt_pairs(pdf_dict, txt_dict, dataset)