更新数据转换功能,支持从新格式提取信息并生成多种问题模板,优化输入输出文件路径
This commit is contained in:
		| @@ -23,10 +23,11 @@ def convert_to_alpaca_format(input_file, output_file): | ||||
|         ] | ||||
|     } | ||||
|     """ | ||||
|     choice_text=", A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE" | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|     with open(input_file, "r", encoding="utf-8-sig") as f: | ||||
|         csv_reader = csv.DictReader(f) | ||||
|         for row in csv_reader: | ||||
|             try: | ||||
| @@ -44,7 +45,7 @@ def convert_to_alpaca_format(input_file, output_file): | ||||
|                     "system": "你是个优秀的论文分类师", | ||||
|                     "conversation": [ | ||||
|                         { | ||||
|                             "human": row["question"], | ||||
|                             "human": row["question"]+choice_text, | ||||
|                             "assistant": row["answer"] | ||||
|                         } | ||||
|                     ] | ||||
| @@ -62,19 +63,8 @@ def convert_to_alpaca_format(input_file, output_file): | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") | ||||
|     # parser.add_argument( | ||||
|     #     "--input", | ||||
|     #     type=str, | ||||
|     #     required=True, | ||||
|     #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", | ||||
|     # ) | ||||
|     # parser.add_argument("--output", type=str, required=True, help="输出文件路径") | ||||
|  | ||||
|     # args = parser.parse_args() | ||||
|  | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     input_file = "newformat_sft_test_data.csv" | ||||
|     output_file = "newformat_sft_test_data--swift-sft.jsonl"  # 输出文件路径 | ||||
|     input_file = "G:\\11\\data-prepare\\eval_oc_data-26gai.csv" | ||||
|     output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
| @@ -2,6 +2,7 @@ | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
| import random | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -99,7 +100,7 @@ def convert_to_alpaca_format(input_file, output_file): | ||||
|  | ||||
|  | ||||
|  | ||||
| def convert_onedata2multi_type(input_file, output_file): | ||||
| def convert_onedata2multi_type(input_file, output_file, num_templates): | ||||
|     """ | ||||
|     读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据, | ||||
|     并保存为output_file | ||||
| @@ -108,6 +109,7 @@ def convert_onedata2multi_type(input_file, output_file): | ||||
|     input_file: 输入文件路径 | ||||
|     output_file: 输出文件路径 | ||||
|     """ | ||||
|     print(f"开始转换数据...每条数据生成{num_templates}条变体") | ||||
|     print(f"开始转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n" | ||||
| @@ -183,45 +185,84 @@ def convert_onedata2multi_type(input_file, output_file): | ||||
|             try: | ||||
|                 data = json.loads(line.strip()) | ||||
|                  | ||||
|                 # 检查数据结构 | ||||
|                 if "system" not in data or "conversation" not in data or not data["conversation"]: | ||||
|                     print(f"警告: 数据缺少必要字段: {data}") | ||||
|                 # 检查新格式的数据结构 | ||||
|                 if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3: | ||||
|                     # 提取系统指令 | ||||
|                     system_instruction = "" | ||||
|                     human_content = "" | ||||
|                     assistant_content = "" | ||||
|                      | ||||
|                     for msg in data["messages"]: | ||||
|                         if msg["role"] == "system": | ||||
|                             system_instruction = msg["content"] | ||||
|                         elif msg["role"] == "user": | ||||
|                             human_content = msg["content"] | ||||
|                         elif msg["role"] == "assistant": | ||||
|                             assistant_content = msg["content"] | ||||
|                      | ||||
|                     # 提取标题、作者和摘要 | ||||
|                     extracted = extract_title_author_and_abstract(human_content) | ||||
|                     title = extracted.get("title", "") | ||||
|                     authors = extracted.get("authors", "") | ||||
|                     abstract = extracted.get("abstract", "") | ||||
|                      | ||||
|  | ||||
|                     n = min(num_templates, len(question_templates)) | ||||
|                     selected_templates = random.sample(question_templates, n) | ||||
|                     # 为每个问题模板创建新数据 | ||||
|                     for template in selected_templates: | ||||
|                         formatted_question = template.format( | ||||
|                             title=title, | ||||
|                             authors=authors, | ||||
|                             abstract=abstract, | ||||
|                             category_text=category_text | ||||
|                         ) | ||||
|                          | ||||
|                         # 创建新的数据条目(保持新格式) | ||||
|                         new_data = { | ||||
|                             "messages": [ | ||||
|                                 {"role": "system", "content": system_instruction}, | ||||
|                                 {"role": "user", "content": formatted_question}, | ||||
|                                 {"role": "assistant", "content": assistant_content} | ||||
|                             ] | ||||
|                         } | ||||
|                         multi_type_data.append(new_data) | ||||
|                  | ||||
|                 # 检查旧格式的数据结构 | ||||
|                 elif "system" in data and "conversation" in data and data["conversation"]: | ||||
|                     system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") | ||||
|                      | ||||
|                     for turn in data["conversation"]: | ||||
|                         if "human" in turn and "assistant" in turn: | ||||
|                             extracted = extract_title_author_and_abstract(turn["human"]) | ||||
|                             title = extracted.get("title", "") | ||||
|                             authors = extracted.get("authors", "") | ||||
|                             abstract = extracted.get("abstract", "") | ||||
|                             n = min(num_templates, len(question_templates)) | ||||
|                             selected_templates = random.sample(question_templates, n) | ||||
|                              | ||||
|                             for template in selected_templates: | ||||
|                                 formatted_question = template.format( | ||||
|                                     title=title, | ||||
|                                     authors=authors, | ||||
|                                     abstract=abstract, | ||||
|                                     category_text=category_text | ||||
|                                 ) | ||||
|                                  | ||||
|                                 new_data = { | ||||
|                                     "system": system_instruction, | ||||
|                                     "conversation": [ | ||||
|                                         { | ||||
|                                             "human": formatted_question, | ||||
|                                             "assistant": turn["assistant"] | ||||
|                                         } | ||||
|                                     ] | ||||
|                                 } | ||||
|                                 multi_type_data.append(new_data) | ||||
|                 else: | ||||
|                     print(f"警告: 数据格式不识别: {data}") | ||||
|                     continue | ||||
|              | ||||
|                 # 获取系统指令 | ||||
|                 system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") | ||||
|                  | ||||
|                 # 处理对话 | ||||
|                 for turn in data["conversation"]: | ||||
|                     if "human" in turn and "assistant" in turn: | ||||
|                         # 提取标题、作者和摘要 | ||||
|                         extracted = extract_title_author_and_abstract(turn["human"]) | ||||
|                         title = extracted.get("title", "") | ||||
|                         authors = extracted.get("authors", "") | ||||
|                         abstract = extracted.get("abstract", "") | ||||
|                          | ||||
|                         # 为每个问题模板创建新数据 | ||||
|                         for template in question_templates: | ||||
|                             # 格式化问题 | ||||
|                             formatted_question = template.format( | ||||
|                                 title=title, | ||||
|                                 authors=authors, | ||||
|                                 abstract=abstract, | ||||
|                                 category_text=category_text | ||||
|                             ) | ||||
|                              | ||||
|                             # 创建新的数据条目 | ||||
|                             new_data = { | ||||
|                                 "system": system_instruction, | ||||
|                                 "conversation": [ | ||||
|                                     { | ||||
|                                         "human": formatted_question, | ||||
|                                         "assistant": turn["assistant"] | ||||
|                                     } | ||||
|                                 ] | ||||
|                             } | ||||
|                             multi_type_data.append(new_data) | ||||
|              | ||||
|             except json.JSONDecodeError: | ||||
|                 print(f"警告: 无法解析JSON行: {line}") | ||||
|             except Exception as e: | ||||
| @@ -248,10 +289,12 @@ if __name__ == "__main__": | ||||
|     content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE" | ||||
|     extract_title_author_and_abstract(content_text) | ||||
|  | ||||
|     input_file = "G:\\11\\data-prepare\\val_dataset.jsonl" | ||||
|     output_file = "G:\\11\\data-prepare\\val_dataset-m.jsonl"  # 输出文件路径 | ||||
|     # input_file = "G:\\11\\data-prepare\\val_dataset.jsonl" | ||||
|     # output_file = "G:\\11\\data-prepare\\val_dataset-m2.jsonl"  # 输出文件路径 | ||||
|     input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" | ||||
|     output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m2.jsonl"  # 输出文件路径     | ||||
|  | ||||
|     convert_onedata2multi_type(input_file, output_file) | ||||
|     convert_onedata2multi_type(input_file, output_file, num_templates=2) | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user