更新数据转换功能,支持从新格式提取信息并生成多种问题模板,优化输入输出文件路径
This commit is contained in:
@@ -23,10 +23,11 @@ def convert_to_alpaca_format(input_file, output_file):
|
||||
]
|
||||
}
|
||||
"""
|
||||
choice_text=", A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
|
||||
print(f"转换数据: {input_file} -> {output_file}")
|
||||
|
||||
converted_data = []
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
with open(input_file, "r", encoding="utf-8-sig") as f:
|
||||
csv_reader = csv.DictReader(f)
|
||||
for row in csv_reader:
|
||||
try:
|
||||
@@ -44,7 +45,7 @@ def convert_to_alpaca_format(input_file, output_file):
|
||||
"system": "你是个优秀的论文分类师",
|
||||
"conversation": [
|
||||
{
|
||||
"human": row["question"],
|
||||
"human": row["question"]+choice_text,
|
||||
"assistant": row["answer"]
|
||||
}
|
||||
]
|
||||
@@ -62,19 +63,8 @@ def convert_to_alpaca_format(input_file, output_file):
|
||||
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
|
||||
# parser.add_argument(
|
||||
# "--input",
|
||||
# type=str,
|
||||
# required=True,
|
||||
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
|
||||
# )
|
||||
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
|
||||
|
||||
# args = parser.parse_args()
|
||||
|
||||
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
|
||||
input_file = "newformat_sft_test_data.csv"
|
||||
output_file = "newformat_sft_test_data--swift-sft.jsonl" # 输出文件路径
|
||||
input_file = "G:\\11\\data-prepare\\eval_oc_data-26gai.csv"
|
||||
output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" # 输出文件路径
|
||||
|
||||
convert_to_alpaca_format(input_file, output_file)
|
Reference in New Issue
Block a user