Files
data-prepare/05-data-xtuner-swfit.py
2025-07-18 18:00:04 +08:00

75 lines
2.0 KiB
Python

import json
import os
import argparse
def convert_to_alpaca_format(input_file, output_file):
"""
将 Alpaca 格式转换为 Swift 格式的数据
输入格式:
{
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": "Based on the title...",
"output": "D"
}
输出格式 (Alpaca):
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "arxiv-metadata-oai-snapshot--swift.json"
output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)