2025-07-18 18:00:04 +08:00
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
import csv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_to_alpaca_format(input_file, output_file):
|
|
|
|
|
"""
|
|
|
|
|
读取csv文件,提取其中的question和answer列的数据,并转换为 Alpaca 格式。
|
|
|
|
|
|
|
|
|
|
输入csv格式:
|
|
|
|
|
question,A,B,C,D,E,F,G,H,I,J,answer
|
|
|
|
|
|
|
|
|
|
输出格式 (swift):
|
|
|
|
|
{
|
|
|
|
|
"system": "你是个优秀的论文分类师",
|
|
|
|
|
"conversation": [
|
|
|
|
|
{
|
|
|
|
|
"human": "Based on the title...",
|
|
|
|
|
"assistant": "D"
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
"""
|
2025-07-19 17:06:10 +08:00
|
|
|
|
choice_text=", A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
|
2025-07-18 18:00:04 +08:00
|
|
|
|
print(f"转换数据: {input_file} -> {output_file}")
|
|
|
|
|
|
|
|
|
|
converted_data = []
|
2025-07-19 17:06:10 +08:00
|
|
|
|
with open(input_file, "r", encoding="utf-8-sig") as f:
|
2025-07-18 18:00:04 +08:00
|
|
|
|
csv_reader = csv.DictReader(f)
|
|
|
|
|
for row in csv_reader:
|
|
|
|
|
try:
|
|
|
|
|
# 检查必要的列是否存在
|
|
|
|
|
if "question" not in row or "answer" not in row:
|
|
|
|
|
print(f"警告: 数据缺少必要列: {row}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 创建新的 swift 格式数据
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
new_data = {
|
|
|
|
|
"system": "你是个优秀的论文分类师",
|
|
|
|
|
"conversation": [
|
|
|
|
|
{
|
2025-07-19 17:06:10 +08:00
|
|
|
|
"human": row["question"]+choice_text,
|
2025-07-18 18:00:04 +08:00
|
|
|
|
"assistant": row["answer"]
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
converted_data.append(new_data)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理行时发生错误: {str(e)}")
|
|
|
|
|
|
|
|
|
|
# 写入输出文件
|
|
|
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
|
|
|
for item in converted_data:
|
|
|
|
|
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
|
|
|
|
|
|
|
|
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2025-07-19 17:06:10 +08:00
|
|
|
|
|
|
|
|
|
input_file = "G:\\11\\data-prepare\\eval_oc_data-26gai.csv"
|
|
|
|
|
output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" # 输出文件路径
|
2025-07-18 18:00:04 +08:00
|
|
|
|
|
|
|
|
|
convert_to_alpaca_format(input_file, output_file)
|