Files
data-prepare/05-data-csv-swift-sft.py

70 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import csv
def convert_to_alpaca_format(input_file, output_file):
"""
读取csv文件提取其中的question和answer列的数据并转换为 Alpaca 格式。
输入csv格式:
question,A,B,C,D,E,F,G,H,I,J,answer
输出格式 (swift):
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
choice_text=", A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8-sig") as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
try:
# 检查必要的列是否存在
if "question" not in row or "answer" not in row:
print(f"警告: 数据缺少必要列: {row}")
continue
# 创建新的 swift 格式数据
new_data = {
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": row["question"]+choice_text,
"assistant": row["answer"]
}
]
}
converted_data.append(new_data)
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
input_file = "G:\\11\\data-prepare\\eval_oc_data-26gai.csv"
output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)