Files
data-prepare/05-data-swfit-pretrain-revise.py

81 lines
2.3 KiB
Python
Raw Normal View History

2025-07-18 18:00:04 +08:00
import json
import os
import argparse
import re
def convert_to_alpaca_format(input_file, output_file):
"""
Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"messages": [
{
"role": "assistant",
"content": "This is a paper titled ...."
}
]
}
删除"content"中的 "with ID 0704.0145,"部分
按原格式输出
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "messages" not in data or not isinstance(data["messages"], list):
print(f"警告: 数据格式不正确缺少messages字段或格式错误")
continue
if not data["messages"] or "content" not in data["messages"][0]:
print(f"警告: messages为空或缺少content字段")
continue
# 转换数据
content = data["messages"][0]["content"]
# 删除 "with ID xxxx.xxxx," 的部分
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
content = content[:-180]
new_data = {
"messages": [
{
"role": data["messages"][0].get("role", "assistant"),
"content": content
}
]
}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)