Files
data-prepare/05-data-swfit-pretrain-revise.py
2025-07-18 18:00:04 +08:00

81 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import argparse
import re
def convert_to_alpaca_format(input_file, output_file):
"""
将 Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"messages": [
{
"role": "assistant",
"content": "This is a paper titled ...."
}
]
}
删除"content"中的 "with ID 0704.0145,"部分
按原格式输出
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "messages" not in data or not isinstance(data["messages"], list):
print(f"警告: 数据格式不正确缺少messages字段或格式错误")
continue
if not data["messages"] or "content" not in data["messages"][0]:
print(f"警告: messages为空或缺少content字段")
continue
# 转换数据
content = data["messages"][0]["content"]
# 删除 "with ID xxxx.xxxx," 的部分
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
content = content[:-180]
new_data = {
"messages": [
{
"role": data["messages"][0].get("role", "assistant"),
"content": content
}
]
}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)