81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
|
import json
|
|||
|
import os
|
|||
|
import argparse
|
|||
|
import re
|
|||
|
|
|||
|
|
|||
|
def convert_to_alpaca_format(input_file, output_file):
|
|||
|
"""
|
|||
|
将 Swift 格式的数据转换为 Alpaca 格式
|
|||
|
|
|||
|
输入格式:
|
|||
|
{
|
|||
|
"messages": [
|
|||
|
{
|
|||
|
"role": "assistant",
|
|||
|
"content": "This is a paper titled ...."
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
|
|||
|
删除"content"中的 "with ID 0704.0145,"部分
|
|||
|
按原格式输出
|
|||
|
"""
|
|||
|
print(f"转换数据: {input_file} -> {output_file}")
|
|||
|
|
|||
|
converted_data = []
|
|||
|
with open(input_file, "r", encoding="utf-8") as f:
|
|||
|
for line in f:
|
|||
|
try:
|
|||
|
data = json.loads(line.strip())
|
|||
|
|
|||
|
# 检查数据结构
|
|||
|
if "messages" not in data or not isinstance(data["messages"], list):
|
|||
|
print(f"警告: 数据格式不正确,缺少messages字段或格式错误")
|
|||
|
continue
|
|||
|
|
|||
|
if not data["messages"] or "content" not in data["messages"][0]:
|
|||
|
print(f"警告: messages为空或缺少content字段")
|
|||
|
continue
|
|||
|
|
|||
|
# 转换数据
|
|||
|
content = data["messages"][0]["content"]
|
|||
|
# 删除 "with ID xxxx.xxxx," 的部分
|
|||
|
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
|
|||
|
content = content[:-180]
|
|||
|
|
|||
|
new_data = {
|
|||
|
"messages": [
|
|||
|
{
|
|||
|
"role": data["messages"][0].get("role", "assistant"),
|
|||
|
"content": content
|
|||
|
}
|
|||
|
]
|
|||
|
}
|
|||
|
|
|||
|
converted_data.append(new_data)
|
|||
|
|
|||
|
except json.JSONDecodeError:
|
|||
|
print(f"警告: 无法解析JSON行: {line}")
|
|||
|
except Exception as e:
|
|||
|
print(f"处理行时发生错误: {str(e)}")
|
|||
|
|
|||
|
# 写入输出文件
|
|||
|
with open(output_file, "w", encoding="utf-8") as f:
|
|||
|
for item in converted_data:
|
|||
|
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|||
|
|
|||
|
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
|
|||
|
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
|
|||
|
|
|||
|
convert_to_alpaca_format(input_file, output_file)
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|
|||
|
|