81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
import json
|
||
import os
|
||
import argparse
|
||
import re
|
||
|
||
|
||
def convert_to_alpaca_format(input_file, output_file):
|
||
"""
|
||
将 Swift 格式的数据转换为 Alpaca 格式
|
||
|
||
输入格式:
|
||
{
|
||
"messages": [
|
||
{
|
||
"role": "assistant",
|
||
"content": "This is a paper titled ...."
|
||
}
|
||
]
|
||
}
|
||
|
||
删除"content"中的 "with ID 0704.0145,"部分
|
||
按原格式输出
|
||
"""
|
||
print(f"转换数据: {input_file} -> {output_file}")
|
||
|
||
converted_data = []
|
||
with open(input_file, "r", encoding="utf-8") as f:
|
||
for line in f:
|
||
try:
|
||
data = json.loads(line.strip())
|
||
|
||
# 检查数据结构
|
||
if "messages" not in data or not isinstance(data["messages"], list):
|
||
print(f"警告: 数据格式不正确,缺少messages字段或格式错误")
|
||
continue
|
||
|
||
if not data["messages"] or "content" not in data["messages"][0]:
|
||
print(f"警告: messages为空或缺少content字段")
|
||
continue
|
||
|
||
# 转换数据
|
||
content = data["messages"][0]["content"]
|
||
# 删除 "with ID xxxx.xxxx," 的部分
|
||
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
|
||
content = content[:-180]
|
||
|
||
new_data = {
|
||
"messages": [
|
||
{
|
||
"role": data["messages"][0].get("role", "assistant"),
|
||
"content": content
|
||
}
|
||
]
|
||
}
|
||
|
||
converted_data.append(new_data)
|
||
|
||
except json.JSONDecodeError:
|
||
print(f"警告: 无法解析JSON行: {line}")
|
||
except Exception as e:
|
||
print(f"处理行时发生错误: {str(e)}")
|
||
|
||
# 写入输出文件
|
||
with open(output_file, "w", encoding="utf-8") as f:
|
||
for item in converted_data:
|
||
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
||
|
||
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
|
||
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
|
||
|
||
convert_to_alpaca_format(input_file, output_file)
|
||
|
||
|
||
|
||
|
||
|