swift
This commit is contained in:
80
05-data-swfit-pretrain-revise.py
Normal file
80
05-data-swfit-pretrain-revise.py
Normal file
@@ -0,0 +1,80 @@
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import re
|
||||
|
||||
|
||||
def convert_to_alpaca_format(input_file, output_file):
|
||||
"""
|
||||
将 Swift 格式的数据转换为 Alpaca 格式
|
||||
|
||||
输入格式:
|
||||
{
|
||||
"messages": [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "This is a paper titled ...."
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
删除"content"中的 "with ID 0704.0145,"部分
|
||||
按原格式输出
|
||||
"""
|
||||
print(f"转换数据: {input_file} -> {output_file}")
|
||||
|
||||
converted_data = []
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line.strip())
|
||||
|
||||
# 检查数据结构
|
||||
if "messages" not in data or not isinstance(data["messages"], list):
|
||||
print(f"警告: 数据格式不正确,缺少messages字段或格式错误")
|
||||
continue
|
||||
|
||||
if not data["messages"] or "content" not in data["messages"][0]:
|
||||
print(f"警告: messages为空或缺少content字段")
|
||||
continue
|
||||
|
||||
# 转换数据
|
||||
content = data["messages"][0]["content"]
|
||||
# 删除 "with ID xxxx.xxxx," 的部分
|
||||
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
|
||||
content = content[:-180]
|
||||
|
||||
new_data = {
|
||||
"messages": [
|
||||
{
|
||||
"role": data["messages"][0].get("role", "assistant"),
|
||||
"content": content
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
converted_data.append(new_data)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f"警告: 无法解析JSON行: {line}")
|
||||
except Exception as e:
|
||||
print(f"处理行时发生错误: {str(e)}")
|
||||
|
||||
# 写入输出文件
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
for item in converted_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
|
||||
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
|
||||
|
||||
convert_to_alpaca_format(input_file, output_file)
|
||||
|
||||
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user