data-prepare/05-data-swfit-pretrain-revise.py

import json
import os
import argparse
import re


def convert_to_alpaca_format(input_file, output_file):
    """
    将 Swift 格式的数据转换为 Alpaca 格式

    输入格式:
        {
        "messages": [
            {
                "role": "assistant",
                "content": "This is a paper titled ...."
            }
        ]
    }

    删除"content"中的 "with ID 0704.0145,"部分
    按原格式输出
    """
    print(f"转换数据: {input_file} -> {output_file}")

    converted_data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data = json.loads(line.strip())

                # 检查数据结构
                if "messages" not in data or not isinstance(data["messages"], list):
                    print(f"警告: 数据格式不正确，缺少messages字段或格式错误")
                    continue

                if not data["messages"] or "content" not in data["messages"][0]:
                    print(f"警告: messages为空或缺少content字段")
                    continue

                # 转换数据
                content = data["messages"][0]["content"]
                # 删除 "with ID xxxx.xxxx," 的部分
                content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
                content = content[:-180]

                new_data = {
                    "messages": [
                        {
                            "role": data["messages"][0].get("role", "assistant"),
                            "content": content
                        }
                    ]
                }

                converted_data.append(new_data)

            except json.JSONDecodeError:
                print(f"警告: 无法解析JSON行: {line}")
            except Exception as e:
                print(f"处理行时发生错误: {str(e)}")

    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"转换完成! 共转换 {len(converted_data)} 条数据")


if __name__ == "__main__":
    input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
    output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl"  # 输出文件路径

    convert_to_alpaca_format(input_file, output_file)