import json import os import argparse import re def convert_to_alpaca_format(input_file, output_file): """ 将 Swift 格式的数据转换为 Alpaca 格式 输入格式: { "messages": [ { "role": "assistant", "content": "This is a paper titled ...." } ] } 删除"content"中的 "with ID 0704.0145,"部分 按原格式输出 """ print(f"转换数据: {input_file} -> {output_file}") converted_data = [] with open(input_file, "r", encoding="utf-8") as f: for line in f: try: data = json.loads(line.strip()) # 检查数据结构 if "messages" not in data or not isinstance(data["messages"], list): print(f"警告: 数据格式不正确,缺少messages字段或格式错误") continue if not data["messages"] or "content" not in data["messages"][0]: print(f"警告: messages为空或缺少content字段") continue # 转换数据 content = data["messages"][0]["content"] # 删除 "with ID xxxx.xxxx," 的部分 content = re.sub(r'with ID \d+\.?\d*,\s*', '', content) content = content[:-180] new_data = { "messages": [ { "role": data["messages"][0].get("role", "assistant"), "content": content } ] } converted_data.append(new_data) except json.JSONDecodeError: print(f"警告: 无法解析JSON行: {line}") except Exception as e: print(f"处理行时发生错误: {str(e)}") # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: for item in converted_data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"转换完成! 共转换 {len(converted_data)} 条数据") if __name__ == "__main__": input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl" output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径 convert_to_alpaca_format(input_file, output_file)