data-prepare/05-data-swfit-xtuner.py

      
import json
import os
import argparse


def convert_to_alpaca_format(input_file, output_file):
    """
    将 Swift 格式的数据转换为 Alpaca 格式

    输入格式:
    {
        "system": "你是个优秀的论文分类师",
        "conversation": [
            {
                "human": "Based on the title...",
                "assistant": "D"
            }
        ]
    }

    输出格式 (Alpaca):
    {
        "instruction": "根据论文的标题、作者和摘要，确定该论文的科学类别。",
        "input": "Based on the title...",
        "output": "D"
    }
    """
    print(f"转换数据: {input_file} -> {output_file}")

    converted_data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data = json.loads(line.strip())

                # 检查数据结构
                if "system" not in data or "conversation" not in data:
                    print(f"警告: 数据缺少必要字段: {data}")
                    continue

                # 从 system 提取指令
                instruction = data.get("system", "")
                if not instruction:
                    instruction = "根据论文的标题、作者和摘要，确定该论文的科学类别。"

                # 处理对话
                for turn in data["conversation"]:
                    if "human" in turn and "assistant" in turn:
                        # 创建新的 Alpaca 格式数据
                        new_data = {
                            "instruction": instruction,
                            "input": turn["human"],
                            "output": turn["assistant"],
                        }
                        converted_data.append(new_data)

            except json.JSONDecodeError:
                print(f"警告: 无法解析JSON行: {line}")
            except Exception as e:
                print(f"处理行时发生错误: {str(e)}")

    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"转换完成! 共转换 {len(converted_data)} 条数据")


if __name__ == "__main__":
    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
    # parser.add_argument(
    #     "--input",
    #     type=str,
    #     required=True,
    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
    # )
    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")

    # args = parser.parse_args()

    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
    input_file = "arxiv-metadata-oai-snapshot--swift.json"
    output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl"  # 输出文件路径

    convert_to_alpaca_format(input_file, output_file)
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00
			`import json`
			`import os`
			`import argparse`


			`def convert_to_alpaca_format(input_file, output_file):`
			`"""`
			`将 Swift 格式的数据转换为 Alpaca 格式`

			`输入格式:`
			`{`
			`"system": "你是个优秀的论文分类师",`
			`"conversation": [`
			`{`
			`"human": "Based on the title...",`
			`"assistant": "D"`
			`}`
			`]`
			`}`

			`输出格式 (Alpaca):`
			`{`
			`"instruction": "根据论文的标题、作者和摘要，确定该论文的科学类别。",`
			`"input": "Based on the title...",`
			`"output": "D"`
			`}`
			`"""`
			`print(f"转换数据: {input_file} -> {output_file}")`

			`converted_data = []`
			`with open(input_file, "r", encoding="utf-8") as f:`
			`for line in f:`
			`try:`
			`data = json.loads(line.strip())`

			`# 检查数据结构`
			`if "system" not in data or "conversation" not in data:`
			`print(f"警告: 数据缺少必要字段: {data}")`
			`continue`

			`# 从 system 提取指令`
			`instruction = data.get("system", "")`
			`if not instruction:`
			`instruction = "根据论文的标题、作者和摘要，确定该论文的科学类别。"`

			`# 处理对话`
			`for turn in data["conversation"]:`
			`if "human" in turn and "assistant" in turn:`
			`# 创建新的 Alpaca 格式数据`
			`new_data = {`
			`"instruction": instruction,`
			`"input": turn["human"],`
			`"output": turn["assistant"],`
			`}`
			`converted_data.append(new_data)`

			`except json.JSONDecodeError:`
			`print(f"警告: 无法解析JSON行: {line}")`
			`except Exception as e:`
			`print(f"处理行时发生错误: {str(e)}")`

			`# 写入输出文件`
			`with open(output_file, "w", encoding="utf-8") as f:`
			`for item in converted_data:`
			`f.write(json.dumps(item, ensure_ascii=False) + "\n")`

			`print(f"转换完成! 共转换 {len(converted_data)} 条数据")`


			`if __name__ == "__main__":`
			`# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")`
			`# parser.add_argument(`
			`# "--input",`
			`# type=str,`
			`# required=True,`
			`# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",`
			`# )`
			`# parser.add_argument("--output", type=str, required=True, help="输出文件路径")`

			`# args = parser.parse_args()`

			`#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径`
			`input_file = "arxiv-metadata-oai-snapshot--swift.json"`
			`output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径`

			`convert_to_alpaca_format(input_file, output_file)`