data-prepare/04-data2swift.py

import json
import random

input_file = "arxiv-metadata-oai-snapshot--ratio.json"   # 20000条原始数据文件路径
output_file = "arxiv-metadata-oai-snapshot--swift.json"

# 类别对应选项映射
label_map = {
    "astro-ph": "A",
    "cond-mat.mes-hall": "B",
    "cond-mat.mtrl-sci": "C",
    "cs.CL": "D",
    "cs.CV": "E",
    "cs.LG": "F",
    "gr-qc": "G",
    "hep-ph": "H",
    "hep-th": "I",
    "quant-ph": "J"
}

options_text = (
    "\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"
    "E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"
)

# 读取所有数据
with open(input_file, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# 随机抽样1000条
#random.seed(42)
sampled = data

with open(output_file, 'w', encoding='utf-8') as f_out:
    count = 0
    for item in sampled:
        # 多类别时取最后一个类别（通常以空格分割）
        categories_str = item.get("categories", "").strip()
        if not categories_str:
            continue
        last_category = categories_str.split()[-1]

        if last_category not in label_map:
            continue

        title = item.get("title", "").replace("\n", " ").strip()
        authors = item.get("authors", "").replace("\n", " ").strip()
        abstract = item.get("abstract", "").replace("\n", " ").strip()
        if not title or not authors or not abstract:
            continue

        human_text = (
            f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', "
            f"please determine the scientific category of this paper.{options_text}"
        )

        finetune_sample = {
            "system": "你是个优秀的论文分类师",
            "conversation": [
                {
                    "human": human_text,
                    "assistant": label_map[last_category]
                }
            ]
        }

        f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n")
        count += 1

print(f"转换完成，共生成{count}条微调数据，保存到 {output_file}")
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`import json`
			`import random`

			`input_file = "arxiv-metadata-oai-snapshot--ratio.json" # 20000条原始数据文件路径`
			`output_file = "arxiv-metadata-oai-snapshot--swift.json"`

			`# 类别对应选项映射`
			`label_map = {`
			`"astro-ph": "A",`
			`"cond-mat.mes-hall": "B",`
			`"cond-mat.mtrl-sci": "C",`
			`"cs.CL": "D",`
			`"cs.CV": "E",`
			`"cs.LG": "F",`
			`"gr-qc": "G",`
			`"hep-ph": "H",`
			`"hep-th": "I",`
			`"quant-ph": "J"`
			`}`

			`options_text = (`
			`"\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"`
			`"E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"`
			`)`

			`# 读取所有数据`
			`with open(input_file, 'r', encoding='utf-8') as f:`
			`data = [json.loads(line) for line in f]`

			`# 随机抽样1000条`
			`#random.seed(42)`
			`sampled = data`

			`with open(output_file, 'w', encoding='utf-8') as f_out:`
			`count = 0`
			`for item in sampled:`
			`# 多类别时取最后一个类别（通常以空格分割）`
			`categories_str = item.get("categories", "").strip()`
			`if not categories_str:`
			`continue`
			`last_category = categories_str.split()[-1]`

			`if last_category not in label_map:`
			`continue`

			`title = item.get("title", "").replace("\n", " ").strip()`
			`authors = item.get("authors", "").replace("\n", " ").strip()`
			`abstract = item.get("abstract", "").replace("\n", " ").strip()`
			`if not title or not authors or not abstract:`
			`continue`

			`human_text = (`
			`f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', "`
			`f"please determine the scientific category of this paper.{options_text}"`
			`)`

			`finetune_sample = {`
			`"system": "你是个优秀的论文分类师",`
			`"conversation": [`
			`{`
			`"human": human_text,`
			`"assistant": label_map[last_category]`
			`}`
			`]`
			`}`

			`f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n")`
			`count += 1`

			`print(f"转换完成，共生成{count}条微调数据，保存到 {output_file}")`