data-prepare/01-pre.py

import json

# 要保留的类别关键词
target_categories = {
    "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
    "cs.CL", "cs.CV", "cs.LG",
    "gr-qc", "hep-ph", "hep-th", "quant-ph"
}

input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot--.json"  # 使用 JSON Lines 格式输出路径

count = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        try:
            record = json.loads(line)
            record_cats = record.get("categories", "").split()
            if record_cats:
                last_cat = record_cats[-1]
                if last_cat in target_categories:
                    outfile.write(json.dumps(record) + '\n')
                    count += 1
        except json.JSONDecodeError:
            continue  # 忽略格式错误的行

print(f"筛选完成，共保存了 {count} 条记录到 {output_path}")
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`import json`

			`# 要保留的类别关键词`
			`target_categories = {`
			`"astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",`
			`"cs.CL", "cs.CV", "cs.LG",`
			`"gr-qc", "hep-ph", "hep-th", "quant-ph"`
			`}`

			`input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径`
			`output_path = "arxiv-metadata-oai-snapshot--.json" # 使用 JSON Lines 格式输出路径`

			`count = 0`

			`with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:`
			`for line in infile:`
			`try:`
			`record = json.loads(line)`
			`record_cats = record.get("categories", "").split()`
			`if record_cats:`
			`last_cat = record_cats[-1]`
			`if last_cat in target_categories:`
			`outfile.write(json.dumps(record) + '\n')`
			`count += 1`
			`except json.JSONDecodeError:`
			`continue # 忽略格式错误的行`

			`print(f"筛选完成，共保存了 {count} 条记录到 {output_path}")`