data-prepare/01-pre.py

import json

# 要保留的类别关键词
# target_categories = {
#     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
#     "cs.CL", "cs.CV", "cs.LG",
#     "gr-qc", "hep-ph", "hep-th", "quant-ph"
# }

target_categories = {
        'quant-ph',
        'physics.chem-ph', 
        'physics.atom-ph',
        'cond-mat.soft',
        'cs.RO',
        'cs.CL',
        'cs.SE',
        'cs.IR',
        'hep-th',
        'hep-ph',
        'physics.optics',
        'cs.AI',
        'cs.CV',
        'nucl-th',
        'astro-ph',
        'math.PR',
        'cs.OS',
        'eess.SP',
        'math.OC',
        'math.DS',
        'math.DG',
        'math.MP',
        'cs.MM',
        'stat.ME',
        'math.CO',
        'cs.NE'
    }


input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot-single.json"  # 使用 JSON Lines 格式输出路径

count = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        try:
            record = json.loads(line)
            record_cats = record.get("categories", "").split()
                        # 获取更新日期和摘要
            update_date = record.get("update_date", "")
            abstract = record.get("abstract", "")


            # 只保留一个类别的记录
            if len(record_cats) > 1:
                continue
            if record_cats:
                last_cat = record_cats[0]
                if last_cat in target_categories:
                    # 定义无需过滤条件的类别
                    no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}
                    
                    # 如果属于无需过滤的类别，直接写入
                    if last_cat in no_filter_categories:
                        outfile.write(json.dumps(record) + '\n')
                        count += 1
                    else:
                        # 其他类别需要满足过滤条件
                        if len(abstract) >= 300 and len(abstract) <= 1024:
                            if update_date and int(update_date[:4]) >= 2016:
                                outfile.write(json.dumps(record) + '\n')
                                count += 1

        except json.JSONDecodeError:
            continue  # 忽略格式错误的行

print(f"筛选完成，共保存了 {count} 条记录到 {output_path}")
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`import json`

			`# 要保留的类别关键词`
swift 2025-07-18 18:00:04 +08:00			`# target_categories = {`
			`# "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",`
			`# "cs.CL", "cs.CV", "cs.LG",`
			`# "gr-qc", "hep-ph", "hep-th", "quant-ph"`
			`# }`

添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`target_categories = {`
swift 2025-07-18 18:00:04 +08:00			`'quant-ph',`
			`'physics.chem-ph',`
			`'physics.atom-ph',`
			`'cond-mat.soft',`
			`'cs.RO',`
			`'cs.CL',`
			`'cs.SE',`
			`'cs.IR',`
			`'hep-th',`
			`'hep-ph',`
			`'physics.optics',`
			`'cs.AI',`
			`'cs.CV',`
			`'nucl-th',`
			`'astro-ph',`
			`'math.PR',`
			`'cs.OS',`
			`'eess.SP',`
			`'math.OC',`
			`'math.DS',`
			`'math.DG',`
			`'math.MP',`
			`'cs.MM',`
			`'stat.ME',`
			`'math.CO',`
			`'cs.NE'`
			`}`



添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00
			`input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径`
添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据 2025-07-30 23:05:31 +08:00			`output_path = "arxiv-metadata-oai-snapshot-single.json" # 使用 JSON Lines 格式输出路径`
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00
			`count = 0`

			`with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:`
			`for line in infile:`
			`try:`
			`record = json.loads(line)`
			`record_cats = record.get("categories", "").split()`
添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据 2025-07-30 23:05:31 +08:00			`# 获取更新日期和摘要`
			`update_date = record.get("update_date", "")`
			`abstract = record.get("abstract", "")`





			`# 只保留一个类别的记录`
			`if len(record_cats) > 1:`
			`continue`
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`if record_cats:`
添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据 2025-07-30 23:05:31 +08:00			`last_cat = record_cats[0]`
添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`if last_cat in target_categories:`
添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据 2025-07-30 23:05:31 +08:00			`# 定义无需过滤条件的类别`
			`no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}`

			`# 如果属于无需过滤的类别，直接写入`
			`if last_cat in no_filter_categories:`
			`outfile.write(json.dumps(record) + '\n')`
			`count += 1`
			`else:`
			`# 其他类别需要满足过滤条件`
			`if len(abstract) >= 300 and len(abstract) <= 1024:`
			`if update_date and int(update_date[:4]) >= 2016:`
			`outfile.write(json.dumps(record) + '\n')`
			`count += 1`

添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式 2025-06-09 14:39:07 +08:00			`except json.JSONDecodeError:`
			`continue # 忽略格式错误的行`

			`print(f"筛选完成，共保存了 {count} 条记录到 {output_path}")`