data-prepare/01-pre-multi.py

import json

# 要保留的类别关键词
# target_categories = {
#     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
#     "cs.CL", "cs.CV", "cs.LG",
#     "gr-qc", "hep-ph", "hep-th", "quant-ph"
# }

target_categories = {
        'quant-ph',
        'physics.chem-ph',
        'physics.atom-ph',
        'cond-mat.soft',
        'cs.RO',
        'cs.CL',
        'cs.SE',
        'cs.IR',
        'hep-th',
        'hep-ph',
        'physics.optics',
        'cs.AI',
        'cs.CV',
        'nucl-th',
        'astro-ph',
        'math.PR',
        'cs.OS',
        'eess.SP',
        'math.OC',
        'math.DS',
        'math.DG',
        'math.MP',
        'cs.MM',
        'stat.ME',
        'math.CO',
        'cs.NE'
    }


input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot-multi.json"  # 使用 JSON Lines 格式输出路径

count = 0

with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
    for line in infile:
        try:
            record = json.loads(line)
            record_cats = record.get("categories", "").split()
                        # 获取更新日期和摘要
            update_date = record.get("update_date", "")
            abstract = record.get("abstract", "")


            # 多类别的记录
            if len(record_cats) > 1:
                # 检查是否record_cats只有一个类别在目标类别中
                # 检查record_cats中是否只有一个类别在目标类别中
                target_count = sum(1 for cat in record_cats if cat in target_categories)
                has_single_target_category = target_count == 1

                if not has_single_target_category:
                    continue

                    # 检查是否包含无需过滤的类别
                no_filter_categories = {'cs.OS'}
                has_no_filter_category = any(cat in no_filter_categories for cat in record_cats)

                # 如果包含无需过滤的类别，直接写入
                if has_no_filter_category:
                    outfile.write(json.dumps(record) + '\n')
                    count += 1
                else:


                    # 其他需要满足过滤条件
                    if len(abstract) >= 300 and len(abstract) <= 1024:
                        if update_date and int(update_date[:4]) >= 2016:
                            outfile.write(json.dumps(record) + '\n')
                            count += 1

        except json.JSONDecodeError:
            continue  # 忽略格式错误的行

print(f"筛选完成，共保存了 {count} 条记录到 {output_path}")