import json # 要保留的类别关键词 # target_categories = { # "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci", # "cs.CL", "cs.CV", "cs.LG", # "gr-qc", "hep-ph", "hep-th", "quant-ph" # } target_categories = { 'quant-ph', 'physics.chem-ph', 'physics.atom-ph', 'cond-mat.soft', 'cs.RO', 'cs.CL', 'cs.SE', 'cs.IR', 'hep-th', 'hep-ph', 'physics.optics', 'cs.AI', 'cs.CV', 'nucl-th', 'astro-ph', 'math.PR', 'cs.OS', 'eess.SP', 'math.OC', 'math.DS', 'math.DG', 'math.MP', 'cs.MM', 'stat.ME', 'math.CO', 'cs.NE' } input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径 output_path = "arxiv-metadata-oai-snapshot-single.json" # 使用 JSON Lines 格式输出路径 count = 0 with open(input_path, 'r') as infile, open(output_path, 'w') as outfile: for line in infile: try: record = json.loads(line) record_cats = record.get("categories", "").split() # 获取更新日期和摘要 update_date = record.get("update_date", "") abstract = record.get("abstract", "") # 只保留一个类别的记录 if len(record_cats) > 1: continue if record_cats: last_cat = record_cats[0] if last_cat in target_categories: # 定义无需过滤条件的类别 no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'} # 如果属于无需过滤的类别,直接写入 if last_cat in no_filter_categories: outfile.write(json.dumps(record) + '\n') count += 1 else: # 其他类别需要满足过滤条件 if len(abstract) >= 300 and len(abstract) <= 1024: if update_date and int(update_date[:4]) >= 2016: outfile.write(json.dumps(record) + '\n') count += 1 except json.JSONDecodeError: continue # 忽略格式错误的行 print(f"筛选完成,共保存了 {count} 条记录到 {output_path}")