2025-06-09 14:39:07 +08:00
|
|
|
import json
|
|
|
|
|
|
|
|
# 要保留的类别关键词
|
2025-07-18 18:00:04 +08:00
|
|
|
# target_categories = {
|
|
|
|
# "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
|
|
|
|
# "cs.CL", "cs.CV", "cs.LG",
|
|
|
|
# "gr-qc", "hep-ph", "hep-th", "quant-ph"
|
|
|
|
# }
|
|
|
|
|
2025-06-09 14:39:07 +08:00
|
|
|
target_categories = {
|
2025-07-18 18:00:04 +08:00
|
|
|
'quant-ph',
|
|
|
|
'physics.chem-ph',
|
|
|
|
'physics.atom-ph',
|
|
|
|
'cond-mat.soft',
|
|
|
|
'cs.RO',
|
|
|
|
'cs.CL',
|
|
|
|
'cs.SE',
|
|
|
|
'cs.IR',
|
|
|
|
'hep-th',
|
|
|
|
'hep-ph',
|
|
|
|
'physics.optics',
|
|
|
|
'cs.AI',
|
|
|
|
'cs.CV',
|
|
|
|
'nucl-th',
|
|
|
|
'astro-ph',
|
|
|
|
'math.PR',
|
|
|
|
'cs.OS',
|
|
|
|
'eess.SP',
|
|
|
|
'math.OC',
|
|
|
|
'math.DS',
|
|
|
|
'math.DG',
|
|
|
|
'math.MP',
|
|
|
|
'cs.MM',
|
|
|
|
'stat.ME',
|
|
|
|
'math.CO',
|
|
|
|
'cs.NE'
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-06-09 14:39:07 +08:00
|
|
|
|
|
|
|
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
|
2025-07-30 23:05:31 +08:00
|
|
|
output_path = "arxiv-metadata-oai-snapshot-single.json" # 使用 JSON Lines 格式输出路径
|
2025-06-09 14:39:07 +08:00
|
|
|
|
|
|
|
count = 0
|
|
|
|
|
|
|
|
with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
|
|
|
|
for line in infile:
|
|
|
|
try:
|
|
|
|
record = json.loads(line)
|
|
|
|
record_cats = record.get("categories", "").split()
|
2025-07-30 23:05:31 +08:00
|
|
|
# 获取更新日期和摘要
|
|
|
|
update_date = record.get("update_date", "")
|
|
|
|
abstract = record.get("abstract", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 只保留一个类别的记录
|
|
|
|
if len(record_cats) > 1:
|
|
|
|
continue
|
2025-06-09 14:39:07 +08:00
|
|
|
if record_cats:
|
2025-07-30 23:05:31 +08:00
|
|
|
last_cat = record_cats[0]
|
2025-06-09 14:39:07 +08:00
|
|
|
if last_cat in target_categories:
|
2025-07-30 23:05:31 +08:00
|
|
|
# 定义无需过滤条件的类别
|
|
|
|
no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}
|
|
|
|
|
|
|
|
# 如果属于无需过滤的类别,直接写入
|
|
|
|
if last_cat in no_filter_categories:
|
|
|
|
outfile.write(json.dumps(record) + '\n')
|
|
|
|
count += 1
|
|
|
|
else:
|
|
|
|
# 其他类别需要满足过滤条件
|
|
|
|
if len(abstract) >= 300 and len(abstract) <= 1024:
|
|
|
|
if update_date and int(update_date[:4]) >= 2016:
|
|
|
|
outfile.write(json.dumps(record) + '\n')
|
|
|
|
count += 1
|
|
|
|
|
2025-06-09 14:39:07 +08:00
|
|
|
except json.JSONDecodeError:
|
|
|
|
continue # 忽略格式错误的行
|
|
|
|
|
|
|
|
print(f"筛选完成,共保存了 {count} 条记录到 {output_path}")
|
|
|
|
|