This commit is contained in:
2025-07-18 18:00:04 +08:00
parent 24abc7aab3
commit 563f16f0c5
15 changed files with 25541 additions and 41 deletions

View File

@@ -1,14 +1,46 @@
import json
# 要保留的类别关键词
# target_categories = {
# "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
# "cs.CL", "cs.CV", "cs.LG",
# "gr-qc", "hep-ph", "hep-th", "quant-ph"
# }
target_categories = {
"astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
"cs.CL", "cs.CV", "cs.LG",
"gr-qc", "hep-ph", "hep-th", "quant-ph"
}
'quant-ph',
'physics.chem-ph',
'physics.atom-ph',
'cond-mat.soft',
'cs.RO',
'cs.CL',
'cs.SE',
'cs.IR',
'hep-th',
'hep-ph',
'physics.optics',
'cs.AI',
'cs.CV',
'nucl-th',
'astro-ph',
'math.PR',
'cs.OS',
'eess.SP',
'math.OC',
'math.DS',
'math.DG',
'math.MP',
'cs.MM',
'stat.ME',
'math.CO',
'cs.NE'
}
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot--.json" # 使用 JSON Lines 格式输出路径
output_path = "arxiv-metadata-oai-snapshot--26.json" # 使用 JSON Lines 格式输出路径
count = 0