添加多个类别关键词,优化数据处理逻辑,支持从arXiv提取和筛选论文数据
This commit is contained in:
31
01-pre.py
31
01-pre.py
@@ -40,7 +40,7 @@ target_categories = {
|
||||
|
||||
|
||||
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
|
||||
output_path = "arxiv-metadata-oai-snapshot--26.json" # 使用 JSON Lines 格式输出路径
|
||||
output_path = "arxiv-metadata-oai-snapshot-single.json" # 使用 JSON Lines 格式输出路径
|
||||
|
||||
count = 0
|
||||
|
||||
@@ -49,11 +49,34 @@ with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
|
||||
try:
|
||||
record = json.loads(line)
|
||||
record_cats = record.get("categories", "").split()
|
||||
# 获取更新日期和摘要
|
||||
update_date = record.get("update_date", "")
|
||||
abstract = record.get("abstract", "")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 只保留一个类别的记录
|
||||
if len(record_cats) > 1:
|
||||
continue
|
||||
if record_cats:
|
||||
last_cat = record_cats[-1]
|
||||
last_cat = record_cats[0]
|
||||
if last_cat in target_categories:
|
||||
outfile.write(json.dumps(record) + '\n')
|
||||
count += 1
|
||||
# 定义无需过滤条件的类别
|
||||
no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}
|
||||
|
||||
# 如果属于无需过滤的类别,直接写入
|
||||
if last_cat in no_filter_categories:
|
||||
outfile.write(json.dumps(record) + '\n')
|
||||
count += 1
|
||||
else:
|
||||
# 其他类别需要满足过滤条件
|
||||
if len(abstract) >= 300 and len(abstract) <= 1024:
|
||||
if update_date and int(update_date[:4]) >= 2016:
|
||||
outfile.write(json.dumps(record) + '\n')
|
||||
count += 1
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue # 忽略格式错误的行
|
||||
|
||||
|
||||
Reference in New Issue
Block a user