添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据

2025-07-30 23:05:31 +08:00
parent 7d15721f61
commit 40262648c4
6 changed files with 298 additions and 81 deletions
--- a/01-pre.py
+++ b/01-pre.py
@@ -40,7 +40,7 @@ target_categories = {


 input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
-output_path = "arxiv-metadata-oai-snapshot--26.json"  # 使用 JSON Lines 格式输出路径
+output_path = "arxiv-metadata-oai-snapshot-single.json"  # 使用 JSON Lines 格式输出路径

 count = 0

@@ -49,11 +49,34 @@ with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
        try:
            record = json.loads(line)
            record_cats = record.get("categories", "").split()
+                        # 获取更新日期和摘要
+            update_date = record.get("update_date", "")
+            abstract = record.get("abstract", "")
+
+
+
+
+
+            # 只保留一个类别的记录
+            if len(record_cats) > 1:
+                continue
            if record_cats:
-                last_cat = record_cats[-1]
+                last_cat = record_cats[0]
                if last_cat in target_categories:
-                    outfile.write(json.dumps(record) + '\n')
-                    count += 1
+                    # 定义无需过滤条件的类别
+                    no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}
+                    
+                    # 如果属于无需过滤的类别，直接写入
+                    if last_cat in no_filter_categories:
+                        outfile.write(json.dumps(record) + '\n')
+                        count += 1
+                    else:
+                        # 其他类别需要满足过滤条件
+                        if len(abstract) >= 300 and len(abstract) <= 1024:
+                            if update_date and int(update_date[:4]) >= 2016:
+                                outfile.write(json.dumps(record) + '\n')
+                                count += 1
+
        except json.JSONDecodeError:
            continue  # 忽略格式错误的行