添加多个类别关键词，优化数据处理逻辑，支持从arXiv提取和筛选论文数据

2025-07-30 23:05:31 +08:00
parent 7d15721f61
commit 40262648c4
6 changed files with 298 additions and 81 deletions
--- a/03-data_select_ratio.py
+++ b/03-data_select_ratio.py
@@ -1,93 +1,190 @@
 import json
 import random
+categorys = [
+    'quant-ph',
+    'physics.chem-ph', 
+    'physics.atom-ph',
+    'cond-mat.soft',
+    'cs.RO',
+    'cs.CL',
+    'cs.SE',
+    'cs.IR',
+    'hep-th',
+    'hep-ph',
+    'physics.optics',
+    'cs.AI',
+    'cs.CV',
+    'nucl-th',
+    'astro-ph',
+    'math.PR',
+    'cs.OS' ,
+    'eess.SP',
+    'math.OC',
+    'math.DS',
+    'math.DG',
+    'math.MP',
+    'cs.MM',
+    'stat.ME',
+    'math.CO',
+    'cs.NE'
+]

-input_path = "arxiv-metadata-oai-snapshot--26.json"
-output_path = "arxiv-metadata-oai-snapshot--26-500.json"
-sample_size = 4000  # 你可以改成 10000 等其他数字
+
+def extract_category_mapping():
+    """定义类别到选项的映射"""
+    category_to_option = {
+        'quant-ph': 'A',
+        'physics.chem-ph': 'B', 
+        'physics.atom-ph': 'C',
+        'cond-mat.soft': 'D',
+        'cs.RO': 'E',
+        'cs.CL': 'F',
+        'cs.SE': 'G',
+        'cs.IR': 'H',
+        'hep-th': 'I',
+        'hep-ph': 'J',
+        'physics.optics': 'K',
+        'cs.AI': 'L',
+        'cs.CV': 'M',
+        'nucl-th': 'N',
+        'astro-ph': 'O',
+        'math.PR': 'P',
+        'cs.OS': 'Q',
+        'eess.SP': 'R',
+        'math.OC': 'S',
+        'math.DS': 'T',
+        'math.DG': 'U',
+        'math.MP': 'V',
+        'cs.MM': 'W',
+        'stat.ME': 'X',
+        'math.CO': 'Y',
+        'cs.NE': 'Z'
+    }
+    return category_to_option
+
+def get_category_options_text():
+    """生成选项文本"""
+    options = [
+        "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft",
+        "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph",
+        "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph",
+        "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS",
+        "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE"
+    ]
+    return "\n".join(options)
+
+def process_paper(paper_data, verbose=False):
+    """处理单篇论文数据"""
+    category_mapping = extract_category_mapping()
+    
+    # 提取基本信息
+    paper_id = paper_data.get('id', '')
+    title = paper_data.get('title', '').replace('\n', ' ').strip()
+    authors = paper_data.get('authors', '')
+    abstract = paper_data.get('abstract', '').replace('\n', ' ').strip()
+    categories = paper_data.get('categories', '')
+    
+    # 检查是否包含多个类别（用空格分隔）
+    category_list = categories.split()
+    if len(category_list) > 1:
+        # 如果有多个类别，category_list中第1个满足category_to_option的类别作为目标类别
+        target_category = next((category for category in category_list if category in categorys), None)



-# 先将所有数据加载到内存中（30万条可以接受）
+    else:
+        target_category = category_list[0] if category_list else ''
+
+    # 检查类别是否在我们的目标列表中
+    
+    # if target_category not in category_mapping:
+    #     if verbose:
+    #         print(f"跳过非目标类别论文 {paper_id}: {target_category}")
+    #     return None
+    
+    # 获取对应的选项字母
+    correct_option = category_mapping[target_category]
+    
+    # 构建human问题
+    options_text = get_category_options_text()
+    human_content = f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{options_text}"
+    
+    # 构建JSONL条目
+    jsonl_entry = {
+        "system": "你是个优秀的论文分类师",
+        "conversation": [
+            {
+                "human": human_content,
+                "assistant": correct_option
+            }
+        ]
+    }
+    
+    if verbose:
+        print(f"处理论文 {paper_id}: {target_category} -> {correct_option}")
+    
+    return jsonl_entry
+
+
+# input_path = "arxiv-metadata-oai-snapshot-single.json"
+# output_path_1 = "arxiv-metadata-oai-snapshot-single-batch1.json"
+# output_path_2 = "arxiv-metadata-oai-snapshot-single-batch2.json"
+# batch1_size_per_category = 400
+# batch2_size_per_category = 600
+
+
+
+
+input_path = "arxiv-metadata-oai-snapshot-multi.json"
+output_path_1 = "arxiv-metadata-oai-snapshot-multi-batch1.json"
+output_path_2 = "arxiv-metadata-oai-snapshot-multi-batch2.json"
+
+batch1_size_per_category = 400
+batch2_size_per_category = 400
+
+# 先将所有数据加载到内存中
 with open(input_path, 'r') as infile:
    data = [json.loads(line) for line in infile]

 print(f"原始数据量：{len(data)} 条")

-## 按类别筛选数据，不是随机
-## 每个类别指定抽取的比例
-# category_proportions = {
-#     'astro-ph': 0.1336,
-#     'cond-mat.mes-hall': 0.0486,
-#     'cond-mat.mtrl-sci': 0.0587,
-#     'cs.CL': 0.085,
-#     'cs.CV': 0.0931,
-#     'cs.LG': 0.0992,
-#     'gr-qc': 0.1174,
-#     'hep-ph': 0.1194,
-#     'hep-th': 0.085,
-#     'quant-ph': 0.1599
-# }
-
-category_proportions = {
-        'quant-ph': 0.1,
-        'physics.chem-ph': 0.1, 
-        'physics.atom-ph': 0.1,
-        'cond-mat.soft': 0.1,
-        'cs.RO': 0.1,
-        'cs.CL': 0.1,
-        'cs.SE': 0.1,
-        'cs.IR': 0.1,
-        'hep-th': 0.1,
-        'hep-ph': 0.1,
-        'physics.optics': 0.1,
-        'cs.AI': 0.1,
-        'cs.CV': 0.1,
-        'nucl-th': 0.1,
-        'astro-ph': 0.1,
-        'math.PR': 0.1,
-        'cs.OS': 0.1,
-        'eess.SP': 0.1,
-        'math.OC': 0.1,
-        'math.DS': 0.1,
-        'math.DG': 0.1,
-        'math.MP': 0.1,
-        'cs.MM': 0.1,
-        'stat.ME': 0.1,
-        'math.CO': 0.1,
-        'cs.NE': 0.1
-    }


+# 存储两个批次的数据
+batch1_data = []
+batch2_data = []

-## print 每个类别的筛选比例和数量
-print("每个类别的筛选比例和数量:")
-for category, proportion in category_proportions.items():
-    count = sample_size * proportion
-    print(f"类别 {category}: 抽取比例 {proportion}, 数量 {count}")
-# 按每个类别的数量筛选数据
-filtered_data = []
-for category, proportion in category_proportions.items():
-    count = int(sample_size * proportion)
+# 按类别处理数据
+for category in categorys:
    # 筛选出当前类别的数据
-    category_data = [item for item in data if item.get('categories', '').strip() == category]
-    # 如果当前类别的数据量小于需要抽取的数量，则全部取出
-    if len(category_data) < count:
-        filtered_data.extend(category_data)
-    else:
-        # 随机抽样指定数量的数据
-        sampled_data = random.sample(category_data, count)
-        filtered_data.extend(sampled_data)
-    print(f"类别 {category}: 抽取数量 {count}")
+    category_data = [item for item in data if category in item.get('categories', '').strip().split()]
+    print(f"类别 {category}: 总共 {len(category_data)} 条")
    
+    # 打乱数据顺序
+    random.shuffle(category_data)
+    
+    # 确定第一批和第二批的数量
+    total_count = len(category_data)
+    batch1_count = min(batch1_size_per_category, total_count)
+    batch2_count = min(batch2_size_per_category, total_count - batch1_count)
+    
+    # 分配数据到两个批次
+    batch1_data.extend(category_data[:batch1_count])
+    batch2_data.extend(category_data[batch1_count:batch1_count + batch2_count])
+    
+    print(f"类别 {category}: 第一批 {batch1_count} 条, 第二批 {batch2_count} 条")

+# 保存第一批数据
+with open(output_path_1, 'w', encoding='utf-8') as outfile:
+    for record in batch1_data:
+        swft_js = process_paper(record, verbose=False)
+        outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')

+# 保存第二批数据
+with open(output_path_2, 'w', encoding='utf-8') as outfile:
+    for record in batch2_data:
+        swft_js = process_paper(record, verbose=False)
+        outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')

-
-
-
-# 保存结果
-with open(output_path, 'w') as outfile:
-    for record in filtered_data:
-        outfile.write(json.dumps(record) + '\n')
-
-print(f"已按比例抽取 {sample_size} 条数据保存到 {output_path}")
+print(f"第一批数据: {len(batch1_data)} 条，已保存到 {output_path_1}")
+print(f"第二批数据: {len(batch2_data)} 条，已保存到 {output_path_2}")