import json import random categorys = [ 'quant-ph', 'physics.chem-ph', 'physics.atom-ph', 'cond-mat.soft', 'cs.RO', 'cs.CL', 'cs.SE', 'cs.IR', 'hep-th', 'hep-ph', 'physics.optics', 'cs.AI', 'cs.CV', 'nucl-th', 'astro-ph', 'math.PR', 'cs.OS' , 'eess.SP', 'math.OC', 'math.DS', 'math.DG', 'math.MP', 'cs.MM', 'stat.ME', 'math.CO', 'cs.NE' ] def extract_category_mapping(): """定义类别到选项的映射""" category_to_option = { 'quant-ph': 'A', 'physics.chem-ph': 'B', 'physics.atom-ph': 'C', 'cond-mat.soft': 'D', 'cs.RO': 'E', 'cs.CL': 'F', 'cs.SE': 'G', 'cs.IR': 'H', 'hep-th': 'I', 'hep-ph': 'J', 'physics.optics': 'K', 'cs.AI': 'L', 'cs.CV': 'M', 'nucl-th': 'N', 'astro-ph': 'O', 'math.PR': 'P', 'cs.OS': 'Q', 'eess.SP': 'R', 'math.OC': 'S', 'math.DS': 'T', 'math.DG': 'U', 'math.MP': 'V', 'cs.MM': 'W', 'stat.ME': 'X', 'math.CO': 'Y', 'cs.NE': 'Z' } return category_to_option def get_category_options_text(): """生成选项文本""" options = [ "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft", "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph", "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph", "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS", "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE" ] return "\n".join(options) def process_paper(paper_data, verbose=False): """处理单篇论文数据""" category_mapping = extract_category_mapping() # 提取基本信息 paper_id = paper_data.get('id', '') title = paper_data.get('title', '').replace('\n', ' ').strip() authors = paper_data.get('authors', '') abstract = paper_data.get('abstract', '').replace('\n', ' ').strip() categories = paper_data.get('categories', '') # 检查是否包含多个类别(用空格分隔) category_list = categories.split() if len(category_list) > 1: # 如果有多个类别,category_list中第1个满足category_to_option的类别作为目标类别 target_category = next((category for category in category_list if category in categorys), None) else: target_category = category_list[0] if category_list else '' # 检查类别是否在我们的目标列表中 # if target_category not in category_mapping: # if verbose: # print(f"跳过非目标类别论文 {paper_id}: {target_category}") # return None # 获取对应的选项字母 correct_option = category_mapping[target_category] # 构建human问题 options_text = get_category_options_text() human_content = f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{options_text}" # 构建JSONL条目 jsonl_entry = { "system": "你是个优秀的论文分类师", "conversation": [ { "human": human_content, "assistant": correct_option } ] } if verbose: print(f"处理论文 {paper_id}: {target_category} -> {correct_option}") return jsonl_entry # input_path = "arxiv-metadata-oai-snapshot-single.json" # output_path_1 = "arxiv-metadata-oai-snapshot-single-batch1.json" # output_path_2 = "arxiv-metadata-oai-snapshot-single-batch2.json" # batch1_size_per_category = 400 # batch2_size_per_category = 600 input_path = "arxiv-metadata-oai-snapshot-multi.json" output_path_1 = "arxiv-metadata-oai-snapshot-multi-batch1.json" output_path_2 = "arxiv-metadata-oai-snapshot-multi-batch2.json" batch1_size_per_category = 400 batch2_size_per_category = 400 # 先将所有数据加载到内存中 with open(input_path, 'r') as infile: data = [json.loads(line) for line in infile] print(f"原始数据量:{len(data)} 条") # 存储两个批次的数据 batch1_data = [] batch2_data = [] # 按类别处理数据 for category in categorys: # 筛选出当前类别的数据 category_data = [item for item in data if category in item.get('categories', '').strip().split()] print(f"类别 {category}: 总共 {len(category_data)} 条") # 打乱数据顺序 random.shuffle(category_data) # 确定第一批和第二批的数量 total_count = len(category_data) batch1_count = min(batch1_size_per_category, total_count) batch2_count = min(batch2_size_per_category, total_count - batch1_count) # 分配数据到两个批次 batch1_data.extend(category_data[:batch1_count]) batch2_data.extend(category_data[batch1_count:batch1_count + batch2_count]) print(f"类别 {category}: 第一批 {batch1_count} 条, 第二批 {batch2_count} 条") # 保存第一批数据 with open(output_path_1, 'w', encoding='utf-8') as outfile: for record in batch1_data: swft_js = process_paper(record, verbose=False) outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n') # 保存第二批数据 with open(output_path_2, 'w', encoding='utf-8') as outfile: for record in batch2_data: swft_js = process_paper(record, verbose=False) outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n') print(f"第一批数据: {len(batch1_data)} 条,已保存到 {output_path_1}") print(f"第二批数据: {len(batch2_data)} 条,已保存到 {output_path_2}")