import json import random input_file = "arxiv-metadata-oai-snapshot--26-500.json" # 20000条原始数据文件路径 output_file = "arxiv-metadata-oai-snapshot--swift-26-500.json" # 类别对应选项映射 label_map = { 'quant-ph': 'A', 'physics.chem-ph': 'B', 'physics.atom-ph': 'C', 'cond-mat.soft': 'D', 'cs.RO': 'E', 'cs.CL': 'F', 'cs.SE': 'G', 'cs.IR': 'H', 'hep-th': 'I', 'hep-ph': 'J', 'physics.optics': 'K', 'cs.AI': 'L', 'cs.CV': 'M', 'nucl-th': 'N', 'astro-ph': 'O', 'math.PR': 'P', 'cs.OS': 'Q', 'eess.SP': 'R', 'math.OC': 'S', 'math.DS': 'T', 'math.DG': 'U', 'math.MP': 'V', 'cs.MM': 'W', 'stat.ME': 'X', 'math.CO': 'Y', 'cs.NE': 'Z' } options = [ "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft", "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph", "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph", "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS", "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE" ] options_text = "\n".join(options) # 读取所有数据 with open(input_file, 'r', encoding='utf-8') as f: data = [json.loads(line) for line in f] # 随机抽样1000条 #random.seed(42) sampled = data with open(output_file, 'w', encoding='utf-8') as f_out: count = 0 for item in sampled: # 多类别时取最后一个类别(通常以空格分割) categories_str = item.get("categories", "").strip() if not categories_str: continue last_category = categories_str.split()[-1] if last_category not in label_map: continue title = item.get("title", "").replace("\n", " ").strip() authors = item.get("authors", "").replace("\n", " ").strip() abstract = item.get("abstract", "").replace("\n", " ").strip() if not title or not authors or not abstract: continue human_text = ( f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', " f"please determine the scientific category of this paper.{options_text}" ) finetune_sample = { "system": "你是个优秀的论文分类师", "conversation": [ { "human": human_text, "assistant": label_map[last_category] } ] } f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n") count += 1 print(f"转换完成,共生成{count}条微调数据,保存到 {output_file}")