import json import os import argparse import random # 科学类别文本常量 CATEGORY_TEXT = """ A. quant-ph B. physics.chem-ph C. physics.atom-ph D. cond-mat.soft E. cs.RO F. cs.CL G. cs.SE H. cs.IR I. hep-th J. hep-ph K. physics.optics L. cs.AI M. cs.CV N. nucl-th O. astro-ph P. math.PR Q. cs.OS R. eess.SP S. math.OC T. math.DS U. math.DG V. math.MP W. cs.MM X. stat.ME Y. math.CO Z. cs.NE """ # 科学类别字典 CATEGORY_DICT = { "quant-ph": "A", "physics.chem-ph": "B", "physics.atom-ph": "C", "cond-mat.soft": "D", "cs.RO": "E", "cs.CL": "F", "cs.SE": "G", "cs.IR": "H", "hep-th": "I", "hep-ph": "J", "physics.optics": "K", "cs.AI": "L", "cs.CV": "M", "nucl-th": "N", "astro-ph": "O", "math.PR": "P", "cs.OS": "Q", "eess.SP": "R", "math.OC": "S", "math.DS": "T", "math.DG": "U", "math.MP": "V", "cs.MM": "W", "stat.ME": "X", "math.CO": "Y", "cs.NE": "Z" } # 问题模板常量 QUESTION_TEMPLATES = [ # 直接提问式 "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?", # 命令式 "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}", # 描述性引导 "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.", # 正式请求 "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}", # 摘要优先 "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}", # 作者强调 "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?", # 问题链式 "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}", # 简洁版 "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}", # 上下文嵌入 "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}", # 非正式口语 "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}", # 元素罗列 "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?", # 假设场景 "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}", # 强调关键信息 "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}", # 间接询问 "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}", # 完整句子整合 "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}", # 问题聚焦摘要 "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}", # 标题驱动 "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?", # 多部分查询 "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}", # 比较式 "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}", # 行动导向 "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}" ] QUESTION_TEMPLATES = [ "Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{category_text}" ] def extract_title_author_and_abstract(content_text): """ content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}} """ try: # 针对可以直接解析的JSON格式数据进行处理 if content_text.strip().startswith('{') and '"title"' in content_text and ('"author_names"' in content_text or '"authors"' in content_text): try: # 尝试解析为JSON对象 paper_data = json.loads(content_text) title = paper_data.get("title", "") authors = ", ".join(paper_data.get("author_names", paper_data.get("authors", []))) abstract = paper_data.get("summary", paper_data.get("abstract", "")) return {"title": title, "authors": authors, "abstract": abstract} except: pass #content_text.split("',") parts = content_text.split("',") if len(parts) < 3: # 如果分割后的部分少于3个,返回默认值 return {"title": "", "authors": "", "abstract": ""} # 安全地提取标题 title_parts = parts[0].split("'") if len(title_parts) >= 2: title = title_parts[1].strip() else: title = "" # 安全地提取作者 authors_parts = parts[1].split("'") if len(authors_parts) >= 2: authors = authors_parts[1].strip() else: authors = "" # 安全地提取摘要 abstract_parts = parts[2].split("'") if len(abstract_parts) >= 2: abstract = abstract_parts[1].strip() else: abstract = "" return {"title": title, "authors": authors, "abstract": abstract} except Exception as e: # 如果出现任何异常,返回默认值 print(f"解析内容时出错: {e}") return {"title": "", "authors": "", "abstract": ""} def parse_new_format_data(data): """ 解析新格式的数据 Args: data: 新格式的JSON数据 Returns: tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None) """ if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3: return None, None, None system_instruction = "" human_content = "" assistant_content = "" for msg in data["messages"]: if msg["role"] == "system": system_instruction = msg["content"] elif msg["role"] == "user": human_content = msg["content"] elif msg["role"] == "assistant": assistant_content = msg["content"] return system_instruction, human_content, assistant_content def parse_old_format_data(data): """ 解析旧格式的数据 Args: data: 旧格式的JSON数据 Returns: tuple: (system_instruction, conversation_data) 或 (None, None) """ if "system" not in data or "conversation" not in data or not data["conversation"]: return None, None system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") return system_instruction, data["conversation"] def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates): """ 根据模板生成多种类型的样本 Args: title: 论文标题 authors: 作者 abstract: 摘要 system_instruction: 系统指令 assistant_content: 助手回复 num_templates: 使用的模板数量 Returns: list: 生成的多种类型数据列表 """ n = min(num_templates, len(QUESTION_TEMPLATES)) selected_templates = random.sample(QUESTION_TEMPLATES, n) samples = [] for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=CATEGORY_TEXT ) new_data = { "messages": [ {"role": "system", "content": system_instruction}, {"role": "user", "content": formatted_question}, {"role": "assistant", "content": assistant_content} ] } samples.append(new_data) return samples def process_new_format_data(data, num_templates): """ 处理新格式数据 Args: data: 新格式数据 num_templates: 模板数量 Returns: list: 处理后的数据列表 """ system_instruction, human_content, assistant_content = parse_new_format_data(data) if not human_content: return [] extracted = extract_title_author_and_abstract(human_content) title = extracted.get("title", "") authors = extracted.get("authors", "") abstract = extracted.get("abstract", "") return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates) def process_old_format_data(data, num_templates): """ 处理旧格式数据 Args: data: 旧格式数据 num_templates: 模板数量 Returns: list: 处理后的数据列表 """ system_instruction, conversation_data = parse_old_format_data(data) if not conversation_data: return [] samples = [] for turn in conversation_data: if "human" not in turn or "assistant" not in turn: continue extracted = extract_title_author_and_abstract(turn["human"]) title = extracted.get("title", "") authors = extracted.get("authors", "") abstract = extracted.get("abstract", "") n = min(num_templates, len(QUESTION_TEMPLATES)) selected_templates = random.sample(QUESTION_TEMPLATES, n) for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=CATEGORY_TEXT ) new_data = { "system": system_instruction, "conversation": [ { "human": formatted_question, "assistant": turn["assistant"] } ] } samples.append(new_data) return samples def get_paper_data_from_crawl_jason(input_path): """ 从指定文件夹里的所有JSON文件中获取论文数据 或从单个JSON文件中获取论文数据 """ paper_data_list = [] # 检查输入路径是文件还是文件夹 if os.path.isfile(input_path): # 如果是单个文件 paper_data_list.extend(_extract_paper_data_from_file(input_path)) print(f"从文件 {input_path} 中提取了 {len(paper_data_list)} 条数据") elif os.path.isdir(input_path): # 如果是文件夹,遍历其中所有JSON文件 files_found = 0 for filename in os.listdir(input_path): if filename.endswith('.jsonl') : file_path = os.path.join(input_path, filename) try: file_data = _extract_paper_data_from_file(file_path) paper_data_list.extend(file_data) print(f"已从 {filename} 中提取 {len(file_data)} 条数据") files_found += 1 except Exception as e: print(f"处理文件 {filename} 时出错: {e}") print(f"在目录中找到 {files_found} 个JSON文件") else: print(f"路径 {input_path} 既不是文件也不是文件夹") print(f"总共提取了 {len(paper_data_list)} 条论文数据") return paper_data_list def _extract_paper_data_from_file(file_path): """ 从单个JSON文件中提取论文数据 Args: file_path: JSON文件路径 Returns: list: 论文数据列表 """ paper_data_list = [] # 处理JSONL格式文件 with open(file_path, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line: # 跳过空行 continue try: item = json.loads(line) title = item.get("title", "") # 处理作者信息的不同可能格式 authors_list = item.get("author_names", item.get("authors", [])) if isinstance(authors_list, list): authors = ", ".join(authors_list) else: authors = str(authors_list) # 处理摘要信息的不同可能格式 abstract = item.get("summary", item.get("abstract", "")) # 处理分类信息的不同可能格式 category = item.get("category", "Unknown") # 如果没有category字段,尝试从categories列表中获取第一个 if category == "Unknown" and "categories" in item and isinstance(item["categories"], list) and len(item["categories"]) > 0: category = item["categories"][0] # 提取论文数据 paper_data_dict = { "title": title, "authors": authors, "abstract": abstract, "category": category } paper_data_list.append(paper_data_dict) except json.JSONDecodeError as e: print(f"解析文件 {file_path} 的第 {line_num} 行时出错: {e}") continue return paper_data_list def convert_onedata2multi_type_pre(paper_datas, output_file, num_templates): """ 读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据, 并保存为output_file 参数: input_file: 输入文件路径 output_file: 输出文件路径 num_templates: 每条数据生成的模板数量 """ print(f"开始转换数据...每条数据生成{num_templates}条变体") print(f"开始转换数据: {input_file} -> {output_file}") multi_type_data = [] for item in paper_datas: title = item.get("title", "") authors = item.get("authors", "") abstract = item.get("summary", item.get("abstract", "")) n = min(num_templates, len(QUESTION_TEMPLATES)) selected_templates = random.sample(QUESTION_TEMPLATES, n) for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=CATEGORY_TEXT ) new_data = { "messages": [ { "role": "assistant", "content": formatted_question #"assistant": row["answer"] } ] } multi_type_data.append(new_data) # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: for item in multi_type_data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"转换完成! 共转换 {len(multi_type_data)} 条数据") def convert_onedata2multi_type_sft(paper_datas, output_file, num_templates): """ 读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据, 并保存为output_file 参数: input_file: 输入文件路径 output_file: 输出文件路径 num_templates: 每条数据生成的模板数量 """ print(f"开始转换数据...每条数据生成{num_templates}条变体") print(f"开始转换数据: {input_file} -> {output_file}") multi_type_data = [] for item in paper_datas: title = item.get("title", "") authors = item.get("authors", "") abstract = item.get("summary", item.get("abstract", "")) category = item.get("category", "Unknown") answer=CATEGORY_DICT.get(category, "Unknown") #print(item) # 生成系统指令 system_instruction = "你是个优秀的论文分类师,根据论文的标题、作者和摘要,确定该论文的科学类别。" n = min(num_templates, len(QUESTION_TEMPLATES)) selected_templates = random.sample(QUESTION_TEMPLATES, n) for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=CATEGORY_TEXT ) new_data = { "system": system_instruction, "conversation": [ { "human": formatted_question, "assistant": answer } ] } multi_type_data.append(new_data) # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: for item in multi_type_data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"转换完成! 共转换 {len(multi_type_data)} 条数据") if __name__ == "__main__": # 示例用法 input_file = r"G:\\11\data-prepare\\arxiv_papers\\" output_file_sft = r"G:\\11\data-prepare\\arxiv_papers-multi_type-sft.json" output_file_pre = r"G:\\11\data-prepare\\arxiv_papers-multi_type-pre.json" paper_datas=get_paper_data_from_crawl_jason(input_file) convert_onedata2multi_type_sft(paper_datas, output_file_sft, num_templates=1) #convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)