import json import os import argparse import random def extract_title_author_and_abstract(content_text): """ content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]} """ #content_text.split("',") parts = content_text.split("',") title = parts[0].split("'")[1].strip() authors = parts[1].split("'")[1].strip() abstract = parts[2].split("'")[1].strip() # # for part in parts: # # print(part) # print(title) # print("----------------------------------------------------------------------------------------------------------") # print(authors) # print("----------------------------------------------------------------------------------------------------------") # print(abstract) # print("----------------------------------------------------------------------------------------------------------") return {"title": title, "authors": authors, "abstract": abstract} def convert_to_alpaca_format(input_file, output_file): """ 将 Swift 格式的数据转换为 Alpaca 格式 输入格式: { "system": "你是个优秀的论文分类师", "conversation": [ { "human": "Based on the title...", "assistant": "D" } ] } """ print(f"转换数据: {input_file} -> {output_file}") converted_data = [] with open(input_file, "r", encoding="utf-8") as f: for line in f: try: data = json.loads(line.strip()) # 检查数据结构 if "system" not in data or "conversation" not in data: print(f"警告: 数据缺少必要字段: {data}") continue # 从 system 提取指令 instruction = data.get("system", "") if not instruction: instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。" # 处理对话 for turn in data["conversation"]: if "human" in turn and "assistant" in turn: # 创建新的 Alpaca 格式数据 new_data = { "messages": [ { "role": "assistant", "content": "This is a paper titled " + turn["human"] }]} converted_data.append(new_data) except json.JSONDecodeError: print(f"警告: 无法解析JSON行: {line}") except Exception as e: print(f"处理行时发生错误: {str(e)}") # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: for item in converted_data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"转换完成! 共转换 {len(converted_data)} 条数据") def convert_onedata2multi_type(input_file, output_file, num_templates): """ 读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据, 并保存为output_file 参数: input_file: 输入文件路径 output_file: 输出文件路径 """ print(f"开始转换数据...每条数据生成{num_templates}条变体") print(f"开始转换数据: {input_file} -> {output_file}") category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n" # 定义20种问题模板 question_templates = [ # 直接提问式 "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?", # 命令式 "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}", # 描述性引导 "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.", # 正式请求 "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}", # 摘要优先 "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}", # 作者强调 "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?", # 问题链式 "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}", # 简洁版 "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}", # 上下文嵌入 "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}", # 非正式口语 "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}", # 元素罗列 "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?", # 假设场景 "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}", # 强调关键信息 "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}", # 间接询问 "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}", # 完整句子整合 "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}", # 问题聚焦摘要 "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}", # 标题驱动 "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?", # 多部分查询 "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}", # 比较式 "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}", # 行动导向 "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}" ] multi_type_data = [] with open(input_file, "r", encoding="utf-8") as f: for line in f: try: data = json.loads(line.strip()) # 检查新格式的数据结构 if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3: # 提取系统指令 system_instruction = "" human_content = "" assistant_content = "" for msg in data["messages"]: if msg["role"] == "system": system_instruction = msg["content"] elif msg["role"] == "user": human_content = msg["content"] elif msg["role"] == "assistant": assistant_content = msg["content"] # 提取标题、作者和摘要 extracted = extract_title_author_and_abstract(human_content) title = extracted.get("title", "") authors = extracted.get("authors", "") abstract = extracted.get("abstract", "") n = min(num_templates, len(question_templates)) selected_templates = random.sample(question_templates, n) # 为每个问题模板创建新数据 for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=category_text ) # 创建新的数据条目(保持新格式) new_data = { "messages": [ {"role": "system", "content": system_instruction}, {"role": "user", "content": formatted_question}, {"role": "assistant", "content": assistant_content} ] } multi_type_data.append(new_data) # 检查旧格式的数据结构 elif "system" in data and "conversation" in data and data["conversation"]: system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") for turn in data["conversation"]: if "human" in turn and "assistant" in turn: extracted = extract_title_author_and_abstract(turn["human"]) title = extracted.get("title", "") authors = extracted.get("authors", "") abstract = extracted.get("abstract", "") n = min(num_templates, len(question_templates)) selected_templates = random.sample(question_templates, n) for template in selected_templates: formatted_question = template.format( title=title, authors=authors, abstract=abstract, category_text=category_text ) new_data = { "system": system_instruction, "conversation": [ { "human": formatted_question, "assistant": turn["assistant"] } ] } multi_type_data.append(new_data) else: print(f"警告: 数据格式不识别: {data}") continue except json.JSONDecodeError: print(f"警告: 无法解析JSON行: {line}") except Exception as e: print(f"处理行时发生错误: {str(e)}") # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: for item in multi_type_data: f.write(json.dumps(item, ensure_ascii=False) + "\n") print(f"转换完成! 共转换 {len(multi_type_data)} 条数据") if __name__ == "__main__": content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE" extract_title_author_and_abstract(content_text) # input_file = "G:\\11\\data-prepare\\val_dataset.jsonl" # output_file = "G:\\11\\data-prepare\\val_dataset-m2.jsonl" # 输出文件路径 input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m2.jsonl" # 输出文件路径 convert_onedata2multi_type(input_file, output_file, num_templates=2)