diff --git a/05-data-swfit-sft2multi_type.py b/05-data-swfit-sft2multi_type.py index cab745f..9e57352 100644 --- a/05-data-swfit-sft2multi_type.py +++ b/05-data-swfit-sft2multi_type.py @@ -4,269 +4,349 @@ import os import argparse import random +# 科学类别文本常量 +CATEGORY_TEXT = """ A. quant-ph +B. physics.chem-ph +C. physics.atom-ph +D. cond-mat.soft +E. cs.RO +F. cs.CL +G. cs.SE +H. cs.IR +I. hep-th +J. hep-ph +K. physics.optics +L. cs.AI +M. cs.CV +N. nucl-th +O. astro-ph +P. math.PR +Q. cs.OS +R. eess.SP +S. math.OC +T. math.DS +U. math.DG +V. math.MP +W. cs.MM +X. stat.ME +Y. math.CO +Z. cs.NE +""" +# 问题模板常量 +QUESTION_TEMPLATES = [ + # 直接提问式 + "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?", + + # 命令式 + "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}", + + # 描述性引导 + "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.", + + # 正式请求 + "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}", + + # 摘要优先 + "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}", + + # 作者强调 + "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?", + + # 问题链式 + "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}", + + # 简洁版 + "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}", + + # 上下文嵌入 + "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}", + + # 非正式口语 + "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}", + + # 元素罗列 + "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?", + + # 假设场景 + "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}", + + # 强调关键信息 + "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}", + + # 间接询问 + "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}", + + # 完整句子整合 + "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}", + + # 问题聚焦摘要 + "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}", + + # 标题驱动 + "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?", + + # 多部分查询 + "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}", + + # 比较式 + "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}", + + # 行动导向 + "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}" +] def extract_title_author_and_abstract(content_text): """ content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , - A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]} + A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}} """ - #content_text.split("',") - parts = content_text.split("',") - title = parts[0].split("'")[1].strip() - authors = parts[1].split("'")[1].strip() - abstract = parts[2].split("'")[1].strip() - # # for part in parts: - # # print(part) - # print(title) - # print("----------------------------------------------------------------------------------------------------------") - # print(authors) - # print("----------------------------------------------------------------------------------------------------------") - # print(abstract) - # print("----------------------------------------------------------------------------------------------------------") - return {"title": title, "authors": authors, "abstract": abstract} + try: + #content_text.split("',") + parts = content_text.split("',") + if len(parts) < 3: + # 如果分割后的部分少于3个,返回默认值 + return {"title": "", "authors": "", "abstract": ""} + + # 安全地提取标题 + title_parts = parts[0].split("'") + if len(title_parts) >= 2: + title = title_parts[1].strip() + else: + title = "" + + # 安全地提取作者 + authors_parts = parts[1].split("'") + if len(authors_parts) >= 2: + authors = authors_parts[1].strip() + else: + authors = "" + + # 安全地提取摘要 + abstract_parts = parts[2].split("'") + if len(abstract_parts) >= 2: + abstract = abstract_parts[1].strip() + else: + abstract = "" + + return {"title": title, "authors": authors, "abstract": abstract} + except Exception as e: + # 如果出现任何异常,返回默认值 + print(f"解析内容时出错: {e}") + return {"title": "", "authors": "", "abstract": ""} -def convert_to_alpaca_format(input_file, output_file): + + + + + + + + + + +def parse_new_format_data(data): """ - 将 Swift 格式的数据转换为 Alpaca 格式 + 解析新格式的数据 + + Args: + data: 新格式的JSON数据 + + Returns: + tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None) + """ + if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3: + return None, None, None + + system_instruction = "" + human_content = "" + assistant_content = "" + + for msg in data["messages"]: + if msg["role"] == "system": + system_instruction = msg["content"] + elif msg["role"] == "user": + human_content = msg["content"] + elif msg["role"] == "assistant": + assistant_content = msg["content"] + + return system_instruction, human_content, assistant_content - 输入格式: - { - "system": "你是个优秀的论文分类师", - "conversation": [ - { - "human": "Based on the title...", - "assistant": "D" + +def parse_old_format_data(data): + """ + 解析旧格式的数据 + + Args: + data: 旧格式的JSON数据 + + Returns: + tuple: (system_instruction, conversation_data) 或 (None, None) + """ + if "system" not in data or "conversation" not in data or not data["conversation"]: + return None, None + + system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") + return system_instruction, data["conversation"] + + +def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates): + """ + 根据模板生成多种类型的样本 + + Args: + title: 论文标题 + authors: 作者 + abstract: 摘要 + system_instruction: 系统指令 + assistant_content: 助手回复 + num_templates: 使用的模板数量 + + Returns: + list: 生成的多种类型数据列表 + """ + n = min(num_templates, len(QUESTION_TEMPLATES)) + selected_templates = random.sample(QUESTION_TEMPLATES, n) + samples = [] + + for template in selected_templates: + formatted_question = template.format( + title=title, + authors=authors, + abstract=abstract, + category_text=CATEGORY_TEXT + ) + + new_data = { + "messages": [ + {"role": "system", "content": system_instruction}, + {"role": "user", "content": formatted_question}, + {"role": "assistant", "content": assistant_content} + ] + } + samples.append(new_data) + + return samples + + +def process_new_format_data(data, num_templates): + """ + 处理新格式数据 + + Args: + data: 新格式数据 + num_templates: 模板数量 + + Returns: + list: 处理后的数据列表 + """ + system_instruction, human_content, assistant_content = parse_new_format_data(data) + + if not human_content: + return [] + + extracted = extract_title_author_and_abstract(human_content) + title = extracted.get("title", "") + authors = extracted.get("authors", "") + abstract = extracted.get("abstract", "") + + return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates) + + +def process_old_format_data(data, num_templates): + """ + 处理旧格式数据 + + Args: + data: 旧格式数据 + num_templates: 模板数量 + + Returns: + list: 处理后的数据列表 + """ + system_instruction, conversation_data = parse_old_format_data(data) + + if not conversation_data: + return [] + + samples = [] + for turn in conversation_data: + if "human" not in turn or "assistant" not in turn: + continue + + extracted = extract_title_author_and_abstract(turn["human"]) + title = extracted.get("title", "") + authors = extracted.get("authors", "") + abstract = extracted.get("abstract", "") + + n = min(num_templates, len(QUESTION_TEMPLATES)) + selected_templates = random.sample(QUESTION_TEMPLATES, n) + + for template in selected_templates: + formatted_question = template.format( + title=title, + authors=authors, + abstract=abstract, + category_text=CATEGORY_TEXT + ) + + new_data = { + "system": system_instruction, + "conversation": [ + { + "human": formatted_question, + "assistant": turn["assistant"] + } + ] } - ] - } - - - """ - print(f"转换数据: {input_file} -> {output_file}") - - converted_data = [] - with open(input_file, "r", encoding="utf-8") as f: - for line in f: - try: - data = json.loads(line.strip()) - - # 检查数据结构 - if "system" not in data or "conversation" not in data: - print(f"警告: 数据缺少必要字段: {data}") - continue - - # 从 system 提取指令 - instruction = data.get("system", "") - if not instruction: - instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。" - - # 处理对话 - for turn in data["conversation"]: - if "human" in turn and "assistant" in turn: - # 创建新的 Alpaca 格式数据 - new_data = { - "messages": [ - { - "role": "assistant", - "content": "This is a paper titled " + turn["human"] - - }]} - converted_data.append(new_data) - - except json.JSONDecodeError: - print(f"警告: 无法解析JSON行: {line}") - except Exception as e: - print(f"处理行时发生错误: {str(e)}") - - # 写入输出文件 - with open(output_file, "w", encoding="utf-8") as f: - for item in converted_data: - f.write(json.dumps(item, ensure_ascii=False) + "\n") - - print(f"转换完成! 共转换 {len(converted_data)} 条数据") - - - - - - - - + samples.append(new_data) + + return samples def convert_onedata2multi_type(input_file, output_file, num_templates): """ - 读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据, + 读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据, 并保存为output_file 参数: input_file: 输入文件路径 output_file: 输出文件路径 + num_templates: 每条数据生成的模板数量 """ print(f"开始转换数据...每条数据生成{num_templates}条变体") print(f"开始转换数据: {input_file} -> {output_file}") - - category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n" - - - # 定义20种问题模板 - question_templates = [ - # 直接提问式 - "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?", - - # 命令式 - "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}", - - # 描述性引导 - "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.", - - # 正式请求 - "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}", - - # 摘要优先 - "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}", - - # 作者强调 - "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?", - - # 问题链式 - "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}", - - # 简洁版 - "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}", - - # 上下文嵌入 - "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}", - - # 非正式口语 - "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}", - - # 元素罗列 - "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?", - - # 假设场景 - "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}", - - # 强调关键信息 - "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}", - - # 间接询问 - "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}", - - # 完整句子整合 - "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}", - - # 问题聚焦摘要 - "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}", - - # 标题驱动 - "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?", - - # 多部分查询 - "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}", - - # 比较式 - "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}", - - # 行动导向 - "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}" - ] multi_type_data = [] with open(input_file, "r", encoding="utf-8") as f: - for line in f: + for line_num, line in enumerate(f, 1): try: data = json.loads(line.strip()) - # 检查新格式的数据结构 - if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3: - # 提取系统指令 - system_instruction = "" - human_content = "" - assistant_content = "" - - for msg in data["messages"]: - if msg["role"] == "system": - system_instruction = msg["content"] - elif msg["role"] == "user": - human_content = msg["content"] - elif msg["role"] == "assistant": - assistant_content = msg["content"] - - # 提取标题、作者和摘要 - extracted = extract_title_author_and_abstract(human_content) - title = extracted.get("title", "") - authors = extracted.get("authors", "") - abstract = extracted.get("abstract", "") - - - n = min(num_templates, len(question_templates)) - selected_templates = random.sample(question_templates, n) - # 为每个问题模板创建新数据 - for template in selected_templates: - formatted_question = template.format( - title=title, - authors=authors, - abstract=abstract, - category_text=category_text - ) - - # 创建新的数据条目(保持新格式) - new_data = { - "messages": [ - {"role": "system", "content": system_instruction}, - {"role": "user", "content": formatted_question}, - {"role": "assistant", "content": assistant_content} - ] - } - multi_type_data.append(new_data) + # 处理新格式数据 + if "messages" in data: + samples = process_new_format_data(data, num_templates) + multi_type_data.extend(samples) + + # 处理旧格式数据 + elif "system" in data and "conversation" in data: + samples = process_old_format_data(data, num_templates) + multi_type_data.extend(samples) - # 检查旧格式的数据结构 - elif "system" in data and "conversation" in data and data["conversation"]: - system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。") - - for turn in data["conversation"]: - if "human" in turn and "assistant" in turn: - extracted = extract_title_author_and_abstract(turn["human"]) - title = extracted.get("title", "") - authors = extracted.get("authors", "") - abstract = extracted.get("abstract", "") - n = min(num_templates, len(question_templates)) - selected_templates = random.sample(question_templates, n) - - for template in selected_templates: - formatted_question = template.format( - title=title, - authors=authors, - abstract=abstract, - category_text=category_text - ) - - new_data = { - "system": system_instruction, - "conversation": [ - { - "human": formatted_question, - "assistant": turn["assistant"] - } - ] - } - multi_type_data.append(new_data) else: - print(f"警告: 数据格式不识别: {data}") + print(f"警告: 第{line_num}行数据格式不识别: {data}") continue except json.JSONDecodeError: - print(f"警告: 无法解析JSON行: {line}") + print(f"警告: 第{line_num}行无法解析JSON: {line}") except Exception as e: - print(f"处理行时发生错误: {str(e)}") + print(f"处理第{line_num}行时发生错误: {str(e)}") # 写入输出文件 with open(output_file, "w", encoding="utf-8") as f: @@ -278,25 +358,17 @@ def convert_onedata2multi_type(input_file, output_file, num_templates): - - - - - - if __name__ == "__main__": - - content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE" - extract_title_author_and_abstract(content_text) - + # 示例用法 input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl" - output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m.jsonl" # 输出文件路径 - # input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl" - # output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m4.jsonl" # 输出文件路径 - + output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m+.jsonl" convert_onedata2multi_type(input_file, output_file, num_templates=1) + + + +