添加多种问题模板生成和数据解析功能，优化数据转换流程

2025-07-26 11:16:28 +08:00
parent 2846ebd310
commit ecf6279300
1 changed files with 309 additions and 237 deletions
--- a/05-data-swfit-sft2multi_type.py
+++ b/05-data-swfit-sft2multi_type.py
@@ -4,119 +4,37 @@ import os
 import argparse
 import random
-
+# 科学类别文本常量
-
+CATEGORY_TEXT = """ A. quant-ph
-def extract_title_author_and_abstract(content_text):
+B. physics.chem-ph
 C. physics.atom-ph
 D. cond-mat.soft
 E. cs.RO
 F. cs.CL
 G. cs.SE
 H. cs.IR
 I. hep-th
 J. hep-ph
 K. physics.optics
 L. cs.AI
 M. cs.CV
 N. nucl-th
 O. astro-ph
 P. math.PR
 Q. cs.OS
 R. eess.SP
 S. math.OC
 T. math.DS
 U. math.DG
 V. math.MP
 W. cs.MM
 X. stat.ME
 Y. math.CO
 Z. cs.NE
 """
    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , 
    A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}
-    
+# 问题模板常量
-    """
+QUESTION_TEMPLATES = [
    #content_text.split("',")
    parts = content_text.split("',")
    title = parts[0].split("'")[1].strip()
    authors = parts[1].split("'")[1].strip()
    abstract = parts[2].split("'")[1].strip()
    # # for part in parts:
    # #     print(part)
    # print(title)
    # print("----------------------------------------------------------------------------------------------------------")
    # print(authors)
    # print("----------------------------------------------------------------------------------------------------------")
    # print(abstract)
    # print("----------------------------------------------------------------------------------------------------------")
    return {"title": title, "authors": authors, "abstract": abstract}
 def convert_to_alpaca_format(input_file, output_file):
    """
    将 Swift 格式的数据转换为 Alpaca 格式
    输入格式:
    {
        "system": "你是个优秀的论文分类师",
        "conversation": [
            {
                "human": "Based on the title...",
                "assistant": "D"
            }
        ]
    }
    """
    print(f"转换数据: {input_file} -> {output_file}")
    converted_data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line in f:
            try:
                data = json.loads(line.strip())
                # 检查数据结构
                if "system" not in data or "conversation" not in data:
                    print(f"警告: 数据缺少必要字段: {data}")
                    continue
                # 从 system 提取指令
                instruction = data.get("system", "")
                if not instruction:
                    instruction = "根据论文的标题、作者和摘要，确定该论文的科学类别。"
                # 处理对话
                for turn in data["conversation"]:
                    if "human" in turn and "assistant" in turn:
                        # 创建新的 Alpaca 格式数据
                        new_data = {                                        
                                    "messages": [
                                        {
                                            "role": "assistant",
                                            "content": "This is a paper titled " + turn["human"]
                                        }]}
                        converted_data.append(new_data)
            except json.JSONDecodeError:
                print(f"警告: 无法解析JSON行: {line}")
            except Exception as e:
                print(f"处理行时发生错误: {str(e)}")
    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")
    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
 def convert_onedata2multi_type(input_file, output_file, num_templates):
    """
    读取input_file，将Swift格式的1条数据按20种问题模板格式转换为20条数据，
    并保存为output_file
    参数:
    input_file: 输入文件路径
    output_file: 输出文件路径
    """
    print(f"开始转换数据...每条数据生成{num_templates}条变体")
    print(f"开始转换数据: {input_file} -> {output_file}")
    category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
    # 定义20种问题模板
    question_templates = [
    # 直接提问式
    "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
@@ -178,16 +96,73 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
    "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
 ]
-    multi_type_data = []
+def extract_title_author_and_abstract(content_text):
    """
    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , 
    A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}
-    with open(input_file, "r", encoding="utf-8") as f:
+    
-        for line in f:
+    """
    try:
-                data = json.loads(line.strip())
+        #content_text.split("',")
        parts = content_text.split("',")
        if len(parts) < 3:
            # 如果分割后的部分少于3个，返回默认值
            return {"title": "", "authors": "", "abstract": ""}
        # 安全地提取标题
        title_parts = parts[0].split("'")
        if len(title_parts) >= 2:
            title = title_parts[1].strip()
        else:
            title = ""
        # 安全地提取作者
        authors_parts = parts[1].split("'")
        if len(authors_parts) >= 2:
            authors = authors_parts[1].strip()
        else:
            authors = ""
        # 安全地提取摘要
        abstract_parts = parts[2].split("'")
        if len(abstract_parts) >= 2:
            abstract = abstract_parts[1].strip()
        else:
            abstract = ""
        return {"title": title, "authors": authors, "abstract": abstract}
    except Exception as e:
        # 如果出现任何异常，返回默认值
        print(f"解析内容时出错: {e}")
        return {"title": "", "authors": "", "abstract": ""}
 def parse_new_format_data(data):
    """
    解析新格式的数据
    Args:
        data: 新格式的JSON数据
    Returns:
        tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
    """
    if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
        return None, None, None
                # 检查新格式的数据结构
                if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3:
                    # 提取系统指令
    system_instruction = ""
    human_content = ""
    assistant_content = ""
@@ -200,25 +175,53 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
        elif msg["role"] == "assistant":
            assistant_content = msg["content"]
-                    # 提取标题、作者和摘要
+    return system_instruction, human_content, assistant_content
                    extracted = extract_title_author_and_abstract(human_content)
                    title = extracted.get("title", "")
                    authors = extracted.get("authors", "")
                    abstract = extracted.get("abstract", "")
-                    n = min(num_templates, len(question_templates))
+def parse_old_format_data(data):
-                    selected_templates = random.sample(question_templates, n)
+    """
-                    # 为每个问题模板创建新数据
+    解析旧格式的数据
    Args:
        data: 旧格式的JSON数据
    Returns:
        tuple: (system_instruction, conversation_data) 或 (None, None)
    """
    if "system" not in data or "conversation" not in data or not data["conversation"]:
        return None, None
    system_instruction = data.get("system", "根据论文的标题、作者和摘要，确定该论文的科学类别。")
    return system_instruction, data["conversation"]
 def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
    """
    根据模板生成多种类型的样本
    Args:
        title: 论文标题
        authors: 作者
        abstract: 摘要
        system_instruction: 系统指令
        assistant_content: 助手回复
        num_templates: 使用的模板数量
    Returns:
        list: 生成的多种类型数据列表
    """
    n = min(num_templates, len(QUESTION_TEMPLATES))
    selected_templates = random.sample(QUESTION_TEMPLATES, n)
    samples = []
    for template in selected_templates:
        formatted_question = template.format(
            title=title,
            authors=authors,
            abstract=abstract,
-                            category_text=category_text
+            category_text=CATEGORY_TEXT
        )
                        # 创建新的数据条目（保持新格式）
        new_data = {
            "messages": [
                {"role": "system", "content": system_instruction},
@@ -226,27 +229,70 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
                {"role": "assistant", "content": assistant_content}
            ]
        }
-                        multi_type_data.append(new_data)
+        samples.append(new_data)
-                # 检查旧格式的数据结构
+    return samples
-                elif "system" in data and "conversation" in data and data["conversation"]:
+
-                    system_instruction = data.get("system", "根据论文的标题、作者和摘要，确定该论文的科学类别。")
+
 def process_new_format_data(data, num_templates):
    """
    处理新格式数据
    Args:
        data: 新格式数据
        num_templates: 模板数量
    Returns:
        list: 处理后的数据列表
    """
    system_instruction, human_content, assistant_content = parse_new_format_data(data)
    if not human_content:
        return []
    extracted = extract_title_author_and_abstract(human_content)
    title = extracted.get("title", "")
    authors = extracted.get("authors", "")
    abstract = extracted.get("abstract", "")
    return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)
 def process_old_format_data(data, num_templates):
    """
    处理旧格式数据
    Args:
        data: 旧格式数据
        num_templates: 模板数量
    Returns:
        list: 处理后的数据列表
    """
    system_instruction, conversation_data = parse_old_format_data(data)
    if not conversation_data:
        return []
    samples = []
    for turn in conversation_data:
        if "human" not in turn or "assistant" not in turn:
            continue
                    for turn in data["conversation"]:
                        if "human" in turn and "assistant" in turn:
        extracted = extract_title_author_and_abstract(turn["human"])
        title = extracted.get("title", "")
        authors = extracted.get("authors", "")
        abstract = extracted.get("abstract", "")
-                            n = min(num_templates, len(question_templates))
+        
-                            selected_templates = random.sample(question_templates, n)
+        n = min(num_templates, len(QUESTION_TEMPLATES))
        selected_templates = random.sample(QUESTION_TEMPLATES, n)
        for template in selected_templates:
            formatted_question = template.format(
                title=title,
                authors=authors,
                abstract=abstract,
-                                    category_text=category_text
+                category_text=CATEGORY_TEXT
            )
            new_data = {
@@ -258,15 +304,49 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
                    }
                ]
            }
-                                multi_type_data.append(new_data)
+            samples.append(new_data)
    return samples
 def convert_onedata2multi_type(input_file, output_file, num_templates):
    """
    读取input_file，将Swift格式的1条数据按多种问题模板格式转换为多条数据，
    并保存为output_file
    参数:
    input_file: 输入文件路径
    output_file: 输出文件路径
    num_templates: 每条数据生成的模板数量
    """
    print(f"开始转换数据...每条数据生成{num_templates}条变体")
    print(f"开始转换数据: {input_file} -> {output_file}")
    multi_type_data = []
    with open(input_file, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line.strip())
                # 处理新格式数据
                if "messages" in data:
                    samples = process_new_format_data(data, num_templates)
                    multi_type_data.extend(samples)
                # 处理旧格式数据
                elif "system" in data and "conversation" in data:
                    samples = process_old_format_data(data, num_templates)
                    multi_type_data.extend(samples)
                else:
-                    print(f"警告: 数据格式不识别: {data}")
+                    print(f"警告: 第{line_num}行数据格式不识别: {data}")
                    continue
            except json.JSONDecodeError:
-                print(f"警告: 无法解析JSON行: {line}")
+                print(f"警告: 第{line_num}行无法解析JSON: {line}")
            except Exception as e:
-                print(f"处理行时发生错误: {str(e)}")
+                print(f"处理第{line_num}行时发生错误: {str(e)}")
    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
@@ -278,25 +358,17 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
 if __name__ == "__main__":
-
+    # 示例用法
    content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
    extract_title_author_and_abstract(content_text)
    input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl"
-    output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m.jsonl"  # 输出文件路径
+    output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m+.jsonl"
    # input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"
    # output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m4.jsonl"  # 输出文件路径    
    convert_onedata2multi_type(input_file, output_file, num_templates=1)