data-prepare/05-data-swfit-sft2multi_type-crawl.py


import json
import os
import argparse
import random

# 科学类别文本常量
CATEGORY_TEXT = """ A. quant-ph
B. physics.chem-ph
C. physics.atom-ph
D. cond-mat.soft
E. cs.RO
F. cs.CL
G. cs.SE
H. cs.IR
I. hep-th
J. hep-ph
K. physics.optics
L. cs.AI
M. cs.CV
N. nucl-th
O. astro-ph
P. math.PR
Q. cs.OS
R. eess.SP
S. math.OC
T. math.DS
U. math.DG
V. math.MP
W. cs.MM
X. stat.ME
Y. math.CO
Z. cs.NE
"""
# 科学类别字典
CATEGORY_DICT = {
    "quant-ph": "A",
    "physics.chem-ph": "B",
    "physics.atom-ph": "C",
    "cond-mat.soft": "D",
    "cs.RO": "E",
    "cs.CL": "F",
    "cs.SE": "G",
    "cs.IR": "H",
    "hep-th": "I",
    "hep-ph": "J",
    "physics.optics": "K",
    "cs.AI": "L",
    "cs.CV": "M",
    "nucl-th": "N",
    "astro-ph": "O",
    "math.PR": "P",
    "cs.OS": "Q",
    "eess.SP": "R",
    "math.OC": "S",
    "math.DS": "T",
    "math.DG": "U",
    "math.MP": "V",
    "cs.MM": "W",
    "stat.ME": "X",
    "math.CO": "Y",
    "cs.NE": "Z"
}
# 问题模板常量
QUESTION_TEMPLATES = [
    # 直接提问式
    "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",

    # 命令式
    "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",

    # 描述性引导
    "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",

    # 正式请求
    "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",

    # 摘要优先
    "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",

    # 作者强调
    "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",

    # 问题链式
    "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",

    # 简洁版
    "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",

    # 上下文嵌入
    "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",

    # 非正式口语
    "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",

    # 元素罗列
    "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",

    # 假设场景
    "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",

    # 强调关键信息
    "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",

    # 间接询问
    "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",

    # 完整句子整合
    "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",

    # 问题聚焦摘要
    "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",

    # 标题驱动
    "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",

    # 多部分查询
    "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",

    # 比较式
    "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",

    # 行动导向
    "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
]

QUESTION_TEMPLATES = [
    "Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{category_text}"
]


def extract_title_author_and_abstract(content_text):
    """
    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
    A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}


    """
    try:
        # 针对可以直接解析的JSON格式数据进行处理
        if content_text.strip().startswith('{') and '"title"' in content_text and ('"author_names"' in content_text or '"authors"' in content_text):
            try:
                # 尝试解析为JSON对象
                paper_data = json.loads(content_text)
                title = paper_data.get("title", "")
                authors = ", ".join(paper_data.get("author_names", paper_data.get("authors", [])))
                abstract = paper_data.get("summary", paper_data.get("abstract", ""))
                return {"title": title, "authors": authors, "abstract": abstract}
            except:
                pass

        #content_text.split("',")
        parts = content_text.split("',")
        if len(parts) < 3:
            # 如果分割后的部分少于3个，返回默认值
            return {"title": "", "authors": "", "abstract": ""}

        # 安全地提取标题
        title_parts = parts[0].split("'")
        if len(title_parts) >= 2:
            title = title_parts[1].strip()
        else:
            title = ""

        # 安全地提取作者
        authors_parts = parts[1].split("'")
        if len(authors_parts) >= 2:
            authors = authors_parts[1].strip()
        else:
            authors = ""

        # 安全地提取摘要
        abstract_parts = parts[2].split("'")
        if len(abstract_parts) >= 2:
            abstract = abstract_parts[1].strip()
        else:
            abstract = ""

        return {"title": title, "authors": authors, "abstract": abstract}
    except Exception as e:
        # 如果出现任何异常，返回默认值
        print(f"解析内容时出错: {e}")
        return {"title": "", "authors": "", "abstract": ""}

def parse_new_format_data(data):
    """
    解析新格式的数据

    Args:
        data: 新格式的JSON数据

    Returns:
        tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
    """
    if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
        return None, None, None

    system_instruction = ""
    human_content = ""
    assistant_content = ""

    for msg in data["messages"]:
        if msg["role"] == "system":
            system_instruction = msg["content"]
        elif msg["role"] == "user":
            human_content = msg["content"]
        elif msg["role"] == "assistant":
            assistant_content = msg["content"]

    return system_instruction, human_content, assistant_content


def parse_old_format_data(data):
    """
    解析旧格式的数据

    Args:
        data: 旧格式的JSON数据

    Returns:
        tuple: (system_instruction, conversation_data) 或 (None, None)
    """
    if "system" not in data or "conversation" not in data or not data["conversation"]:
        return None, None

    system_instruction = data.get("system", "根据论文的标题、作者和摘要，确定该论文的科学类别。")
    return system_instruction, data["conversation"]


def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
    """
    根据模板生成多种类型的样本

    Args:
        title: 论文标题
        authors: 作者
        abstract: 摘要
        system_instruction: 系统指令
        assistant_content: 助手回复
        num_templates: 使用的模板数量

    Returns:
        list: 生成的多种类型数据列表
    """
    n = min(num_templates, len(QUESTION_TEMPLATES))
    selected_templates = random.sample(QUESTION_TEMPLATES, n)
    samples = []

    for template in selected_templates:
        formatted_question = template.format(
            title=title,
            authors=authors,
            abstract=abstract,
            category_text=CATEGORY_TEXT
        )

        new_data = {
            "messages": [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": formatted_question},
                {"role": "assistant", "content": assistant_content}
            ]
        }
        samples.append(new_data)

    return samples


def process_new_format_data(data, num_templates):
    """
    处理新格式数据

    Args:
        data: 新格式数据
        num_templates: 模板数量

    Returns:
        list: 处理后的数据列表
    """
    system_instruction, human_content, assistant_content = parse_new_format_data(data)

    if not human_content:
        return []

    extracted = extract_title_author_and_abstract(human_content)
    title = extracted.get("title", "")
    authors = extracted.get("authors", "")
    abstract = extracted.get("abstract", "")

    return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)


def process_old_format_data(data, num_templates):
    """
    处理旧格式数据

    Args:
        data: 旧格式数据
        num_templates: 模板数量

    Returns:
        list: 处理后的数据列表
    """
    system_instruction, conversation_data = parse_old_format_data(data)

    if not conversation_data:
        return []

    samples = []
    for turn in conversation_data:
        if "human" not in turn or "assistant" not in turn:
            continue

        extracted = extract_title_author_and_abstract(turn["human"])
        title = extracted.get("title", "")
        authors = extracted.get("authors", "")
        abstract = extracted.get("abstract", "")

        n = min(num_templates, len(QUESTION_TEMPLATES))
        selected_templates = random.sample(QUESTION_TEMPLATES, n)

        for template in selected_templates:
            formatted_question = template.format(
                title=title,
                authors=authors,
                abstract=abstract,
                category_text=CATEGORY_TEXT
            )

            new_data = {
                "system": system_instruction,
                "conversation": [
                    {
                        "human": formatted_question,
                        "assistant": turn["assistant"]
                    }
                ]
            }
            samples.append(new_data)

    return samples


def get_paper_data_from_crawl_jason(input_path):
    """
    从指定文件夹里的所有JSON文件中获取论文数据
    或从单个JSON文件中获取论文数据
    """
    paper_data_list = []

    # 检查输入路径是文件还是文件夹
    if os.path.isfile(input_path):
        # 如果是单个文件
        paper_data_list.extend(_extract_paper_data_from_file(input_path))
        print(f"从文件 {input_path} 中提取了 {len(paper_data_list)} 条数据")
    elif os.path.isdir(input_path):
        # 如果是文件夹，遍历其中所有JSON文件
        files_found = 0
        for filename in os.listdir(input_path):
            if filename.endswith('.jsonl') :
                file_path = os.path.join(input_path, filename)
                try:
                    file_data = _extract_paper_data_from_file(file_path)
                    paper_data_list.extend(file_data)
                    print(f"已从 {filename} 中提取 {len(file_data)} 条数据")
                    files_found += 1
                except Exception as e:
                    print(f"处理文件 {filename} 时出错: {e}")
        print(f"在目录中找到 {files_found} 个JSON文件")
    else:
        print(f"路径 {input_path} 既不是文件也不是文件夹")

    print(f"总共提取了 {len(paper_data_list)} 条论文数据")
    return paper_data_list

def _extract_paper_data_from_file(file_path):
    """
    从单个JSON文件中提取论文数据

    Args:
        file_path: JSON文件路径

    Returns:
        list: 论文数据列表
    """
    paper_data_list = []

    # 处理JSONL格式文件
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:  # 跳过空行
                continue
            try:
                item = json.loads(line)
                title = item.get("title", "")
                # 处理作者信息的不同可能格式
                authors_list = item.get("author_names", item.get("authors", []))
                if isinstance(authors_list, list):
                    authors = ", ".join(authors_list)
                else:
                    authors = str(authors_list)

                # 处理摘要信息的不同可能格式
                abstract = item.get("summary", item.get("abstract", ""))
                # 处理分类信息的不同可能格式
                category = item.get("category", "Unknown")
                # 如果没有category字段，尝试从categories列表中获取第一个
                if category == "Unknown" and "categories" in item and isinstance(item["categories"], list) and len(item["categories"]) > 0:
                    category = item["categories"][0]

                # 提取论文数据
                paper_data_dict = {
                    "title": title,
                    "authors": authors,
                    "abstract": abstract,
                    "category": category
                }
                paper_data_list.append(paper_data_dict)
            except json.JSONDecodeError as e:
                print(f"解析文件 {file_path} 的第 {line_num} 行时出错: {e}")
                continue

    return paper_data_list


def convert_onedata2multi_type_pre(paper_datas, output_file, num_templates):
    """
    读取input_file，将Swift格式的1条数据按多种问题模板格式转换为多条数据，
    并保存为output_file

    参数:
    input_file: 输入文件路径
    output_file: 输出文件路径
    num_templates: 每条数据生成的模板数量
    """
    print(f"开始转换数据...每条数据生成{num_templates}条变体")
    print(f"开始转换数据: {input_file} -> {output_file}")

    multi_type_data = []


    for item in paper_datas:
        title = item.get("title", "")
        authors = item.get("authors", "")
        abstract = item.get("summary", item.get("abstract", ""))


        n = min(num_templates, len(QUESTION_TEMPLATES))
        selected_templates = random.sample(QUESTION_TEMPLATES, n)

        for template in selected_templates:
            formatted_question = template.format(
                title=title,
                authors=authors,
                abstract=abstract,
                category_text=CATEGORY_TEXT
            )


            new_data = {

                "messages": [
                    {
                        "role": "assistant",
                        "content": formatted_question
                        #"assistant": row["answer"]
                    }
                ]
                }
            multi_type_data.append(new_data)


    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
        for item in multi_type_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")


def convert_onedata2multi_type_sft(paper_datas, output_file, num_templates):
    """
    读取input_file，将Swift格式的1条数据按多种问题模板格式转换为多条数据，
    并保存为output_file

    参数:
    input_file: 输入文件路径
    output_file: 输出文件路径
    num_templates: 每条数据生成的模板数量
    """
    print(f"开始转换数据...每条数据生成{num_templates}条变体")
    print(f"开始转换数据: {input_file} -> {output_file}")

    multi_type_data = []


    for item in paper_datas:
        title = item.get("title", "")
        authors = item.get("authors", "")
        abstract = item.get("summary", item.get("abstract", ""))
        category = item.get("category", "Unknown")
        answer=CATEGORY_DICT.get(category, "Unknown")
        #print(item)
        # 生成系统指令
        system_instruction = "你是个优秀的论文分类师,根据论文的标题、作者和摘要，确定该论文的科学类别。"

        n = min(num_templates, len(QUESTION_TEMPLATES))
        selected_templates = random.sample(QUESTION_TEMPLATES, n)

        for template in selected_templates:
            formatted_question = template.format(
                title=title,
                authors=authors,
                abstract=abstract,
                category_text=CATEGORY_TEXT
            )

            new_data = {
                "system": system_instruction,
                "conversation": [
                    {
                        "human": formatted_question,
                        "assistant": answer
                    }
                ]
            }
            multi_type_data.append(new_data)


    # 写入输出文件
    with open(output_file, "w", encoding="utf-8") as f:
        for item in multi_type_data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")


if __name__ == "__main__":
    # 示例用法
    input_file = r"G:\\11\data-prepare\\arxiv_papers\\"
    output_file_sft = r"G:\\11\data-prepare\\arxiv_papers-multi_type-sft.json"
    output_file_pre = r"G:\\11\data-prepare\\arxiv_papers-multi_type-pre.json"
    paper_datas=get_paper_data_from_crawl_jason(input_file)
    convert_onedata2multi_type_sft(paper_datas, output_file_sft, num_templates=1)
    #convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)