Files
data-prepare/05-data-swfit-sft2multi_type.py
glowz 87f2756fdf Add validation analysis script for classification results
- Implemented a new script `val_test.py` to analyze classification results from a JSONL file.
- Extracted true labels and predicted responses, handling invalid entries gracefully.
- Generated a classification report with accuracy metrics and detailed statistics for each category.
- Added functionality to export results to CSV and save analysis reports.
- Included visualization of confusion matrix and category accuracy distribution.
- Ensured dynamic handling of categories based on the input data.
2025-07-20 21:04:08 +08:00

303 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import argparse
import random
def extract_title_author_and_abstract(content_text):
"""
content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}
"""
#content_text.split("',")
parts = content_text.split("',")
title = parts[0].split("'")[1].strip()
authors = parts[1].split("'")[1].strip()
abstract = parts[2].split("'")[1].strip()
# # for part in parts:
# # print(part)
# print(title)
# print("----------------------------------------------------------------------------------------------------------")
# print(authors)
# print("----------------------------------------------------------------------------------------------------------")
# print(abstract)
# print("----------------------------------------------------------------------------------------------------------")
return {"title": title, "authors": authors, "abstract": abstract}
def convert_to_alpaca_format(input_file, output_file):
"""
将 Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "system" not in data or "conversation" not in data:
print(f"警告: 数据缺少必要字段: {data}")
continue
# 从 system 提取指令
instruction = data.get("system", "")
if not instruction:
instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
# 处理对话
for turn in data["conversation"]:
if "human" in turn and "assistant" in turn:
# 创建新的 Alpaca 格式数据
new_data = {
"messages": [
{
"role": "assistant",
"content": "This is a paper titled " + turn["human"]
}]}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
def convert_onedata2multi_type(input_file, output_file, num_templates):
"""
读取input_file将Swift格式的1条数据按20种问题模板格式转换为20条数据
并保存为output_file
参数:
input_file: 输入文件路径
output_file: 输出文件路径
"""
print(f"开始转换数据...每条数据生成{num_templates}条变体")
print(f"开始转换数据: {input_file} -> {output_file}")
category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
# 定义20种问题模板
question_templates = [
# 直接提问式
"{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
# 命令式
"Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
# 描述性引导
"{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
# 正式请求
"Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
# 摘要优先
"Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
# 作者强调
"{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
# 问题链式
"Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
# 简洁版
"Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
# 上下文嵌入
"Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
# 非正式口语
"Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
# 元素罗列
"{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
# 假设场景
"If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
# 强调关键信息
"Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
# 间接询问
"For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
# 完整句子整合
"Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
# 问题聚焦摘要
"The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
# 标题驱动
"{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
# 多部分查询
"Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
# 比较式
"Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
# 行动导向
"Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
]
multi_type_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查新格式的数据结构
if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3:
# 提取系统指令
system_instruction = ""
human_content = ""
assistant_content = ""
for msg in data["messages"]:
if msg["role"] == "system":
system_instruction = msg["content"]
elif msg["role"] == "user":
human_content = msg["content"]
elif msg["role"] == "assistant":
assistant_content = msg["content"]
# 提取标题、作者和摘要
extracted = extract_title_author_and_abstract(human_content)
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
n = min(num_templates, len(question_templates))
selected_templates = random.sample(question_templates, n)
# 为每个问题模板创建新数据
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=category_text
)
# 创建新的数据条目(保持新格式)
new_data = {
"messages": [
{"role": "system", "content": system_instruction},
{"role": "user", "content": formatted_question},
{"role": "assistant", "content": assistant_content}
]
}
multi_type_data.append(new_data)
# 检查旧格式的数据结构
elif "system" in data and "conversation" in data and data["conversation"]:
system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
for turn in data["conversation"]:
if "human" in turn and "assistant" in turn:
extracted = extract_title_author_and_abstract(turn["human"])
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
n = min(num_templates, len(question_templates))
selected_templates = random.sample(question_templates, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=category_text
)
new_data = {
"system": system_instruction,
"conversation": [
{
"human": formatted_question,
"assistant": turn["assistant"]
}
]
}
multi_type_data.append(new_data)
else:
print(f"警告: 数据格式不识别: {data}")
continue
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in multi_type_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
if __name__ == "__main__":
content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
extract_title_author_and_abstract(content_text)
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl"
output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m.jsonl" # 输出文件路径
# input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"
# output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m4.jsonl" # 输出文件路径
convert_onedata2multi_type(input_file, output_file, num_templates=1)