Files
data-prepare/05-data-swfit-sft2multi_type.py

423 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import argparse
import random
# 科学类别文本常量
CATEGORY_TEXT = """ A. quant-ph
B. physics.chem-ph
C. physics.atom-ph
D. cond-mat.soft
E. cs.RO
F. cs.CL
G. cs.SE
H. cs.IR
I. hep-th
J. hep-ph
K. physics.optics
L. cs.AI
M. cs.CV
N. nucl-th
O. astro-ph
P. math.PR
Q. cs.OS
R. eess.SP
S. math.OC
T. math.DS
U. math.DG
V. math.MP
W. cs.MM
X. stat.ME
Y. math.CO
Z. cs.NE
"""
# 问题模板常量
QUESTION_TEMPLATES = [
# 直接提问式
"{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
# 命令式
"Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
# 描述性引导
"{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
# 正式请求
"Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
# 摘要优先
"Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
# 作者强调
"{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
# 问题链式
"Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
# 简洁版
"Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
# 上下文嵌入
"Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
# 非正式口语
"Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
# 元素罗列
"{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
# 假设场景
"If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
# 强调关键信息
"Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
# 间接询问
"For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
# 完整句子整合
"Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
# 问题聚焦摘要
"The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
# 标题驱动
"{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
# 多部分查询
"Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
# 比较式
"Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
# 行动导向
"Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
]
def extract_title_author_and_abstract(content_text):
"""
content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}
"""
try:
# 针对可以直接解析的JSON格式数据进行处理
if content_text.strip().startswith('{') and '"title"' in content_text and '"author_names"' in content_text:
try:
# 尝试解析为JSON对象
paper_data = json.loads(content_text)
title = paper_data.get("title", "")
authors = ", ".join(paper_data.get("author_names", []))
abstract = paper_data.get("summary", paper_data.get("abstract", ""))
return {"title": title, "authors": authors, "abstract": abstract}
except:
pass
#content_text.split("',")
parts = content_text.split("',")
if len(parts) < 3:
# 如果分割后的部分少于3个返回默认值
return {"title": "", "authors": "", "abstract": ""}
# 安全地提取标题
title_parts = parts[0].split("'")
if len(title_parts) >= 2:
title = title_parts[1].strip()
else:
title = ""
# 安全地提取作者
authors_parts = parts[1].split("'")
if len(authors_parts) >= 2:
authors = authors_parts[1].strip()
else:
authors = ""
# 安全地提取摘要
abstract_parts = parts[2].split("'")
if len(abstract_parts) >= 2:
abstract = abstract_parts[1].strip()
else:
abstract = ""
return {"title": title, "authors": authors, "abstract": abstract}
except Exception as e:
# 如果出现任何异常,返回默认值
print(f"解析内容时出错: {e}")
return {"title": "", "authors": "", "abstract": ""}
def parse_new_format_data(data):
"""
解析新格式的数据
Args:
data: 新格式的JSON数据
Returns:
tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
"""
if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
return None, None, None
system_instruction = ""
human_content = ""
assistant_content = ""
for msg in data["messages"]:
if msg["role"] == "system":
system_instruction = msg["content"]
elif msg["role"] == "user":
human_content = msg["content"]
elif msg["role"] == "assistant":
assistant_content = msg["content"]
return system_instruction, human_content, assistant_content
def parse_old_format_data(data):
"""
解析旧格式的数据
Args:
data: 旧格式的JSON数据
Returns:
tuple: (system_instruction, conversation_data) 或 (None, None)
"""
if "system" not in data or "conversation" not in data or not data["conversation"]:
return None, None
system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
return system_instruction, data["conversation"]
def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
"""
根据模板生成多种类型的样本
Args:
title: 论文标题
authors: 作者
abstract: 摘要
system_instruction: 系统指令
assistant_content: 助手回复
num_templates: 使用的模板数量
Returns:
list: 生成的多种类型数据列表
"""
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
samples = []
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"messages": [
{"role": "system", "content": system_instruction},
{"role": "user", "content": formatted_question},
{"role": "assistant", "content": assistant_content}
]
}
samples.append(new_data)
return samples
def process_new_format_data(data, num_templates):
"""
处理新格式数据
Args:
data: 新格式数据
num_templates: 模板数量
Returns:
list: 处理后的数据列表
"""
system_instruction, human_content, assistant_content = parse_new_format_data(data)
if not human_content:
return []
extracted = extract_title_author_and_abstract(human_content)
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)
def process_old_format_data(data, num_templates):
"""
处理旧格式数据
Args:
data: 旧格式数据
num_templates: 模板数量
Returns:
list: 处理后的数据列表
"""
system_instruction, conversation_data = parse_old_format_data(data)
if not conversation_data:
return []
samples = []
for turn in conversation_data:
if "human" not in turn or "assistant" not in turn:
continue
extracted = extract_title_author_and_abstract(turn["human"])
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"system": system_instruction,
"conversation": [
{
"human": formatted_question,
"assistant": turn["assistant"]
}
]
}
samples.append(new_data)
return samples
def convert_onedata2multi_type(input_file, output_file, num_templates):
"""
读取input_file将Swift格式的1条数据按多种问题模板格式转换为多条数据
并保存为output_file
参数:
input_file: 输入文件路径
output_file: 输出文件路径
num_templates: 每条数据生成的模板数量
"""
print(f"开始转换数据...每条数据生成{num_templates}条变体")
print(f"开始转换数据: {input_file} -> {output_file}")
multi_type_data = []
# 检查是否为JSON文件格式
if input_file.endswith('.json'):
# 处理JSON格式文件
with open(input_file, "r", encoding="utf-8") as f:
json_data = json.load(f)
for item in json_data:
title = item.get("title", "")
authors = ", ".join(item.get("author_names", item.get("authors", [])))
abstract = item.get("summary", item.get("abstract", ""))
category = item.get("category", "Unknown")
# 生成系统指令
system_instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"messages": [
{"role": "system", "content": system_instruction},
{"role": "user", "content": formatted_question},
{"role": "assistant", "content": category}
]
}
multi_type_data.append(new_data)
else:
# 原有的处理逻辑
with open(input_file, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
try:
data = json.loads(line.strip())
# 处理新格式数据
if "messages" in data:
samples = process_new_format_data(data, num_templates)
multi_type_data.extend(samples)
# 处理旧格式数据
elif "system" in data and "conversation" in data:
samples = process_old_format_data(data, num_templates)
multi_type_data.extend(samples)
else:
print(f"警告: 第{line_num}行数据格式不识别: {data}")
continue
except json.JSONDecodeError:
print(f"警告: 第{line_num}行无法解析JSON: {line}")
except Exception as e:
print(f"处理第{line_num}行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in multi_type_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
if __name__ == "__main__":
# 示例用法
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl"
output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m+.jsonl"
convert_onedata2multi_type(input_file, output_file, num_templates=1)