添加多种问题模板生成和数据解析功能,优化数据转换流程
This commit is contained in:
@@ -4,269 +4,349 @@ import os
|
|||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
# 科学类别文本常量
|
||||||
|
CATEGORY_TEXT = """ A. quant-ph
|
||||||
|
B. physics.chem-ph
|
||||||
|
C. physics.atom-ph
|
||||||
|
D. cond-mat.soft
|
||||||
|
E. cs.RO
|
||||||
|
F. cs.CL
|
||||||
|
G. cs.SE
|
||||||
|
H. cs.IR
|
||||||
|
I. hep-th
|
||||||
|
J. hep-ph
|
||||||
|
K. physics.optics
|
||||||
|
L. cs.AI
|
||||||
|
M. cs.CV
|
||||||
|
N. nucl-th
|
||||||
|
O. astro-ph
|
||||||
|
P. math.PR
|
||||||
|
Q. cs.OS
|
||||||
|
R. eess.SP
|
||||||
|
S. math.OC
|
||||||
|
T. math.DS
|
||||||
|
U. math.DG
|
||||||
|
V. math.MP
|
||||||
|
W. cs.MM
|
||||||
|
X. stat.ME
|
||||||
|
Y. math.CO
|
||||||
|
Z. cs.NE
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 问题模板常量
|
||||||
|
QUESTION_TEMPLATES = [
|
||||||
|
# 直接提问式
|
||||||
|
"{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
|
||||||
|
|
||||||
|
# 命令式
|
||||||
|
"Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
|
||||||
|
|
||||||
|
# 描述性引导
|
||||||
|
"{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
|
||||||
|
|
||||||
|
# 正式请求
|
||||||
|
"Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
|
||||||
|
|
||||||
|
# 摘要优先
|
||||||
|
"Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
|
||||||
|
|
||||||
|
# 作者强调
|
||||||
|
"{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
|
||||||
|
|
||||||
|
# 问题链式
|
||||||
|
"Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
|
||||||
|
|
||||||
|
# 简洁版
|
||||||
|
"Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
|
||||||
|
|
||||||
|
# 上下文嵌入
|
||||||
|
"Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
|
||||||
|
|
||||||
|
# 非正式口语
|
||||||
|
"Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
|
||||||
|
|
||||||
|
# 元素罗列
|
||||||
|
"{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
|
||||||
|
|
||||||
|
# 假设场景
|
||||||
|
"If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
|
||||||
|
|
||||||
|
# 强调关键信息
|
||||||
|
"Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
|
||||||
|
|
||||||
|
# 间接询问
|
||||||
|
"For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
|
||||||
|
|
||||||
|
# 完整句子整合
|
||||||
|
"Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
|
||||||
|
|
||||||
|
# 问题聚焦摘要
|
||||||
|
"The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
|
||||||
|
|
||||||
|
# 标题驱动
|
||||||
|
"{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
|
||||||
|
|
||||||
|
# 多部分查询
|
||||||
|
"Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
|
||||||
|
|
||||||
|
# 比较式
|
||||||
|
"Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
|
||||||
|
|
||||||
|
# 行动导向
|
||||||
|
"Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
|
||||||
|
]
|
||||||
|
|
||||||
def extract_title_author_and_abstract(content_text):
|
def extract_title_author_and_abstract(content_text):
|
||||||
"""
|
"""
|
||||||
content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
|
content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
|
||||||
A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}
|
A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
#content_text.split("',")
|
try:
|
||||||
parts = content_text.split("',")
|
#content_text.split("',")
|
||||||
title = parts[0].split("'")[1].strip()
|
parts = content_text.split("',")
|
||||||
authors = parts[1].split("'")[1].strip()
|
if len(parts) < 3:
|
||||||
abstract = parts[2].split("'")[1].strip()
|
# 如果分割后的部分少于3个,返回默认值
|
||||||
# # for part in parts:
|
return {"title": "", "authors": "", "abstract": ""}
|
||||||
# # print(part)
|
|
||||||
# print(title)
|
# 安全地提取标题
|
||||||
# print("----------------------------------------------------------------------------------------------------------")
|
title_parts = parts[0].split("'")
|
||||||
# print(authors)
|
if len(title_parts) >= 2:
|
||||||
# print("----------------------------------------------------------------------------------------------------------")
|
title = title_parts[1].strip()
|
||||||
# print(abstract)
|
else:
|
||||||
# print("----------------------------------------------------------------------------------------------------------")
|
title = ""
|
||||||
return {"title": title, "authors": authors, "abstract": abstract}
|
|
||||||
|
# 安全地提取作者
|
||||||
|
authors_parts = parts[1].split("'")
|
||||||
|
if len(authors_parts) >= 2:
|
||||||
|
authors = authors_parts[1].strip()
|
||||||
|
else:
|
||||||
|
authors = ""
|
||||||
|
|
||||||
|
# 安全地提取摘要
|
||||||
|
abstract_parts = parts[2].split("'")
|
||||||
|
if len(abstract_parts) >= 2:
|
||||||
|
abstract = abstract_parts[1].strip()
|
||||||
|
else:
|
||||||
|
abstract = ""
|
||||||
|
|
||||||
|
return {"title": title, "authors": authors, "abstract": abstract}
|
||||||
|
except Exception as e:
|
||||||
|
# 如果出现任何异常,返回默认值
|
||||||
|
print(f"解析内容时出错: {e}")
|
||||||
|
return {"title": "", "authors": "", "abstract": ""}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def convert_to_alpaca_format(input_file, output_file):
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def parse_new_format_data(data):
|
||||||
"""
|
"""
|
||||||
将 Swift 格式的数据转换为 Alpaca 格式
|
解析新格式的数据
|
||||||
|
|
||||||
输入格式:
|
Args:
|
||||||
{
|
data: 新格式的JSON数据
|
||||||
"system": "你是个优秀的论文分类师",
|
|
||||||
"conversation": [
|
Returns:
|
||||||
{
|
tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
|
||||||
"human": "Based on the title...",
|
"""
|
||||||
"assistant": "D"
|
if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
system_instruction = ""
|
||||||
|
human_content = ""
|
||||||
|
assistant_content = ""
|
||||||
|
|
||||||
|
for msg in data["messages"]:
|
||||||
|
if msg["role"] == "system":
|
||||||
|
system_instruction = msg["content"]
|
||||||
|
elif msg["role"] == "user":
|
||||||
|
human_content = msg["content"]
|
||||||
|
elif msg["role"] == "assistant":
|
||||||
|
assistant_content = msg["content"]
|
||||||
|
|
||||||
|
return system_instruction, human_content, assistant_content
|
||||||
|
|
||||||
|
|
||||||
|
def parse_old_format_data(data):
|
||||||
|
"""
|
||||||
|
解析旧格式的数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 旧格式的JSON数据
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (system_instruction, conversation_data) 或 (None, None)
|
||||||
|
"""
|
||||||
|
if "system" not in data or "conversation" not in data or not data["conversation"]:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
|
||||||
|
return system_instruction, data["conversation"]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
|
||||||
|
"""
|
||||||
|
根据模板生成多种类型的样本
|
||||||
|
|
||||||
|
Args:
|
||||||
|
title: 论文标题
|
||||||
|
authors: 作者
|
||||||
|
abstract: 摘要
|
||||||
|
system_instruction: 系统指令
|
||||||
|
assistant_content: 助手回复
|
||||||
|
num_templates: 使用的模板数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 生成的多种类型数据列表
|
||||||
|
"""
|
||||||
|
n = min(num_templates, len(QUESTION_TEMPLATES))
|
||||||
|
selected_templates = random.sample(QUESTION_TEMPLATES, n)
|
||||||
|
samples = []
|
||||||
|
|
||||||
|
for template in selected_templates:
|
||||||
|
formatted_question = template.format(
|
||||||
|
title=title,
|
||||||
|
authors=authors,
|
||||||
|
abstract=abstract,
|
||||||
|
category_text=CATEGORY_TEXT
|
||||||
|
)
|
||||||
|
|
||||||
|
new_data = {
|
||||||
|
"messages": [
|
||||||
|
{"role": "system", "content": system_instruction},
|
||||||
|
{"role": "user", "content": formatted_question},
|
||||||
|
{"role": "assistant", "content": assistant_content}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
samples.append(new_data)
|
||||||
|
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
def process_new_format_data(data, num_templates):
|
||||||
|
"""
|
||||||
|
处理新格式数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 新格式数据
|
||||||
|
num_templates: 模板数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 处理后的数据列表
|
||||||
|
"""
|
||||||
|
system_instruction, human_content, assistant_content = parse_new_format_data(data)
|
||||||
|
|
||||||
|
if not human_content:
|
||||||
|
return []
|
||||||
|
|
||||||
|
extracted = extract_title_author_and_abstract(human_content)
|
||||||
|
title = extracted.get("title", "")
|
||||||
|
authors = extracted.get("authors", "")
|
||||||
|
abstract = extracted.get("abstract", "")
|
||||||
|
|
||||||
|
return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)
|
||||||
|
|
||||||
|
|
||||||
|
def process_old_format_data(data, num_templates):
|
||||||
|
"""
|
||||||
|
处理旧格式数据
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: 旧格式数据
|
||||||
|
num_templates: 模板数量
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 处理后的数据列表
|
||||||
|
"""
|
||||||
|
system_instruction, conversation_data = parse_old_format_data(data)
|
||||||
|
|
||||||
|
if not conversation_data:
|
||||||
|
return []
|
||||||
|
|
||||||
|
samples = []
|
||||||
|
for turn in conversation_data:
|
||||||
|
if "human" not in turn or "assistant" not in turn:
|
||||||
|
continue
|
||||||
|
|
||||||
|
extracted = extract_title_author_and_abstract(turn["human"])
|
||||||
|
title = extracted.get("title", "")
|
||||||
|
authors = extracted.get("authors", "")
|
||||||
|
abstract = extracted.get("abstract", "")
|
||||||
|
|
||||||
|
n = min(num_templates, len(QUESTION_TEMPLATES))
|
||||||
|
selected_templates = random.sample(QUESTION_TEMPLATES, n)
|
||||||
|
|
||||||
|
for template in selected_templates:
|
||||||
|
formatted_question = template.format(
|
||||||
|
title=title,
|
||||||
|
authors=authors,
|
||||||
|
abstract=abstract,
|
||||||
|
category_text=CATEGORY_TEXT
|
||||||
|
)
|
||||||
|
|
||||||
|
new_data = {
|
||||||
|
"system": system_instruction,
|
||||||
|
"conversation": [
|
||||||
|
{
|
||||||
|
"human": formatted_question,
|
||||||
|
"assistant": turn["assistant"]
|
||||||
|
}
|
||||||
|
]
|
||||||
}
|
}
|
||||||
]
|
samples.append(new_data)
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
print(f"转换数据: {input_file} -> {output_file}")
|
|
||||||
|
|
||||||
converted_data = []
|
|
||||||
with open(input_file, "r", encoding="utf-8") as f:
|
|
||||||
for line in f:
|
|
||||||
try:
|
|
||||||
data = json.loads(line.strip())
|
|
||||||
|
|
||||||
# 检查数据结构
|
|
||||||
if "system" not in data or "conversation" not in data:
|
|
||||||
print(f"警告: 数据缺少必要字段: {data}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 从 system 提取指令
|
|
||||||
instruction = data.get("system", "")
|
|
||||||
if not instruction:
|
|
||||||
instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
|
|
||||||
|
|
||||||
# 处理对话
|
|
||||||
for turn in data["conversation"]:
|
|
||||||
if "human" in turn and "assistant" in turn:
|
|
||||||
# 创建新的 Alpaca 格式数据
|
|
||||||
new_data = {
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"role": "assistant",
|
|
||||||
"content": "This is a paper titled " + turn["human"]
|
|
||||||
|
|
||||||
}]}
|
|
||||||
converted_data.append(new_data)
|
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
print(f"警告: 无法解析JSON行: {line}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"处理行时发生错误: {str(e)}")
|
|
||||||
|
|
||||||
# 写入输出文件
|
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
|
||||||
for item in converted_data:
|
|
||||||
f.write(json.dumps(item, ensure_ascii=False) + "\n")
|
|
||||||
|
|
||||||
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
def convert_onedata2multi_type(input_file, output_file, num_templates):
|
def convert_onedata2multi_type(input_file, output_file, num_templates):
|
||||||
"""
|
"""
|
||||||
读取input_file,将Swift格式的1条数据按20种问题模板格式转换为20条数据,
|
读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据,
|
||||||
并保存为output_file
|
并保存为output_file
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
input_file: 输入文件路径
|
input_file: 输入文件路径
|
||||||
output_file: 输出文件路径
|
output_file: 输出文件路径
|
||||||
|
num_templates: 每条数据生成的模板数量
|
||||||
"""
|
"""
|
||||||
print(f"开始转换数据...每条数据生成{num_templates}条变体")
|
print(f"开始转换数据...每条数据生成{num_templates}条变体")
|
||||||
print(f"开始转换数据: {input_file} -> {output_file}")
|
print(f"开始转换数据: {input_file} -> {output_file}")
|
||||||
|
|
||||||
category_text=" A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE\n"
|
|
||||||
|
|
||||||
|
|
||||||
# 定义20种问题模板
|
|
||||||
question_templates = [
|
|
||||||
# 直接提问式
|
|
||||||
"{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
|
|
||||||
|
|
||||||
# 命令式
|
|
||||||
"Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
|
|
||||||
|
|
||||||
# 描述性引导
|
|
||||||
"{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
|
|
||||||
|
|
||||||
# 正式请求
|
|
||||||
"Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
|
|
||||||
|
|
||||||
# 摘要优先
|
|
||||||
"Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
|
|
||||||
|
|
||||||
# 作者强调
|
|
||||||
"{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
|
|
||||||
|
|
||||||
# 问题链式
|
|
||||||
"Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
|
|
||||||
|
|
||||||
# 简洁版
|
|
||||||
"Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
|
|
||||||
|
|
||||||
# 上下文嵌入
|
|
||||||
"Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
|
|
||||||
|
|
||||||
# 非正式口语
|
|
||||||
"Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
|
|
||||||
|
|
||||||
# 元素罗列
|
|
||||||
"{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
|
|
||||||
|
|
||||||
# 假设场景
|
|
||||||
"If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
|
|
||||||
|
|
||||||
# 强调关键信息
|
|
||||||
"Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
|
|
||||||
|
|
||||||
# 间接询问
|
|
||||||
"For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
|
|
||||||
|
|
||||||
# 完整句子整合
|
|
||||||
"Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
|
|
||||||
|
|
||||||
# 问题聚焦摘要
|
|
||||||
"The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
|
|
||||||
|
|
||||||
# 标题驱动
|
|
||||||
"{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
|
|
||||||
|
|
||||||
# 多部分查询
|
|
||||||
"Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
|
|
||||||
|
|
||||||
# 比较式
|
|
||||||
"Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
|
|
||||||
|
|
||||||
# 行动导向
|
|
||||||
"Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
|
|
||||||
]
|
|
||||||
|
|
||||||
multi_type_data = []
|
multi_type_data = []
|
||||||
|
|
||||||
with open(input_file, "r", encoding="utf-8") as f:
|
with open(input_file, "r", encoding="utf-8") as f:
|
||||||
for line in f:
|
for line_num, line in enumerate(f, 1):
|
||||||
try:
|
try:
|
||||||
data = json.loads(line.strip())
|
data = json.loads(line.strip())
|
||||||
|
|
||||||
# 检查新格式的数据结构
|
# 处理新格式数据
|
||||||
if "messages" in data and isinstance(data["messages"], list) and len(data["messages"]) >= 3:
|
if "messages" in data:
|
||||||
# 提取系统指令
|
samples = process_new_format_data(data, num_templates)
|
||||||
system_instruction = ""
|
multi_type_data.extend(samples)
|
||||||
human_content = ""
|
|
||||||
assistant_content = ""
|
|
||||||
|
|
||||||
for msg in data["messages"]:
|
# 处理旧格式数据
|
||||||
if msg["role"] == "system":
|
elif "system" in data and "conversation" in data:
|
||||||
system_instruction = msg["content"]
|
samples = process_old_format_data(data, num_templates)
|
||||||
elif msg["role"] == "user":
|
multi_type_data.extend(samples)
|
||||||
human_content = msg["content"]
|
|
||||||
elif msg["role"] == "assistant":
|
|
||||||
assistant_content = msg["content"]
|
|
||||||
|
|
||||||
# 提取标题、作者和摘要
|
|
||||||
extracted = extract_title_author_and_abstract(human_content)
|
|
||||||
title = extracted.get("title", "")
|
|
||||||
authors = extracted.get("authors", "")
|
|
||||||
abstract = extracted.get("abstract", "")
|
|
||||||
|
|
||||||
|
|
||||||
n = min(num_templates, len(question_templates))
|
|
||||||
selected_templates = random.sample(question_templates, n)
|
|
||||||
# 为每个问题模板创建新数据
|
|
||||||
for template in selected_templates:
|
|
||||||
formatted_question = template.format(
|
|
||||||
title=title,
|
|
||||||
authors=authors,
|
|
||||||
abstract=abstract,
|
|
||||||
category_text=category_text
|
|
||||||
)
|
|
||||||
|
|
||||||
# 创建新的数据条目(保持新格式)
|
|
||||||
new_data = {
|
|
||||||
"messages": [
|
|
||||||
{"role": "system", "content": system_instruction},
|
|
||||||
{"role": "user", "content": formatted_question},
|
|
||||||
{"role": "assistant", "content": assistant_content}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
multi_type_data.append(new_data)
|
|
||||||
|
|
||||||
# 检查旧格式的数据结构
|
|
||||||
elif "system" in data and "conversation" in data and data["conversation"]:
|
|
||||||
system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
|
|
||||||
|
|
||||||
for turn in data["conversation"]:
|
|
||||||
if "human" in turn and "assistant" in turn:
|
|
||||||
extracted = extract_title_author_and_abstract(turn["human"])
|
|
||||||
title = extracted.get("title", "")
|
|
||||||
authors = extracted.get("authors", "")
|
|
||||||
abstract = extracted.get("abstract", "")
|
|
||||||
n = min(num_templates, len(question_templates))
|
|
||||||
selected_templates = random.sample(question_templates, n)
|
|
||||||
|
|
||||||
for template in selected_templates:
|
|
||||||
formatted_question = template.format(
|
|
||||||
title=title,
|
|
||||||
authors=authors,
|
|
||||||
abstract=abstract,
|
|
||||||
category_text=category_text
|
|
||||||
)
|
|
||||||
|
|
||||||
new_data = {
|
|
||||||
"system": system_instruction,
|
|
||||||
"conversation": [
|
|
||||||
{
|
|
||||||
"human": formatted_question,
|
|
||||||
"assistant": turn["assistant"]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
multi_type_data.append(new_data)
|
|
||||||
else:
|
else:
|
||||||
print(f"警告: 数据格式不识别: {data}")
|
print(f"警告: 第{line_num}行数据格式不识别: {data}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f"警告: 无法解析JSON行: {line}")
|
print(f"警告: 第{line_num}行无法解析JSON: {line}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理行时发生错误: {str(e)}")
|
print(f"处理第{line_num}行时发生错误: {str(e)}")
|
||||||
|
|
||||||
# 写入输出文件
|
# 写入输出文件
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
@@ -278,25 +358,17 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# 示例用法
|
||||||
content_text="Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE"
|
|
||||||
extract_title_author_and_abstract(content_text)
|
|
||||||
|
|
||||||
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl"
|
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500.jsonl"
|
||||||
output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m.jsonl" # 输出文件路径
|
output_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot--swift-26-500-m+.jsonl"
|
||||||
# input_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26.jsonl"
|
|
||||||
# output_file = "G:\\11\\data-prepare\\newformat_sft_test_data--swift-sft-26-m4.jsonl" # 输出文件路径
|
|
||||||
|
|
||||||
convert_onedata2multi_type(input_file, output_file, num_templates=1)
|
convert_onedata2multi_type(input_file, output_file, num_templates=1)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user