2025-07-28 06:11:49 +08:00
import json
import os
import argparse
import random
# 科学类别文本常量
CATEGORY_TEXT = """ A. quant-ph
B . physics . chem - ph
C . physics . atom - ph
D . cond - mat . soft
E . cs . RO
F . cs . CL
G . cs . SE
H . cs . IR
I . hep - th
J . hep - ph
K . physics . optics
L . cs . AI
M . cs . CV
N . nucl - th
O . astro - ph
P . math . PR
Q . cs . OS
R . eess . SP
S . math . OC
T . math . DS
U . math . DG
V . math . MP
W . cs . MM
X . stat . ME
Y . math . CO
Z . cs . NE
"""
# 科学类别字典
CATEGORY_DICT = {
" quant-ph " : " A " ,
" physics.chem-ph " : " B " ,
" physics.atom-ph " : " C " ,
" cond-mat.soft " : " D " ,
" cs.RO " : " E " ,
" cs.CL " : " F " ,
" cs.SE " : " G " ,
" cs.IR " : " H " ,
" hep-th " : " I " ,
" hep-ph " : " J " ,
" physics.optics " : " K " ,
" cs.AI " : " L " ,
" cs.CV " : " M " ,
" nucl-th " : " N " ,
" astro-ph " : " O " ,
" math.PR " : " P " ,
" cs.OS " : " Q " ,
" eess.SP " : " R " ,
" math.OC " : " S " ,
" math.DS " : " T " ,
" math.DG " : " U " ,
" math.MP " : " V " ,
" cs.MM " : " W " ,
" stat.ME " : " X " ,
" math.CO " : " Y " ,
" cs.NE " : " Z "
}
# 问题模板常量
QUESTION_TEMPLATES = [
# 直接提问式
" {category_text} What is the scientific category for a paper titled ' {title} ' , authored by {authors} , with abstract ' {abstract} ' ? " ,
# 命令式
" Classify this paper into its scientific category based on title ' {title} ' , authors ' {authors} ' , and abstract ' {abstract} ' . {category_text} " ,
# 描述性引导
" {category_text} Given a research paper with title ' {title} ' , authors {authors} , and abstract ' {abstract} ' , identify the appropriate discipline. " ,
# 正式请求
" Please assign the scientific category for the paper: title ' {title} ' , authors ' {authors} ' , abstract ' {abstract} ' . {category_text} " ,
# 摘要优先
" Using the abstract ' {abstract} ' , title ' {title} ' , and authors ' {authors} ' , determine the paper ' s category. {category_text} " ,
# 作者强调
" {category_text} From authors ' {authors} ' , title ' {title} ' , and abstract ' {abstract} ' , what category does this paper fall into? " ,
# 问题链式
" Here ' s a paper: title ' {title} ' , authors {authors} , abstract ' {abstract} ' . What is its scientific category? {category_text} " ,
# 简洁版
" Category for: title ' {title} ' , authors ' {authors} ' , abstract ' {abstract} ' ? {category_text} " ,
# 上下文嵌入
" Considering the title ' {title} ' , the authors ' {authors} ' , and the abstract content ' {abstract} ' , please specify the paper ' s field. {category_text} " ,
# 非正式口语
" Hey, what category is this paper? Title ' {title} ' , by {authors} , abstract ' {abstract} ' . {category_text} " ,
# 元素罗列
" {category_text} Title: ' {title} ' . Authors: ' {authors} ' . Abstract: ' {abstract} ' . Now, what ' s the scientific category? " ,
# 假设场景
" If a paper has title ' {title} ' , authors ' {authors} ' , and abstract ' {abstract} ' , which scientific category best fits it? {category_text} " ,
# 强调关键信息
" Based solely on the title ' {title} ' , authors list ' {authors} ' , and abstract text ' {abstract} ' , categorize this paper. {category_text} " ,
# 间接询问
" For the paper ' {title} ' by {authors} , with abstract ' {abstract} ' , could you indicate its scientific discipline? {category_text} " ,
# 完整句子整合
" Determine the category of the research paper entitled ' {title} ' , written by {authors} , and summarized as ' {abstract} ' . {category_text} " ,
# 问题聚焦摘要
" The abstract ' {abstract} ' describes a paper titled ' {title} ' by authors ' {authors} ' . What category is it? {category_text} " ,
# 标题驱动
" {category_text} Starting from the title ' {title} ' , and considering authors ' {authors} ' and abstract ' {abstract} ' , what is the paper ' s category? " ,
# 多部分查询
" Part 1: Title is ' {title} ' . Part 2: Authors are ' {authors} ' . Part 3: Abstract is ' {abstract} ' . Based on this, classify the paper. {category_text} " ,
# 比较式
" Given the details: title ' {title} ' , authors ' {authors} ' , abstract ' {abstract} ' , how would you categorize this paper scientifically? {category_text} " ,
# 行动导向
" Using the provided title ' {title} ' , authors ' {authors} ' , and abstract ' {abstract} ' , output the scientific category for this paper. {category_text} "
]
2025-07-30 23:05:31 +08:00
QUESTION_TEMPLATES = [
" Based on the title ' {title} ' , authors ' {authors} ' , and abstract ' {abstract} ' , please determine the scientific category of this paper. \n \n {category_text} "
]
2025-07-28 06:11:49 +08:00
def extract_title_author_and_abstract ( content_text ) :
"""
content_text : 格式示例 " Based on the title ' The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity ' , authors ' Alexander Shalyt-Margolin ' , and abstract ' In the present work consideration is given to the primordial black holes ( { \\ bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ( { \\ bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ( { \\ bf qgcs}) leads to a higher probability for the occurrence of such { \\ bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned { \\ bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern. ' , please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
A . quant - ph \nB . physics . chem - ph \nC . physics . atom - ph \nD . cond - mat . soft \nE . cs . RO \nF . cs . CL \nG . cs . SE \nH . cs . IR \nI . hep - th \nJ . hep - ph \nK . physics . optics \nL . cs . AI \nM . cs . CV \nN . nucl - th \nO . astro - ph \nP . math . PR \nQ . cs . OS \nR . eess . SP \nS . math . OC \nT . math . DS \nU . math . DG \nV . math . MP \nW . cs . MM \nX . stat . ME \nY . math . CO \nZ . cs . NE " , " assistant " : " I " }]}}
"""
try :
# 针对可以直接解析的JSON格式数据进行处理
if content_text . strip ( ) . startswith ( ' { ' ) and ' " title " ' in content_text and ( ' " author_names " ' in content_text or ' " authors " ' in content_text ) :
try :
# 尝试解析为JSON对象
paper_data = json . loads ( content_text )
title = paper_data . get ( " title " , " " )
authors = " , " . join ( paper_data . get ( " author_names " , paper_data . get ( " authors " , [ ] ) ) )
abstract = paper_data . get ( " summary " , paper_data . get ( " abstract " , " " ) )
return { " title " : title , " authors " : authors , " abstract " : abstract }
except :
pass
#content_text.split("',")
parts = content_text . split ( " ' , " )
if len ( parts ) < 3 :
# 如果分割后的部分少于3个, 返回默认值
return { " title " : " " , " authors " : " " , " abstract " : " " }
# 安全地提取标题
title_parts = parts [ 0 ] . split ( " ' " )
if len ( title_parts ) > = 2 :
title = title_parts [ 1 ] . strip ( )
else :
title = " "
# 安全地提取作者
authors_parts = parts [ 1 ] . split ( " ' " )
if len ( authors_parts ) > = 2 :
authors = authors_parts [ 1 ] . strip ( )
else :
authors = " "
# 安全地提取摘要
abstract_parts = parts [ 2 ] . split ( " ' " )
if len ( abstract_parts ) > = 2 :
abstract = abstract_parts [ 1 ] . strip ( )
else :
abstract = " "
return { " title " : title , " authors " : authors , " abstract " : abstract }
except Exception as e :
# 如果出现任何异常,返回默认值
print ( f " 解析内容时出错: { e } " )
return { " title " : " " , " authors " : " " , " abstract " : " " }
def parse_new_format_data ( data ) :
"""
解析新格式的数据
Args :
data : 新格式的JSON数据
Returns :
tuple : ( system_instruction , human_content , assistant_content ) 或 ( None , None , None )
"""
if " messages " not in data or not isinstance ( data [ " messages " ] , list ) or len ( data [ " messages " ] ) < 3 :
return None , None , None
system_instruction = " "
human_content = " "
assistant_content = " "
for msg in data [ " messages " ] :
if msg [ " role " ] == " system " :
system_instruction = msg [ " content " ]
elif msg [ " role " ] == " user " :
human_content = msg [ " content " ]
elif msg [ " role " ] == " assistant " :
assistant_content = msg [ " content " ]
return system_instruction , human_content , assistant_content
def parse_old_format_data ( data ) :
"""
解析旧格式的数据
Args :
data : 旧格式的JSON数据
Returns :
tuple : ( system_instruction , conversation_data ) 或 ( None , None )
"""
if " system " not in data or " conversation " not in data or not data [ " conversation " ] :
return None , None
system_instruction = data . get ( " system " , " 根据论文的标题、作者和摘要,确定该论文的科学类别。 " )
return system_instruction , data [ " conversation " ]
def generate_multi_type_samples ( title , authors , abstract , system_instruction , assistant_content , num_templates ) :
"""
根据模板生成多种类型的样本
Args :
title : 论文标题
authors : 作者
abstract : 摘要
system_instruction : 系统指令
assistant_content : 助手回复
num_templates : 使用的模板数量
Returns :
list : 生成的多种类型数据列表
"""
n = min ( num_templates , len ( QUESTION_TEMPLATES ) )
selected_templates = random . sample ( QUESTION_TEMPLATES , n )
samples = [ ]
for template in selected_templates :
formatted_question = template . format (
title = title ,
authors = authors ,
abstract = abstract ,
category_text = CATEGORY_TEXT
)
new_data = {
" messages " : [
{ " role " : " system " , " content " : system_instruction } ,
{ " role " : " user " , " content " : formatted_question } ,
{ " role " : " assistant " , " content " : assistant_content }
]
}
samples . append ( new_data )
return samples
def process_new_format_data ( data , num_templates ) :
"""
处理新格式数据
Args :
data : 新格式数据
num_templates : 模板数量
Returns :
list : 处理后的数据列表
"""
system_instruction , human_content , assistant_content = parse_new_format_data ( data )
if not human_content :
return [ ]
extracted = extract_title_author_and_abstract ( human_content )
title = extracted . get ( " title " , " " )
authors = extracted . get ( " authors " , " " )
abstract = extracted . get ( " abstract " , " " )
return generate_multi_type_samples ( title , authors , abstract , system_instruction , assistant_content , num_templates )
def process_old_format_data ( data , num_templates ) :
"""
处理旧格式数据
Args :
data : 旧格式数据
num_templates : 模板数量
Returns :
list : 处理后的数据列表
"""
system_instruction , conversation_data = parse_old_format_data ( data )
if not conversation_data :
return [ ]
samples = [ ]
for turn in conversation_data :
if " human " not in turn or " assistant " not in turn :
continue
extracted = extract_title_author_and_abstract ( turn [ " human " ] )
title = extracted . get ( " title " , " " )
authors = extracted . get ( " authors " , " " )
abstract = extracted . get ( " abstract " , " " )
n = min ( num_templates , len ( QUESTION_TEMPLATES ) )
selected_templates = random . sample ( QUESTION_TEMPLATES , n )
for template in selected_templates :
formatted_question = template . format (
title = title ,
authors = authors ,
abstract = abstract ,
category_text = CATEGORY_TEXT
)
new_data = {
" system " : system_instruction ,
" conversation " : [
{
" human " : formatted_question ,
" assistant " : turn [ " assistant " ]
}
]
}
samples . append ( new_data )
return samples
def get_paper_data_from_crawl_jason ( input_path ) :
"""
从指定文件夹里的所有JSON文件中获取论文数据
或从单个JSON文件中获取论文数据
"""
paper_data_list = [ ]
# 检查输入路径是文件还是文件夹
if os . path . isfile ( input_path ) :
# 如果是单个文件
paper_data_list . extend ( _extract_paper_data_from_file ( input_path ) )
print ( f " 从文件 { input_path } 中提取了 { len ( paper_data_list ) } 条数据 " )
elif os . path . isdir ( input_path ) :
# 如果是文件夹, 遍历其中所有JSON文件
files_found = 0
for filename in os . listdir ( input_path ) :
if filename . endswith ( ' .jsonl ' ) :
file_path = os . path . join ( input_path , filename )
try :
file_data = _extract_paper_data_from_file ( file_path )
paper_data_list . extend ( file_data )
print ( f " 已从 { filename } 中提取 { len ( file_data ) } 条数据 " )
files_found + = 1
except Exception as e :
print ( f " 处理文件 { filename } 时出错: { e } " )
print ( f " 在目录中找到 { files_found } 个JSON文件 " )
else :
print ( f " 路径 { input_path } 既不是文件也不是文件夹 " )
print ( f " 总共提取了 { len ( paper_data_list ) } 条论文数据 " )
return paper_data_list
def _extract_paper_data_from_file ( file_path ) :
"""
从单个JSON文件中提取论文数据
Args :
file_path : JSON文件路径
Returns :
list : 论文数据列表
"""
paper_data_list = [ ]
# 处理JSONL格式文件
with open ( file_path , " r " , encoding = " utf-8 " ) as f :
for line_num , line in enumerate ( f , 1 ) :
line = line . strip ( )
if not line : # 跳过空行
continue
try :
item = json . loads ( line )
title = item . get ( " title " , " " )
# 处理作者信息的不同可能格式
authors_list = item . get ( " author_names " , item . get ( " authors " , [ ] ) )
if isinstance ( authors_list , list ) :
authors = " , " . join ( authors_list )
else :
authors = str ( authors_list )
# 处理摘要信息的不同可能格式
abstract = item . get ( " summary " , item . get ( " abstract " , " " ) )
# 处理分类信息的不同可能格式
category = item . get ( " category " , " Unknown " )
# 如果没有category字段, 尝试从categories列表中获取第一个
if category == " Unknown " and " categories " in item and isinstance ( item [ " categories " ] , list ) and len ( item [ " categories " ] ) > 0 :
category = item [ " categories " ] [ 0 ]
# 提取论文数据
paper_data_dict = {
" title " : title ,
" authors " : authors ,
" abstract " : abstract ,
" category " : category
}
paper_data_list . append ( paper_data_dict )
except json . JSONDecodeError as e :
print ( f " 解析文件 { file_path } 的第 { line_num } 行时出错: { e } " )
continue
return paper_data_list
def convert_onedata2multi_type_pre ( paper_datas , output_file , num_templates ) :
"""
读取input_file , 将Swift格式的1条数据按多种问题模板格式转换为多条数据 ,
并保存为output_file
参数 :
input_file : 输入文件路径
output_file : 输出文件路径
num_templates : 每条数据生成的模板数量
"""
print ( f " 开始转换数据...每条数据生成 { num_templates } 条变体 " )
print ( f " 开始转换数据: { input_file } -> { output_file } " )
multi_type_data = [ ]
for item in paper_datas :
title = item . get ( " title " , " " )
authors = item . get ( " authors " , " " )
abstract = item . get ( " summary " , item . get ( " abstract " , " " ) )
n = min ( num_templates , len ( QUESTION_TEMPLATES ) )
selected_templates = random . sample ( QUESTION_TEMPLATES , n )
for template in selected_templates :
formatted_question = template . format (
title = title ,
authors = authors ,
abstract = abstract ,
category_text = CATEGORY_TEXT
)
new_data = {
" messages " : [
{
" role " : " assistant " ,
" content " : formatted_question
#"assistant": row["answer"]
}
]
}
multi_type_data . append ( new_data )
# 写入输出文件
with open ( output_file , " w " , encoding = " utf-8 " ) as f :
for item in multi_type_data :
f . write ( json . dumps ( item , ensure_ascii = False ) + " \n " )
print ( f " 转换完成! 共转换 { len ( multi_type_data ) } 条数据 " )
def convert_onedata2multi_type_sft ( paper_datas , output_file , num_templates ) :
"""
读取input_file , 将Swift格式的1条数据按多种问题模板格式转换为多条数据 ,
并保存为output_file
参数 :
input_file : 输入文件路径
output_file : 输出文件路径
num_templates : 每条数据生成的模板数量
"""
print ( f " 开始转换数据...每条数据生成 { num_templates } 条变体 " )
print ( f " 开始转换数据: { input_file } -> { output_file } " )
multi_type_data = [ ]
for item in paper_datas :
title = item . get ( " title " , " " )
authors = item . get ( " authors " , " " )
abstract = item . get ( " summary " , item . get ( " abstract " , " " ) )
category = item . get ( " category " , " Unknown " )
answer = CATEGORY_DICT . get ( category , " Unknown " )
#print(item)
# 生成系统指令
system_instruction = " 你是个优秀的论文分类师,根据论文的标题、作者和摘要,确定该论文的科学类别。 "
n = min ( num_templates , len ( QUESTION_TEMPLATES ) )
selected_templates = random . sample ( QUESTION_TEMPLATES , n )
for template in selected_templates :
formatted_question = template . format (
title = title ,
authors = authors ,
abstract = abstract ,
category_text = CATEGORY_TEXT
)
new_data = {
" system " : system_instruction ,
" conversation " : [
{
" human " : formatted_question ,
" assistant " : answer
}
]
}
multi_type_data . append ( new_data )
# 写入输出文件
with open ( output_file , " w " , encoding = " utf-8 " ) as f :
for item in multi_type_data :
f . write ( json . dumps ( item , ensure_ascii = False ) + " \n " )
print ( f " 转换完成! 共转换 { len ( multi_type_data ) } 条数据 " )
if __name__ == " __main__ " :
# 示例用法
input_file = r " G: \\ 11 \ data-prepare \\ arxiv_papers \\ "
output_file_sft = r " G: \\ 11 \ data-prepare \\ arxiv_papers-multi_type-sft.json "
output_file_pre = r " G: \\ 11 \ data-prepare \\ arxiv_papers-multi_type-pre.json "
paper_datas = get_paper_data_from_crawl_jason ( input_file )
convert_onedata2multi_type_sft ( paper_datas , output_file_sft , num_templates = 1 )
2025-07-30 23:05:31 +08:00
#convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)
2025-07-28 06:11:49 +08:00