Files
data-prepare/05-data-swfit-sft2multi_type-crawl.py

567 lines
20 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import argparse
import random
# 科学类别文本常量
CATEGORY_TEXT = """ A. quant-ph
B. physics.chem-ph
C. physics.atom-ph
D. cond-mat.soft
E. cs.RO
F. cs.CL
G. cs.SE
H. cs.IR
I. hep-th
J. hep-ph
K. physics.optics
L. cs.AI
M. cs.CV
N. nucl-th
O. astro-ph
P. math.PR
Q. cs.OS
R. eess.SP
S. math.OC
T. math.DS
U. math.DG
V. math.MP
W. cs.MM
X. stat.ME
Y. math.CO
Z. cs.NE
"""
# 科学类别字典
CATEGORY_DICT = {
"quant-ph": "A",
"physics.chem-ph": "B",
"physics.atom-ph": "C",
"cond-mat.soft": "D",
"cs.RO": "E",
"cs.CL": "F",
"cs.SE": "G",
"cs.IR": "H",
"hep-th": "I",
"hep-ph": "J",
"physics.optics": "K",
"cs.AI": "L",
"cs.CV": "M",
"nucl-th": "N",
"astro-ph": "O",
"math.PR": "P",
"cs.OS": "Q",
"eess.SP": "R",
"math.OC": "S",
"math.DS": "T",
"math.DG": "U",
"math.MP": "V",
"cs.MM": "W",
"stat.ME": "X",
"math.CO": "Y",
"cs.NE": "Z"
}
# 问题模板常量
QUESTION_TEMPLATES = [
# 直接提问式
"{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
# 命令式
"Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
# 描述性引导
"{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
# 正式请求
"Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
# 摘要优先
"Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
# 作者强调
"{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
# 问题链式
"Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
# 简洁版
"Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
# 上下文嵌入
"Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
# 非正式口语
"Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
# 元素罗列
"{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
# 假设场景
"If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
# 强调关键信息
"Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
# 间接询问
"For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
# 完整句子整合
"Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
# 问题聚焦摘要
"The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
# 标题驱动
"{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
# 多部分查询
"Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
# 比较式
"Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
# 行动导向
"Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
]
QUESTION_TEMPLATES = [
"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{category_text}"
]
def extract_title_author_and_abstract(content_text):
"""
content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter, Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex ,
A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}
"""
try:
# 针对可以直接解析的JSON格式数据进行处理
if content_text.strip().startswith('{') and '"title"' in content_text and ('"author_names"' in content_text or '"authors"' in content_text):
try:
# 尝试解析为JSON对象
paper_data = json.loads(content_text)
title = paper_data.get("title", "")
authors = ", ".join(paper_data.get("author_names", paper_data.get("authors", [])))
abstract = paper_data.get("summary", paper_data.get("abstract", ""))
return {"title": title, "authors": authors, "abstract": abstract}
except:
pass
#content_text.split("',")
parts = content_text.split("',")
if len(parts) < 3:
# 如果分割后的部分少于3个返回默认值
return {"title": "", "authors": "", "abstract": ""}
# 安全地提取标题
title_parts = parts[0].split("'")
if len(title_parts) >= 2:
title = title_parts[1].strip()
else:
title = ""
# 安全地提取作者
authors_parts = parts[1].split("'")
if len(authors_parts) >= 2:
authors = authors_parts[1].strip()
else:
authors = ""
# 安全地提取摘要
abstract_parts = parts[2].split("'")
if len(abstract_parts) >= 2:
abstract = abstract_parts[1].strip()
else:
abstract = ""
return {"title": title, "authors": authors, "abstract": abstract}
except Exception as e:
# 如果出现任何异常,返回默认值
print(f"解析内容时出错: {e}")
return {"title": "", "authors": "", "abstract": ""}
def parse_new_format_data(data):
"""
解析新格式的数据
Args:
data: 新格式的JSON数据
Returns:
tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
"""
if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
return None, None, None
system_instruction = ""
human_content = ""
assistant_content = ""
for msg in data["messages"]:
if msg["role"] == "system":
system_instruction = msg["content"]
elif msg["role"] == "user":
human_content = msg["content"]
elif msg["role"] == "assistant":
assistant_content = msg["content"]
return system_instruction, human_content, assistant_content
def parse_old_format_data(data):
"""
解析旧格式的数据
Args:
data: 旧格式的JSON数据
Returns:
tuple: (system_instruction, conversation_data) 或 (None, None)
"""
if "system" not in data or "conversation" not in data or not data["conversation"]:
return None, None
system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
return system_instruction, data["conversation"]
def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
"""
根据模板生成多种类型的样本
Args:
title: 论文标题
authors: 作者
abstract: 摘要
system_instruction: 系统指令
assistant_content: 助手回复
num_templates: 使用的模板数量
Returns:
list: 生成的多种类型数据列表
"""
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
samples = []
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"messages": [
{"role": "system", "content": system_instruction},
{"role": "user", "content": formatted_question},
{"role": "assistant", "content": assistant_content}
]
}
samples.append(new_data)
return samples
def process_new_format_data(data, num_templates):
"""
处理新格式数据
Args:
data: 新格式数据
num_templates: 模板数量
Returns:
list: 处理后的数据列表
"""
system_instruction, human_content, assistant_content = parse_new_format_data(data)
if not human_content:
return []
extracted = extract_title_author_and_abstract(human_content)
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)
def process_old_format_data(data, num_templates):
"""
处理旧格式数据
Args:
data: 旧格式数据
num_templates: 模板数量
Returns:
list: 处理后的数据列表
"""
system_instruction, conversation_data = parse_old_format_data(data)
if not conversation_data:
return []
samples = []
for turn in conversation_data:
if "human" not in turn or "assistant" not in turn:
continue
extracted = extract_title_author_and_abstract(turn["human"])
title = extracted.get("title", "")
authors = extracted.get("authors", "")
abstract = extracted.get("abstract", "")
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"system": system_instruction,
"conversation": [
{
"human": formatted_question,
"assistant": turn["assistant"]
}
]
}
samples.append(new_data)
return samples
def get_paper_data_from_crawl_jason(input_path):
"""
从指定文件夹里的所有JSON文件中获取论文数据
或从单个JSON文件中获取论文数据
"""
paper_data_list = []
# 检查输入路径是文件还是文件夹
if os.path.isfile(input_path):
# 如果是单个文件
paper_data_list.extend(_extract_paper_data_from_file(input_path))
print(f"从文件 {input_path} 中提取了 {len(paper_data_list)} 条数据")
elif os.path.isdir(input_path):
# 如果是文件夹遍历其中所有JSON文件
files_found = 0
for filename in os.listdir(input_path):
if filename.endswith('.jsonl') :
file_path = os.path.join(input_path, filename)
try:
file_data = _extract_paper_data_from_file(file_path)
paper_data_list.extend(file_data)
print(f"已从 {filename} 中提取 {len(file_data)} 条数据")
files_found += 1
except Exception as e:
print(f"处理文件 {filename} 时出错: {e}")
print(f"在目录中找到 {files_found} 个JSON文件")
else:
print(f"路径 {input_path} 既不是文件也不是文件夹")
print(f"总共提取了 {len(paper_data_list)} 条论文数据")
return paper_data_list
def _extract_paper_data_from_file(file_path):
"""
从单个JSON文件中提取论文数据
Args:
file_path: JSON文件路径
Returns:
list: 论文数据列表
"""
paper_data_list = []
# 处理JSONL格式文件
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line: # 跳过空行
continue
try:
item = json.loads(line)
title = item.get("title", "")
# 处理作者信息的不同可能格式
authors_list = item.get("author_names", item.get("authors", []))
if isinstance(authors_list, list):
authors = ", ".join(authors_list)
else:
authors = str(authors_list)
# 处理摘要信息的不同可能格式
abstract = item.get("summary", item.get("abstract", ""))
# 处理分类信息的不同可能格式
category = item.get("category", "Unknown")
# 如果没有category字段尝试从categories列表中获取第一个
if category == "Unknown" and "categories" in item and isinstance(item["categories"], list) and len(item["categories"]) > 0:
category = item["categories"][0]
# 提取论文数据
paper_data_dict = {
"title": title,
"authors": authors,
"abstract": abstract,
"category": category
}
paper_data_list.append(paper_data_dict)
except json.JSONDecodeError as e:
print(f"解析文件 {file_path} 的第 {line_num} 行时出错: {e}")
continue
return paper_data_list
def convert_onedata2multi_type_pre(paper_datas, output_file, num_templates):
"""
读取input_file将Swift格式的1条数据按多种问题模板格式转换为多条数据
并保存为output_file
参数:
input_file: 输入文件路径
output_file: 输出文件路径
num_templates: 每条数据生成的模板数量
"""
print(f"开始转换数据...每条数据生成{num_templates}条变体")
print(f"开始转换数据: {input_file} -> {output_file}")
multi_type_data = []
for item in paper_datas:
title = item.get("title", "")
authors = item.get("authors", "")
abstract = item.get("summary", item.get("abstract", ""))
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"messages": [
{
"role": "assistant",
"content": formatted_question
#"assistant": row["answer"]
}
]
}
multi_type_data.append(new_data)
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in multi_type_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
def convert_onedata2multi_type_sft(paper_datas, output_file, num_templates):
"""
读取input_file将Swift格式的1条数据按多种问题模板格式转换为多条数据
并保存为output_file
参数:
input_file: 输入文件路径
output_file: 输出文件路径
num_templates: 每条数据生成的模板数量
"""
print(f"开始转换数据...每条数据生成{num_templates}条变体")
print(f"开始转换数据: {input_file} -> {output_file}")
multi_type_data = []
for item in paper_datas:
title = item.get("title", "")
authors = item.get("authors", "")
abstract = item.get("summary", item.get("abstract", ""))
category = item.get("category", "Unknown")
answer=CATEGORY_DICT.get(category, "Unknown")
#print(item)
# 生成系统指令
system_instruction = "你是个优秀的论文分类师,根据论文的标题、作者和摘要,确定该论文的科学类别。"
n = min(num_templates, len(QUESTION_TEMPLATES))
selected_templates = random.sample(QUESTION_TEMPLATES, n)
for template in selected_templates:
formatted_question = template.format(
title=title,
authors=authors,
abstract=abstract,
category_text=CATEGORY_TEXT
)
new_data = {
"system": system_instruction,
"conversation": [
{
"human": formatted_question,
"assistant": answer
}
]
}
multi_type_data.append(new_data)
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in multi_type_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
if __name__ == "__main__":
# 示例用法
input_file = r"G:\\11\data-prepare\\arxiv_papers\\"
output_file_sft = r"G:\\11\data-prepare\\arxiv_papers-multi_type-sft.json"
output_file_pre = r"G:\\11\data-prepare\\arxiv_papers-multi_type-pre.json"
paper_datas=get_paper_data_from_crawl_jason(input_file)
convert_onedata2multi_type_sft(paper_datas, output_file_sft, num_templates=1)
#convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)