This commit is contained in:
2025-07-18 18:00:04 +08:00
parent 24abc7aab3
commit 563f16f0c5
15 changed files with 25541 additions and 41 deletions

View File

@@ -1,14 +1,46 @@
import json
# 要保留的类别关键词
# target_categories = {
# "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
# "cs.CL", "cs.CV", "cs.LG",
# "gr-qc", "hep-ph", "hep-th", "quant-ph"
# }
target_categories = {
"astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
"cs.CL", "cs.CV", "cs.LG",
"gr-qc", "hep-ph", "hep-th", "quant-ph"
}
'quant-ph',
'physics.chem-ph',
'physics.atom-ph',
'cond-mat.soft',
'cs.RO',
'cs.CL',
'cs.SE',
'cs.IR',
'hep-th',
'hep-ph',
'physics.optics',
'cs.AI',
'cs.CV',
'nucl-th',
'astro-ph',
'math.PR',
'cs.OS',
'eess.SP',
'math.OC',
'math.DS',
'math.DG',
'math.MP',
'cs.MM',
'stat.ME',
'math.CO',
'cs.NE'
}
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot--.json" # 使用 JSON Lines 格式输出路径
output_path = "arxiv-metadata-oai-snapshot--26.json" # 使用 JSON Lines 格式输出路径
count = 0

View File

@@ -1,9 +1,9 @@
import json
import random
input_path = "arxiv-metadata-oai-snapshot-date-len.json"
output_path = "arxiv-metadata-oai-snapshot--ratio.json"
sample_size = 2000 # 你可以改成 10000 等其他数字
input_path = "arxiv-metadata-oai-snapshot--26.json"
output_path = "arxiv-metadata-oai-snapshot--26-500.json"
sample_size = 4000 # 你可以改成 10000 等其他数字
@@ -15,18 +15,50 @@ print(f"原始数据量:{len(data)} 条")
## 按类别筛选数据,不是随机
## 每个类别指定抽取的比例
# category_proportions = {
# 'astro-ph': 0.1336,
# 'cond-mat.mes-hall': 0.0486,
# 'cond-mat.mtrl-sci': 0.0587,
# 'cs.CL': 0.085,
# 'cs.CV': 0.0931,
# 'cs.LG': 0.0992,
# 'gr-qc': 0.1174,
# 'hep-ph': 0.1194,
# 'hep-th': 0.085,
# 'quant-ph': 0.1599
# }
category_proportions = {
'astro-ph': 0.1,
'cond-mat.mes-hall': 0.1,
'cond-mat.mtrl-sci': 0.1,
'cs.CL': 0.1,
'cs.CV': 0.1,
'cs.LG': 0.1,
'gr-qc': 0.1,
'hep-ph': 0.1,
'hep-th': 0.1,
'quant-ph': 0.1
}
'quant-ph': 0.1,
'physics.chem-ph': 0.1,
'physics.atom-ph': 0.1,
'cond-mat.soft': 0.1,
'cs.RO': 0.1,
'cs.CL': 0.1,
'cs.SE': 0.1,
'cs.IR': 0.1,
'hep-th': 0.1,
'hep-ph': 0.1,
'physics.optics': 0.1,
'cs.AI': 0.1,
'cs.CV': 0.1,
'nucl-th': 0.1,
'astro-ph': 0.1,
'math.PR': 0.1,
'cs.OS': 0.1,
'eess.SP': 0.1,
'math.OC': 0.1,
'math.DS': 0.1,
'math.DG': 0.1,
'math.MP': 0.1,
'cs.MM': 0.1,
'stat.ME': 0.1,
'math.CO': 0.1,
'cs.NE': 0.1
}
## print 每个类别的筛选比例和数量
print("每个类别的筛选比例和数量:")
for category, proportion in category_proportions.items():

View File

@@ -1,27 +1,48 @@
import json
import random
input_file = "arxiv-metadata-oai-snapshot--ratio.json" # 20000条原始数据文件路径
output_file = "arxiv-metadata-oai-snapshot--swift.json"
input_file = "arxiv-metadata-oai-snapshot--26-500.json" # 20000条原始数据文件路径
output_file = "arxiv-metadata-oai-snapshot--swift-26-500.json"
# 类别对应选项映射
label_map = {
"astro-ph": "A",
"cond-mat.mes-hall": "B",
"cond-mat.mtrl-sci": "C",
"cs.CL": "D",
"cs.CV": "E",
"cs.LG": "F",
"gr-qc": "G",
"hep-ph": "H",
"hep-th": "I",
"quant-ph": "J"
'quant-ph': 'A',
'physics.chem-ph': 'B',
'physics.atom-ph': 'C',
'cond-mat.soft': 'D',
'cs.RO': 'E',
'cs.CL': 'F',
'cs.SE': 'G',
'cs.IR': 'H',
'hep-th': 'I',
'hep-ph': 'J',
'physics.optics': 'K',
'cs.AI': 'L',
'cs.CV': 'M',
'nucl-th': 'N',
'astro-ph': 'O',
'math.PR': 'P',
'cs.OS': 'Q',
'eess.SP': 'R',
'math.OC': 'S',
'math.DS': 'T',
'math.DG': 'U',
'math.MP': 'V',
'cs.MM': 'W',
'stat.ME': 'X',
'math.CO': 'Y',
'cs.NE': 'Z'
}
options_text = (
"\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"
"E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"
)
options = [
"A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft",
"E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph",
"K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph",
"P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS",
"U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE"
]
options_text = "\n".join(options)
# 读取所有数据
with open(input_file, 'r', encoding='utf-8') as f:

View File

@@ -0,0 +1,81 @@
import json
import csv
def convert_to_alpaca_format(input_file, output_file):
"""
读取csv文件提取其中的question和answer列的数据并转换为 Alpaca 格式。
输入csv格式:
question,A,B,C,D,E,F,G,H,I,J,answer
输出格式 (swift):
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
try:
# 检查必要的列是否存在
if "question" not in row or "answer" not in row:
print(f"警告: 数据缺少必要列: {row}")
continue
# 创建新的 swift 格式数据
new_data = {
"messages": [
{
"role": "assistant",
"content": "This is a paper titled " + row["question"][19:]
#"assistant": row["answer"]
}
]
}
converted_data.append(new_data)
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "newformat_sft_test_data.csv"
output_file = "newformat_sft_test_data--swift-pretrain.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

80
05-data-csv-swift-sft.py Normal file
View File

@@ -0,0 +1,80 @@
import json
import csv
def convert_to_alpaca_format(input_file, output_file):
"""
读取csv文件提取其中的question和answer列的数据并转换为 Alpaca 格式。
输入csv格式:
question,A,B,C,D,E,F,G,H,I,J,answer
输出格式 (swift):
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
try:
# 检查必要的列是否存在
if "question" not in row or "answer" not in row:
print(f"警告: 数据缺少必要列: {row}")
continue
# 创建新的 swift 格式数据
new_data = {
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": row["question"],
"assistant": row["answer"]
}
]
}
converted_data.append(new_data)
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "newformat_sft_test_data.csv"
output_file = "newformat_sft_test_data--swift-sft.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

View File

@@ -0,0 +1,80 @@
import json
import os
import argparse
import re
def convert_to_alpaca_format(input_file, output_file):
"""
将 Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"messages": [
{
"role": "assistant",
"content": "This is a paper titled ...."
}
]
}
删除"content"中的 "with ID 0704.0145,"部分
按原格式输出
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "messages" not in data or not isinstance(data["messages"], list):
print(f"警告: 数据格式不正确缺少messages字段或格式错误")
continue
if not data["messages"] or "content" not in data["messages"][0]:
print(f"警告: messages为空或缺少content字段")
continue
# 转换数据
content = data["messages"][0]["content"]
# 删除 "with ID xxxx.xxxx," 的部分
content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
content = content[:-180]
new_data = {
"messages": [
{
"role": data["messages"][0].get("role", "assistant"),
"content": content
}
]
}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

View File

@@ -0,0 +1,97 @@
import json
import os
import argparse
def convert_to_alpaca_format(input_file, output_file):
"""
将 Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
输出格式:
{
"messages": [
{
"role": "assistant",
"content": "This is a paper titled ...."
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "system" not in data or "conversation" not in data:
print(f"警告: 数据缺少必要字段: {data}")
continue
# 从 system 提取指令
instruction = data.get("system", "")
if not instruction:
instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
# 处理对话
for turn in data["conversation"]:
if "human" in turn and "assistant" in turn:
# 创建新的 Alpaca 格式数据
new_data = {
"messages": [
{
"role": "assistant",
"content": "This is a paper titled " + turn["human"][19:]
}]}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

View File

@@ -84,4 +84,9 @@ if __name__ == "__main__":
input_file = "arxiv-metadata-oai-snapshot--swift.json"
output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)
convert_to_alpaca_format(input_file, output_file)

74
05-data-xtuner-swfit.py Normal file
View File

@@ -0,0 +1,74 @@
import json
import os
import argparse
def convert_to_alpaca_format(input_file, output_file):
"""
将 Alpaca 格式转换为 Swift 格式的数据
输入格式:
{
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": "Based on the title...",
"output": "D"
}
输出格式 (Alpaca):
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "arxiv-metadata-oai-snapshot--swift.json"
output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

54
06-data-swift-compose.py Normal file
View File

@@ -0,0 +1,54 @@
import json
import os
import argparse
import pandas as pd
import matplotlib.pyplot as plt
def get_Composition_ratio(input_file):
"""
计算数据集类别组成比例,并打印输出。
:param input_file: 输入的JSONL文件路径
"""
# 读取JSONL文件
with open(input_file, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# 提取每条数据的类别标签(假设在 conversation[0]['assistant']
labels = []
for item in data:
# 兼容 conversation 为列表且有 assistant 字段
if "conversation" in item and isinstance(item["conversation"], list):
conv = item["conversation"]
if len(conv) > 0 and "assistant" in conv[0]:
labels.append(conv[0]["assistant"])
else:
labels.append("未知")
else:
labels.append("未知")
df = pd.DataFrame({"label": labels})
# 计算每个类别的数量
counts = df['label'].value_counts()
total = counts.sum()
# 计算每个类别的比例
ratios = counts / total * 100
# 打印每个类别的比例
print("类别比例和数量:")
for category, ratio in ratios.items():
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
# 绘制饼图
plt.figure(figsize=(8, 6))
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
plt.title('数据集类别比例')
plt.show()
return ratios
if __name__ == "__main__":
# input_file = "sftdata.jsonl"
input_file = "output-26.jsonl"
input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
get_Composition_ratio(input_file)

View File

@@ -1,4 +1,3 @@
import json
import os
import argparse
@@ -22,10 +21,11 @@ def get_Composition_ratio(input_file):
"""
# 读取JSONL文件
with open(input_file, "r") as f:
data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
with open(input_file, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
df = pd.DataFrame(data)
# print(df.head(5))
print("实际列名:", df.columns)
print("前几行数据:\n", df.head())
# 计算每个类别的数量
counts = df['output'].value_counts()
# 计算总数
@@ -67,7 +67,7 @@ if __name__ == "__main__":
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
#input_file = "arxiv-metadata-oai-snapshot--swift.json"
input_file = "sftdata.jsonl" # 输出文件路径
input_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径
input_file = "arxiv-metadata-oai-snapshot--swift-26.json" # 输出文件路径
get_Composition_ratio(input_file)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff