添加数据处理脚本,支持从原始数据筛选、抽样到转换为Alpaca格式

This commit is contained in:
glowzz 2025-06-09 14:39:07 +08:00
parent 40c5dee22c
commit 24abc7aab3
8 changed files with 438 additions and 0 deletions

29
01-pre.py Normal file
View File

@ -0,0 +1,29 @@
import json
# 要保留的类别关键词
target_categories = {
"astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
"cs.CL", "cs.CV", "cs.LG",
"gr-qc", "hep-ph", "hep-th", "quant-ph"
}
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
output_path = "arxiv-metadata-oai-snapshot--.json" # 使用 JSON Lines 格式输出路径
count = 0
with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
for line in infile:
try:
record = json.loads(line)
record_cats = record.get("categories", "").split()
if record_cats:
last_cat = record_cats[-1]
if last_cat in target_categories:
outfile.write(json.dumps(record) + '\n')
count += 1
except json.JSONDecodeError:
continue # 忽略格式错误的行
print(f"筛选完成,共保存了 {count} 条记录到 {output_path}")

View File

@ -0,0 +1,26 @@
import json
input_path = "arxiv-metadata-oai-snapshot--.json" # 上一步筛选后的数据
output_path = "arxiv-metadata-oai-snapshot-date-len.json" # 输出高质量数据
count = 0
with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
for line in infile:
try:
record = json.loads(line)
# 获取更新日期和摘要
update_date = record.get("update_date", "")
abstract = record.get("abstract", "")
# 过滤条件,这里根据自己的模型参数修改
if len(abstract) >= 300 and len(abstract)<=4096:
if update_date and int(update_date[:4]) >= 2020:
outfile.write(json.dumps(record) + '\n')
count += 1
except json.JSONDecodeError:
continue # 跳过格式错误的行
print(f"高质量筛选完成,共保留 {count} 条记录到 {output_path}")

22
03-data_select_random.py Normal file
View File

@ -0,0 +1,22 @@
import json
import random
input_path = "arxiv-metadata-oai-snapshot-date-len.json"
output_path = "arxiv-metadata-oai-snapshot--random.json"
sample_size = 10000 # 你可以改成 10000 等其他数字
# 先将所有数据加载到内存中30万条可以接受
with open(input_path, 'r') as infile:
data = [json.loads(line) for line in infile]
print(f"原始数据量:{len(data)}")
random.seed(42) #随机数种子,可以自己随便调
# 随机抽样
sampled_data = random.sample(data, sample_size)
# 保存结果
with open(output_path, 'w') as outfile:
for record in sampled_data:
outfile.write(json.dumps(record) + '\n')
print(f"已随机抽取 {sample_size} 条数据保存到 {output_path}")

61
03-data_select_ratio.py Normal file
View File

@ -0,0 +1,61 @@
import json
import random
input_path = "arxiv-metadata-oai-snapshot-date-len.json"
output_path = "arxiv-metadata-oai-snapshot--ratio.json"
sample_size = 2000 # 你可以改成 10000 等其他数字
# 先将所有数据加载到内存中30万条可以接受
with open(input_path, 'r') as infile:
data = [json.loads(line) for line in infile]
print(f"原始数据量:{len(data)}")
## 按类别筛选数据,不是随机
## 每个类别指定抽取的比例
category_proportions = {
'astro-ph': 0.1,
'cond-mat.mes-hall': 0.1,
'cond-mat.mtrl-sci': 0.1,
'cs.CL': 0.1,
'cs.CV': 0.1,
'cs.LG': 0.1,
'gr-qc': 0.1,
'hep-ph': 0.1,
'hep-th': 0.1,
'quant-ph': 0.1
}
## print 每个类别的筛选比例和数量
print("每个类别的筛选比例和数量:")
for category, proportion in category_proportions.items():
count = sample_size * proportion
print(f"类别 {category}: 抽取比例 {proportion}, 数量 {count}")
# 按每个类别的数量筛选数据
filtered_data = []
for category, proportion in category_proportions.items():
count = int(sample_size * proportion)
# 筛选出当前类别的数据
category_data = [item for item in data if item.get('categories', '').strip() == category]
# 如果当前类别的数据量小于需要抽取的数量,则全部取出
if len(category_data) < count:
filtered_data.extend(category_data)
else:
# 随机抽样指定数量的数据
sampled_data = random.sample(category_data, count)
filtered_data.extend(sampled_data)
print(f"类别 {category}: 抽取数量 {count}")
# 保存结果
with open(output_path, 'w') as outfile:
for record in filtered_data:
outfile.write(json.dumps(record) + '\n')
print(f"已按比例抽取 {sample_size} 条数据保存到 {output_path}")

70
04-data2swift.py Normal file
View File

@ -0,0 +1,70 @@
import json
import random
input_file = "arxiv-metadata-oai-snapshot--ratio.json" # 20000条原始数据文件路径
output_file = "arxiv-metadata-oai-snapshot--swift.json"
# 类别对应选项映射
label_map = {
"astro-ph": "A",
"cond-mat.mes-hall": "B",
"cond-mat.mtrl-sci": "C",
"cs.CL": "D",
"cs.CV": "E",
"cs.LG": "F",
"gr-qc": "G",
"hep-ph": "H",
"hep-th": "I",
"quant-ph": "J"
}
options_text = (
"\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"
"E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"
)
# 读取所有数据
with open(input_file, 'r', encoding='utf-8') as f:
data = [json.loads(line) for line in f]
# 随机抽样1000条
#random.seed(42)
sampled = data
with open(output_file, 'w', encoding='utf-8') as f_out:
count = 0
for item in sampled:
# 多类别时取最后一个类别(通常以空格分割)
categories_str = item.get("categories", "").strip()
if not categories_str:
continue
last_category = categories_str.split()[-1]
if last_category not in label_map:
continue
title = item.get("title", "").replace("\n", " ").strip()
authors = item.get("authors", "").replace("\n", " ").strip()
abstract = item.get("abstract", "").replace("\n", " ").strip()
if not title or not authors or not abstract:
continue
human_text = (
f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', "
f"please determine the scientific category of this paper.{options_text}"
)
finetune_sample = {
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": human_text,
"assistant": label_map[last_category]
}
]
}
f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n")
count += 1
print(f"转换完成,共生成{count}条微调数据,保存到 {output_file}")

68
05-data-csv-xtuner.py Normal file
View File

@ -0,0 +1,68 @@
import json
import csv
def convert_to_alpaca_format(input_file, output_file):
"""
读取csv文件提取其中的question和answer列的数据并转换为 Alpaca 格式
输入csv格式:
question,A,B,C,D,E,F,G,H,I,J,answer
输出格式 (Alpaca):
{
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": "Based on the title...",
"output": "D"
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
csv_reader = csv.DictReader(f)
for row in csv_reader:
try:
# 检查必要的列是否存在
if "question" not in row or "answer" not in row:
print(f"警告: 数据缺少必要列: {row}")
continue
# 创建新的 Alpaca 格式数据
new_data = {
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": row["question"],
"output": row["answer"]
}
converted_data.append(new_data)
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "newformat_sft_test_data.csv"
output_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

87
05-data-swfit-xtuner.py Normal file
View File

@ -0,0 +1,87 @@
import json
import os
import argparse
def convert_to_alpaca_format(input_file, output_file):
"""
Swift 格式的数据转换为 Alpaca 格式
输入格式:
{
"system": "你是个优秀的论文分类师",
"conversation": [
{
"human": "Based on the title...",
"assistant": "D"
}
]
}
输出格式 (Alpaca):
{
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": "Based on the title...",
"output": "D"
}
"""
print(f"转换数据: {input_file} -> {output_file}")
converted_data = []
with open(input_file, "r", encoding="utf-8") as f:
for line in f:
try:
data = json.loads(line.strip())
# 检查数据结构
if "system" not in data or "conversation" not in data:
print(f"警告: 数据缺少必要字段: {data}")
continue
# 从 system 提取指令
instruction = data.get("system", "")
if not instruction:
instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
# 处理对话
for turn in data["conversation"]:
if "human" in turn and "assistant" in turn:
# 创建新的 Alpaca 格式数据
new_data = {
"instruction": instruction,
"input": turn["human"],
"output": turn["assistant"],
}
converted_data.append(new_data)
except json.JSONDecodeError:
print(f"警告: 无法解析JSON行: {line}")
except Exception as e:
print(f"处理行时发生错误: {str(e)}")
# 写入输出文件
with open(output_file, "w", encoding="utf-8") as f:
for item in converted_data:
f.write(json.dumps(item, ensure_ascii=False) + "\n")
print(f"转换完成! 共转换 {len(converted_data)} 条数据")
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
input_file = "arxiv-metadata-oai-snapshot--swift.json"
output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径
convert_to_alpaca_format(input_file, output_file)

75
06-data-xtuner-compose.py Normal file
View File

@ -0,0 +1,75 @@
import json
import os
import argparse
import pandas as pd
import matplotlib.pyplot as plt
def get_Composition_ratio(input_file):
"""
输出格式 (Alpaca):
{
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
"input": "Based on the title...",
"output": "D"
}
计算数据集组成比例并打印输出
:param input_file: 输入的JSONL文件路径
"""
# 读取JSONL文件
with open(input_file, "r") as f:
data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
df = pd.DataFrame(data)
# print(df.head(5))
# 计算每个类别的数量
counts = df['output'].value_counts()
# 计算总数
total = counts.sum()
# 计算每个类别的比例
ratios = counts / total * 100
# 打印每个类别的比例
print("类别比例和数量:")
for category, ratio in ratios.items():
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
# 绘制饼图
plt.figure(figsize=(8, 6))
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
plt.title('数据集类别比例')
plt.show()
return ratios
if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
# parser.add_argument(
# "--input",
# type=str,
# required=True,
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
# )
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
# args = parser.parse_args()
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
#input_file = "arxiv-metadata-oai-snapshot--swift.json"
input_file = "sftdata.jsonl" # 输出文件路径
input_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径
get_Composition_ratio(input_file)
#convert_to_alpaca_format(input_file, output_file)