75 lines
2.0 KiB
Python
75 lines
2.0 KiB
Python
|
|
import json
|
|
import os
|
|
import argparse
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
def get_Composition_ratio(input_file):
|
|
"""
|
|
输出格式 (Alpaca):
|
|
{
|
|
"instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。",
|
|
"input": "Based on the title...",
|
|
"output": "D"
|
|
}
|
|
计算数据集组成比例,并打印输出。
|
|
:param input_file: 输入的JSONL文件路径
|
|
|
|
|
|
"""
|
|
|
|
# 读取JSONL文件
|
|
with open(input_file, "r") as f:
|
|
data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
|
|
df = pd.DataFrame(data)
|
|
# print(df.head(5))
|
|
# 计算每个类别的数量
|
|
counts = df['output'].value_counts()
|
|
# 计算总数
|
|
total = counts.sum()
|
|
|
|
# 计算每个类别的比例
|
|
ratios = counts / total * 100
|
|
# 打印每个类别的比例
|
|
print("类别比例和数量:")
|
|
for category, ratio in ratios.items():
|
|
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
|
|
# 绘制饼图
|
|
plt.figure(figsize=(8, 6))
|
|
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
|
|
plt.title('数据集类别比例')
|
|
plt.show()
|
|
return ratios
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
|
|
# parser.add_argument(
|
|
# "--input",
|
|
# type=str,
|
|
# required=True,
|
|
# help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
|
|
# )
|
|
# parser.add_argument("--output", type=str, required=True, help="输出文件路径")
|
|
|
|
# args = parser.parse_args()
|
|
|
|
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
|
|
#input_file = "arxiv-metadata-oai-snapshot--swift.json"
|
|
input_file = "sftdata.jsonl" # 输出文件路径
|
|
input_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径
|
|
|
|
get_Composition_ratio(input_file)
|
|
|
|
|
|
#convert_to_alpaca_format(input_file, output_file) |