2025-07-18 18:00:04 +08:00
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import argparse
|
|
|
|
|
import pandas as pd
|
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
def get_Composition_ratio(input_file):
|
|
|
|
|
"""
|
|
|
|
|
计算数据集类别组成比例,并打印输出。
|
|
|
|
|
:param input_file: 输入的JSONL文件路径
|
|
|
|
|
"""
|
|
|
|
|
# 读取JSONL文件
|
|
|
|
|
with open(input_file, "r", encoding="utf-8") as f:
|
|
|
|
|
data = [json.loads(line) for line in f]
|
|
|
|
|
|
|
|
|
|
# 提取每条数据的类别标签(假设在 conversation[0]['assistant'])
|
|
|
|
|
labels = []
|
|
|
|
|
for item in data:
|
|
|
|
|
# 兼容 conversation 为列表且有 assistant 字段
|
|
|
|
|
if "conversation" in item and isinstance(item["conversation"], list):
|
|
|
|
|
conv = item["conversation"]
|
|
|
|
|
if len(conv) > 0 and "assistant" in conv[0]:
|
|
|
|
|
labels.append(conv[0]["assistant"])
|
|
|
|
|
else:
|
|
|
|
|
labels.append("未知")
|
|
|
|
|
else:
|
|
|
|
|
labels.append("未知")
|
|
|
|
|
|
|
|
|
|
df = pd.DataFrame({"label": labels})
|
|
|
|
|
|
|
|
|
|
# 计算每个类别的数量
|
|
|
|
|
counts = df['label'].value_counts()
|
|
|
|
|
total = counts.sum()
|
|
|
|
|
|
|
|
|
|
# 计算每个类别的比例
|
|
|
|
|
ratios = counts / total * 100
|
|
|
|
|
|
|
|
|
|
# 打印每个类别的比例
|
|
|
|
|
print("类别比例和数量:")
|
|
|
|
|
for category, ratio in ratios.items():
|
|
|
|
|
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
|
|
|
|
|
|
|
|
|
|
# 绘制饼图
|
|
|
|
|
plt.figure(figsize=(8, 6))
|
|
|
|
|
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
|
|
|
|
|
plt.title('数据集类别比例')
|
|
|
|
|
plt.show()
|
|
|
|
|
return ratios
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# input_file = "sftdata.jsonl"
|
|
|
|
|
input_file = "output-26.jsonl"
|
2025-07-30 23:05:31 +08:00
|
|
|
|
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json"
|
2025-07-18 18:00:04 +08:00
|
|
|
|
get_Composition_ratio(input_file)
|