Files
data-prepare/06-data-swift-compose.py

55 lines
1.7 KiB
Python
Raw Normal View History

2025-07-18 18:00:04 +08:00
import json
import os
import argparse
import pandas as pd
import matplotlib.pyplot as plt
def get_Composition_ratio(input_file):
"""
计算数据集类别组成比例并打印输出
:param input_file: 输入的JSONL文件路径
"""
# 读取JSONL文件
with open(input_file, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# 提取每条数据的类别标签(假设在 conversation[0]['assistant']
labels = []
for item in data:
# 兼容 conversation 为列表且有 assistant 字段
if "conversation" in item and isinstance(item["conversation"], list):
conv = item["conversation"]
if len(conv) > 0 and "assistant" in conv[0]:
labels.append(conv[0]["assistant"])
else:
labels.append("未知")
else:
labels.append("未知")
df = pd.DataFrame({"label": labels})
# 计算每个类别的数量
counts = df['label'].value_counts()
total = counts.sum()
# 计算每个类别的比例
ratios = counts / total * 100
# 打印每个类别的比例
print("类别比例和数量:")
for category, ratio in ratios.items():
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
# 绘制饼图
plt.figure(figsize=(8, 6))
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
plt.title('数据集类别比例')
plt.show()
return ratios
if __name__ == "__main__":
# input_file = "sftdata.jsonl"
input_file = "output-26.jsonl"
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json"
2025-07-18 18:00:04 +08:00
get_Composition_ratio(input_file)