Files
data-prepare/06-data-swift-compose.py

55 lines
1.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
import argparse
import pandas as pd
import matplotlib.pyplot as plt
def get_Composition_ratio(input_file):
"""
计算数据集类别组成比例,并打印输出。
:param input_file: 输入的JSONL文件路径
"""
# 读取JSONL文件
with open(input_file, "r", encoding="utf-8") as f:
data = [json.loads(line) for line in f]
# 提取每条数据的类别标签(假设在 conversation[0]['assistant']
labels = []
for item in data:
# 兼容 conversation 为列表且有 assistant 字段
if "conversation" in item and isinstance(item["conversation"], list):
conv = item["conversation"]
if len(conv) > 0 and "assistant" in conv[0]:
labels.append(conv[0]["assistant"])
else:
labels.append("未知")
else:
labels.append("未知")
df = pd.DataFrame({"label": labels})
# 计算每个类别的数量
counts = df['label'].value_counts()
total = counts.sum()
# 计算每个类别的比例
ratios = counts / total * 100
# 打印每个类别的比例
print("类别比例和数量:")
for category, ratio in ratios.items():
print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
# 绘制饼图
plt.figure(figsize=(8, 6))
plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
plt.title('数据集类别比例')
plt.show()
return ratios
if __name__ == "__main__":
# input_file = "sftdata.jsonl"
input_file = "output-26.jsonl"
input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json"
get_Composition_ratio(input_file)