import json import os import argparse import pandas as pd import matplotlib.pyplot as plt def get_Composition_ratio(input_file): """ 计算数据集类别组成比例,并打印输出。 :param input_file: 输入的JSONL文件路径 """ # 读取JSONL文件 with open(input_file, "r", encoding="utf-8") as f: data = [json.loads(line) for line in f] # 提取每条数据的类别标签(假设在 conversation[0]['assistant']) labels = [] for item in data: # 兼容 conversation 为列表且有 assistant 字段 if "conversation" in item and isinstance(item["conversation"], list): conv = item["conversation"] if len(conv) > 0 and "assistant" in conv[0]: labels.append(conv[0]["assistant"]) else: labels.append("未知") else: labels.append("未知") df = pd.DataFrame({"label": labels}) # 计算每个类别的数量 counts = df['label'].value_counts() total = counts.sum() # 计算每个类别的比例 ratios = counts / total * 100 # 打印每个类别的比例 print("类别比例和数量:") for category, ratio in ratios.items(): print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)") # 绘制饼图 plt.figure(figsize=(8, 6)) plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140) plt.title('数据集类别比例') plt.show() return ratios if __name__ == "__main__": # input_file = "sftdata.jsonl" input_file = "output-26.jsonl" input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json" get_Composition_ratio(input_file)