添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式

2025-06-09 14:39:07 +08:00
parent 40c5dee22c
commit 24abc7aab3
8 changed files with 438 additions and 0 deletions
--- a/06-data-xtuner-compose.py
+++ b/06-data-xtuner-compose.py
@@ -0,0 +1,75 @@
+      
+import json
+import os
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+
+def get_Composition_ratio(input_file):
+    """
+        输出格式 (Alpaca):
+    {
+        "instruction": "根据论文的标题、作者和摘要，确定该论文的科学类别。",
+        "input": "Based on the title...",
+        "output": "D"
+    }
+    计算数据集组成比例，并打印输出。
+    :param input_file: 输入的JSONL文件路径
+
+    
+    """
+
+    # 读取JSONL文件
+    with open(input_file, "r") as f:
+        data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
+        df = pd.DataFrame(data)
+        # print(df.head(5))
+    # 计算每个类别的数量
+    counts = df['output'].value_counts()
+    # 计算总数
+    total = counts.sum()
+
+    # 计算每个类别的比例
+    ratios = counts / total * 100
+    # 打印每个类别的比例
+    print("类别比例和数量:")
+    for category, ratio in ratios.items():
+        print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
+    # 绘制饼图
+    plt.figure(figsize=(8, 6))
+    plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
+    plt.title('数据集类别比例')
+    plt.show()
+    return ratios
+
+        
+
+
+
+
+
+
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
+    # parser.add_argument(
+    #     "--input",
+    #     type=str,
+    #     required=True,
+    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
+    # )
+    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
+
+    # args = parser.parse_args()
+
+    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
+    #input_file = "arxiv-metadata-oai-snapshot--swift.json"
+    input_file = "sftdata.jsonl"  # 输出文件路径
+    input_file = "newformat_sft_test_data--xtuner.jsonl"  # 输出文件路径
+
+    get_Composition_ratio(input_file)
+
+
+    #convert_to_alpaca_format(input_file, output_file)