swift
This commit is contained in:
@@ -1,4 +1,3 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
@@ -22,10 +21,11 @@ def get_Composition_ratio(input_file):
|
||||
"""
|
||||
|
||||
# 读取JSONL文件
|
||||
with open(input_file, "r") as f:
|
||||
data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
|
||||
with open(input_file, "r", encoding="utf-8") as f:
|
||||
data = [json.loads(line) for line in f]
|
||||
df = pd.DataFrame(data)
|
||||
# print(df.head(5))
|
||||
print("实际列名:", df.columns)
|
||||
print("前几行数据:\n", df.head())
|
||||
# 计算每个类别的数量
|
||||
counts = df['output'].value_counts()
|
||||
# 计算总数
|
||||
@@ -67,7 +67,7 @@ if __name__ == "__main__":
|
||||
#input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径
|
||||
#input_file = "arxiv-metadata-oai-snapshot--swift.json"
|
||||
input_file = "sftdata.jsonl" # 输出文件路径
|
||||
input_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径
|
||||
input_file = "arxiv-metadata-oai-snapshot--swift-26.json" # 输出文件路径
|
||||
|
||||
get_Composition_ratio(input_file)
|
||||
|
||||
|
Reference in New Issue
Block a user