swift

2025-07-18 18:00:04 +08:00
parent 24abc7aab3
commit 563f16f0c5
15 changed files with 25541 additions and 41 deletions
--- a/01-pre.py
+++ b/01-pre.py
@@ -1,14 +1,46 @@
 import json

 # 要保留的类别关键词
+# target_categories = {
+#     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
+#     "cs.CL", "cs.CV", "cs.LG",
+#     "gr-qc", "hep-ph", "hep-th", "quant-ph"
+# }
+
 target_categories = {
-    "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
-    "cs.CL", "cs.CV", "cs.LG",
-    "gr-qc", "hep-ph", "hep-th", "quant-ph"
-}
+        'quant-ph',
+        'physics.chem-ph', 
+        'physics.atom-ph',
+        'cond-mat.soft',
+        'cs.RO',
+        'cs.CL',
+        'cs.SE',
+        'cs.IR',
+        'hep-th',
+        'hep-ph',
+        'physics.optics',
+        'cs.AI',
+        'cs.CV',
+        'nucl-th',
+        'astro-ph',
+        'math.PR',
+        'cs.OS',
+        'eess.SP',
+        'math.OC',
+        'math.DS',
+        'math.DG',
+        'math.MP',
+        'cs.MM',
+        'stat.ME',
+        'math.CO',
+        'cs.NE'
+    }
+
+
+

 input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
-output_path = "arxiv-metadata-oai-snapshot--.json"  # 使用 JSON Lines 格式输出路径
+output_path = "arxiv-metadata-oai-snapshot--26.json"  # 使用 JSON Lines 格式输出路径

 count = 0

--- a/03-data_select_ratio.py
+++ b/03-data_select_ratio.py
@@ -1,9 +1,9 @@
 import json
 import random

-input_path = "arxiv-metadata-oai-snapshot-date-len.json"
-output_path = "arxiv-metadata-oai-snapshot--ratio.json"
-sample_size = 2000  # 你可以改成 10000 等其他数字
+input_path = "arxiv-metadata-oai-snapshot--26.json"
+output_path = "arxiv-metadata-oai-snapshot--26-500.json"
+sample_size = 4000  # 你可以改成 10000 等其他数字



@@ -15,18 +15,50 @@ print(f"原始数据量：{len(data)} 条")

 ## 按类别筛选数据，不是随机
 ## 每个类别指定抽取的比例
+# category_proportions = {
+#     'astro-ph': 0.1336,
+#     'cond-mat.mes-hall': 0.0486,
+#     'cond-mat.mtrl-sci': 0.0587,
+#     'cs.CL': 0.085,
+#     'cs.CV': 0.0931,
+#     'cs.LG': 0.0992,
+#     'gr-qc': 0.1174,
+#     'hep-ph': 0.1194,
+#     'hep-th': 0.085,
+#     'quant-ph': 0.1599
+# }
+
 category_proportions = {
-    'astro-ph': 0.1,
-    'cond-mat.mes-hall': 0.1,
-    'cond-mat.mtrl-sci': 0.1,
-    'cs.CL': 0.1,
-    'cs.CV': 0.1,
-    'cs.LG': 0.1,
-    'gr-qc': 0.1,
-    'hep-ph': 0.1,
-    'hep-th': 0.1,
-    'quant-ph': 0.1
-}
+        'quant-ph': 0.1,
+        'physics.chem-ph': 0.1, 
+        'physics.atom-ph': 0.1,
+        'cond-mat.soft': 0.1,
+        'cs.RO': 0.1,
+        'cs.CL': 0.1,
+        'cs.SE': 0.1,
+        'cs.IR': 0.1,
+        'hep-th': 0.1,
+        'hep-ph': 0.1,
+        'physics.optics': 0.1,
+        'cs.AI': 0.1,
+        'cs.CV': 0.1,
+        'nucl-th': 0.1,
+        'astro-ph': 0.1,
+        'math.PR': 0.1,
+        'cs.OS': 0.1,
+        'eess.SP': 0.1,
+        'math.OC': 0.1,
+        'math.DS': 0.1,
+        'math.DG': 0.1,
+        'math.MP': 0.1,
+        'cs.MM': 0.1,
+        'stat.ME': 0.1,
+        'math.CO': 0.1,
+        'cs.NE': 0.1
+    }
+
+
+
 ## print 每个类别的筛选比例和数量
 print("每个类别的筛选比例和数量:")
 for category, proportion in category_proportions.items():
--- a/04-data2swift.py
+++ b/04-data2swift.py
@@ -1,27 +1,48 @@
 import json
 import random

-input_file = "arxiv-metadata-oai-snapshot--ratio.json"   # 20000条原始数据文件路径
-output_file = "arxiv-metadata-oai-snapshot--swift.json"
+input_file = "arxiv-metadata-oai-snapshot--26-500.json"   # 20000条原始数据文件路径
+output_file = "arxiv-metadata-oai-snapshot--swift-26-500.json"

 # 类别对应选项映射
 label_map = {
-    "astro-ph": "A",
-    "cond-mat.mes-hall": "B",
-    "cond-mat.mtrl-sci": "C",
-    "cs.CL": "D",
-    "cs.CV": "E",
-    "cs.LG": "F",
-    "gr-qc": "G",
-    "hep-ph": "H",
-    "hep-th": "I",
-    "quant-ph": "J"
+        'quant-ph': 'A',
+        'physics.chem-ph': 'B', 
+        'physics.atom-ph': 'C',
+        'cond-mat.soft': 'D',
+        'cs.RO': 'E',
+        'cs.CL': 'F',
+        'cs.SE': 'G',
+        'cs.IR': 'H',
+        'hep-th': 'I',
+        'hep-ph': 'J',
+        'physics.optics': 'K',
+        'cs.AI': 'L',
+        'cs.CV': 'M',
+        'nucl-th': 'N',
+        'astro-ph': 'O',
+        'math.PR': 'P',
+        'cs.OS': 'Q',
+        'eess.SP': 'R',
+        'math.OC': 'S',
+        'math.DS': 'T',
+        'math.DG': 'U',
+        'math.MP': 'V',
+        'cs.MM': 'W',
+        'stat.ME': 'X',
+        'math.CO': 'Y',
+        'cs.NE': 'Z'
 }

-options_text = (
-    "\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"
-    "E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"
-)
+options = [
+    "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft",
+    "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph",
+    "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph",
+    "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS",
+    "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE"
+]
+
+options_text = "\n".join(options)

 # 读取所有数据
 with open(input_file, 'r', encoding='utf-8') as f:
--- a/05-data-csv-swift-pretrain.py
+++ b/05-data-csv-swift-pretrain.py
@@ -0,0 +1,81 @@
+      
+import json
+
+import csv
+
+
+
+def convert_to_alpaca_format(input_file, output_file):
+    """
+    读取csv文件，提取其中的question和answer列的数据，并转换为 Alpaca 格式。
+
+    输入csv格式:
+    question,A,B,C,D,E,F,G,H,I,J,answer
+
+    输出格式 (swift):
+    {
+        "system": "你是个优秀的论文分类师",
+        "conversation": [
+            {
+                "human": "Based on the title...",
+                "assistant": "D"
+            }
+        ]
+    }
+    """
+    print(f"转换数据: {input_file} -> {output_file}")
+
+    converted_data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        csv_reader = csv.DictReader(f)
+        for row in csv_reader:
+            try:
+                # 检查必要的列是否存在
+                if "question" not in row or "answer" not in row:
+                    print(f"警告: 数据缺少必要列: {row}")
+                    continue
+
+                # 创建新的 swift 格式数据
+                
+
+
+                
+                new_data = {
+                    
+                    "messages": [
+                        {
+                            "role": "assistant",
+                            "content": "This is a paper titled " + row["question"][19:] 
+                            #"assistant": row["answer"]
+                        }
+                    ]
+                }
+                converted_data.append(new_data)
+
+            except Exception as e:
+                print(f"处理行时发生错误: {str(e)}")
+
+    # 写入输出文件
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in converted_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
+    # parser.add_argument(
+    #     "--input",
+    #     type=str,
+    #     required=True,
+    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
+    # )
+    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
+
+    # args = parser.parse_args()
+
+    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
+    input_file = "newformat_sft_test_data.csv"
+    output_file = "newformat_sft_test_data--swift-pretrain.jsonl"  # 输出文件路径
+
+    convert_to_alpaca_format(input_file, output_file)
--- a/05-data-csv-swift-sft.py
+++ b/05-data-csv-swift-sft.py
@@ -0,0 +1,80 @@
+      
+import json
+
+import csv
+
+
+
+def convert_to_alpaca_format(input_file, output_file):
+    """
+    读取csv文件，提取其中的question和answer列的数据，并转换为 Alpaca 格式。
+
+    输入csv格式:
+    question,A,B,C,D,E,F,G,H,I,J,answer
+
+    输出格式 (swift):
+    {
+        "system": "你是个优秀的论文分类师",
+        "conversation": [
+            {
+                "human": "Based on the title...",
+                "assistant": "D"
+            }
+        ]
+    }
+    """
+    print(f"转换数据: {input_file} -> {output_file}")
+
+    converted_data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        csv_reader = csv.DictReader(f)
+        for row in csv_reader:
+            try:
+                # 检查必要的列是否存在
+                if "question" not in row or "answer" not in row:
+                    print(f"警告: 数据缺少必要列: {row}")
+                    continue
+
+                # 创建新的 swift 格式数据
+                
+
+
+                
+                new_data = {
+                    "system": "你是个优秀的论文分类师",
+                    "conversation": [
+                        {
+                            "human": row["question"],
+                            "assistant": row["answer"]
+                        }
+                    ]
+                }
+                converted_data.append(new_data)
+
+            except Exception as e:
+                print(f"处理行时发生错误: {str(e)}")
+
+    # 写入输出文件
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in converted_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
+    # parser.add_argument(
+    #     "--input",
+    #     type=str,
+    #     required=True,
+    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
+    # )
+    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
+
+    # args = parser.parse_args()
+
+    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
+    input_file = "newformat_sft_test_data.csv"
+    output_file = "newformat_sft_test_data--swift-sft.jsonl"  # 输出文件路径
+
+    convert_to_alpaca_format(input_file, output_file)
--- a/05-data-swfit-pretrain-revise.py
+++ b/05-data-swfit-pretrain-revise.py
@@ -0,0 +1,80 @@
+import json
+import os
+import argparse
+import re
+
+
+def convert_to_alpaca_format(input_file, output_file):
+    """
+    将 Swift 格式的数据转换为 Alpaca 格式
+
+    输入格式:
+        {
+        "messages": [
+            {
+                "role": "assistant",
+                "content": "This is a paper titled ...."  
+            }
+        ]
+    }
+
+    删除"content"中的 "with ID 0704.0145,"部分
+    按原格式输出
+    """
+    print(f"转换数据: {input_file} -> {output_file}")
+
+    converted_data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+
+                # 检查数据结构
+                if "messages" not in data or not isinstance(data["messages"], list):
+                    print(f"警告: 数据格式不正确，缺少messages字段或格式错误")
+                    continue
+
+                if not data["messages"] or "content" not in data["messages"][0]:
+                    print(f"警告: messages为空或缺少content字段")
+                    continue
+
+                # 转换数据
+                content = data["messages"][0]["content"]
+                # 删除 "with ID xxxx.xxxx," 的部分
+                content = re.sub(r'with ID \d+\.?\d*,\s*', '', content)
+                content = content[:-180]
+
+                new_data = {
+                    "messages": [
+                        {
+                            "role": data["messages"][0].get("role", "assistant"),
+                            "content": content
+                        }
+                    ]
+                }
+
+                converted_data.append(new_data)
+
+            except json.JSONDecodeError:
+                print(f"警告: 无法解析JSON行: {line}")
+            except Exception as e:
+                print(f"处理行时发生错误: {str(e)}")
+
+    # 写入输出文件
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in converted_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
+
+
+if __name__ == "__main__":
+    input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl"
+    output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl"  # 输出文件路径
+
+    convert_to_alpaca_format(input_file, output_file)
+
+
+
+
+
--- a/05-data-swfit-sft2pretrain.py
+++ b/05-data-swfit-sft2pretrain.py
@@ -0,0 +1,97 @@
+      
+import json
+import os
+import argparse
+
+
+def convert_to_alpaca_format(input_file, output_file):
+    """
+    将 Swift 格式的数据转换为 Alpaca 格式
+
+    输入格式:
+    {
+        "system": "你是个优秀的论文分类师",
+        "conversation": [
+            {
+                "human": "Based on the title...",
+                "assistant": "D"
+            }
+        ]
+    }
+
+    输出格式:
+        {
+        "messages": [
+            {
+                "role": "assistant",
+                "content": "This is a paper titled ...."  
+            }
+        ]
+    }
+    """
+    print(f"转换数据: {input_file} -> {output_file}")
+
+    converted_data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+
+                # 检查数据结构
+                if "system" not in data or "conversation" not in data:
+                    print(f"警告: 数据缺少必要字段: {data}")
+                    continue
+
+                # 从 system 提取指令
+                instruction = data.get("system", "")
+                if not instruction:
+                    instruction = "根据论文的标题、作者和摘要，确定该论文的科学类别。"
+
+                # 处理对话
+                for turn in data["conversation"]:
+                    if "human" in turn and "assistant" in turn:
+                        # 创建新的 Alpaca 格式数据
+                        new_data = {                                        
+                                    "messages": [
+                                        {
+                                            "role": "assistant",
+                                            "content": "This is a paper titled " + turn["human"][19:] 
+                                            
+                                        }]}
+                        converted_data.append(new_data)
+
+            except json.JSONDecodeError:
+                print(f"警告: 无法解析JSON行: {line}")
+            except Exception as e:
+                print(f"处理行时发生错误: {str(e)}")
+
+    # 写入输出文件
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in converted_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
+
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
+    # parser.add_argument(
+    #     "--input",
+    #     type=str,
+    #     required=True,
+    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
+    # )
+    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
+
+    # args = parser.parse_args()
+
+    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
+    input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
+    output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl"  # 输出文件路径
+
+    convert_to_alpaca_format(input_file, output_file)
+
+
+
+
+
--- a/05-data-swfit-xtuner.py
+++ b/05-data-swfit-xtuner.py
@@ -84,4 +84,9 @@ if __name__ == "__main__":
    input_file = "arxiv-metadata-oai-snapshot--swift.json"
    output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl"  # 输出文件路径

-    convert_to_alpaca_format(input_file, output_file)
+    convert_to_alpaca_format(input_file, output_file)
+
+
+
+
+
--- a/05-data-xtuner-swfit.py
+++ b/05-data-xtuner-swfit.py
@@ -0,0 +1,74 @@
+      
+import json
+import os
+import argparse
+
+
+def convert_to_alpaca_format(input_file, output_file):
+    """
+    将 Alpaca 格式转换为 Swift 格式的数据 
+
+    输入格式:
+        {
+        "instruction": "根据论文的标题、作者和摘要，确定该论文的科学类别。",
+        "input": "Based on the title...",
+        "output": "D"
+    }
+
+
+    输出格式 (Alpaca):
+    {
+        "system": "你是个优秀的论文分类师",
+        "conversation": [
+            {
+                "human": "Based on the title...",
+                "assistant": "D"
+            }
+        ]
+    }
+    """
+    print(f"转换数据: {input_file} -> {output_file}")
+
+    converted_data = []
+    with open(input_file, "r", encoding="utf-8") as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+
+ 
+
+            except json.JSONDecodeError:
+                print(f"警告: 无法解析JSON行: {line}")
+            except Exception as e:
+                print(f"处理行时发生错误: {str(e)}")
+
+    # 写入输出文件
+    with open(output_file, "w", encoding="utf-8") as f:
+        for item in converted_data:
+            f.write(json.dumps(item, ensure_ascii=False) + "\n")
+
+    print(f"转换完成! 共转换 {len(converted_data)} 条数据")
+
+
+if __name__ == "__main__":
+    # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式")
+    # parser.add_argument(
+    #     "--input",
+    #     type=str,
+    #     required=True,
+    #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)",
+    # )
+    # parser.add_argument("--output", type=str, required=True, help="输出文件路径")
+
+    # args = parser.parse_args()
+
+    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
+    input_file = "arxiv-metadata-oai-snapshot--swift.json"
+    output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl"  # 输出文件路径
+
+    convert_to_alpaca_format(input_file, output_file)
+
+
+
+
+
--- a/06-data-swift-compose.py
+++ b/06-data-swift-compose.py
@@ -0,0 +1,54 @@
+import json
+import os
+import argparse
+import pandas as pd
+import matplotlib.pyplot as plt
+    
+def get_Composition_ratio(input_file):
+    """
+    计算数据集类别组成比例，并打印输出。
+    :param input_file: 输入的JSONL文件路径
+    """
+    # 读取JSONL文件
+    with open(input_file, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
+
+    # 提取每条数据的类别标签（假设在 conversation[0]['assistant']）
+    labels = []
+    for item in data:
+        # 兼容 conversation 为列表且有 assistant 字段
+        if "conversation" in item and isinstance(item["conversation"], list):
+            conv = item["conversation"]
+            if len(conv) > 0 and "assistant" in conv[0]:
+                labels.append(conv[0]["assistant"])
+            else:
+                labels.append("未知")
+        else:
+            labels.append("未知")
+
+    df = pd.DataFrame({"label": labels})
+
+    # 计算每个类别的数量
+    counts = df['label'].value_counts()
+    total = counts.sum()
+
+    # 计算每个类别的比例
+    ratios = counts / total * 100
+
+    # 打印每个类别的比例
+    print("类别比例和数量:")
+    for category, ratio in ratios.items():
+        print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)")
+
+    # 绘制饼图
+    plt.figure(figsize=(8, 6))
+    plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140)
+    plt.title('数据集类别比例')
+    plt.show()
+    return ratios
+
+if __name__ == "__main__":
+    # input_file = "sftdata.jsonl"
+    input_file = "output-26.jsonl"
+    input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
+    get_Composition_ratio(input_file)
--- a/06-data-xtuner-compose.py
+++ b/06-data-xtuner-compose.py
@@ -1,4 +1,3 @@
-      
 import json
 import os
 import argparse
@@ -22,10 +21,11 @@ def get_Composition_ratio(input_file):
    """

    # 读取JSONL文件
-    with open(input_file, "r") as f:
-        data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象
+    with open(input_file, "r", encoding="utf-8") as f:
+        data = [json.loads(line) for line in f]
        df = pd.DataFrame(data)
-        # print(df.head(5))
+        print("实际列名：", df.columns)
+        print("前几行数据：\n", df.head())
    # 计算每个类别的数量
    counts = df['output'].value_counts()
    # 计算总数
@@ -67,7 +67,7 @@ if __name__ == "__main__":
    #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径
    #input_file = "arxiv-metadata-oai-snapshot--swift.json"
    input_file = "sftdata.jsonl"  # 输出文件路径
-    input_file = "newformat_sft_test_data--xtuner.jsonl"  # 输出文件路径
+    input_file = "arxiv-metadata-oai-snapshot--swift-26.json"  # 输出文件路径

    get_Composition_ratio(input_file)

--- a/arxiv-metadata-oai-snapshot--swift-26-500.json
+++ b/arxiv-metadata-oai-snapshot--swift-26-500.json
--- a/arxiv-metadata-oai-snapshot--swift-26.json
+++ b/arxiv-metadata-oai-snapshot--swift-26.json
--- a/arxiv-metadata-oai-snapshot--swift-26.jsonl.txt
+++ b/arxiv-metadata-oai-snapshot--swift-26.jsonl.txt
--- a/arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl
+++ b/arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl