添加数据处理脚本，支持从原始数据筛选、抽样到转换为Alpaca格式

2025-06-09 14:39:07 +08:00
parent 40c5dee22c
commit 24abc7aab3
8 changed files with 438 additions and 0 deletions
--- a/04-data2swift.py
+++ b/04-data2swift.py
@@ -0,0 +1,70 @@
+import json
+import random
+
+input_file = "arxiv-metadata-oai-snapshot--ratio.json"   # 20000条原始数据文件路径
+output_file = "arxiv-metadata-oai-snapshot--swift.json"
+
+# 类别对应选项映射
+label_map = {
+    "astro-ph": "A",
+    "cond-mat.mes-hall": "B",
+    "cond-mat.mtrl-sci": "C",
+    "cs.CL": "D",
+    "cs.CV": "E",
+    "cs.LG": "F",
+    "gr-qc": "G",
+    "hep-ph": "H",
+    "hep-th": "I",
+    "quant-ph": "J"
+}
+
+options_text = (
+    "\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n"
+    "E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph"
+)
+
+# 读取所有数据
+with open(input_file, 'r', encoding='utf-8') as f:
+    data = [json.loads(line) for line in f]
+
+# 随机抽样1000条
+#random.seed(42)
+sampled = data
+
+with open(output_file, 'w', encoding='utf-8') as f_out:
+    count = 0
+    for item in sampled:
+        # 多类别时取最后一个类别（通常以空格分割）
+        categories_str = item.get("categories", "").strip()
+        if not categories_str:
+            continue
+        last_category = categories_str.split()[-1]
+
+        if last_category not in label_map:
+            continue
+
+        title = item.get("title", "").replace("\n", " ").strip()
+        authors = item.get("authors", "").replace("\n", " ").strip()
+        abstract = item.get("abstract", "").replace("\n", " ").strip()
+        if not title or not authors or not abstract:
+            continue
+
+        human_text = (
+            f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', "
+            f"please determine the scientific category of this paper.{options_text}"
+        )
+
+        finetune_sample = {
+            "system": "你是个优秀的论文分类师",
+            "conversation": [
+                {
+                    "human": human_text,
+                    "assistant": label_map[last_category]
+                }
+            ]
+        }
+
+        f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n")
+        count += 1
+
+print(f"转换完成，共生成{count}条微调数据，保存到 {output_file}")