swift
This commit is contained in:
		
							
								
								
									
										42
									
								
								01-pre.py
									
									
									
									
									
								
							
							
						
						
									
										42
									
								
								01-pre.py
									
									
									
									
									
								
							| @@ -1,14 +1,46 @@ | ||||
| import json | ||||
|  | ||||
| # 要保留的类别关键词 | ||||
| # target_categories = { | ||||
| #     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci", | ||||
| #     "cs.CL", "cs.CV", "cs.LG", | ||||
| #     "gr-qc", "hep-ph", "hep-th", "quant-ph" | ||||
| # } | ||||
|  | ||||
| target_categories = { | ||||
|     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci", | ||||
|     "cs.CL", "cs.CV", "cs.LG", | ||||
|     "gr-qc", "hep-ph", "hep-th", "quant-ph" | ||||
| } | ||||
|         'quant-ph', | ||||
|         'physics.chem-ph',  | ||||
|         'physics.atom-ph', | ||||
|         'cond-mat.soft', | ||||
|         'cs.RO', | ||||
|         'cs.CL', | ||||
|         'cs.SE', | ||||
|         'cs.IR', | ||||
|         'hep-th', | ||||
|         'hep-ph', | ||||
|         'physics.optics', | ||||
|         'cs.AI', | ||||
|         'cs.CV', | ||||
|         'nucl-th', | ||||
|         'astro-ph', | ||||
|         'math.PR', | ||||
|         'cs.OS', | ||||
|         'eess.SP', | ||||
|         'math.OC', | ||||
|         'math.DS', | ||||
|         'math.DG', | ||||
|         'math.MP', | ||||
|         'cs.MM', | ||||
|         'stat.ME', | ||||
|         'math.CO', | ||||
|         'cs.NE' | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径 | ||||
| output_path = "arxiv-metadata-oai-snapshot--.json"  # 使用 JSON Lines 格式输出路径 | ||||
| output_path = "arxiv-metadata-oai-snapshot--26.json"  # 使用 JSON Lines 格式输出路径 | ||||
|  | ||||
| count = 0 | ||||
|  | ||||
|   | ||||
| @@ -1,9 +1,9 @@ | ||||
| import json | ||||
| import random | ||||
|  | ||||
| input_path = "arxiv-metadata-oai-snapshot-date-len.json" | ||||
| output_path = "arxiv-metadata-oai-snapshot--ratio.json" | ||||
| sample_size = 2000  # 你可以改成 10000 等其他数字 | ||||
| input_path = "arxiv-metadata-oai-snapshot--26.json" | ||||
| output_path = "arxiv-metadata-oai-snapshot--26-500.json" | ||||
| sample_size = 4000  # 你可以改成 10000 等其他数字 | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -15,18 +15,50 @@ print(f"原始数据量:{len(data)} 条") | ||||
|  | ||||
| ## 按类别筛选数据,不是随机 | ||||
| ## 每个类别指定抽取的比例 | ||||
| # category_proportions = { | ||||
| #     'astro-ph': 0.1336, | ||||
| #     'cond-mat.mes-hall': 0.0486, | ||||
| #     'cond-mat.mtrl-sci': 0.0587, | ||||
| #     'cs.CL': 0.085, | ||||
| #     'cs.CV': 0.0931, | ||||
| #     'cs.LG': 0.0992, | ||||
| #     'gr-qc': 0.1174, | ||||
| #     'hep-ph': 0.1194, | ||||
| #     'hep-th': 0.085, | ||||
| #     'quant-ph': 0.1599 | ||||
| # } | ||||
|  | ||||
| category_proportions = { | ||||
|     'astro-ph': 0.1, | ||||
|     'cond-mat.mes-hall': 0.1, | ||||
|     'cond-mat.mtrl-sci': 0.1, | ||||
|     'cs.CL': 0.1, | ||||
|     'cs.CV': 0.1, | ||||
|     'cs.LG': 0.1, | ||||
|     'gr-qc': 0.1, | ||||
|     'hep-ph': 0.1, | ||||
|     'hep-th': 0.1, | ||||
|     'quant-ph': 0.1 | ||||
| } | ||||
|         'quant-ph': 0.1, | ||||
|         'physics.chem-ph': 0.1,  | ||||
|         'physics.atom-ph': 0.1, | ||||
|         'cond-mat.soft': 0.1, | ||||
|         'cs.RO': 0.1, | ||||
|         'cs.CL': 0.1, | ||||
|         'cs.SE': 0.1, | ||||
|         'cs.IR': 0.1, | ||||
|         'hep-th': 0.1, | ||||
|         'hep-ph': 0.1, | ||||
|         'physics.optics': 0.1, | ||||
|         'cs.AI': 0.1, | ||||
|         'cs.CV': 0.1, | ||||
|         'nucl-th': 0.1, | ||||
|         'astro-ph': 0.1, | ||||
|         'math.PR': 0.1, | ||||
|         'cs.OS': 0.1, | ||||
|         'eess.SP': 0.1, | ||||
|         'math.OC': 0.1, | ||||
|         'math.DS': 0.1, | ||||
|         'math.DG': 0.1, | ||||
|         'math.MP': 0.1, | ||||
|         'cs.MM': 0.1, | ||||
|         'stat.ME': 0.1, | ||||
|         'math.CO': 0.1, | ||||
|         'cs.NE': 0.1 | ||||
|     } | ||||
|  | ||||
|  | ||||
|  | ||||
| ## print 每个类别的筛选比例和数量 | ||||
| print("每个类别的筛选比例和数量:") | ||||
| for category, proportion in category_proportions.items(): | ||||
|   | ||||
| @@ -1,27 +1,48 @@ | ||||
| import json | ||||
| import random | ||||
|  | ||||
| input_file = "arxiv-metadata-oai-snapshot--ratio.json"   # 20000条原始数据文件路径 | ||||
| output_file = "arxiv-metadata-oai-snapshot--swift.json" | ||||
| input_file = "arxiv-metadata-oai-snapshot--26-500.json"   # 20000条原始数据文件路径 | ||||
| output_file = "arxiv-metadata-oai-snapshot--swift-26-500.json" | ||||
|  | ||||
| # 类别对应选项映射 | ||||
| label_map = { | ||||
|     "astro-ph": "A", | ||||
|     "cond-mat.mes-hall": "B", | ||||
|     "cond-mat.mtrl-sci": "C", | ||||
|     "cs.CL": "D", | ||||
|     "cs.CV": "E", | ||||
|     "cs.LG": "F", | ||||
|     "gr-qc": "G", | ||||
|     "hep-ph": "H", | ||||
|     "hep-th": "I", | ||||
|     "quant-ph": "J" | ||||
|         'quant-ph': 'A', | ||||
|         'physics.chem-ph': 'B',  | ||||
|         'physics.atom-ph': 'C', | ||||
|         'cond-mat.soft': 'D', | ||||
|         'cs.RO': 'E', | ||||
|         'cs.CL': 'F', | ||||
|         'cs.SE': 'G', | ||||
|         'cs.IR': 'H', | ||||
|         'hep-th': 'I', | ||||
|         'hep-ph': 'J', | ||||
|         'physics.optics': 'K', | ||||
|         'cs.AI': 'L', | ||||
|         'cs.CV': 'M', | ||||
|         'nucl-th': 'N', | ||||
|         'astro-ph': 'O', | ||||
|         'math.PR': 'P', | ||||
|         'cs.OS': 'Q', | ||||
|         'eess.SP': 'R', | ||||
|         'math.OC': 'S', | ||||
|         'math.DS': 'T', | ||||
|         'math.DG': 'U', | ||||
|         'math.MP': 'V', | ||||
|         'cs.MM': 'W', | ||||
|         'stat.ME': 'X', | ||||
|         'math.CO': 'Y', | ||||
|         'cs.NE': 'Z' | ||||
| } | ||||
|  | ||||
| options_text = ( | ||||
|     "\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n" | ||||
|     "E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph" | ||||
| ) | ||||
| options = [ | ||||
|     "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft", | ||||
|     "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph", | ||||
|     "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph", | ||||
|     "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS", | ||||
|     "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE" | ||||
| ] | ||||
|  | ||||
| options_text = "\n".join(options) | ||||
|  | ||||
| # 读取所有数据 | ||||
| with open(input_file, 'r', encoding='utf-8') as f: | ||||
|   | ||||
							
								
								
									
										81
									
								
								05-data-csv-swift-pretrain.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										81
									
								
								05-data-csv-swift-pretrain.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,81 @@ | ||||
|        | ||||
| import json | ||||
|  | ||||
| import csv | ||||
|  | ||||
|  | ||||
|  | ||||
| def convert_to_alpaca_format(input_file, output_file): | ||||
|     """ | ||||
|     读取csv文件,提取其中的question和answer列的数据,并转换为 Alpaca 格式。 | ||||
|  | ||||
|     输入csv格式: | ||||
|     question,A,B,C,D,E,F,G,H,I,J,answer | ||||
|  | ||||
|     输出格式 (swift): | ||||
|     { | ||||
|         "system": "你是个优秀的论文分类师", | ||||
|         "conversation": [ | ||||
|             { | ||||
|                 "human": "Based on the title...", | ||||
|                 "assistant": "D" | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|     """ | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         csv_reader = csv.DictReader(f) | ||||
|         for row in csv_reader: | ||||
|             try: | ||||
|                 # 检查必要的列是否存在 | ||||
|                 if "question" not in row or "answer" not in row: | ||||
|                     print(f"警告: 数据缺少必要列: {row}") | ||||
|                     continue | ||||
|  | ||||
|                 # 创建新的 swift 格式数据 | ||||
|                  | ||||
|  | ||||
|  | ||||
|                  | ||||
|                 new_data = { | ||||
|                      | ||||
|                     "messages": [ | ||||
|                         { | ||||
|                             "role": "assistant", | ||||
|                             "content": "This is a paper titled " + row["question"][19:]  | ||||
|                             #"assistant": row["answer"] | ||||
|                         } | ||||
|                     ] | ||||
|                 } | ||||
|                 converted_data.append(new_data) | ||||
|  | ||||
|             except Exception as e: | ||||
|                 print(f"处理行时发生错误: {str(e)}") | ||||
|  | ||||
|     # 写入输出文件 | ||||
|     with open(output_file, "w", encoding="utf-8") as f: | ||||
|         for item in converted_data: | ||||
|             f.write(json.dumps(item, ensure_ascii=False) + "\n") | ||||
|  | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") | ||||
|     # parser.add_argument( | ||||
|     #     "--input", | ||||
|     #     type=str, | ||||
|     #     required=True, | ||||
|     #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", | ||||
|     # ) | ||||
|     # parser.add_argument("--output", type=str, required=True, help="输出文件路径") | ||||
|  | ||||
|     # args = parser.parse_args() | ||||
|  | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     input_file = "newformat_sft_test_data.csv" | ||||
|     output_file = "newformat_sft_test_data--swift-pretrain.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
							
								
								
									
										80
									
								
								05-data-csv-swift-sft.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								05-data-csv-swift-sft.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
|        | ||||
| import json | ||||
|  | ||||
| import csv | ||||
|  | ||||
|  | ||||
|  | ||||
| def convert_to_alpaca_format(input_file, output_file): | ||||
|     """ | ||||
|     读取csv文件,提取其中的question和answer列的数据,并转换为 Alpaca 格式。 | ||||
|  | ||||
|     输入csv格式: | ||||
|     question,A,B,C,D,E,F,G,H,I,J,answer | ||||
|  | ||||
|     输出格式 (swift): | ||||
|     { | ||||
|         "system": "你是个优秀的论文分类师", | ||||
|         "conversation": [ | ||||
|             { | ||||
|                 "human": "Based on the title...", | ||||
|                 "assistant": "D" | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|     """ | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         csv_reader = csv.DictReader(f) | ||||
|         for row in csv_reader: | ||||
|             try: | ||||
|                 # 检查必要的列是否存在 | ||||
|                 if "question" not in row or "answer" not in row: | ||||
|                     print(f"警告: 数据缺少必要列: {row}") | ||||
|                     continue | ||||
|  | ||||
|                 # 创建新的 swift 格式数据 | ||||
|                  | ||||
|  | ||||
|  | ||||
|                  | ||||
|                 new_data = { | ||||
|                     "system": "你是个优秀的论文分类师", | ||||
|                     "conversation": [ | ||||
|                         { | ||||
|                             "human": row["question"], | ||||
|                             "assistant": row["answer"] | ||||
|                         } | ||||
|                     ] | ||||
|                 } | ||||
|                 converted_data.append(new_data) | ||||
|  | ||||
|             except Exception as e: | ||||
|                 print(f"处理行时发生错误: {str(e)}") | ||||
|  | ||||
|     # 写入输出文件 | ||||
|     with open(output_file, "w", encoding="utf-8") as f: | ||||
|         for item in converted_data: | ||||
|             f.write(json.dumps(item, ensure_ascii=False) + "\n") | ||||
|  | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") | ||||
|     # parser.add_argument( | ||||
|     #     "--input", | ||||
|     #     type=str, | ||||
|     #     required=True, | ||||
|     #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", | ||||
|     # ) | ||||
|     # parser.add_argument("--output", type=str, required=True, help="输出文件路径") | ||||
|  | ||||
|     # args = parser.parse_args() | ||||
|  | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     input_file = "newformat_sft_test_data.csv" | ||||
|     output_file = "newformat_sft_test_data--swift-sft.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
							
								
								
									
										80
									
								
								05-data-swfit-pretrain-revise.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										80
									
								
								05-data-swfit-pretrain-revise.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,80 @@ | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
| import re | ||||
|  | ||||
|  | ||||
| def convert_to_alpaca_format(input_file, output_file): | ||||
|     """ | ||||
|     将 Swift 格式的数据转换为 Alpaca 格式 | ||||
|  | ||||
|     输入格式: | ||||
|         { | ||||
|         "messages": [ | ||||
|             { | ||||
|                 "role": "assistant", | ||||
|                 "content": "This is a paper titled ...."   | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|  | ||||
|     删除"content"中的 "with ID 0704.0145,"部分 | ||||
|     按原格式输出 | ||||
|     """ | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         for line in f: | ||||
|             try: | ||||
|                 data = json.loads(line.strip()) | ||||
|  | ||||
|                 # 检查数据结构 | ||||
|                 if "messages" not in data or not isinstance(data["messages"], list): | ||||
|                     print(f"警告: 数据格式不正确,缺少messages字段或格式错误") | ||||
|                     continue | ||||
|  | ||||
|                 if not data["messages"] or "content" not in data["messages"][0]: | ||||
|                     print(f"警告: messages为空或缺少content字段") | ||||
|                     continue | ||||
|  | ||||
|                 # 转换数据 | ||||
|                 content = data["messages"][0]["content"] | ||||
|                 # 删除 "with ID xxxx.xxxx," 的部分 | ||||
|                 content = re.sub(r'with ID \d+\.?\d*,\s*', '', content) | ||||
|                 content = content[:-180] | ||||
|  | ||||
|                 new_data = { | ||||
|                     "messages": [ | ||||
|                         { | ||||
|                             "role": data["messages"][0].get("role", "assistant"), | ||||
|                             "content": content | ||||
|                         } | ||||
|                     ] | ||||
|                 } | ||||
|  | ||||
|                 converted_data.append(new_data) | ||||
|  | ||||
|             except json.JSONDecodeError: | ||||
|                 print(f"警告: 无法解析JSON行: {line}") | ||||
|             except Exception as e: | ||||
|                 print(f"处理行时发生错误: {str(e)}") | ||||
|  | ||||
|     # 写入输出文件 | ||||
|     with open(output_file, "w", encoding="utf-8") as f: | ||||
|         for item in converted_data: | ||||
|             f.write(json.dumps(item, ensure_ascii=False) + "\n") | ||||
|  | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift-pretrain.jsonl" | ||||
|     output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
							
								
								
									
										97
									
								
								05-data-swfit-sft2pretrain.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								05-data-swfit-sft2pretrain.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,97 @@ | ||||
|        | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
|  | ||||
|  | ||||
| def convert_to_alpaca_format(input_file, output_file): | ||||
|     """ | ||||
|     将 Swift 格式的数据转换为 Alpaca 格式 | ||||
|  | ||||
|     输入格式: | ||||
|     { | ||||
|         "system": "你是个优秀的论文分类师", | ||||
|         "conversation": [ | ||||
|             { | ||||
|                 "human": "Based on the title...", | ||||
|                 "assistant": "D" | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|  | ||||
|     输出格式: | ||||
|         { | ||||
|         "messages": [ | ||||
|             { | ||||
|                 "role": "assistant", | ||||
|                 "content": "This is a paper titled ...."   | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|     """ | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         for line in f: | ||||
|             try: | ||||
|                 data = json.loads(line.strip()) | ||||
|  | ||||
|                 # 检查数据结构 | ||||
|                 if "system" not in data or "conversation" not in data: | ||||
|                     print(f"警告: 数据缺少必要字段: {data}") | ||||
|                     continue | ||||
|  | ||||
|                 # 从 system 提取指令 | ||||
|                 instruction = data.get("system", "") | ||||
|                 if not instruction: | ||||
|                     instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。" | ||||
|  | ||||
|                 # 处理对话 | ||||
|                 for turn in data["conversation"]: | ||||
|                     if "human" in turn and "assistant" in turn: | ||||
|                         # 创建新的 Alpaca 格式数据 | ||||
|                         new_data = {                                         | ||||
|                                     "messages": [ | ||||
|                                         { | ||||
|                                             "role": "assistant", | ||||
|                                             "content": "This is a paper titled " + turn["human"][19:]  | ||||
|                                              | ||||
|                                         }]} | ||||
|                         converted_data.append(new_data) | ||||
|  | ||||
|             except json.JSONDecodeError: | ||||
|                 print(f"警告: 无法解析JSON行: {line}") | ||||
|             except Exception as e: | ||||
|                 print(f"处理行时发生错误: {str(e)}") | ||||
|  | ||||
|     # 写入输出文件 | ||||
|     with open(output_file, "w", encoding="utf-8") as f: | ||||
|         for item in converted_data: | ||||
|             f.write(json.dumps(item, ensure_ascii=False) + "\n") | ||||
|  | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") | ||||
|     # parser.add_argument( | ||||
|     #     "--input", | ||||
|     #     type=str, | ||||
|     #     required=True, | ||||
|     #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", | ||||
|     # ) | ||||
|     # parser.add_argument("--output", type=str, required=True, help="输出文件路径") | ||||
|  | ||||
|     # args = parser.parse_args() | ||||
|  | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift-26.json" | ||||
|     output_file = "arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -84,4 +84,9 @@ if __name__ == "__main__": | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift.json" | ||||
|     output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|   | ||||
							
								
								
									
										74
									
								
								05-data-xtuner-swfit.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										74
									
								
								05-data-xtuner-swfit.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,74 @@ | ||||
|        | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
|  | ||||
|  | ||||
| def convert_to_alpaca_format(input_file, output_file): | ||||
|     """ | ||||
|     将 Alpaca 格式转换为 Swift 格式的数据  | ||||
|  | ||||
|     输入格式: | ||||
|         { | ||||
|         "instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。", | ||||
|         "input": "Based on the title...", | ||||
|         "output": "D" | ||||
|     } | ||||
|  | ||||
|  | ||||
|     输出格式 (Alpaca): | ||||
|     { | ||||
|         "system": "你是个优秀的论文分类师", | ||||
|         "conversation": [ | ||||
|             { | ||||
|                 "human": "Based on the title...", | ||||
|                 "assistant": "D" | ||||
|             } | ||||
|         ] | ||||
|     } | ||||
|     """ | ||||
|     print(f"转换数据: {input_file} -> {output_file}") | ||||
|  | ||||
|     converted_data = [] | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         for line in f: | ||||
|             try: | ||||
|                 data = json.loads(line.strip()) | ||||
|  | ||||
|   | ||||
|  | ||||
|             except json.JSONDecodeError: | ||||
|                 print(f"警告: 无法解析JSON行: {line}") | ||||
|             except Exception as e: | ||||
|                 print(f"处理行时发生错误: {str(e)}") | ||||
|  | ||||
|     # 写入输出文件 | ||||
|     with open(output_file, "w", encoding="utf-8") as f: | ||||
|         for item in converted_data: | ||||
|             f.write(json.dumps(item, ensure_ascii=False) + "\n") | ||||
|  | ||||
|     print(f"转换完成! 共转换 {len(converted_data)} 条数据") | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") | ||||
|     # parser.add_argument( | ||||
|     #     "--input", | ||||
|     #     type=str, | ||||
|     #     required=True, | ||||
|     #     help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", | ||||
|     # ) | ||||
|     # parser.add_argument("--output", type=str, required=True, help="输出文件路径") | ||||
|  | ||||
|     # args = parser.parse_args() | ||||
|  | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift.json" | ||||
|     output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl"  # 输出文件路径 | ||||
|  | ||||
|     convert_to_alpaca_format(input_file, output_file) | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
							
								
								
									
										54
									
								
								06-data-swift-compose.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								06-data-swift-compose.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,54 @@ | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
| import pandas as pd | ||||
| import matplotlib.pyplot as plt | ||||
|      | ||||
| def get_Composition_ratio(input_file): | ||||
|     """ | ||||
|     计算数据集类别组成比例,并打印输出。 | ||||
|     :param input_file: 输入的JSONL文件路径 | ||||
|     """ | ||||
|     # 读取JSONL文件 | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         data = [json.loads(line) for line in f] | ||||
|  | ||||
|     # 提取每条数据的类别标签(假设在 conversation[0]['assistant']) | ||||
|     labels = [] | ||||
|     for item in data: | ||||
|         # 兼容 conversation 为列表且有 assistant 字段 | ||||
|         if "conversation" in item and isinstance(item["conversation"], list): | ||||
|             conv = item["conversation"] | ||||
|             if len(conv) > 0 and "assistant" in conv[0]: | ||||
|                 labels.append(conv[0]["assistant"]) | ||||
|             else: | ||||
|                 labels.append("未知") | ||||
|         else: | ||||
|             labels.append("未知") | ||||
|  | ||||
|     df = pd.DataFrame({"label": labels}) | ||||
|  | ||||
|     # 计算每个类别的数量 | ||||
|     counts = df['label'].value_counts() | ||||
|     total = counts.sum() | ||||
|  | ||||
|     # 计算每个类别的比例 | ||||
|     ratios = counts / total * 100 | ||||
|  | ||||
|     # 打印每个类别的比例 | ||||
|     print("类别比例和数量:") | ||||
|     for category, ratio in ratios.items(): | ||||
|         print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)") | ||||
|  | ||||
|     # 绘制饼图 | ||||
|     plt.figure(figsize=(8, 6)) | ||||
|     plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140) | ||||
|     plt.title('数据集类别比例') | ||||
|     plt.show() | ||||
|     return ratios | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     # input_file = "sftdata.jsonl" | ||||
|     input_file = "output-26.jsonl" | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift-26.json" | ||||
|     get_Composition_ratio(input_file) | ||||
| @@ -1,4 +1,3 @@ | ||||
|        | ||||
| import json | ||||
| import os | ||||
| import argparse | ||||
| @@ -22,10 +21,11 @@ def get_Composition_ratio(input_file): | ||||
|     """ | ||||
|  | ||||
|     # 读取JSONL文件 | ||||
|     with open(input_file, "r") as f: | ||||
|         data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象 | ||||
|     with open(input_file, "r", encoding="utf-8") as f: | ||||
|         data = [json.loads(line) for line in f] | ||||
|         df = pd.DataFrame(data) | ||||
|         # print(df.head(5)) | ||||
|         print("实际列名:", df.columns) | ||||
|         print("前几行数据:\n", df.head()) | ||||
|     # 计算每个类别的数量 | ||||
|     counts = df['output'].value_counts() | ||||
|     # 计算总数 | ||||
| @@ -67,7 +67,7 @@ if __name__ == "__main__": | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--random.json"   # 20000条原始数据文件路径 | ||||
|     #input_file = "arxiv-metadata-oai-snapshot--swift.json" | ||||
|     input_file = "sftdata.jsonl"  # 输出文件路径 | ||||
|     input_file = "newformat_sft_test_data--xtuner.jsonl"  # 输出文件路径 | ||||
|     input_file = "arxiv-metadata-oai-snapshot--swift-26.json"  # 输出文件路径 | ||||
|  | ||||
|     get_Composition_ratio(input_file) | ||||
|  | ||||
|   | ||||
							
								
								
									
										9944
									
								
								arxiv-metadata-oai-snapshot--swift-26-500.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9944
									
								
								arxiv-metadata-oai-snapshot--swift-26-500.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-26.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-26.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-26.jsonl.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-26.jsonl.txt
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5000
									
								
								arxiv-metadata-oai-snapshot--swift-pretrain-26.jsonl
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user