diff --git a/01-pre.py b/01-pre.py new file mode 100644 index 0000000..194cc16 --- /dev/null +++ b/01-pre.py @@ -0,0 +1,29 @@ +import json + +# 要保留的类别关键词 +target_categories = { + "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci", + "cs.CL", "cs.CV", "cs.LG", + "gr-qc", "hep-ph", "hep-th", "quant-ph" +} + +input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径 +output_path = "arxiv-metadata-oai-snapshot--.json" # 使用 JSON Lines 格式输出路径 + +count = 0 + +with open(input_path, 'r') as infile, open(output_path, 'w') as outfile: + for line in infile: + try: + record = json.loads(line) + record_cats = record.get("categories", "").split() + if record_cats: + last_cat = record_cats[-1] + if last_cat in target_categories: + outfile.write(json.dumps(record) + '\n') + count += 1 + except json.JSONDecodeError: + continue # 忽略格式错误的行 + +print(f"筛选完成,共保存了 {count} 条记录到 {output_path}") + diff --git a/02-data_select_date_len.py b/02-data_select_date_len.py new file mode 100644 index 0000000..9b90f5f --- /dev/null +++ b/02-data_select_date_len.py @@ -0,0 +1,26 @@ +import json + +input_path = "arxiv-metadata-oai-snapshot--.json" # 上一步筛选后的数据 +output_path = "arxiv-metadata-oai-snapshot-date-len.json" # 输出高质量数据 + +count = 0 + +with open(input_path, 'r') as infile, open(output_path, 'w') as outfile: + for line in infile: + try: + record = json.loads(line) + + # 获取更新日期和摘要 + update_date = record.get("update_date", "") + abstract = record.get("abstract", "") + + # 过滤条件,这里根据自己的模型参数修改 + if len(abstract) >= 300 and len(abstract)<=4096: + if update_date and int(update_date[:4]) >= 2020: + outfile.write(json.dumps(record) + '\n') + count += 1 + + except json.JSONDecodeError: + continue # 跳过格式错误的行 + +print(f"高质量筛选完成,共保留 {count} 条记录到 {output_path}") diff --git a/03-data_select_random.py b/03-data_select_random.py new file mode 100644 index 0000000..2ec449c --- /dev/null +++ b/03-data_select_random.py @@ -0,0 +1,22 @@ +import json +import random + +input_path = "arxiv-metadata-oai-snapshot-date-len.json" +output_path = "arxiv-metadata-oai-snapshot--random.json" +sample_size = 10000 # 你可以改成 10000 等其他数字 + +# 先将所有数据加载到内存中(30万条可以接受) +with open(input_path, 'r') as infile: + data = [json.loads(line) for line in infile] + +print(f"原始数据量:{len(data)} 条") +random.seed(42) #随机数种子,可以自己随便调 +# 随机抽样 +sampled_data = random.sample(data, sample_size) + +# 保存结果 +with open(output_path, 'w') as outfile: + for record in sampled_data: + outfile.write(json.dumps(record) + '\n') + +print(f"已随机抽取 {sample_size} 条数据保存到 {output_path}") diff --git a/03-data_select_ratio.py b/03-data_select_ratio.py new file mode 100644 index 0000000..3619653 --- /dev/null +++ b/03-data_select_ratio.py @@ -0,0 +1,61 @@ +import json +import random + +input_path = "arxiv-metadata-oai-snapshot-date-len.json" +output_path = "arxiv-metadata-oai-snapshot--ratio.json" +sample_size = 2000 # 你可以改成 10000 等其他数字 + + + +# 先将所有数据加载到内存中(30万条可以接受) +with open(input_path, 'r') as infile: + data = [json.loads(line) for line in infile] + +print(f"原始数据量:{len(data)} 条") + +## 按类别筛选数据,不是随机 +## 每个类别指定抽取的比例 +category_proportions = { + 'astro-ph': 0.1, + 'cond-mat.mes-hall': 0.1, + 'cond-mat.mtrl-sci': 0.1, + 'cs.CL': 0.1, + 'cs.CV': 0.1, + 'cs.LG': 0.1, + 'gr-qc': 0.1, + 'hep-ph': 0.1, + 'hep-th': 0.1, + 'quant-ph': 0.1 +} +## print 每个类别的筛选比例和数量 +print("每个类别的筛选比例和数量:") +for category, proportion in category_proportions.items(): + count = sample_size * proportion + print(f"类别 {category}: 抽取比例 {proportion}, 数量 {count}") +# 按每个类别的数量筛选数据 +filtered_data = [] +for category, proportion in category_proportions.items(): + count = int(sample_size * proportion) + # 筛选出当前类别的数据 + category_data = [item for item in data if item.get('categories', '').strip() == category] + # 如果当前类别的数据量小于需要抽取的数量,则全部取出 + if len(category_data) < count: + filtered_data.extend(category_data) + else: + # 随机抽样指定数量的数据 + sampled_data = random.sample(category_data, count) + filtered_data.extend(sampled_data) + print(f"类别 {category}: 抽取数量 {count}") + + + + + + + +# 保存结果 +with open(output_path, 'w') as outfile: + for record in filtered_data: + outfile.write(json.dumps(record) + '\n') + +print(f"已按比例抽取 {sample_size} 条数据保存到 {output_path}") diff --git a/04-data2swift.py b/04-data2swift.py new file mode 100644 index 0000000..324ac43 --- /dev/null +++ b/04-data2swift.py @@ -0,0 +1,70 @@ +import json +import random + +input_file = "arxiv-metadata-oai-snapshot--ratio.json" # 20000条原始数据文件路径 +output_file = "arxiv-metadata-oai-snapshot--swift.json" + +# 类别对应选项映射 +label_map = { + "astro-ph": "A", + "cond-mat.mes-hall": "B", + "cond-mat.mtrl-sci": "C", + "cs.CL": "D", + "cs.CV": "E", + "cs.LG": "F", + "gr-qc": "G", + "hep-ph": "H", + "hep-th": "I", + "quant-ph": "J" +} + +options_text = ( + "\n\nA. astro-ph\nB. cond-mat.mes-hall\nC. cond-mat.mtrl-sci\nD. cs.CL\n" + "E. cs.CV\nF. cs.LG\nG. gr-qc\nH. hep-ph\nI. hep-th\nJ. quant-ph" +) + +# 读取所有数据 +with open(input_file, 'r', encoding='utf-8') as f: + data = [json.loads(line) for line in f] + +# 随机抽样1000条 +#random.seed(42) +sampled = data + +with open(output_file, 'w', encoding='utf-8') as f_out: + count = 0 + for item in sampled: + # 多类别时取最后一个类别(通常以空格分割) + categories_str = item.get("categories", "").strip() + if not categories_str: + continue + last_category = categories_str.split()[-1] + + if last_category not in label_map: + continue + + title = item.get("title", "").replace("\n", " ").strip() + authors = item.get("authors", "").replace("\n", " ").strip() + abstract = item.get("abstract", "").replace("\n", " ").strip() + if not title or not authors or not abstract: + continue + + human_text = ( + f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', " + f"please determine the scientific category of this paper.{options_text}" + ) + + finetune_sample = { + "system": "你是个优秀的论文分类师", + "conversation": [ + { + "human": human_text, + "assistant": label_map[last_category] + } + ] + } + + f_out.write(json.dumps(finetune_sample, ensure_ascii=False) + "\n") + count += 1 + +print(f"转换完成,共生成{count}条微调数据,保存到 {output_file}") diff --git a/05-data-csv-xtuner.py b/05-data-csv-xtuner.py new file mode 100644 index 0000000..03c2262 --- /dev/null +++ b/05-data-csv-xtuner.py @@ -0,0 +1,68 @@ + +import json + +import csv + + + +def convert_to_alpaca_format(input_file, output_file): + """ + 读取csv文件,提取其中的question和answer列的数据,并转换为 Alpaca 格式。 + + 输入csv格式: + question,A,B,C,D,E,F,G,H,I,J,answer + + 输出格式 (Alpaca): + { + "instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。", + "input": "Based on the title...", + "output": "D" + } + """ + print(f"转换数据: {input_file} -> {output_file}") + + converted_data = [] + with open(input_file, "r", encoding="utf-8") as f: + csv_reader = csv.DictReader(f) + for row in csv_reader: + try: + # 检查必要的列是否存在 + if "question" not in row or "answer" not in row: + print(f"警告: 数据缺少必要列: {row}") + continue + + # 创建新的 Alpaca 格式数据 + new_data = { + "instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。", + "input": row["question"], + "output": row["answer"] + } + converted_data.append(new_data) + + except Exception as e: + print(f"处理行时发生错误: {str(e)}") + + # 写入输出文件 + with open(output_file, "w", encoding="utf-8") as f: + for item in converted_data: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + + print(f"转换完成! 共转换 {len(converted_data)} 条数据") + +if __name__ == "__main__": + # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") + # parser.add_argument( + # "--input", + # type=str, + # required=True, + # help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", + # ) + # parser.add_argument("--output", type=str, required=True, help="输出文件路径") + + # args = parser.parse_args() + + #input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径 + input_file = "newformat_sft_test_data.csv" + output_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径 + + convert_to_alpaca_format(input_file, output_file) \ No newline at end of file diff --git a/05-data-swfit-xtuner.py b/05-data-swfit-xtuner.py new file mode 100644 index 0000000..5a827a8 --- /dev/null +++ b/05-data-swfit-xtuner.py @@ -0,0 +1,87 @@ + +import json +import os +import argparse + + +def convert_to_alpaca_format(input_file, output_file): + """ + 将 Swift 格式的数据转换为 Alpaca 格式 + + 输入格式: + { + "system": "你是个优秀的论文分类师", + "conversation": [ + { + "human": "Based on the title...", + "assistant": "D" + } + ] + } + + 输出格式 (Alpaca): + { + "instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。", + "input": "Based on the title...", + "output": "D" + } + """ + print(f"转换数据: {input_file} -> {output_file}") + + converted_data = [] + with open(input_file, "r", encoding="utf-8") as f: + for line in f: + try: + data = json.loads(line.strip()) + + # 检查数据结构 + if "system" not in data or "conversation" not in data: + print(f"警告: 数据缺少必要字段: {data}") + continue + + # 从 system 提取指令 + instruction = data.get("system", "") + if not instruction: + instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。" + + # 处理对话 + for turn in data["conversation"]: + if "human" in turn and "assistant" in turn: + # 创建新的 Alpaca 格式数据 + new_data = { + "instruction": instruction, + "input": turn["human"], + "output": turn["assistant"], + } + converted_data.append(new_data) + + except json.JSONDecodeError: + print(f"警告: 无法解析JSON行: {line}") + except Exception as e: + print(f"处理行时发生错误: {str(e)}") + + # 写入输出文件 + with open(output_file, "w", encoding="utf-8") as f: + for item in converted_data: + f.write(json.dumps(item, ensure_ascii=False) + "\n") + + print(f"转换完成! 共转换 {len(converted_data)} 条数据") + + +if __name__ == "__main__": + # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") + # parser.add_argument( + # "--input", + # type=str, + # required=True, + # help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", + # ) + # parser.add_argument("--output", type=str, required=True, help="输出文件路径") + + # args = parser.parse_args() + + #input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径 + input_file = "arxiv-metadata-oai-snapshot--swift.json" + output_file = "arxiv-metadata-oai-snapshot--xtuner.jsonl" # 输出文件路径 + + convert_to_alpaca_format(input_file, output_file) \ No newline at end of file diff --git a/06-data-xtuner-compose.py b/06-data-xtuner-compose.py new file mode 100644 index 0000000..7602922 --- /dev/null +++ b/06-data-xtuner-compose.py @@ -0,0 +1,75 @@ + +import json +import os +import argparse +import pandas as pd +import matplotlib.pyplot as plt + + + +def get_Composition_ratio(input_file): + """ + 输出格式 (Alpaca): + { + "instruction": "根据论文的标题、作者和摘要,确定该论文的科学类别。", + "input": "Based on the title...", + "output": "D" + } + 计算数据集组成比例,并打印输出。 + :param input_file: 输入的JSONL文件路径 + + + """ + + # 读取JSONL文件 + with open(input_file, "r") as f: + data = [json.loads(line) for line in f] # 读取每一行并解析为JSON对象 + df = pd.DataFrame(data) + # print(df.head(5)) + # 计算每个类别的数量 + counts = df['output'].value_counts() + # 计算总数 + total = counts.sum() + + # 计算每个类别的比例 + ratios = counts / total * 100 + # 打印每个类别的比例 + print("类别比例和数量:") + for category, ratio in ratios.items(): + print(f"类别 {category}: {ratio:.2f}% ({counts[category]} 条)") + # 绘制饼图 + plt.figure(figsize=(8, 6)) + plt.pie(ratios, labels=ratios.index, autopct='%1.1f%%', startangle=140) + plt.title('数据集类别比例') + plt.show() + return ratios + + + + + + + + + +if __name__ == "__main__": + # parser = argparse.ArgumentParser(description="转换数据到Alpaca格式") + # parser.add_argument( + # "--input", + # type=str, + # required=True, + # help="输入文件路径 (swift_formatted_sft_train_data.jsonl)", + # ) + # parser.add_argument("--output", type=str, required=True, help="输出文件路径") + + # args = parser.parse_args() + + #input_file = "arxiv-metadata-oai-snapshot--random.json" # 20000条原始数据文件路径 + #input_file = "arxiv-metadata-oai-snapshot--swift.json" + input_file = "sftdata.jsonl" # 输出文件路径 + input_file = "newformat_sft_test_data--xtuner.jsonl" # 输出文件路径 + + get_Composition_ratio(input_file) + + + #convert_to_alpaca_format(input_file, output_file) \ No newline at end of file