添加多个类别关键词,优化数据处理逻辑,支持从arXiv提取和筛选论文数据
This commit is contained in:
@@ -1,93 +1,190 @@
|
||||
import json
|
||||
import random
|
||||
categorys = [
|
||||
'quant-ph',
|
||||
'physics.chem-ph',
|
||||
'physics.atom-ph',
|
||||
'cond-mat.soft',
|
||||
'cs.RO',
|
||||
'cs.CL',
|
||||
'cs.SE',
|
||||
'cs.IR',
|
||||
'hep-th',
|
||||
'hep-ph',
|
||||
'physics.optics',
|
||||
'cs.AI',
|
||||
'cs.CV',
|
||||
'nucl-th',
|
||||
'astro-ph',
|
||||
'math.PR',
|
||||
'cs.OS' ,
|
||||
'eess.SP',
|
||||
'math.OC',
|
||||
'math.DS',
|
||||
'math.DG',
|
||||
'math.MP',
|
||||
'cs.MM',
|
||||
'stat.ME',
|
||||
'math.CO',
|
||||
'cs.NE'
|
||||
]
|
||||
|
||||
input_path = "arxiv-metadata-oai-snapshot--26.json"
|
||||
output_path = "arxiv-metadata-oai-snapshot--26-500.json"
|
||||
sample_size = 4000 # 你可以改成 10000 等其他数字
|
||||
|
||||
def extract_category_mapping():
|
||||
"""定义类别到选项的映射"""
|
||||
category_to_option = {
|
||||
'quant-ph': 'A',
|
||||
'physics.chem-ph': 'B',
|
||||
'physics.atom-ph': 'C',
|
||||
'cond-mat.soft': 'D',
|
||||
'cs.RO': 'E',
|
||||
'cs.CL': 'F',
|
||||
'cs.SE': 'G',
|
||||
'cs.IR': 'H',
|
||||
'hep-th': 'I',
|
||||
'hep-ph': 'J',
|
||||
'physics.optics': 'K',
|
||||
'cs.AI': 'L',
|
||||
'cs.CV': 'M',
|
||||
'nucl-th': 'N',
|
||||
'astro-ph': 'O',
|
||||
'math.PR': 'P',
|
||||
'cs.OS': 'Q',
|
||||
'eess.SP': 'R',
|
||||
'math.OC': 'S',
|
||||
'math.DS': 'T',
|
||||
'math.DG': 'U',
|
||||
'math.MP': 'V',
|
||||
'cs.MM': 'W',
|
||||
'stat.ME': 'X',
|
||||
'math.CO': 'Y',
|
||||
'cs.NE': 'Z'
|
||||
}
|
||||
return category_to_option
|
||||
|
||||
def get_category_options_text():
|
||||
"""生成选项文本"""
|
||||
options = [
|
||||
"A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft",
|
||||
"E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph",
|
||||
"K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph",
|
||||
"P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS",
|
||||
"U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE"
|
||||
]
|
||||
return "\n".join(options)
|
||||
|
||||
def process_paper(paper_data, verbose=False):
|
||||
"""处理单篇论文数据"""
|
||||
category_mapping = extract_category_mapping()
|
||||
|
||||
# 提取基本信息
|
||||
paper_id = paper_data.get('id', '')
|
||||
title = paper_data.get('title', '').replace('\n', ' ').strip()
|
||||
authors = paper_data.get('authors', '')
|
||||
abstract = paper_data.get('abstract', '').replace('\n', ' ').strip()
|
||||
categories = paper_data.get('categories', '')
|
||||
|
||||
# 检查是否包含多个类别(用空格分隔)
|
||||
category_list = categories.split()
|
||||
if len(category_list) > 1:
|
||||
# 如果有多个类别,category_list中第1个满足category_to_option的类别作为目标类别
|
||||
target_category = next((category for category in category_list if category in categorys), None)
|
||||
|
||||
|
||||
|
||||
# 先将所有数据加载到内存中(30万条可以接受)
|
||||
else:
|
||||
target_category = category_list[0] if category_list else ''
|
||||
|
||||
# 检查类别是否在我们的目标列表中
|
||||
|
||||
# if target_category not in category_mapping:
|
||||
# if verbose:
|
||||
# print(f"跳过非目标类别论文 {paper_id}: {target_category}")
|
||||
# return None
|
||||
|
||||
# 获取对应的选项字母
|
||||
correct_option = category_mapping[target_category]
|
||||
|
||||
# 构建human问题
|
||||
options_text = get_category_options_text()
|
||||
human_content = f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{options_text}"
|
||||
|
||||
# 构建JSONL条目
|
||||
jsonl_entry = {
|
||||
"system": "你是个优秀的论文分类师",
|
||||
"conversation": [
|
||||
{
|
||||
"human": human_content,
|
||||
"assistant": correct_option
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
if verbose:
|
||||
print(f"处理论文 {paper_id}: {target_category} -> {correct_option}")
|
||||
|
||||
return jsonl_entry
|
||||
|
||||
|
||||
# input_path = "arxiv-metadata-oai-snapshot-single.json"
|
||||
# output_path_1 = "arxiv-metadata-oai-snapshot-single-batch1.json"
|
||||
# output_path_2 = "arxiv-metadata-oai-snapshot-single-batch2.json"
|
||||
# batch1_size_per_category = 400
|
||||
# batch2_size_per_category = 600
|
||||
|
||||
|
||||
|
||||
|
||||
input_path = "arxiv-metadata-oai-snapshot-multi.json"
|
||||
output_path_1 = "arxiv-metadata-oai-snapshot-multi-batch1.json"
|
||||
output_path_2 = "arxiv-metadata-oai-snapshot-multi-batch2.json"
|
||||
|
||||
batch1_size_per_category = 400
|
||||
batch2_size_per_category = 400
|
||||
|
||||
# 先将所有数据加载到内存中
|
||||
with open(input_path, 'r') as infile:
|
||||
data = [json.loads(line) for line in infile]
|
||||
|
||||
print(f"原始数据量:{len(data)} 条")
|
||||
|
||||
## 按类别筛选数据,不是随机
|
||||
## 每个类别指定抽取的比例
|
||||
# category_proportions = {
|
||||
# 'astro-ph': 0.1336,
|
||||
# 'cond-mat.mes-hall': 0.0486,
|
||||
# 'cond-mat.mtrl-sci': 0.0587,
|
||||
# 'cs.CL': 0.085,
|
||||
# 'cs.CV': 0.0931,
|
||||
# 'cs.LG': 0.0992,
|
||||
# 'gr-qc': 0.1174,
|
||||
# 'hep-ph': 0.1194,
|
||||
# 'hep-th': 0.085,
|
||||
# 'quant-ph': 0.1599
|
||||
# }
|
||||
|
||||
category_proportions = {
|
||||
'quant-ph': 0.1,
|
||||
'physics.chem-ph': 0.1,
|
||||
'physics.atom-ph': 0.1,
|
||||
'cond-mat.soft': 0.1,
|
||||
'cs.RO': 0.1,
|
||||
'cs.CL': 0.1,
|
||||
'cs.SE': 0.1,
|
||||
'cs.IR': 0.1,
|
||||
'hep-th': 0.1,
|
||||
'hep-ph': 0.1,
|
||||
'physics.optics': 0.1,
|
||||
'cs.AI': 0.1,
|
||||
'cs.CV': 0.1,
|
||||
'nucl-th': 0.1,
|
||||
'astro-ph': 0.1,
|
||||
'math.PR': 0.1,
|
||||
'cs.OS': 0.1,
|
||||
'eess.SP': 0.1,
|
||||
'math.OC': 0.1,
|
||||
'math.DS': 0.1,
|
||||
'math.DG': 0.1,
|
||||
'math.MP': 0.1,
|
||||
'cs.MM': 0.1,
|
||||
'stat.ME': 0.1,
|
||||
'math.CO': 0.1,
|
||||
'cs.NE': 0.1
|
||||
}
|
||||
|
||||
|
||||
# 存储两个批次的数据
|
||||
batch1_data = []
|
||||
batch2_data = []
|
||||
|
||||
## print 每个类别的筛选比例和数量
|
||||
print("每个类别的筛选比例和数量:")
|
||||
for category, proportion in category_proportions.items():
|
||||
count = sample_size * proportion
|
||||
print(f"类别 {category}: 抽取比例 {proportion}, 数量 {count}")
|
||||
# 按每个类别的数量筛选数据
|
||||
filtered_data = []
|
||||
for category, proportion in category_proportions.items():
|
||||
count = int(sample_size * proportion)
|
||||
# 按类别处理数据
|
||||
for category in categorys:
|
||||
# 筛选出当前类别的数据
|
||||
category_data = [item for item in data if item.get('categories', '').strip() == category]
|
||||
# 如果当前类别的数据量小于需要抽取的数量,则全部取出
|
||||
if len(category_data) < count:
|
||||
filtered_data.extend(category_data)
|
||||
else:
|
||||
# 随机抽样指定数量的数据
|
||||
sampled_data = random.sample(category_data, count)
|
||||
filtered_data.extend(sampled_data)
|
||||
print(f"类别 {category}: 抽取数量 {count}")
|
||||
category_data = [item for item in data if category in item.get('categories', '').strip().split()]
|
||||
print(f"类别 {category}: 总共 {len(category_data)} 条")
|
||||
|
||||
# 打乱数据顺序
|
||||
random.shuffle(category_data)
|
||||
|
||||
# 确定第一批和第二批的数量
|
||||
total_count = len(category_data)
|
||||
batch1_count = min(batch1_size_per_category, total_count)
|
||||
batch2_count = min(batch2_size_per_category, total_count - batch1_count)
|
||||
|
||||
# 分配数据到两个批次
|
||||
batch1_data.extend(category_data[:batch1_count])
|
||||
batch2_data.extend(category_data[batch1_count:batch1_count + batch2_count])
|
||||
|
||||
print(f"类别 {category}: 第一批 {batch1_count} 条, 第二批 {batch2_count} 条")
|
||||
|
||||
# 保存第一批数据
|
||||
with open(output_path_1, 'w', encoding='utf-8') as outfile:
|
||||
for record in batch1_data:
|
||||
swft_js = process_paper(record, verbose=False)
|
||||
outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')
|
||||
|
||||
# 保存第二批数据
|
||||
with open(output_path_2, 'w', encoding='utf-8') as outfile:
|
||||
for record in batch2_data:
|
||||
swft_js = process_paper(record, verbose=False)
|
||||
outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')
|
||||
|
||||
|
||||
|
||||
|
||||
# 保存结果
|
||||
with open(output_path, 'w') as outfile:
|
||||
for record in filtered_data:
|
||||
outfile.write(json.dumps(record) + '\n')
|
||||
|
||||
print(f"已按比例抽取 {sample_size} 条数据保存到 {output_path}")
|
||||
print(f"第一批数据: {len(batch1_data)} 条,已保存到 {output_path_1}")
|
||||
print(f"第二批数据: {len(batch2_data)} 条,已保存到 {output_path_2}")
|
Reference in New Issue
Block a user