添加多个类别关键词,优化数据处理逻辑,支持从arXiv提取和筛选论文数据
This commit is contained in:
		
							
								
								
									
										91
									
								
								01-pre-multi.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								01-pre-multi.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,91 @@
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
# 要保留的类别关键词
 | 
			
		||||
# target_categories = {
 | 
			
		||||
#     "astro-ph", "cond-mat.mes-hall", "cond-mat.mtrl-sci",
 | 
			
		||||
#     "cs.CL", "cs.CV", "cs.LG",
 | 
			
		||||
#     "gr-qc", "hep-ph", "hep-th", "quant-ph"
 | 
			
		||||
# }
 | 
			
		||||
 | 
			
		||||
target_categories = {
 | 
			
		||||
        'quant-ph',
 | 
			
		||||
        'physics.chem-ph', 
 | 
			
		||||
        'physics.atom-ph',
 | 
			
		||||
        'cond-mat.soft',
 | 
			
		||||
        'cs.RO',
 | 
			
		||||
        'cs.CL',
 | 
			
		||||
        'cs.SE',
 | 
			
		||||
        'cs.IR',
 | 
			
		||||
        'hep-th',
 | 
			
		||||
        'hep-ph',
 | 
			
		||||
        'physics.optics',
 | 
			
		||||
        'cs.AI',
 | 
			
		||||
        'cs.CV',
 | 
			
		||||
        'nucl-th',
 | 
			
		||||
        'astro-ph',
 | 
			
		||||
        'math.PR',
 | 
			
		||||
        'cs.OS',
 | 
			
		||||
        'eess.SP',
 | 
			
		||||
        'math.OC',
 | 
			
		||||
        'math.DS',
 | 
			
		||||
        'math.DG',
 | 
			
		||||
        'math.MP',
 | 
			
		||||
        'cs.MM',
 | 
			
		||||
        'stat.ME',
 | 
			
		||||
        'math.CO',
 | 
			
		||||
        'cs.NE'
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
 | 
			
		||||
output_path = "arxiv-metadata-oai-snapshot-multi.json"  # 使用 JSON Lines 格式输出路径
 | 
			
		||||
 | 
			
		||||
count = 0
 | 
			
		||||
 | 
			
		||||
with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
 | 
			
		||||
    for line in infile:
 | 
			
		||||
        try:
 | 
			
		||||
            record = json.loads(line)
 | 
			
		||||
            record_cats = record.get("categories", "").split()
 | 
			
		||||
                        # 获取更新日期和摘要
 | 
			
		||||
            update_date = record.get("update_date", "")
 | 
			
		||||
            abstract = record.get("abstract", "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            # 多类别的记录
 | 
			
		||||
            if len(record_cats) > 1:
 | 
			
		||||
                # 检查是否record_cats只有一个类别在目标类别中
 | 
			
		||||
                # 检查record_cats中是否只有一个类别在目标类别中
 | 
			
		||||
                target_count = sum(1 for cat in record_cats if cat in target_categories)
 | 
			
		||||
                has_single_target_category = target_count == 1
 | 
			
		||||
 | 
			
		||||
                if not has_single_target_category:
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                    # 检查是否包含无需过滤的类别
 | 
			
		||||
                no_filter_categories = {'cs.OS'}
 | 
			
		||||
                has_no_filter_category = any(cat in no_filter_categories for cat in record_cats)
 | 
			
		||||
 | 
			
		||||
                # 如果包含无需过滤的类别,直接写入
 | 
			
		||||
                if has_no_filter_category:
 | 
			
		||||
                    outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
                    count += 1
 | 
			
		||||
                else:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
                    # 其他需要满足过滤条件
 | 
			
		||||
                    if len(abstract) >= 300 and len(abstract) <= 1024:
 | 
			
		||||
                        if update_date and int(update_date[:4]) >= 2016:
 | 
			
		||||
                            outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
                            count += 1
 | 
			
		||||
 | 
			
		||||
        except json.JSONDecodeError:
 | 
			
		||||
            continue  # 忽略格式错误的行
 | 
			
		||||
 | 
			
		||||
print(f"筛选完成,共保存了 {count} 条记录到 {output_path}")
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										31
									
								
								01-pre.py
									
									
									
									
									
								
							
							
						
						
									
										31
									
								
								01-pre.py
									
									
									
									
									
								
							@@ -40,7 +40,7 @@ target_categories = {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
input_path = "arxiv-metadata-oai-snapshot.json"#原数据路径
 | 
			
		||||
output_path = "arxiv-metadata-oai-snapshot--26.json"  # 使用 JSON Lines 格式输出路径
 | 
			
		||||
output_path = "arxiv-metadata-oai-snapshot-single.json"  # 使用 JSON Lines 格式输出路径
 | 
			
		||||
 | 
			
		||||
count = 0
 | 
			
		||||
 | 
			
		||||
@@ -49,11 +49,34 @@ with open(input_path, 'r') as infile, open(output_path, 'w') as outfile:
 | 
			
		||||
        try:
 | 
			
		||||
            record = json.loads(line)
 | 
			
		||||
            record_cats = record.get("categories", "").split()
 | 
			
		||||
                        # 获取更新日期和摘要
 | 
			
		||||
            update_date = record.get("update_date", "")
 | 
			
		||||
            abstract = record.get("abstract", "")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            # 只保留一个类别的记录
 | 
			
		||||
            if len(record_cats) > 1:
 | 
			
		||||
                continue
 | 
			
		||||
            if record_cats:
 | 
			
		||||
                last_cat = record_cats[-1]
 | 
			
		||||
                last_cat = record_cats[0]
 | 
			
		||||
                if last_cat in target_categories:
 | 
			
		||||
                    outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
                    count += 1
 | 
			
		||||
                    # 定义无需过滤条件的类别
 | 
			
		||||
                    no_filter_categories = {'cs.OS', 'cs.MM', 'cs.NE', 'math.MP'}
 | 
			
		||||
                    
 | 
			
		||||
                    # 如果属于无需过滤的类别,直接写入
 | 
			
		||||
                    if last_cat in no_filter_categories:
 | 
			
		||||
                        outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
                        count += 1
 | 
			
		||||
                    else:
 | 
			
		||||
                        # 其他类别需要满足过滤条件
 | 
			
		||||
                        if len(abstract) >= 300 and len(abstract) <= 1024:
 | 
			
		||||
                            if update_date and int(update_date[:4]) >= 2016:
 | 
			
		||||
                                outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
                                count += 1
 | 
			
		||||
 | 
			
		||||
        except json.JSONDecodeError:
 | 
			
		||||
            continue  # 忽略格式错误的行
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,93 +1,190 @@
 | 
			
		||||
import json
 | 
			
		||||
import random
 | 
			
		||||
categorys = [
 | 
			
		||||
    'quant-ph',
 | 
			
		||||
    'physics.chem-ph', 
 | 
			
		||||
    'physics.atom-ph',
 | 
			
		||||
    'cond-mat.soft',
 | 
			
		||||
    'cs.RO',
 | 
			
		||||
    'cs.CL',
 | 
			
		||||
    'cs.SE',
 | 
			
		||||
    'cs.IR',
 | 
			
		||||
    'hep-th',
 | 
			
		||||
    'hep-ph',
 | 
			
		||||
    'physics.optics',
 | 
			
		||||
    'cs.AI',
 | 
			
		||||
    'cs.CV',
 | 
			
		||||
    'nucl-th',
 | 
			
		||||
    'astro-ph',
 | 
			
		||||
    'math.PR',
 | 
			
		||||
    'cs.OS' ,
 | 
			
		||||
    'eess.SP',
 | 
			
		||||
    'math.OC',
 | 
			
		||||
    'math.DS',
 | 
			
		||||
    'math.DG',
 | 
			
		||||
    'math.MP',
 | 
			
		||||
    'cs.MM',
 | 
			
		||||
    'stat.ME',
 | 
			
		||||
    'math.CO',
 | 
			
		||||
    'cs.NE'
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
input_path = "arxiv-metadata-oai-snapshot--26.json"
 | 
			
		||||
output_path = "arxiv-metadata-oai-snapshot--26-500.json"
 | 
			
		||||
sample_size = 4000  # 你可以改成 10000 等其他数字
 | 
			
		||||
 | 
			
		||||
def extract_category_mapping():
 | 
			
		||||
    """定义类别到选项的映射"""
 | 
			
		||||
    category_to_option = {
 | 
			
		||||
        'quant-ph': 'A',
 | 
			
		||||
        'physics.chem-ph': 'B', 
 | 
			
		||||
        'physics.atom-ph': 'C',
 | 
			
		||||
        'cond-mat.soft': 'D',
 | 
			
		||||
        'cs.RO': 'E',
 | 
			
		||||
        'cs.CL': 'F',
 | 
			
		||||
        'cs.SE': 'G',
 | 
			
		||||
        'cs.IR': 'H',
 | 
			
		||||
        'hep-th': 'I',
 | 
			
		||||
        'hep-ph': 'J',
 | 
			
		||||
        'physics.optics': 'K',
 | 
			
		||||
        'cs.AI': 'L',
 | 
			
		||||
        'cs.CV': 'M',
 | 
			
		||||
        'nucl-th': 'N',
 | 
			
		||||
        'astro-ph': 'O',
 | 
			
		||||
        'math.PR': 'P',
 | 
			
		||||
        'cs.OS': 'Q',
 | 
			
		||||
        'eess.SP': 'R',
 | 
			
		||||
        'math.OC': 'S',
 | 
			
		||||
        'math.DS': 'T',
 | 
			
		||||
        'math.DG': 'U',
 | 
			
		||||
        'math.MP': 'V',
 | 
			
		||||
        'cs.MM': 'W',
 | 
			
		||||
        'stat.ME': 'X',
 | 
			
		||||
        'math.CO': 'Y',
 | 
			
		||||
        'cs.NE': 'Z'
 | 
			
		||||
    }
 | 
			
		||||
    return category_to_option
 | 
			
		||||
 | 
			
		||||
def get_category_options_text():
 | 
			
		||||
    """生成选项文本"""
 | 
			
		||||
    options = [
 | 
			
		||||
        "A. quant-ph", "B. physics.chem-ph", "C. physics.atom-ph", "D. cond-mat.soft",
 | 
			
		||||
        "E. cs.RO", "F. cs.CL", "G. cs.SE", "H. cs.IR", "I. hep-th", "J. hep-ph",
 | 
			
		||||
        "K. physics.optics", "L. cs.AI", "M. cs.CV", "N. nucl-th", "O. astro-ph",
 | 
			
		||||
        "P. math.PR", "Q. cs.OS", "R. eess.SP", "S. math.OC", "T. math.DS",
 | 
			
		||||
        "U. math.DG", "V. math.MP", "W. cs.MM", "X. stat.ME", "Y. math.CO", "Z. cs.NE"
 | 
			
		||||
    ]
 | 
			
		||||
    return "\n".join(options)
 | 
			
		||||
 | 
			
		||||
def process_paper(paper_data, verbose=False):
 | 
			
		||||
    """处理单篇论文数据"""
 | 
			
		||||
    category_mapping = extract_category_mapping()
 | 
			
		||||
    
 | 
			
		||||
    # 提取基本信息
 | 
			
		||||
    paper_id = paper_data.get('id', '')
 | 
			
		||||
    title = paper_data.get('title', '').replace('\n', ' ').strip()
 | 
			
		||||
    authors = paper_data.get('authors', '')
 | 
			
		||||
    abstract = paper_data.get('abstract', '').replace('\n', ' ').strip()
 | 
			
		||||
    categories = paper_data.get('categories', '')
 | 
			
		||||
    
 | 
			
		||||
    # 检查是否包含多个类别(用空格分隔)
 | 
			
		||||
    category_list = categories.split()
 | 
			
		||||
    if len(category_list) > 1:
 | 
			
		||||
        # 如果有多个类别,category_list中第1个满足category_to_option的类别作为目标类别
 | 
			
		||||
        target_category = next((category for category in category_list if category in categorys), None)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# 先将所有数据加载到内存中(30万条可以接受)
 | 
			
		||||
    else:
 | 
			
		||||
        target_category = category_list[0] if category_list else ''
 | 
			
		||||
 | 
			
		||||
    # 检查类别是否在我们的目标列表中
 | 
			
		||||
    
 | 
			
		||||
    # if target_category not in category_mapping:
 | 
			
		||||
    #     if verbose:
 | 
			
		||||
    #         print(f"跳过非目标类别论文 {paper_id}: {target_category}")
 | 
			
		||||
    #     return None
 | 
			
		||||
    
 | 
			
		||||
    # 获取对应的选项字母
 | 
			
		||||
    correct_option = category_mapping[target_category]
 | 
			
		||||
    
 | 
			
		||||
    # 构建human问题
 | 
			
		||||
    options_text = get_category_options_text()
 | 
			
		||||
    human_content = f"Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{options_text}"
 | 
			
		||||
    
 | 
			
		||||
    # 构建JSONL条目
 | 
			
		||||
    jsonl_entry = {
 | 
			
		||||
        "system": "你是个优秀的论文分类师",
 | 
			
		||||
        "conversation": [
 | 
			
		||||
            {
 | 
			
		||||
                "human": human_content,
 | 
			
		||||
                "assistant": correct_option
 | 
			
		||||
            }
 | 
			
		||||
        ]
 | 
			
		||||
    }
 | 
			
		||||
    
 | 
			
		||||
    if verbose:
 | 
			
		||||
        print(f"处理论文 {paper_id}: {target_category} -> {correct_option}")
 | 
			
		||||
    
 | 
			
		||||
    return jsonl_entry
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# input_path = "arxiv-metadata-oai-snapshot-single.json"
 | 
			
		||||
# output_path_1 = "arxiv-metadata-oai-snapshot-single-batch1.json"
 | 
			
		||||
# output_path_2 = "arxiv-metadata-oai-snapshot-single-batch2.json"
 | 
			
		||||
# batch1_size_per_category = 400
 | 
			
		||||
# batch2_size_per_category = 600
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
input_path = "arxiv-metadata-oai-snapshot-multi.json"
 | 
			
		||||
output_path_1 = "arxiv-metadata-oai-snapshot-multi-batch1.json"
 | 
			
		||||
output_path_2 = "arxiv-metadata-oai-snapshot-multi-batch2.json"
 | 
			
		||||
 | 
			
		||||
batch1_size_per_category = 400
 | 
			
		||||
batch2_size_per_category = 400
 | 
			
		||||
 | 
			
		||||
# 先将所有数据加载到内存中
 | 
			
		||||
with open(input_path, 'r') as infile:
 | 
			
		||||
    data = [json.loads(line) for line in infile]
 | 
			
		||||
 | 
			
		||||
print(f"原始数据量:{len(data)} 条")
 | 
			
		||||
 | 
			
		||||
## 按类别筛选数据,不是随机
 | 
			
		||||
## 每个类别指定抽取的比例
 | 
			
		||||
# category_proportions = {
 | 
			
		||||
#     'astro-ph': 0.1336,
 | 
			
		||||
#     'cond-mat.mes-hall': 0.0486,
 | 
			
		||||
#     'cond-mat.mtrl-sci': 0.0587,
 | 
			
		||||
#     'cs.CL': 0.085,
 | 
			
		||||
#     'cs.CV': 0.0931,
 | 
			
		||||
#     'cs.LG': 0.0992,
 | 
			
		||||
#     'gr-qc': 0.1174,
 | 
			
		||||
#     'hep-ph': 0.1194,
 | 
			
		||||
#     'hep-th': 0.085,
 | 
			
		||||
#     'quant-ph': 0.1599
 | 
			
		||||
# }
 | 
			
		||||
 | 
			
		||||
category_proportions = {
 | 
			
		||||
        'quant-ph': 0.1,
 | 
			
		||||
        'physics.chem-ph': 0.1, 
 | 
			
		||||
        'physics.atom-ph': 0.1,
 | 
			
		||||
        'cond-mat.soft': 0.1,
 | 
			
		||||
        'cs.RO': 0.1,
 | 
			
		||||
        'cs.CL': 0.1,
 | 
			
		||||
        'cs.SE': 0.1,
 | 
			
		||||
        'cs.IR': 0.1,
 | 
			
		||||
        'hep-th': 0.1,
 | 
			
		||||
        'hep-ph': 0.1,
 | 
			
		||||
        'physics.optics': 0.1,
 | 
			
		||||
        'cs.AI': 0.1,
 | 
			
		||||
        'cs.CV': 0.1,
 | 
			
		||||
        'nucl-th': 0.1,
 | 
			
		||||
        'astro-ph': 0.1,
 | 
			
		||||
        'math.PR': 0.1,
 | 
			
		||||
        'cs.OS': 0.1,
 | 
			
		||||
        'eess.SP': 0.1,
 | 
			
		||||
        'math.OC': 0.1,
 | 
			
		||||
        'math.DS': 0.1,
 | 
			
		||||
        'math.DG': 0.1,
 | 
			
		||||
        'math.MP': 0.1,
 | 
			
		||||
        'cs.MM': 0.1,
 | 
			
		||||
        'stat.ME': 0.1,
 | 
			
		||||
        'math.CO': 0.1,
 | 
			
		||||
        'cs.NE': 0.1
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# 存储两个批次的数据
 | 
			
		||||
batch1_data = []
 | 
			
		||||
batch2_data = []
 | 
			
		||||
 | 
			
		||||
## print 每个类别的筛选比例和数量
 | 
			
		||||
print("每个类别的筛选比例和数量:")
 | 
			
		||||
for category, proportion in category_proportions.items():
 | 
			
		||||
    count = sample_size * proportion
 | 
			
		||||
    print(f"类别 {category}: 抽取比例 {proportion}, 数量 {count}")
 | 
			
		||||
# 按每个类别的数量筛选数据
 | 
			
		||||
filtered_data = []
 | 
			
		||||
for category, proportion in category_proportions.items():
 | 
			
		||||
    count = int(sample_size * proportion)
 | 
			
		||||
# 按类别处理数据
 | 
			
		||||
for category in categorys:
 | 
			
		||||
    # 筛选出当前类别的数据
 | 
			
		||||
    category_data = [item for item in data if item.get('categories', '').strip() == category]
 | 
			
		||||
    # 如果当前类别的数据量小于需要抽取的数量,则全部取出
 | 
			
		||||
    if len(category_data) < count:
 | 
			
		||||
        filtered_data.extend(category_data)
 | 
			
		||||
    else:
 | 
			
		||||
        # 随机抽样指定数量的数据
 | 
			
		||||
        sampled_data = random.sample(category_data, count)
 | 
			
		||||
        filtered_data.extend(sampled_data)
 | 
			
		||||
    print(f"类别 {category}: 抽取数量 {count}")
 | 
			
		||||
    category_data = [item for item in data if category in item.get('categories', '').strip().split()]
 | 
			
		||||
    print(f"类别 {category}: 总共 {len(category_data)} 条")
 | 
			
		||||
    
 | 
			
		||||
    # 打乱数据顺序
 | 
			
		||||
    random.shuffle(category_data)
 | 
			
		||||
    
 | 
			
		||||
    # 确定第一批和第二批的数量
 | 
			
		||||
    total_count = len(category_data)
 | 
			
		||||
    batch1_count = min(batch1_size_per_category, total_count)
 | 
			
		||||
    batch2_count = min(batch2_size_per_category, total_count - batch1_count)
 | 
			
		||||
    
 | 
			
		||||
    # 分配数据到两个批次
 | 
			
		||||
    batch1_data.extend(category_data[:batch1_count])
 | 
			
		||||
    batch2_data.extend(category_data[batch1_count:batch1_count + batch2_count])
 | 
			
		||||
    
 | 
			
		||||
    print(f"类别 {category}: 第一批 {batch1_count} 条, 第二批 {batch2_count} 条")
 | 
			
		||||
 | 
			
		||||
# 保存第一批数据
 | 
			
		||||
with open(output_path_1, 'w', encoding='utf-8') as outfile:
 | 
			
		||||
    for record in batch1_data:
 | 
			
		||||
        swft_js = process_paper(record, verbose=False)
 | 
			
		||||
        outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')
 | 
			
		||||
 | 
			
		||||
# 保存第二批数据
 | 
			
		||||
with open(output_path_2, 'w', encoding='utf-8') as outfile:
 | 
			
		||||
    for record in batch2_data:
 | 
			
		||||
        swft_js = process_paper(record, verbose=False)
 | 
			
		||||
        outfile.write(json.dumps(swft_js, ensure_ascii=False) + '\n')
 | 
			
		||||
 | 
			
		||||
# 保存结果
 | 
			
		||||
with open(output_path, 'w') as outfile:
 | 
			
		||||
    for record in filtered_data:
 | 
			
		||||
        outfile.write(json.dumps(record) + '\n')
 | 
			
		||||
 | 
			
		||||
print(f"已按比例抽取 {sample_size} 条数据保存到 {output_path}")
 | 
			
		||||
print(f"第一批数据: {len(batch1_data)} 条,已保存到 {output_path_1}")
 | 
			
		||||
print(f"第二批数据: {len(batch2_data)} 条,已保存到 {output_path_2}")
 | 
			
		||||
@@ -124,6 +124,12 @@ QUESTION_TEMPLATES = [
 | 
			
		||||
    "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
QUESTION_TEMPLATES = [
 | 
			
		||||
    "Based on the title '{title}', authors '{authors}', and abstract '{abstract}', please determine the scientific category of this paper.\n\n{category_text}"
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def extract_title_author_and_abstract(content_text):
 | 
			
		||||
    """
 | 
			
		||||
    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , 
 | 
			
		||||
@@ -548,7 +554,7 @@ if __name__ == "__main__":
 | 
			
		||||
    output_file_pre = r"G:\\11\data-prepare\\arxiv_papers-multi_type-pre.json"
 | 
			
		||||
    paper_datas=get_paper_data_from_crawl_jason(input_file)
 | 
			
		||||
    convert_onedata2multi_type_sft(paper_datas, output_file_sft, num_templates=1)
 | 
			
		||||
    convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)
 | 
			
		||||
    #convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -50,5 +50,5 @@ def get_Composition_ratio(input_file):
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
    # input_file = "sftdata.jsonl"
 | 
			
		||||
    input_file = "output-26.jsonl"
 | 
			
		||||
    input_file = "arxiv-metadata-oai-snapshot--swift-26.json"
 | 
			
		||||
    input_file = "G:\\11\\data-prepare\\arxiv-metadata-oai-snapshot-multi-batch1.json"
 | 
			
		||||
    get_Composition_ratio(input_file)
 | 
			
		||||
 
 | 
			
		||||
@@ -11,7 +11,7 @@ from sklearn.metrics import (
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
# 配置参数
 | 
			
		||||
RESULT_FILE = "G:\\11\\data-prepare\\20250720-195839.jsonl"  # 替换为你的结果文件路径
 | 
			
		||||
RESULT_FILE = "G:\\11\\data-prepare\\20250727-084808.jsonl"  # 替换为你的结果文件路径
 | 
			
		||||
OUTPUT_DIR = "G:\\11\\data-prepare\\analysis_results"  # 分析结果输出目录
 | 
			
		||||
EXPORT_CSV = True  # 是否导出CSV格式的详细结果
 | 
			
		||||
PLOT_CONFUSION_MATRIX = True  # 是否绘制混淆矩阵
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user