添加从arXiv批量获取论文数据的功能,并将结果保存为JSONL格式,优化了数据处理流程
This commit is contained in:
		
							
								
								
									
										560
									
								
								05-data-swfit-sft2multi_type-crawl.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										560
									
								
								05-data-swfit-sft2multi_type-crawl.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,560 @@
 | 
				
			|||||||
 | 
					      
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# 科学类别文本常量
 | 
				
			||||||
 | 
					CATEGORY_TEXT = """ A. quant-ph
 | 
				
			||||||
 | 
					B. physics.chem-ph
 | 
				
			||||||
 | 
					C. physics.atom-ph
 | 
				
			||||||
 | 
					D. cond-mat.soft
 | 
				
			||||||
 | 
					E. cs.RO
 | 
				
			||||||
 | 
					F. cs.CL
 | 
				
			||||||
 | 
					G. cs.SE
 | 
				
			||||||
 | 
					H. cs.IR
 | 
				
			||||||
 | 
					I. hep-th
 | 
				
			||||||
 | 
					J. hep-ph
 | 
				
			||||||
 | 
					K. physics.optics
 | 
				
			||||||
 | 
					L. cs.AI
 | 
				
			||||||
 | 
					M. cs.CV
 | 
				
			||||||
 | 
					N. nucl-th
 | 
				
			||||||
 | 
					O. astro-ph
 | 
				
			||||||
 | 
					P. math.PR
 | 
				
			||||||
 | 
					Q. cs.OS
 | 
				
			||||||
 | 
					R. eess.SP
 | 
				
			||||||
 | 
					S. math.OC
 | 
				
			||||||
 | 
					T. math.DS
 | 
				
			||||||
 | 
					U. math.DG
 | 
				
			||||||
 | 
					V. math.MP
 | 
				
			||||||
 | 
					W. cs.MM
 | 
				
			||||||
 | 
					X. stat.ME
 | 
				
			||||||
 | 
					Y. math.CO
 | 
				
			||||||
 | 
					Z. cs.NE
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					# 科学类别字典
 | 
				
			||||||
 | 
					CATEGORY_DICT = {
 | 
				
			||||||
 | 
					    "quant-ph": "A",
 | 
				
			||||||
 | 
					    "physics.chem-ph": "B",
 | 
				
			||||||
 | 
					    "physics.atom-ph": "C",
 | 
				
			||||||
 | 
					    "cond-mat.soft": "D",
 | 
				
			||||||
 | 
					    "cs.RO": "E",
 | 
				
			||||||
 | 
					    "cs.CL": "F",
 | 
				
			||||||
 | 
					    "cs.SE": "G",
 | 
				
			||||||
 | 
					    "cs.IR": "H",
 | 
				
			||||||
 | 
					    "hep-th": "I",
 | 
				
			||||||
 | 
					    "hep-ph": "J",
 | 
				
			||||||
 | 
					    "physics.optics": "K",
 | 
				
			||||||
 | 
					    "cs.AI": "L",
 | 
				
			||||||
 | 
					    "cs.CV": "M",
 | 
				
			||||||
 | 
					    "nucl-th": "N",
 | 
				
			||||||
 | 
					    "astro-ph": "O",
 | 
				
			||||||
 | 
					    "math.PR": "P",
 | 
				
			||||||
 | 
					    "cs.OS": "Q",
 | 
				
			||||||
 | 
					    "eess.SP": "R",
 | 
				
			||||||
 | 
					    "math.OC": "S",
 | 
				
			||||||
 | 
					    "math.DS": "T",
 | 
				
			||||||
 | 
					    "math.DG": "U",
 | 
				
			||||||
 | 
					    "math.MP": "V",
 | 
				
			||||||
 | 
					    "cs.MM": "W",
 | 
				
			||||||
 | 
					    "stat.ME": "X",
 | 
				
			||||||
 | 
					    "math.CO": "Y",
 | 
				
			||||||
 | 
					    "cs.NE": "Z"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					# 问题模板常量
 | 
				
			||||||
 | 
					QUESTION_TEMPLATES = [
 | 
				
			||||||
 | 
					    # 直接提问式
 | 
				
			||||||
 | 
					    "{category_text}What is the scientific category for a paper titled '{title}', authored by {authors}, with abstract '{abstract}'?",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 命令式
 | 
				
			||||||
 | 
					    "Classify this paper into its scientific category based on title '{title}', authors '{authors}', and abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 描述性引导
 | 
				
			||||||
 | 
					    "{category_text}Given a research paper with title '{title}', authors {authors}, and abstract '{abstract}', identify the appropriate discipline.",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 正式请求
 | 
				
			||||||
 | 
					    "Please assign the scientific category for the paper: title '{title}', authors '{authors}', abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 摘要优先
 | 
				
			||||||
 | 
					    "Using the abstract '{abstract}', title '{title}', and authors '{authors}', determine the paper's category.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 作者强调
 | 
				
			||||||
 | 
					    "{category_text}From authors '{authors}', title '{title}', and abstract '{abstract}', what category does this paper fall into?",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 问题链式
 | 
				
			||||||
 | 
					    "Here's a paper: title '{title}', authors {authors}, abstract '{abstract}'. What is its scientific category?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 简洁版
 | 
				
			||||||
 | 
					    "Category for: title '{title}', authors '{authors}', abstract '{abstract}'?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 上下文嵌入
 | 
				
			||||||
 | 
					    "Considering the title '{title}', the authors '{authors}', and the abstract content '{abstract}', please specify the paper's field.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 非正式口语
 | 
				
			||||||
 | 
					    "Hey, what category is this paper? Title '{title}', by {authors}, abstract '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 元素罗列
 | 
				
			||||||
 | 
					    "{category_text}Title: '{title}'. Authors: '{authors}'. Abstract: '{abstract}'. Now, what's the scientific category?",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 假设场景
 | 
				
			||||||
 | 
					    "If a paper has title '{title}', authors '{authors}', and abstract '{abstract}', which scientific category best fits it?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 强调关键信息
 | 
				
			||||||
 | 
					    "Based solely on the title '{title}', authors list '{authors}', and abstract text '{abstract}', categorize this paper.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 间接询问
 | 
				
			||||||
 | 
					    "For the paper '{title}' by {authors}, with abstract '{abstract}', could you indicate its scientific discipline?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 完整句子整合
 | 
				
			||||||
 | 
					    "Determine the category of the research paper entitled '{title}', written by {authors}, and summarized as '{abstract}'.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 问题聚焦摘要
 | 
				
			||||||
 | 
					    "The abstract '{abstract}' describes a paper titled '{title}' by authors '{authors}'. What category is it?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 标题驱动
 | 
				
			||||||
 | 
					    "{category_text}Starting from the title '{title}', and considering authors '{authors}' and abstract '{abstract}', what is the paper's category?",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 多部分查询
 | 
				
			||||||
 | 
					    "Part 1: Title is '{title}'. Part 2: Authors are '{authors}'. Part 3: Abstract is '{abstract}'. Based on this, classify the paper.{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 比较式
 | 
				
			||||||
 | 
					    "Given the details: title '{title}', authors '{authors}', abstract '{abstract}', how would you categorize this paper scientifically?{category_text}",
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 行动导向
 | 
				
			||||||
 | 
					    "Using the provided title '{title}', authors '{authors}', and abstract '{abstract}', output the scientific category for this paper.{category_text}"
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def extract_title_author_and_abstract(content_text):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    content_text: 格式示例"Based on the title 'The Quantum Primordial Black Holes, Dimensionless Small Parameter,   Inflationary Cosmology and Non-Gaussianity', authors 'Alexander Shalyt-Margolin', and abstract 'In the present work consideration is given to the primordial black holes ({\\bf pbhs}) in the Schwarzschild-de Sitter Metric with small mass (ultralight) in the preinflationary epoch. Within the scope of natural assumptions, it has been shown that the quantum-gravitational corrections ({\\bf qgcs}) to the characteristics of such black holes can contribute to all the cosmological parameters, shifting them compared with the semiclassical consideration. These contributions are determined by a series expansion in terms of a small parameter dependent on the hole mass (radius). For this pattern different cases have been considered (stationary, black hole evaporation...). It has been demonstrated that involvement of ({\\bf qgcs}) leads to a higher probability for the occurrence of such {\\bf pbhs}. Besides, high-energy deformations of Friedmann Equations created on the basis of these corrections have been derived for different patterns. In the last section of this work it is introduced a study into the contributions generated by the above-mentioned {\\bf qgcs} in inflationary cosmological perturbations. Besides, it has been shown that non-Gaussianity of these perturbations is higher as compared to the semi-classical pattern.', please determine the scientific category of this paper. Additional info: 35 pages, Latex , 
 | 
				
			||||||
 | 
					    A. quant-ph\nB. physics.chem-ph\nC. physics.atom-ph\nD. cond-mat.soft\nE. cs.RO\nF. cs.CL\nG. cs.SE\nH. cs.IR\nI. hep-th\nJ. hep-ph\nK. physics.optics\nL. cs.AI\nM. cs.CV\nN. nucl-th\nO. astro-ph\nP. math.PR\nQ. cs.OS\nR. eess.SP\nS. math.OC\nT. math.DS\nU. math.DG\nV. math.MP\nW. cs.MM\nX. stat.ME\nY. math.CO\nZ. cs.NE", "assistant": "I"}]}}
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        # 针对可以直接解析的JSON格式数据进行处理
 | 
				
			||||||
 | 
					        if content_text.strip().startswith('{') and '"title"' in content_text and ('"author_names"' in content_text or '"authors"' in content_text):
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                # 尝试解析为JSON对象
 | 
				
			||||||
 | 
					                paper_data = json.loads(content_text)
 | 
				
			||||||
 | 
					                title = paper_data.get("title", "")
 | 
				
			||||||
 | 
					                authors = ", ".join(paper_data.get("author_names", paper_data.get("authors", [])))
 | 
				
			||||||
 | 
					                abstract = paper_data.get("summary", paper_data.get("abstract", ""))
 | 
				
			||||||
 | 
					                return {"title": title, "authors": authors, "abstract": abstract}
 | 
				
			||||||
 | 
					            except:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        #content_text.split("',")
 | 
				
			||||||
 | 
					        parts = content_text.split("',")
 | 
				
			||||||
 | 
					        if len(parts) < 3:
 | 
				
			||||||
 | 
					            # 如果分割后的部分少于3个,返回默认值
 | 
				
			||||||
 | 
					            return {"title": "", "authors": "", "abstract": ""}
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 安全地提取标题
 | 
				
			||||||
 | 
					        title_parts = parts[0].split("'")
 | 
				
			||||||
 | 
					        if len(title_parts) >= 2:
 | 
				
			||||||
 | 
					            title = title_parts[1].strip()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            title = ""
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        # 安全地提取作者
 | 
				
			||||||
 | 
					        authors_parts = parts[1].split("'")
 | 
				
			||||||
 | 
					        if len(authors_parts) >= 2:
 | 
				
			||||||
 | 
					            authors = authors_parts[1].strip()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            authors = ""
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        # 安全地提取摘要
 | 
				
			||||||
 | 
					        abstract_parts = parts[2].split("'")
 | 
				
			||||||
 | 
					        if len(abstract_parts) >= 2:
 | 
				
			||||||
 | 
					            abstract = abstract_parts[1].strip()
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            abstract = ""
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        return {"title": title, "authors": authors, "abstract": abstract}
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        # 如果出现任何异常,返回默认值
 | 
				
			||||||
 | 
					        print(f"解析内容时出错: {e}")
 | 
				
			||||||
 | 
					        return {"title": "", "authors": "", "abstract": ""}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_new_format_data(data):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    解析新格式的数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        data: 新格式的JSON数据
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        tuple: (system_instruction, human_content, assistant_content) 或 (None, None, None)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if "messages" not in data or not isinstance(data["messages"], list) or len(data["messages"]) < 3:
 | 
				
			||||||
 | 
					        return None, None, None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    system_instruction = ""
 | 
				
			||||||
 | 
					    human_content = ""
 | 
				
			||||||
 | 
					    assistant_content = ""
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for msg in data["messages"]:
 | 
				
			||||||
 | 
					        if msg["role"] == "system":
 | 
				
			||||||
 | 
					            system_instruction = msg["content"]
 | 
				
			||||||
 | 
					        elif msg["role"] == "user":
 | 
				
			||||||
 | 
					            human_content = msg["content"]
 | 
				
			||||||
 | 
					        elif msg["role"] == "assistant":
 | 
				
			||||||
 | 
					            assistant_content = msg["content"]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return system_instruction, human_content, assistant_content
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def parse_old_format_data(data):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    解析旧格式的数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        data: 旧格式的JSON数据
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        tuple: (system_instruction, conversation_data) 或 (None, None)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    if "system" not in data or "conversation" not in data or not data["conversation"]:
 | 
				
			||||||
 | 
					        return None, None
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    system_instruction = data.get("system", "根据论文的标题、作者和摘要,确定该论文的科学类别。")
 | 
				
			||||||
 | 
					    return system_instruction, data["conversation"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    根据模板生成多种类型的样本
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        title: 论文标题
 | 
				
			||||||
 | 
					        authors: 作者
 | 
				
			||||||
 | 
					        abstract: 摘要
 | 
				
			||||||
 | 
					        system_instruction: 系统指令
 | 
				
			||||||
 | 
					        assistant_content: 助手回复
 | 
				
			||||||
 | 
					        num_templates: 使用的模板数量
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        list: 生成的多种类型数据列表
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    n = min(num_templates, len(QUESTION_TEMPLATES))
 | 
				
			||||||
 | 
					    selected_templates = random.sample(QUESTION_TEMPLATES, n)
 | 
				
			||||||
 | 
					    samples = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    for template in selected_templates:
 | 
				
			||||||
 | 
					        formatted_question = template.format(
 | 
				
			||||||
 | 
					            title=title,
 | 
				
			||||||
 | 
					            authors=authors,
 | 
				
			||||||
 | 
					            abstract=abstract,
 | 
				
			||||||
 | 
					            category_text=CATEGORY_TEXT
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        new_data = {
 | 
				
			||||||
 | 
					            "messages": [
 | 
				
			||||||
 | 
					                {"role": "system", "content": system_instruction},
 | 
				
			||||||
 | 
					                {"role": "user", "content": formatted_question},
 | 
				
			||||||
 | 
					                {"role": "assistant", "content": assistant_content}
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        samples.append(new_data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return samples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def process_new_format_data(data, num_templates):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    处理新格式数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        data: 新格式数据
 | 
				
			||||||
 | 
					        num_templates: 模板数量
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        list: 处理后的数据列表
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    system_instruction, human_content, assistant_content = parse_new_format_data(data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if not human_content:
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    extracted = extract_title_author_and_abstract(human_content)
 | 
				
			||||||
 | 
					    title = extracted.get("title", "")
 | 
				
			||||||
 | 
					    authors = extracted.get("authors", "")
 | 
				
			||||||
 | 
					    abstract = extracted.get("abstract", "")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return generate_multi_type_samples(title, authors, abstract, system_instruction, assistant_content, num_templates)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def process_old_format_data(data, num_templates):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    处理旧格式数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        data: 旧格式数据
 | 
				
			||||||
 | 
					        num_templates: 模板数量
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        list: 处理后的数据列表
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    system_instruction, conversation_data = parse_old_format_data(data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    if not conversation_data:
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    samples = []
 | 
				
			||||||
 | 
					    for turn in conversation_data:
 | 
				
			||||||
 | 
					        if "human" not in turn or "assistant" not in turn:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        extracted = extract_title_author_and_abstract(turn["human"])
 | 
				
			||||||
 | 
					        title = extracted.get("title", "")
 | 
				
			||||||
 | 
					        authors = extracted.get("authors", "")
 | 
				
			||||||
 | 
					        abstract = extracted.get("abstract", "")
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        n = min(num_templates, len(QUESTION_TEMPLATES))
 | 
				
			||||||
 | 
					        selected_templates = random.sample(QUESTION_TEMPLATES, n)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        for template in selected_templates:
 | 
				
			||||||
 | 
					            formatted_question = template.format(
 | 
				
			||||||
 | 
					                title=title,
 | 
				
			||||||
 | 
					                authors=authors,
 | 
				
			||||||
 | 
					                abstract=abstract,
 | 
				
			||||||
 | 
					                category_text=CATEGORY_TEXT
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            new_data = {
 | 
				
			||||||
 | 
					                "system": system_instruction,
 | 
				
			||||||
 | 
					                "conversation": [
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        "human": formatted_question,
 | 
				
			||||||
 | 
					                        "assistant": turn["assistant"]
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            samples.append(new_data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    return samples
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_paper_data_from_crawl_jason(input_path):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    从指定文件夹里的所有JSON文件中获取论文数据
 | 
				
			||||||
 | 
					    或从单个JSON文件中获取论文数据
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    paper_data_list = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 检查输入路径是文件还是文件夹
 | 
				
			||||||
 | 
					    if os.path.isfile(input_path):
 | 
				
			||||||
 | 
					        # 如果是单个文件
 | 
				
			||||||
 | 
					        paper_data_list.extend(_extract_paper_data_from_file(input_path))
 | 
				
			||||||
 | 
					        print(f"从文件 {input_path} 中提取了 {len(paper_data_list)} 条数据")
 | 
				
			||||||
 | 
					    elif os.path.isdir(input_path):
 | 
				
			||||||
 | 
					        # 如果是文件夹,遍历其中所有JSON文件
 | 
				
			||||||
 | 
					        files_found = 0
 | 
				
			||||||
 | 
					        for filename in os.listdir(input_path):
 | 
				
			||||||
 | 
					            if filename.endswith('.jsonl') :
 | 
				
			||||||
 | 
					                file_path = os.path.join(input_path, filename)
 | 
				
			||||||
 | 
					                try:
 | 
				
			||||||
 | 
					                    file_data = _extract_paper_data_from_file(file_path)
 | 
				
			||||||
 | 
					                    paper_data_list.extend(file_data)
 | 
				
			||||||
 | 
					                    print(f"已从 {filename} 中提取 {len(file_data)} 条数据")
 | 
				
			||||||
 | 
					                    files_found += 1
 | 
				
			||||||
 | 
					                except Exception as e:
 | 
				
			||||||
 | 
					                    print(f"处理文件 {filename} 时出错: {e}")
 | 
				
			||||||
 | 
					        print(f"在目录中找到 {files_found} 个JSON文件")
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        print(f"路径 {input_path} 既不是文件也不是文件夹")
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    print(f"总共提取了 {len(paper_data_list)} 条论文数据")
 | 
				
			||||||
 | 
					    return paper_data_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _extract_paper_data_from_file(file_path):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    从单个JSON文件中提取论文数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        file_path: JSON文件路径
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        list: 论文数据列表
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    paper_data_list = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 处理JSONL格式文件
 | 
				
			||||||
 | 
					    with open(file_path, "r", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for line_num, line in enumerate(f, 1):
 | 
				
			||||||
 | 
					            line = line.strip()
 | 
				
			||||||
 | 
					            if not line:  # 跳过空行
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                item = json.loads(line)
 | 
				
			||||||
 | 
					                title = item.get("title", "")
 | 
				
			||||||
 | 
					                # 处理作者信息的不同可能格式
 | 
				
			||||||
 | 
					                authors_list = item.get("author_names", item.get("authors", []))
 | 
				
			||||||
 | 
					                if isinstance(authors_list, list):
 | 
				
			||||||
 | 
					                    authors = ", ".join(authors_list)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    authors = str(authors_list)
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					                # 处理摘要信息的不同可能格式
 | 
				
			||||||
 | 
					                abstract = item.get("summary", item.get("abstract", ""))
 | 
				
			||||||
 | 
					                # 处理分类信息的不同可能格式
 | 
				
			||||||
 | 
					                category = item.get("category", "Unknown")
 | 
				
			||||||
 | 
					                # 如果没有category字段,尝试从categories列表中获取第一个
 | 
				
			||||||
 | 
					                if category == "Unknown" and "categories" in item and isinstance(item["categories"], list) and len(item["categories"]) > 0:
 | 
				
			||||||
 | 
					                    category = item["categories"][0]
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 提取论文数据
 | 
				
			||||||
 | 
					                paper_data_dict = {
 | 
				
			||||||
 | 
					                    "title": title,
 | 
				
			||||||
 | 
					                    "authors": authors,
 | 
				
			||||||
 | 
					                    "abstract": abstract,
 | 
				
			||||||
 | 
					                    "category": category
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                paper_data_list.append(paper_data_dict)
 | 
				
			||||||
 | 
					            except json.JSONDecodeError as e:
 | 
				
			||||||
 | 
					                print(f"解析文件 {file_path} 的第 {line_num} 行时出错: {e}")
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    return paper_data_list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_onedata2multi_type_pre(paper_datas, output_file, num_templates):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据,
 | 
				
			||||||
 | 
					    并保存为output_file
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    参数:
 | 
				
			||||||
 | 
					    input_file: 输入文件路径
 | 
				
			||||||
 | 
					    output_file: 输出文件路径
 | 
				
			||||||
 | 
					    num_templates: 每条数据生成的模板数量
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    print(f"开始转换数据...每条数据生成{num_templates}条变体")
 | 
				
			||||||
 | 
					    print(f"开始转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    multi_type_data = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for item in paper_datas:
 | 
				
			||||||
 | 
					        title = item.get("title", "")
 | 
				
			||||||
 | 
					        authors = item.get("authors", "")
 | 
				
			||||||
 | 
					        abstract = item.get("summary", item.get("abstract", ""))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        n = min(num_templates, len(QUESTION_TEMPLATES))
 | 
				
			||||||
 | 
					        selected_templates = random.sample(QUESTION_TEMPLATES, n)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        for template in selected_templates:
 | 
				
			||||||
 | 
					            formatted_question = template.format(
 | 
				
			||||||
 | 
					                title=title,
 | 
				
			||||||
 | 
					                authors=authors,
 | 
				
			||||||
 | 
					                abstract=abstract,
 | 
				
			||||||
 | 
					                category_text=CATEGORY_TEXT
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            new_data = {
 | 
				
			||||||
 | 
					                    
 | 
				
			||||||
 | 
					                "messages": [
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        "role": "assistant",
 | 
				
			||||||
 | 
					                        "content": formatted_question 
 | 
				
			||||||
 | 
					                        #"assistant": row["answer"]
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 写入输出文件
 | 
				
			||||||
 | 
					    with open(output_file, "w", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for item in multi_type_data:
 | 
				
			||||||
 | 
					            f.write(json.dumps(item, ensure_ascii=False) + "\n")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_onedata2multi_type_sft(paper_datas, output_file, num_templates):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    读取input_file,将Swift格式的1条数据按多种问题模板格式转换为多条数据,
 | 
				
			||||||
 | 
					    并保存为output_file
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    参数:
 | 
				
			||||||
 | 
					    input_file: 输入文件路径
 | 
				
			||||||
 | 
					    output_file: 输出文件路径
 | 
				
			||||||
 | 
					    num_templates: 每条数据生成的模板数量
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    print(f"开始转换数据...每条数据生成{num_templates}条变体")
 | 
				
			||||||
 | 
					    print(f"开始转换数据: {input_file} -> {output_file}")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    multi_type_data = []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for item in paper_datas:
 | 
				
			||||||
 | 
					        title = item.get("title", "")
 | 
				
			||||||
 | 
					        authors = item.get("authors", "")
 | 
				
			||||||
 | 
					        abstract = item.get("summary", item.get("abstract", ""))
 | 
				
			||||||
 | 
					        category = item.get("category", "Unknown")
 | 
				
			||||||
 | 
					        answer=CATEGORY_DICT.get(category, "Unknown")
 | 
				
			||||||
 | 
					        #print(item)
 | 
				
			||||||
 | 
					        # 生成系统指令
 | 
				
			||||||
 | 
					        system_instruction = "你是个优秀的论文分类师,根据论文的标题、作者和摘要,确定该论文的科学类别。"
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        n = min(num_templates, len(QUESTION_TEMPLATES))
 | 
				
			||||||
 | 
					        selected_templates = random.sample(QUESTION_TEMPLATES, n)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        for template in selected_templates:
 | 
				
			||||||
 | 
					            formatted_question = template.format(
 | 
				
			||||||
 | 
					                title=title,
 | 
				
			||||||
 | 
					                authors=authors,
 | 
				
			||||||
 | 
					                abstract=abstract,
 | 
				
			||||||
 | 
					                category_text=CATEGORY_TEXT
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            new_data = {
 | 
				
			||||||
 | 
					                "system": system_instruction,
 | 
				
			||||||
 | 
					                "conversation": [
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        "human": formatted_question,
 | 
				
			||||||
 | 
					                        "assistant": answer
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                ]
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 写入输出文件
 | 
				
			||||||
 | 
					    with open(output_file, "w", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					        for item in multi_type_data:
 | 
				
			||||||
 | 
					            f.write(json.dumps(item, ensure_ascii=False) + "\n")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"转换完成! 共转换 {len(multi_type_data)} 条数据")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    # 示例用法
 | 
				
			||||||
 | 
					    input_file = r"G:\\11\data-prepare\\arxiv_papers\\"
 | 
				
			||||||
 | 
					    output_file_sft = r"G:\\11\data-prepare\\arxiv_papers-multi_type-sft.json"
 | 
				
			||||||
 | 
					    output_file_pre = r"G:\\11\data-prepare\\arxiv_papers-multi_type-pre.json"
 | 
				
			||||||
 | 
					    paper_datas=get_paper_data_from_crawl_jason(input_file)
 | 
				
			||||||
 | 
					    convert_onedata2multi_type_sft(paper_datas, output_file_sft, num_templates=1)
 | 
				
			||||||
 | 
					    convert_onedata2multi_type_pre(paper_datas, output_file_pre, num_templates=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -104,6 +104,18 @@ def extract_title_author_and_abstract(content_text):
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
 | 
					        # 针对可以直接解析的JSON格式数据进行处理
 | 
				
			||||||
 | 
					        if content_text.strip().startswith('{') and '"title"' in content_text and '"author_names"' in content_text:
 | 
				
			||||||
 | 
					            try:
 | 
				
			||||||
 | 
					                # 尝试解析为JSON对象
 | 
				
			||||||
 | 
					                paper_data = json.loads(content_text)
 | 
				
			||||||
 | 
					                title = paper_data.get("title", "")
 | 
				
			||||||
 | 
					                authors = ", ".join(paper_data.get("author_names", []))
 | 
				
			||||||
 | 
					                abstract = paper_data.get("summary", paper_data.get("abstract", ""))
 | 
				
			||||||
 | 
					                return {"title": title, "authors": authors, "abstract": abstract}
 | 
				
			||||||
 | 
					            except:
 | 
				
			||||||
 | 
					                pass
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
        #content_text.split("',")
 | 
					        #content_text.split("',")
 | 
				
			||||||
        parts = content_text.split("',")
 | 
					        parts = content_text.split("',")
 | 
				
			||||||
        if len(parts) < 3:
 | 
					        if len(parts) < 3:
 | 
				
			||||||
@@ -324,6 +336,42 @@ def convert_onedata2multi_type(input_file, output_file, num_templates):
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    multi_type_data = []
 | 
					    multi_type_data = []
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    # 检查是否为JSON文件格式
 | 
				
			||||||
 | 
					    if input_file.endswith('.json'):
 | 
				
			||||||
 | 
					        # 处理JSON格式文件
 | 
				
			||||||
 | 
					        with open(input_file, "r", encoding="utf-8") as f:
 | 
				
			||||||
 | 
					            json_data = json.load(f)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					        for item in json_data:
 | 
				
			||||||
 | 
					            title = item.get("title", "")
 | 
				
			||||||
 | 
					            authors = ", ".join(item.get("author_names", item.get("authors", [])))
 | 
				
			||||||
 | 
					            abstract = item.get("summary", item.get("abstract", ""))
 | 
				
			||||||
 | 
					            category = item.get("category", "Unknown")
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            # 生成系统指令
 | 
				
			||||||
 | 
					            system_instruction = "根据论文的标题、作者和摘要,确定该论文的科学类别。"
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            n = min(num_templates, len(QUESTION_TEMPLATES))
 | 
				
			||||||
 | 
					            selected_templates = random.sample(QUESTION_TEMPLATES, n)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            for template in selected_templates:
 | 
				
			||||||
 | 
					                formatted_question = template.format(
 | 
				
			||||||
 | 
					                    title=title,
 | 
				
			||||||
 | 
					                    authors=authors,
 | 
				
			||||||
 | 
					                    abstract=abstract,
 | 
				
			||||||
 | 
					                    category_text=CATEGORY_TEXT
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                new_data = {
 | 
				
			||||||
 | 
					                    "messages": [
 | 
				
			||||||
 | 
					                        {"role": "system", "content": system_instruction},
 | 
				
			||||||
 | 
					                        {"role": "user", "content": formatted_question},
 | 
				
			||||||
 | 
					                        {"role": "assistant", "content": category}
 | 
				
			||||||
 | 
					                    ]
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                multi_type_data.append(new_data)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # 原有的处理逻辑
 | 
				
			||||||
        with open(input_file, "r", encoding="utf-8") as f:
 | 
					        with open(input_file, "r", encoding="utf-8") as f:
 | 
				
			||||||
            for line_num, line in enumerate(f, 1):
 | 
					            for line_num, line in enumerate(f, 1):
 | 
				
			||||||
                try:
 | 
					                try:
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										162
									
								
								crawl-arxiv.py
									
									
									
									
									
								
							
							
						
						
									
										162
									
								
								crawl-arxiv.py
									
									
									
									
									
								
							@@ -1,17 +1,62 @@
 | 
				
			|||||||
import requests
 | 
					import requests
 | 
				
			||||||
from bs4 import BeautifulSoup
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def fetch_arxiv_papers(query, max_results=10):
 | 
					CATEGORY_DICT = {
 | 
				
			||||||
 | 
					    "A": "quant-ph",
 | 
				
			||||||
 | 
					    "B": "physics.chem-ph",
 | 
				
			||||||
 | 
					    "C": "physics.atom-ph",
 | 
				
			||||||
 | 
					    "D": "cond-mat.soft",
 | 
				
			||||||
 | 
					    "E": "cs.RO",
 | 
				
			||||||
 | 
					    "F": "cs.CL",
 | 
				
			||||||
 | 
					    "G": "cs.SE",
 | 
				
			||||||
 | 
					    "H": "cs.IR",
 | 
				
			||||||
 | 
					    "I": "hep-th",
 | 
				
			||||||
 | 
					    "J": "hep-ph",
 | 
				
			||||||
 | 
					    "K": "physics.optics",
 | 
				
			||||||
 | 
					    "L": "cs.AI",
 | 
				
			||||||
 | 
					    "M": "cs.CV",
 | 
				
			||||||
 | 
					    "N": "nucl-th",
 | 
				
			||||||
 | 
					    "O": "astro-ph",
 | 
				
			||||||
 | 
					    "P": "math.PR",
 | 
				
			||||||
 | 
					    "Q": "cs.OS",
 | 
				
			||||||
 | 
					    "R": "eess.SP",
 | 
				
			||||||
 | 
					    "S": "math.OC",
 | 
				
			||||||
 | 
					    "T": "math.DS",
 | 
				
			||||||
 | 
					    "U": "math.DG",
 | 
				
			||||||
 | 
					    "V": "math.MP",
 | 
				
			||||||
 | 
					    "W": "cs.MM",
 | 
				
			||||||
 | 
					    "X": "stat.ME",
 | 
				
			||||||
 | 
					    "Y": "math.CO",
 | 
				
			||||||
 | 
					    "Z": "cs.NE"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def fetch_arxiv_papers_batch(query, start, max_results=100):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    从arXiv获取一批论文数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        query: 搜索查询
 | 
				
			||||||
 | 
					        start: 起始位置
 | 
				
			||||||
 | 
					        max_results: 本次获取结果数(arXiv API最大支持10000)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
    base_url = "http://export.arxiv.org/api/query"
 | 
					    base_url = "http://export.arxiv.org/api/query"
 | 
				
			||||||
    params = {
 | 
					    params = {
 | 
				
			||||||
        "search_query": query,
 | 
					        "search_query": query,
 | 
				
			||||||
        "start": 0,
 | 
					        "start": start,
 | 
				
			||||||
        "max_results": max_results
 | 
					        "max_results": max_results
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    response = requests.get(base_url, params=params)
 | 
					    
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        response = requests.get(base_url, params=params, timeout=30)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
        if response.status_code == 200:
 | 
					        if response.status_code == 200:
 | 
				
			||||||
            soup = BeautifulSoup(response.content, "xml")
 | 
					            soup = BeautifulSoup(response.content, "xml")
 | 
				
			||||||
            entries = soup.find_all("entry")
 | 
					            entries = soup.find_all("entry")
 | 
				
			||||||
 | 
					            papers = []
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
            for entry in entries:
 | 
					            for entry in entries:
 | 
				
			||||||
                title = entry.title.text.strip()
 | 
					                title = entry.title.text.strip()
 | 
				
			||||||
                summary = entry.summary.text.strip()
 | 
					                summary = entry.summary.text.strip()
 | 
				
			||||||
@@ -24,11 +69,108 @@ def fetch_arxiv_papers(query, max_results=10):
 | 
				
			|||||||
                    if name:
 | 
					                    if name:
 | 
				
			||||||
                        author_names.append(name.text.strip())
 | 
					                        author_names.append(name.text.strip())
 | 
				
			||||||
                
 | 
					                
 | 
				
			||||||
            print(f"标题: {title}")
 | 
					                # 获取分类信息
 | 
				
			||||||
            print(f"作者: {', '.join(author_names)}")
 | 
					                categories = entry.find_all("category")
 | 
				
			||||||
            print(f"摘要: {summary}\n")
 | 
					                category_list = [cat.get("term") for cat in categories]
 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        print("请求失败,状态码:", response.status_code)
 | 
					 | 
				
			||||||
                
 | 
					                
 | 
				
			||||||
# 示例调用
 | 
					                # 获取论文ID和链接
 | 
				
			||||||
fetch_arxiv_papers("cat:math.MP", max_results=5)
 | 
					                paper_id = entry.id.text.strip()
 | 
				
			||||||
 | 
					                published = entry.published.text.strip() if entry.published else ""
 | 
				
			||||||
 | 
					                updated = entry.updated.text.strip() if entry.updated else ""
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                # 构建论文数据结构
 | 
				
			||||||
 | 
					                paper_data = {
 | 
				
			||||||
 | 
					                    "id": paper_id,
 | 
				
			||||||
 | 
					                    "title": title,
 | 
				
			||||||
 | 
					                    "authors": author_names,
 | 
				
			||||||
 | 
					                    "summary": summary,
 | 
				
			||||||
 | 
					                    "categories": category_list,
 | 
				
			||||||
 | 
					                    "published": published,
 | 
				
			||||||
 | 
					                    "updated": updated
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                papers.append(paper_data)
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            return papers
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            print(f"请求失败,状态码: {response.status_code}")
 | 
				
			||||||
 | 
					            return []
 | 
				
			||||||
 | 
					    except Exception as e:
 | 
				
			||||||
 | 
					        print(f"请求异常: {e}")
 | 
				
			||||||
 | 
					        return []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def save_papers_to_jsonl(papers, category_code, category_name):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    将论文数据保存为JSONL格式文件
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        papers: 论文数据列表
 | 
				
			||||||
 | 
					        category_code: 类别代码(如"A")
 | 
				
			||||||
 | 
					        category_name: 类别名称(如"quant-ph")
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    # 创建统一的子文件夹
 | 
				
			||||||
 | 
					    folder_name = "arxiv_papers"
 | 
				
			||||||
 | 
					    os.makedirs(folder_name, exist_ok=True)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    # 文件路径
 | 
				
			||||||
 | 
					    filename = f"arxiv_papers_{category_code}_{category_name.replace('.', '_')}.jsonl"
 | 
				
			||||||
 | 
					    file_path = os.path.join(folder_name, filename)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    with open(file_path, 'a', encoding='utf-8') as f:
 | 
				
			||||||
 | 
					        for paper in papers:
 | 
				
			||||||
 | 
					            f.write(json.dumps(paper, ensure_ascii=False) + '\n')
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"已追加保存 {len(papers)} 条数据到 {file_path}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def crawl_category(category_code, category_name, target_count=500):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    爬取单个类别的论文数据
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        category_code: 类别代码
 | 
				
			||||||
 | 
					        category_name: 类别名称
 | 
				
			||||||
 | 
					        target_count: 目标论文数量
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    query = f"cat:{category_name}"
 | 
				
			||||||
 | 
					    collected_count = 0
 | 
				
			||||||
 | 
					    start = 0
 | 
				
			||||||
 | 
					    batch_size = 100  # 每批获取的论文数量
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"开始爬取类别 {category_code} ({category_name}) 的论文...")
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    while collected_count < target_count:
 | 
				
			||||||
 | 
					        needed_count = min(batch_size, target_count - collected_count)
 | 
				
			||||||
 | 
					        print(f"正在获取 {collected_count+1} 到 {collected_count+needed_count} 篇论文...")
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        papers = fetch_arxiv_papers_batch(query, start, needed_count)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        if not papers:
 | 
				
			||||||
 | 
					            print("未获取到更多论文,停止爬取")
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 保存这批论文
 | 
				
			||||||
 | 
					        save_papers_to_jsonl(papers, category_code, category_name)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        collected_count += len(papers)
 | 
				
			||||||
 | 
					        start += len(papers)
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        print(f"当前已获取 {collected_count} 篇论文")
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        # 避免请求过于频繁
 | 
				
			||||||
 | 
					        time.sleep(3)
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    print(f"完成类别 {category_code} ({category_name}) 的爬取,共获取 {collected_count} 篇论文\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main():
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    主函数:遍历所有类别进行爬取
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    for category_code, category_name in CATEGORY_DICT.items():
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            crawl_category(category_code, category_name, target_count=500)
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            print(f"爬取类别 {category_code} ({category_name}) 时出现错误: {e}")
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    main()
 | 
				
			||||||
		Reference in New Issue
	
	Block a user