data-prepare/crawl-arxiv.py

import requests
from bs4 import BeautifulSoup

def fetch_arxiv_papers(query, max_results=10):
    base_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": query,
        "start": 0,
        "max_results": max_results
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "xml")
        entries = soup.find_all("entry")
        for entry in entries:
            title = entry.title.text.strip()
            summary = entry.summary.text.strip()
            
            # 获取作者信息
            authors = entry.find_all("author")
            author_names = []
            for author in authors:
                name = author.find("name")
                if name:
                    author_names.append(name.text.strip())
            
            print(f"标题: {title}")
            print(f"作者: {', '.join(author_names)}")
            print(f"摘要: {summary}\n")
    else:
        print("请求失败，状态码:", response.status_code)

# 示例调用
fetch_arxiv_papers("cat:math.MP", max_results=5)
添加爬取arXiv论文的功能，支持根据查询获取论文标题、作者和摘要 2025-07-25 18:11:11 +08:00			`import requests`
			`from bs4 import BeautifulSoup`

			`def fetch_arxiv_papers(query, max_results=10):`
			`base_url = "http://export.arxiv.org/api/query"`
			`params = {`
			`"search_query": query,`
			`"start": 0,`
			`"max_results": max_results`
			`}`
			`response = requests.get(base_url, params=params)`
			`if response.status_code == 200:`
			`soup = BeautifulSoup(response.content, "xml")`
			`entries = soup.find_all("entry")`
			`for entry in entries:`
			`title = entry.title.text.strip()`
			`summary = entry.summary.text.strip()`

			`# 获取作者信息`
			`authors = entry.find_all("author")`
			`author_names = []`
			`for author in authors:`
			`name = author.find("name")`
			`if name:`
			`author_names.append(name.text.strip())`

			`print(f"标题: {title}")`
			`print(f"作者: {', '.join(author_names)}")`
			`print(f"摘要: {summary}\n")`
			`else:`
			`print("请求失败，状态码:", response.status_code)`

			`# 示例调用`
			`fetch_arxiv_papers("cat:math.MP", max_results=5)`