Files
data-prepare/crawl-arxiv.py

34 lines
1.1 KiB
Python
Raw Normal View History

import requests
from bs4 import BeautifulSoup
def fetch_arxiv_papers(query, max_results=10):
base_url = "http://export.arxiv.org/api/query"
params = {
"search_query": query,
"start": 0,
"max_results": max_results
}
response = requests.get(base_url, params=params)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "xml")
entries = soup.find_all("entry")
for entry in entries:
title = entry.title.text.strip()
summary = entry.summary.text.strip()
# 获取作者信息
authors = entry.find_all("author")
author_names = []
for author in authors:
name = author.find("name")
if name:
author_names.append(name.text.strip())
print(f"标题: {title}")
print(f"作者: {', '.join(author_names)}")
print(f"摘要: {summary}\n")
else:
print("请求失败,状态码:", response.status_code)
# 示例调用
fetch_arxiv_papers("cat:math.MP", max_results=5)