import aiohttp
import asyncio
from bs4 import BeautifulSoup
import re
# 异步爬取一个页面的 HTML 内容
async def fetch_html(url, session):
try:
async with session.get(url) as response:
if response.status == 200:
return await response.text()
else:
print(f"无法访问 {url},状态码: {response.status}")
return None
except Exception as e:
print(f"请求失败: {e}")
return None
# 从 HTML 中提取所有链接
def extract_links(html, base_url):
soup = BeautifulSoup(html, 'html.parser')
links = set()
# 查找所有 <a> 标签,并提取 href 属性
for link in soup.find_all('a', href=True):
href = link['href']
if href.startswith('
python asyncio异步爬虫
于 2024-11-09 12:08:36 首次发布