import requests
from bs4 import BeautifulSoup
# 定义要爬取的页面列表
urls = [f"https://www.cnblogs.com/#p{i}" for i in range(1, 6)]
# 爬取网页内容
def craw(url):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # 检查请求是否成功
return response.text
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return ""
# 解析HTML,提取超链接和文字内容
def parse(html):
soup = BeautifulSoup(html, "html.parser")
# 查找所有具有文章标题的超链接
links = soup.find_all("a", class_="post-item-title")
return [(link["href"], link.get_text().strip()) for link in links]
if __name__ == "__main__":
# 遍历每个URL,提取并打印内容
for url in urls:
print(f"Fetching {url}...")
html = craw(url)
if html:
results = parse(html)
for href, text in results:
print(f"Title: {text}, Link: {href}")