import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.cnblogs.com/AggSite/AggSitePostList'
headers = {
'Content-Type': 'application/json',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
}
def craw_page(page_index):
"""
爬取
:param page_index: 页号
:return:
"""
data = {
"CategoryType": "SiteHome",
"ParentCategoryId": 0,
"CategoryId": 808,
"PageIndex": page_index,
"TotalPostCount": 2000,
"ItemListActionName": "AggSitePostList"
}
response = requests.post(url=url, data=json.dumps(data), headers=headers)
code = response.status_code
print('响应码:', code)
text = response.text
return text
def parse_data(text):
"""
处理数据
:param text: 网页内容
:return:
"""
datas = []
soup = BeautifulSoup(text, 'html.parser')
articles = soup.find_all('article', class_='post-item')
for article in articles:
link = article.find('a', class_='post-item-title')
title = link.get_text()
href = link['href']
author = article.find('a', class_='post-item-author').get_text()
icon_digg = 0
icon_comment = 0
icon_views = 0
for a in article.find_all("a"):
if "icon_digg" in str(a):
icon_digg = a.find("span").get_text()
if "icon_comment" in str(a):
icon_comment = a.find("span").get_text()
if "icon_views" in str(a):
icon_views = a.find("span").get_text()
datas.append([title, href, author, icon_digg, icon_comment, icon_views])
return datas
all_datas = []
for page in range(100):
print('正在爬取页号:', page + 1)
html = craw_page(page)
datas = parse_data(html)
all_datas.extend(datas)
df = pd.DataFrame(all_datas, columns=['title', 'href', 'author', 'digg', 'comment', 'views'])
df.to_excel('博客园100页文章信息.xlsx', index=False)
python 博客园爬虫03
最新推荐文章于 2025-06-05 15:15:03 发布