爬虫抓取自己csdn博客点赞数

最新推荐文章于 2024-08-03 20:35:45 发布

原创最新推荐文章于 2024-08-03 20:35:45 发布 · 923 阅读

0 ·

CC 4.0 BY-SA版权

爬虫专栏收录该内容

14 篇文章

订阅专栏

本文介绍了一种使用 Python 技术爬取优快云博客文章列表及其详细信息的方法，包括点赞数和浏览量等关键指标。通过结合 requests 和 Selenium 实现了从个人主页抓取所有文章链接，并进一步访问每篇文章获取具体数据。

查看自己文章的点赞数、浏览量。。。

# 翻页，获取全部文章链接
import requests, re, math
url = 'https://me.youkuaiyun.com/yellow_python'
r = requests.get(url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
articles = re.search('<span>(\d+)</span>\s+<a href="https://blog.youkuaiyun.com/yellow_python\?t=1" target="_blank"><strong>原创</strong></a>', r).group(1)
pages = int(math.ceil(int(articles) / 20))
article_urls = []
for page in range(1, pages + 1):
    page_url = 'https://blog.youkuaiyun.com/Yellow_python/article/list/%d' % page
    rp = requests.get(page_url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
    article_urls.extend(re.findall('<h4 class="">\s+<a href="(https://blog.youkuaiyun.com/Yellow_python/article/details/\d+)" target="_blank">', rp))
print(len(article_urls), article_urls)
# 解析文章
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options  # 火狐浏览器设置
firefox_option = Options()
firefox_option.set_headless()  # 设置浏览器为【无头】
driver = webdriver.Firefox(firefox_options=firefox_option)
wait = WebDriverWait(driver, 9)  # 显式等待，设置timeout
for article_url in article_urls:
    driver.get(article_url)
    title = driver.find_element_by_css_selector('html body div#mainBox.container.clearfix main div.blog-content-box div.article-header-box div.article-header div.article-title-box h1.title-article').text
    approval = driver.find_element_by_css_selector('.long-height > p:nth-child(4)').text
    pv = driver.find_element_by_css_selector('.read-count').text
    print(approval, pv, title, article_url, sep=' | ')
driver.close()