查看自己文章的点赞数、浏览量。。。
# 翻页,获取全部文章链接
import requests, re, math
url = 'https://me.youkuaiyun.com/yellow_python'
r = requests.get(url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
articles = re.search('<span>(\d+)</span>\s+<a href="https://blog.youkuaiyun.com/yellow_python\?t=1" target="_blank"><strong>原创</strong></a>', r).group(1)
pages = int(math.ceil(int(articles) / 20))
article_urls = []
for page in range(1, pages + 1):
page_url = 'https://blog.youkuaiyun.com/Yellow_python/article/list/%d' % page
rp = requests.get(page_url, headers={'User-Agent': 'Opera/8.0 (Windows NT 5.1; U; en)'}).text
article_urls.extend(re.findall('<h4 class="">\s+<a href="(https://blog.youkuaiyun.com/Yellow_python/article/details/\d+)" target="_blank">', rp))
print(len(article_urls), article_urls)
# 解析文章
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.options import Options # 火狐浏览器设置
firefox_option = Options()
firefox_option.set_headless() # 设置浏览器为【无头】
driver = webdriver.Firefox(firefox_options=firefox_option)
wait = WebDriverWait(driver, 9) # 显式等待,设置timeout
for article_url in article_urls:
driver.get(article_url)
title = driver.find_element_by_css_selector('html body div#mainBox.container.clearfix main div.blog-content-box div.article-header-box div.article-header div.article-title-box h1.title-article').text
approval = driver.find_element_by_css_selector('.long-height > p:nth-child(4)').text
pv = driver.find_element_by_css_selector('.read-count').text
print(approval, pv, title, article_url, sep=' | ')
driver.close()
注意:程序运行后,要清理关闭失败的无头浏览器