在爬取完简书上的博客列表之后,又尝试爬取了优快云个人主页的文章列表,程序实现的技术路线依旧是requesets+xpath。
简书博客列表爬取:https://blog.youkuaiyun.com/fovever_/article/details/104172715
爬取的文章信息主要包括:文章标题、文章类型、文章链接、文章摘要、发布时间、阅读数、评论数。
废话不多说,先上代码:
import requests
from lxml import etree
import os
def getResponse(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Connection': 'close'}
try:
r = requests.get(url, headers=header, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except:
return 0
def ResponseParse(r, alist):
if r:
dom = etree.HTML(r.text)
articles_xpath = './/div[@class="container clearfix pt0"]/main/div[@class="article-list"]'
articeles = dom.xpath(articles_xpath)
title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
abstract_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'
for article in articeles:
title = article.xpath(title_xpath)
type = article.xpath(type_xpath)
href = article.xpath(href_xpath)
abstract = article.xpath(abstract_xpath)
date = article.xpath(date_xpath)
read = article.xpath(read_xpath)
comment = article.xpath(comment_xpath)
for i in range(len(type)):
alist.append([title[2*i + 1].strip().replace("\n", ""), type[i], href[i], abstract[i].strip().replace("\n", ""), date[i].strip().replace("\n", ""), read[i], comment[i]])
print("文章标题:" + title[2*i + 1].strip().replace("\n", ""))
print("文章类型:" + type[i])
print("文章链接:" + href[i])
print("文章摘要:" + abstract[i].strip().replace("\n", ""))
print("发布时间:" + date[i].strip().replace("\n", ""))
print("阅读数:" + read[i])
print("评论数:" + comment[i])
print("\n")
return len(type)
else:
print("爬取失败!")
def Get_article_count(url):
#//*[@id="asideProfile"]/div[2]/dl[1]/dd/a/span
#/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
#/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
r = getResponse(url)
print(r.url)
# print(r.text)
dom = etree.HTML(r.text)
count_xpath1 = './/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]/text()'
count_xpath = './/div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()'
article_count = dom.xpath(count_xpath)
return int(article_count[0].strip().replace("\n", ""))
def Get_author_name(url):
#/html/body/div[2]/div[1]/div[1]/div[2]/p/text()
r = getResponse(url)
dom = etree.HTML(r.text)
name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
name = dom.xpath(name_xpath)[2].strip().replace("\n", "")
print("作者:", str(name))
return name
def WriteWord(alist, name):
save_dir = '文章列表'
save_dir = os.path.join(os.getcwd(), save_dir)
if not os.path.exists(save_dir):
os.mkdir(save_dir)
save_name = os.path.join(save_dir, name)
out = "文章标题:{0:{7}<10}\n文章类型:{1:{7}<10}\n文章链接:{2:{7}<20}\n文章摘要: {3:{7}<10}\n发布时间:{4:{7}<10}\n阅读数:{5:{7}<10}\n评论数:{6:{7}<10}\n"
with open(save_name, 'w', encoding="utf-8") as f:
for i in range(len(alist)):
f.write(out.format(alist[i][0], alist[i][1], alist[i][2], alist[i][3], alist[i][4], alist[i][5], alist[i][6], chr(12288)))
f.write("\n")
f.close()
print("数据成功写入:"+save_name)
def main():
try:
article_list = []
user_name = "fovever_" #再次修改查询的用户名称https://blog.youkuaiyun.com/fovever_一般为用户主页最后一个下划线后的字符串
url1 = "https://{0}.youkuaiyun.com/{1}"
url = url1 + '/article/list/{2}'
article_count = Get_article_count(url1.format("me", user_name))
save_name = Get_author_name(url1.format("me", user_name)) + '.doc'
if article_count % 40 == 0:
spider_num = article_count /40
else:
spider_num = article_count / 40 + 1
print(article_count)
spider_article_count = 0
for i in range(int(spider_num)):
r = getResponse(url.format("blog", user_name, str(i + 1)))
spider_article_count += ResponseParse(r, article_list)
WriteWord(article_list, save_name)
print("共爬取了:" + str(spider_article_count) + "篇博客!")
except:
print(user_name+"博客爬取失败!")
continue
if __name__ == '__main__':
main()
程序解析
接下来对程序中的几个重要方法进行解释。
getResponse(url)用于获取链接对应的响应,优快云的反爬措施相对简单,只需要将headers伪装成浏览器即可。
程序的重头戏便是对获取的响应进行解析。该功能在ResponseParse(r, alist)中实现,其中参数alist用于存储文章信息。
在ResponseParse中主要是寻找文章列表信息的标签路径。
以自己的博客主页为例:https://blog.youkuaiyun.com/fovever_
f12查看源代码
由图可知:所有的文章目录在class为"article-list"的div标签中,所以我们首先使用xpath寻找该div标签对应的xpath我写的是:.//div[@class="container clearfix pt0"]/main/div[@class="article-list"]
由此可以找到该div标签。
之后便是找到所有文章信息相对于这个标签的路径,根据下图
设置的所有信息的xpath为:
title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
abstract_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'
关于xpath的书写,我个人认为是仁者见仁,智者见智,每个人都可以根据自己的理解写出正确的xpath。
值得注意的是:按照我这样书写的xpath,获取得到的所有同类信息在一个列表中。最开始我是以title
的长度来整合所有的信息,使得同一篇文章的所有信息构成一个列表。但是在输出title时发现有的title是空的,通过检查title列表,发现由于网页的结构所有的title列表中前一个元素是空而其后一个元素才是真正的title。所以最后程序以type的长度来整合信息,而title的取值是:2*i+1
。最后将为一个包含伪证信息的列表添加到alist中。
如此这个程序便具备了简单的爬虫功能。
但是只能爬取一个页面的信息。与简书博客列表爬取相似,使用url的变化来对应翻页。以https://blog.youkuaiyun.com/ygdxt/article/list/1为例,可以发现通用的url形式为:https://blog.youkuaiyun.com/ygdxt/article/list/{?}
,只需要在{}中替换要爬取的页面即可,而且通过简单的观察发现优快云中文章列表每一页中包含40篇文章信息。
所以首先希望获取作者对应的总的文章数,最开始是希望在https://blog.youkuaiyun.com/ygdxt/article/list/1
中获取文章总数,但是
只有原创文章总数,但文章列表中包含了所有类型的文章。之后发现进入个人主页可以获得文章总数。
通过对比个人主页https://me.youkuaiyun.com/fovever_
和博客列表主页https://blog.youkuaiyun.com/fovever_/article/list/1
,个人主页与博客列表主页的前面部分知只是将me,改成了blog,所以我们将url写为:
url1 = "https://{0}.youkuaiyun.com/{1}"
url = url1 + '/article/list/{2}'
url1中的{0}用于区别是爬取博客主页还是个人主页,{1}中使用需要爬取的作者的ID(例如:fovever_),而url中的{2}则用于区别爬取的页数。
使用 Get_article_count(url)方法获取文章的总数,
如图文章总数位于span标签中,所以写出对应的xpath:.//div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()
如此便能循环爬取个人的所有文章。
当我们想将爬取的信息保存到本地时,我们便需要在写一个保存数据的方法,为了唯一标识保存作者信息的文档,以作者主页的昵称命名文档,所以我们有写了Get_author_name(url)方法。如图
个人的昵称位于个人主页的SVg标签中,所以写出其xpath:name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
再通过**WriteWord(alist, name)**将爬取的文章信息写入word中。
保存效果如图:
批量爬取多位博主的文章信息
为了批量爬取多位博主的文章信息,首先需要找到,在一个url界面中包含所有的博主信息,此处没有仔细的搜索。但是发现在https://bbs.youkuaiyun.com/total_rank包含许多博主的信息。
通过分析源代码
可以找到对应博主的个人空间的链接:https://me.youkuaiyun.com/net_lover
我们需要的只是博主的id即net_lover,所以在获取了a标签的内容后,再对内容进行分割link.split('/')[-1] for link in href
。href的xpath为:
href_xpath = './/div/div[@class="bbs_forums_wrap"]/div[@class="expert_wrap"]/div[@class="expert_box"]/ul[@class="expert_list"]/li/label[2]/a[@class="user_name"]/@href'
再在mian()方法中写一个for循环,便可以爬取所有博主的博客信息。但是由于有的博主没有博客,所以使用**try: except:**来处理异常
批量爬取结果如图:
最终的程序:
import requests
from lxml import etree
import os
def getResponse(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
'Connection': 'close'}
try:
r = requests.get(url, headers=header, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
except:
return 0
def ResponseParse(r, alist):
if r:
dom = etree.HTML(r.text)
articles_xpath = './/div[@class="container clearfix pt0"]/main/div[@class="article-list"]'
articeles = dom.xpath(articles_xpath)
title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
abstract_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'
for article in articeles:
title = article.xpath(title_xpath)
type = article.xpath(type_xpath)
href = article.xpath(href_xpath)
abstract = article.xpath(abstract_xpath)
date = article.xpath(date_xpath)
read = article.xpath(read_xpath)
comment = article.xpath(comment_xpath)
# title = [i for i in title if i != ' ']
# print(title)
for i in range(len(type)):
alist.append([title[2*i + 1].strip().replace("\n", ""), type[i], href[i], abstract[i].strip().replace("\n", ""), date[i].strip().replace("\n", ""), read[i], comment[i]])
print("文章标题:" + title[2*i + 1].strip().replace("\n", ""))
print("文章类型:" + type[i])
print("文章链接:" + href[i])
print("文章摘要:" + abstract[i].strip().replace("\n", ""))
print("发布时间:" + date[i].strip().replace("\n", ""))
print("阅读数:" + read[i])
print("评论数:" + comment[i])
print("\n")
return len(type)
# print(r.text)
else:
print("爬取失败!")
def Get_article_count(url):
#//*[@id="asideProfile"]/div[2]/dl[1]/dd/a/span
#/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
#/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
r = getResponse(url)
print(r.url)
# print(r.text)
dom = etree.HTML(r.text)
count_xpath1 = './/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]/text()'
count_xpath = './/div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()'
article_count = dom.xpath(count_xpath)
return int(article_count[0].strip().replace("\n", ""))
def Get_author_name(url):
#/html/body/div[2]/div[1]/div[1]/div[2]/p/text()
r = getResponse(url)
dom = etree.HTML(r.text)
name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
name = dom.xpath(name_xpath)[2].strip().replace("\n", "")
print("作者:", str(name))
return name
def WriteWord(alist, name):
save_dir = '文章列表'
save_dir = os.path.join(os.getcwd(), save_dir)
if not os.path.exists(save_dir):
os.mkdir(save_dir)
save_name = os.path.join(save_dir, name)
out = "文章标题:{0:{7}<10}\n文章类型:{1:{7}<10}\n文章链接:{2:{7}<20}\n文章摘要: {3:{7}<10}\n发布时间:{4:{7}<10}\n阅读数:{5:{7}<10}\n评论数:{6:{7}<10}\n"
with open(save_name, 'w', encoding="utf-8") as f:
for i in range(len(alist)):
f.write(out.format(alist[i][0], alist[i][1], alist[i][2], alist[i][3], alist[i][4], alist[i][5], alist[i][6], chr(12288)))
f.write("\n")
f.close()
print("数据成功写入:"+save_name)
def main():
user_url = 'https://bbs.youkuaiyun.com/total_rank'
user_list = ["fovever_"]
Get_User_list(user_url, user_list)
for user_name in user_list:
try:
article_list = []
# user_name = "fovever_" #再次修改查询的用户名称https://blog.youkuaiyun.com/fovever_一般为用户主页最后一个下划线后的字符串
url1 = "https://{0}.youkuaiyun.com/{1}"
url = url1 + '/article/list/{2}'
article_count = Get_article_count(url1.format("me", user_name))
save_name = Get_author_name(url1.format("me", user_name)) + '.doc'
if article_count % 40 == 0:
spider_num = article_count /40
else:
spider_num = article_count / 40 + 1
print(article_count)
spider_article_count = 0
for i in range(int(spider_num)):
r = getResponse(url.format("blog", user_name, str(i + 1)))
spider_article_count += ResponseParse(r, article_list)
WriteWord(article_list, save_name)
# print(len(article_list))
print("共爬取了:" + str(spider_article_count) + "篇博客!")
except:
print(user_name+"博客爬取失败!")
continue
def Get_User_list(url, ulist):
#/html/body/div[3]/div[2]/div/div/ul/li[1]/label[2]/a[2]
r = getResponse(url)
dom = etree.HTML(r.text)
href_xpath = './/div/div[@class="bbs_forums_wrap"]/div[@class="expert_wrap"]/div[@class="expert_box"]/ul[@class="expert_list"]/li/label[2]/a[@class="user_name"]/@href'
href = dom.xpath(href_xpath)
user_name = [link.split('/')[-1] for link in href]
for user in user_name:
ulist.append(user)
pass
if __name__ == '__main__':
main()
该程序在2020年2月5日能够正常运行。
程序中或有问题,希望大家指教!