#coding=utf-8
import requests
from lxml import etree
from bs4 import BeautifulSoup
print('#'*30)
info=''' *** 趣事百科 ***
爬取文本笑话
使用模块requests,lxml的etree
爬出下一页地址返回循环判断'''
print(info)
print('#'*30)
class Qsbk(object):
def __init__(self):
self.homeurl='https://www.qiushibaike.com'
self.xpathname='./div[1]/a[1]/img/@alt'
self.xpathtext='./a[1]//span/text()'
self.xpathname_text = '//div[@class="col1"]/div'
self.xpathpage = '//*[@id="content-left"]/ul/li[last()]/a/@href'
def def_name(self,url):
response=requests.get(url)
html=response.text
tree=etree.HTML(html)
name_texts=tree.xpath(self.xpathname_text)
#每页用户ID和内容
for name_text in name_texts:
#print(name_text)
names=name_text.xpath(self.xpathname)
texts=name_text.xpath(self.xpathtext)
#下面两个for能同时输出,不理解,请看到朋友帮忙解答
for name in names:
print('\n昵称:',name)
for text in texts:
print('内容:',text.replace('\n','').replace('\r',''))
#下一页地址返回给方法
nextpage=tree.xpath(self.xpathpage)[0]
nextpage=self.homeurl+nextpage
#print(nextpage)
return nextpage
#主程序
if __name__=='__main__':
aa = Qsbk()
url='https://www.qiushibaike.com'
#用下一页表判断是否爬过跳出循环
pagelist=[]
while 1:
if not url in pagelist:
nexturl=aa.def_name(url)
pagelist.append(url)
url = nexturl
print('下一页地址:'+nexturl)
else:
print('完成,爬取了下列页笑话')
print(pagelist)
break
while 1:
pass