import urllib2
import urllib
from lxml import etree
def tieba_spider(fullurl):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
request = urllib2.Request(fullurl,headers = headers)
html = urllib2.urlopen(request).read()
content = etree.HTML(html)
link_list = content.xpath('//div[@class="t_con cleafix"]/div/div/div/a/@href')
for link in link_list:
print link
if __name__ == '__main__':
fullurl = "http://tieba.baidu.com/f?kw=lol&pn=0"
tieba_spider(fullurl)
这是一开始的代码,xpath用google浏览器上面的插件xpath-helper验证过,没有问题,但返回的link_list就是空,一开始我还查以为是google的xpath-helper插件有问题,查找资料发现不是,于是我就怀疑问题出在返回的html中,我就先用xpath查找头部,发现可以返回,看贴吧那页的源代码,发现帖子那些居然被注释了&