仅仅为了记录,只抓一页的
1、用正则写的
#coding=utf-8
import urllib
import urllib2
import re
from lxml import etree
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
pattern = re.compile('<h2>(.*?)</h2.*?content">(.*?)</div>(.*?)<div class="stats.*?number">(.*?)</',re.S)
items = re.findall(pattern,content)
count = 0
print '开始爬取第 %d 页' % page
for item in items:
print '发布者:'+item[0].encode('utf8'),'\n段子:'+item[1].encode('utf8').replace('<br/>','').strip()+'\n',item[3].encode('utf8')+'人点赞'
count+=1
print count
print '-'*40+'华丽的分界线'+'-'*40
print '第 %d 页共爬取 %d 个段子' % (page,count)
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
2、用xpath写的
#coding=utf-8
import urllib
import urllib2
from lxml import etree
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
headers = { 'User-Agent' : user_agent }
try:
request = urllib2.Request(url,headers = headers)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
parse = etree.HTML(content)
divs = parse.xpath('//div[@class="article block untagged mb15"]')
count = 0
for div in divs:
h2 = div.xpath('./div[@class="author clearfix"]/a[2]/h2|./div[@class="author clearfix"]/span[2]/h2')
duanzi = div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip()
author = h2[0].text.encode('utf8')
count += 1
print '第'+str(count)+'条'
print '发布者:'+h2[0].text.encode('utf8'),'\n内容:'+div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip(),'\n'+div.xpath('.//span[@class="stats-vote"]/i')[0].text+'人赞'
print '-'*40+'-'*40
except urllib2.URLError, e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
有时候抓不全,不知道为啥