糗事百科简单爬虫

仅仅为了记录,只抓一页的

1、用正则写的

#coding=utf-8
import urllib
import urllib2
import re
from lxml import etree

page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
headers = { 'User-Agent' : user_agent }
try:
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
    pattern = re.compile('<h2>(.*?)</h2.*?content">(.*?)</div>(.*?)<div class="stats.*?number">(.*?)</',re.S)
    items = re.findall(pattern,content)
    count = 0
    print '开始爬取第 %d 页' % page
    for item in items:
    	print '发布者:'+item[0].encode('utf8'),'\n段子:'+item[1].encode('utf8').replace('<br/>','').strip()+'\n',item[3].encode('utf8')+'人点赞'
    	count+=1
    	print count
        print '-'*40+'华丽的分界线'+'-'*40
    print '第 %d 页共爬取 %d 个段子' % (page,count)
except urllib2.URLError, e:
    if hasattr(e,"code"):
        print e.code
    if hasattr(e,"reason"):
        print e.reason

2、用xpath写的

#coding=utf-8
import urllib
import urllib2
from lxml import etree
page = 1
url = 'http://www.qiushibaike.com/hot/page/' + str(page)
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
headers = { 'User-Agent' : user_agent }
try:
    request = urllib2.Request(url,headers = headers)
    response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
    parse = etree.HTML(content)
    divs = parse.xpath('//div[@class="article block untagged mb15"]')
    count = 0
    for div in divs:
      h2 = div.xpath('./div[@class="author clearfix"]/a[2]/h2|./div[@class="author clearfix"]/span[2]/h2')
      duanzi = div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip()
      author = h2[0].text.encode('utf8')
      count += 1 
      print '第'+str(count)+'条'
      print '发布者:'+h2[0].text.encode('utf8'),'\n内容:'+div.xpath('./div[@class="content"]')[0].text.encode('utf8').strip(),'\n'+div.xpath('.//span[@class="stats-vote"]/i')[0].text+'人赞'
      print '-'*40+'-'*40
    
except urllib2.URLError, e:
    if hasattr(e,"code"):
        print e.code
    if hasattr(e,"reason"):
        print e.reason

有时候抓不全,不知道为啥

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值