爬虫与页面解析实战-优快云博客

本文链接：https://blog.youkuaiyun.com/csvis/article/details/43150041

这个小项目是笔者入职后第一个练手的任务，实现的功能是：从单个小说网站搜集小说数据，解析这些数据后存储到数据库中，然后提供访问数据的接口。实现这些大功能就需要四个模块：爬虫、页面解析器、数据库存储和数据访问接口。

首先，来说说爬虫。对于一个小白程序员，写不了专业的分布式多点爬虫，就简单的实现了一个单点爬虫（参考《Python核心编程》），由于要搜集数据的小说网站17k的web页面连接具有稳定的规律：http://www.17k.com/book/***.html，星号表示3位-7位不同的数字。所以用列表解析生成一个连接簇是不错的方法（相比从web页面解析联系高效的多）。简单的爬虫主要使用的库有：urllib2、urllib、string、time等。不多说，上代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*- 
'''
File: crawler_for17k.py
Author: civis
Date: 2015/01/21
'''
import sys
import os
import time
import string
import htmllib
import urllib
import urllib2
import urlparse
import formatter
import cStringIO

#download web pages
class Retriever(object): 
    def __init__(self, url):
        self.url = url
        self.file =self.filename(url)

    def filename(self, url, deffile='index.htm'):
        """get file name of url"""
        if '/' not in url[7:]:
            url += '/'

        parsedurl = urlparse.urlparse(url, 'http:', 0)
        domain = parsedurl[1]
        self.domain = domain
        path = parsedurl[2]
        #print 'parsedurl: ', parsedurl
        filename = domain + path
        ext = os.path.splitext(filename)
        #print 'ext: ', ext
        if ext[1] == '': #no file, use default
            if path[-1] == '/':
                path += deffile
            else:
                path += '/'+deffile
        
        if not os.path.isdir(domain):
            os.makedirs(domain)

        return path

    def download(self): 
        """download web page of self.url"""
        try:
            resp = urllib2.urlopen(self.url, timeout=2)
            if resp.geturl()[-3:] == "404" or resp.getcode() > 299:
                retval = ('***Error: invalid URL:%s\nreturn url: %s' % (self.url, resp.geturl()))
                return retval
            page = resp.read()
        except Exception, e:
            retval = "***Error: " + str(e)
            return retval

        try:
            filename = self.domain + self.file
            filename = filename.replace('/', ':')
            fl= open(self.domain+'/'+filename, 'w')
            fl.write(page)
            fl.close()
            retval = "Success!\n"
            #retval = urllib.urlretrieve(self.url, self.domain+'/'+ self.file)
        except IOError, e:
            retval = '*** Error: file fault!' + str(e)
            fl.close()
        except Exception, e:
            retval = ('*** Error: '+str(e))
        return retval

    def parse_and_get_links(self):
        """parse HTML, save links"""
        self.parser = htmllib.HTMLParser(formatter.AbstractFormatter(\
                formatter.DumbWriter(cStringIO.StringIO())))
        self.parser.feed(open(self.domain+'/'+self.file).read())
        self.parser.close()
        return self.parser.anchorlist

class Crawler(object):
    """manage entire crawling process"""
    
    count = 0 #static downloaded pages counter
    def __init__(self, url):
        self.queue = [url] #link queue waiting to download
        self.seen = set()  #links have download
        self.domain = urlparse.urlparse(url)[1] #domain of this url
    
    def init_queue(self):
        prefix = "http://www.17k.com/book/"
        for i in range( 120000, 129992 ):
            self.queue.append(prefix+str(i)+'.html')

    def get_page(self, url):
        """call Retrieve to download page"""
        r = Retriever(url)
        retval = r.download()
        
        #erro situation, do not parse
        if retval[0] == '*':
            print retval, '...skipping parse\n'
            return
        else:
            Crawler.count += 1
            print '(', Crawler.count, ')'
            print 'URL: ', url
            print retval
            self.seen.add(url)

        """
        links = r.parse_and_get_links()
        for link in links:
            if link[:4] != 'http' \
                    and string.find(link, '://') == -1:
                link = urlparse.urljoin(url, link)
            print '* ', link
            
            if string.find(string.lower(link), 'mailto:') != -1:
                print '... discarded, mailto link'
                continue

            if link not in self.seen:
                if string.find(link, self.domain) == -1:
                    print '... discarded, not in domain!'
                else:
                    if link not in self.queue:
                        self.queue.append(link)
                        print '... new, added to Queue'
                    else:
                        print '... discarded, already processed!'
            else:
                print '...discarded, already processed!'
        """
    def go(self):
        """process links in queue"""
        self.init_queue()
        while self.queue:
            url = self.queue.pop()
            self.get_page(url)
            time.sleep(0.2)

def main():
    """main funciton"""
    url = "http://www.17k.com/book/"
    parse = urlparse.urlparse(url)
    domain = parse[1]
    if not os.path.isdir(domain):
        os.mkdir(domain)
    
    if not url:
        try:
            url = raw_input("Enter starting url: ")
        except (KeyboardInterrupt, EOFError):
            url = "http://www.baidu.com"
    if not url:
        return
    robot = Crawler(url)
    robot.go()


if __name__ == '__main__':
    main()

上述简单的爬虫中，抓取压力控制QPS<3， urlopen中timeout=2，即访问延迟两秒，time.sleep =0.2。页面下载后存成本地HTML文件。（后期考虑到文件访问性能瓶颈，可以将多个页面存放到一个文件中，减少文件访问次数）

下面说说页面解析模块。页面解析需要解析出小说的基本信息包括：title，author，introduction，category，image_url，word_count，url，score and tags。解析HTML页面当然要使用神器beautifulsoup，从标签树中找出需要的信息。如果页面简单，可以考虑使用正则表达式re模块处理。处理中文页面难免会遇到字符编码集的问题，在请教别人后采用第三方库chardet检测页面的字符集，然后decode/encode 操作。考虑到频繁处理本地文件，所以多线程能提高程序速度，博览网络博客后采用大神分享的线程池，改编后为我所用（感谢大神的分享：http://www.cnblogs.com/coser/archive/2012/03/10/2389264.html）。下面是代码：

#!/usr/bin/env python
# -*-* coding:utf-8 -*-
 
'''
File: parsePage_for17K.py
Author: civis
Date: 2015/01/22 17:03:22
'''

import os
import re
import sys

from bs4 import BeautifulSoup as bsoup
import chardet

reload(sys)
sys.setdefaultencoding('utf-8')

class Parse_Page(object):
    """parse page that downloaded by crawler"""

    def __init__(self, page_name):
        self.page_name = page_name
        self.domain="www.17k.com"

    def parse(self):
        """parse page"""
        try:
            fl = open(self.domain+"/"+self.page_name)
            page = fl.read()
            if page == "":
                print "None page, return!"
                return
            fl.close()
        except IOError:
            print IOError, ", Error, read file fualt!"

        
        #handle character set
        charset = chardet.detect(page)
        charset = charset['encoding']
        #print 'charset: ', charset

        if charset == 'utf-8' or charset == 'UTF-8':
            pass
        else:
            page = page.decode(charset,'ignore')
            page = page.encode('utf-8','ignore')
        
        soup = bsoup(page)
        #title information
        title_tag = soup.h1
        if title_tag is None:
            print "skip file: ", self.page_name
            return
        title = title_tag.string
        
        #introduction information
        intro_tag = soup.find('font', itemprop='description')
        intro = "None information"
        if intro_tag is not None:
            intro = intro_tag.text
            intro = intro.replace("<br>", ' ')
        
        #athor information
        author_tag = soup.find('span', itemprop='author')
        author = "None information"
        if author_tag is not None:
            author_tag = author_tag.span.font
            if author_tag is not None:
                author = author_tag.string
        
        #href information
        href_tag = soup.find('div', "book_btn_new")
        href_tag = href_tag.findAll('a')[1]
        href = "None information"
        if href_tag is not None:
            href = 'http://'+self.domain+href_tag['href']
        
        #image information
        img_tag = soup.find('img', itemprop="image")
        img_href = "None information"
        if img_tag is not None:
            img_href = img_tag['src']

        #category information
        cat_tag = soup.find('font', itemprop='genre')
        category = "None information"
        if cat_tag is not None:
            category = cat_tag.text

        #score information
        score_tag = soup.find('big', id='howmuchreadBook')
        score = "None information"
        if score_tag is not None:
            if score_tag.text != '': score = score_tag.text

        #wordcount information
        wc_tag = soup.find('em', itemprop='wordCount')
        wordcount = "None information"
        if wc_tag is not None:
            #wordcount = int('0' + wc_tag.text)
            wordcount = wc_tag.text

        #tag information
        tag_tag = soup.find('li', 'tab')
        tags = "None information"
        if tag_tag is not None:
            a_tags = tag_tag.findAll('a')
            tags = [ a.text for a in a_tags]
            tag_text = '    '.join(tags)

        try:
            fl = open('novel/17k/23/'+ self.page_name.split(":")[2].split('.')[0] +'.info','w')
            fl.write("title: "+title+'\n')
            fl.write("author: "+author+'\n')
            fl.write("introduction:\n"+intro+'\n')
            fl.write("category: " + category+'\n')
            fl.write("image: " + img_href +'\n')
            fl.write("wordcount: "+ wordcount+'\n')
            fl.write("links: "+href+'\n')
            fl.write("score: " + score +'\n')
            fl.write("tags: " + tag_text +'\n')
            fl.close()
            print "Success Parse!"
        except IOError:
            print IOError, ', file write fault!'


class Parser(object):
    """page parser machine"""
    def __init__(self, domain="www.17k.com"):
        self.domain=domain 
    
    def go(self):
        """start function"""
        files = os.listdir(self.domain)
        for fn in files[0:20]:
            print 'File: ', fn
            pa = Parse_Page(fn)
            pa.parse()

if __name__=='__main__':
    parser = Parser()
    parser.go()

线程池的程序由于改编别人的程序，就没有脸贴出来了。要说明的是，改版程序性能不是很好，原因主要有二：文件操作太频繁；页面解析遍历全文次数太多。未来将主要该井这两点。

代码一贴，略显冗长，那就来个下文分享吧。