这个小项目是笔者入职后第一个练手的任务,实现的功能是:从单个小说网站搜集小说数据,解析这些数据后存储到数据库中,然后提供访问数据的接口。实现这些大功能就需要四个模块:爬虫、页面解析器、数据库存储和数据访问接口。
首先,来说说爬虫。对于一个小白程序员,写不了专业的分布式多点爬虫,就简单的实现了一个单点爬虫(参考《Python核心编程》),由于要搜集数据的小说网站17k的web页面连接具有稳定的规律:http://www.17k.com/book/***.html,星号表示3位-7位不同的数字。所以用列表解析生成一个连接簇是不错的方法(相比从web页面解析联系高效的多)。简单的爬虫主要使用的库有:urllib2、urllib、string、time等。不多说,上代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
File: crawler_for17k.py
Author: civis
Date: 2015/01/21
'''
import sys
import os
import time
import string
import htmllib
import urllib
import urllib2
import urlparse
import formatter
import cStringIO
#download web pages
class Retriever(object):
def __init__(self, url):
self.url = url
self.file =self.filename(url)
def filename(self, url, deffile='index.htm'):
"""get file name of url"""
if '/' not in url[7:]:
url += '/'
parsedurl = urlparse.urlparse(url, 'http:', 0)
domain = parsedurl[1]
self.domain = domain
path = parsedurl[2]
#print 'parsedurl: ', parsedurl
filename = domain + path
ext = os.path.splitext(filename)
#print 'ext: ', ext
if ext[1] == '': #no file, use default
if path[-1] == '/':
path += deffile
else:
path += '/'+deffile
if not os.path.isdir(domain):
os.makedirs(domain)
return path
def download(self):
"""download web page of self.url"""
try:
resp = urllib2.urlopen(self.url, timeout=2)
if resp.geturl()[-3:] == "404" or resp.getcode() > 299:
retval = ('***Error: invalid URL:%s\nreturn url: %s' % (self.url, resp.geturl()))
return retval
page = resp.read()
except Exception, e:
retval = "***Error: " + str(e)
return retval
try:
filename = self.domain + self.file
filename = filename.replace('/', ':')
fl= open(self.domain+'/'+filename, 'w')
fl.write(page)
fl.close()
retval = "Success!\n"
#retval = urllib.urlretrieve(self.url, self.domain+'/'+ self.file)
except IOError, e:
retval = '*** Error: file fault!' + str(e)
fl.close()
except Exception, e:
retval = ('*** Error: '+str(e))
return retval
def parse_and_get_links(self):
"""parse HTML, save links"""
self.parser = htmllib.HTMLParser(formatter.AbstractFormatter(\
formatter.DumbWriter(cStringIO.StringIO())))
self.parser.feed(open(self.domain+'/'+self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object):
"""manage entire crawling process"""
count = 0 #static downloaded pages counter
def __init__(self, url):
self.queue = [url] #link queue waiting to download
self.seen = set() #links have download
self.domain = urlparse.urlparse(url)[1] #domain of this url
def init_queue(self):
prefix = "http://www.17k.com/book/"
for i in range( 120000, 129992 ):
self.queue.append(prefix+str(i)+'.html')
def get_page(self, url):
"""call Retrieve to download page"""
r = Retriever(url)
retval = r.download()
#erro situation, do not parse
if retval[0] == '*':
print retval, '...skipping parse\n'
return
else:
Crawler.count += 1
print '(', Crawler.count, ')'
print 'URL: ', url
print retval
self.seen.add(url)
"""
links = r.parse_and_get_links()
for link in links:
if link[:4] != 'http' \
and string.find(link, '://') == -1:
link = urlparse.urljoin(url, link)
print '* ', link
if string.find(string.lower(link), 'mailto:') != -1:
print '... discarded, mailto link'
continue
if link not in self.seen:
if string.find(link, self.domain) == -1:
print '... discarded, not in domain!'
else:
if link not in self.queue:
self.queue.append(link)
print '... new, added to Queue'
else:
print '... discarded, already processed!'
else:
print '...discarded, already processed!'
"""
def go(self):
"""process links in queue"""
self.init_queue()
while self.queue:
url = self.queue.pop()
self.get_page(url)
time.sleep(0.2)
def main():
"""main funciton"""
url = "http://www.17k.com/book/"
parse = urlparse.urlparse(url)
domain = parse[1]
if not os.path.isdir(domain):
os.mkdir(domain)
if not url:
try:
url = raw_input("Enter starting url: ")
except (KeyboardInterrupt, EOFError):
url = "http://www.baidu.com"
if not url:
return
robot = Crawler(url)
robot.go()
if __name__ == '__main__':
main()
上述简单的爬虫中,抓取压力控制QPS<3, urlopen中timeout=2,即访问延迟两秒,time.sleep =0.2。页面下载后存成本地HTML文件。(后期考虑到文件访问性能瓶颈,可以将多个页面存放到一个文件中,减少文件访问次数)
下面说说页面解析模块。页面解析需要解析出小说的基本信息包括:title,author,introduction,category,image_url,word_count,url,score and tags。解析HTML页面当然要使用神器beautifulsoup,从标签树中找出需要的信息。如果页面简单,可以考虑使用正则表达式re模块处理。处理中文页面难免会遇到字符编码集的问题,在请教别人后采用第三方库chardet检测页面的字符集,然后decode/encode 操作。考虑到频繁处理本地文件,所以多线程能提高程序速度,博览网络博客后采用大神分享的线程池,改编后为我所用(感谢大神的分享:http://www.cnblogs.com/coser/archive/2012/03/10/2389264.html)。下面是代码:
#!/usr/bin/env python
# -*-* coding:utf-8 -*-
'''
File: parsePage_for17K.py
Author: civis
Date: 2015/01/22 17:03:22
'''
import os
import re
import sys
from bs4 import BeautifulSoup as bsoup
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
class Parse_Page(object):
"""parse page that downloaded by crawler"""
def __init__(self, page_name):
self.page_name = page_name
self.domain="www.17k.com"
def parse(self):
"""parse page"""
try:
fl = open(self.domain+"/"+self.page_name)
page = fl.read()
if page == "":
print "None page, return!"
return
fl.close()
except IOError:
print IOError, ", Error, read file fualt!"
#handle character set
charset = chardet.detect(page)
charset = charset['encoding']
#print 'charset: ', charset
if charset == 'utf-8' or charset == 'UTF-8':
pass
else:
page = page.decode(charset,'ignore')
page = page.encode('utf-8','ignore')
soup = bsoup(page)
#title information
title_tag = soup.h1
if title_tag is None:
print "skip file: ", self.page_name
return
title = title_tag.string
#introduction information
intro_tag = soup.find('font', itemprop='description')
intro = "None information"
if intro_tag is not None:
intro = intro_tag.text
intro = intro.replace("<br>", ' ')
#athor information
author_tag = soup.find('span', itemprop='author')
author = "None information"
if author_tag is not None:
author_tag = author_tag.span.font
if author_tag is not None:
author = author_tag.string
#href information
href_tag = soup.find('div', "book_btn_new")
href_tag = href_tag.findAll('a')[1]
href = "None information"
if href_tag is not None:
href = 'http://'+self.domain+href_tag['href']
#image information
img_tag = soup.find('img', itemprop="image")
img_href = "None information"
if img_tag is not None:
img_href = img_tag['src']
#category information
cat_tag = soup.find('font', itemprop='genre')
category = "None information"
if cat_tag is not None:
category = cat_tag.text
#score information
score_tag = soup.find('big', id='howmuchreadBook')
score = "None information"
if score_tag is not None:
if score_tag.text != '': score = score_tag.text
#wordcount information
wc_tag = soup.find('em', itemprop='wordCount')
wordcount = "None information"
if wc_tag is not None:
#wordcount = int('0' + wc_tag.text)
wordcount = wc_tag.text
#tag information
tag_tag = soup.find('li', 'tab')
tags = "None information"
if tag_tag is not None:
a_tags = tag_tag.findAll('a')
tags = [ a.text for a in a_tags]
tag_text = ' '.join(tags)
try:
fl = open('novel/17k/23/'+ self.page_name.split(":")[2].split('.')[0] +'.info','w')
fl.write("title: "+title+'\n')
fl.write("author: "+author+'\n')
fl.write("introduction:\n"+intro+'\n')
fl.write("category: " + category+'\n')
fl.write("image: " + img_href +'\n')
fl.write("wordcount: "+ wordcount+'\n')
fl.write("links: "+href+'\n')
fl.write("score: " + score +'\n')
fl.write("tags: " + tag_text +'\n')
fl.close()
print "Success Parse!"
except IOError:
print IOError, ', file write fault!'
class Parser(object):
"""page parser machine"""
def __init__(self, domain="www.17k.com"):
self.domain=domain
def go(self):
"""start function"""
files = os.listdir(self.domain)
for fn in files[0:20]:
print 'File: ', fn
pa = Parse_Page(fn)
pa.parse()
if __name__=='__main__':
parser = Parser()
parser.go()
线程池的程序由于改编别人的程序,就没有脸贴出来了。要说明的是,改版程序性能不是很好,原因主要有二:文件操作太频繁;页面解析遍历全文次数太多。未来将主要该井这两点。
代码一贴,略显冗长,那就来个下文分享吧。