Python抓取小说
前言
此脚本为了在MAC上抓取小说而写,用Python几句代码就可以了。
代码
# coding=utf-8
import re
import urllib2
import chardet
import sys
from bs4 import BeautifulSoup
import codecs
class Spider():
def __init__(self):
self.aTag=re.compile("<a href=\"(http://www.44pq.com/read/[0-9]+?_[0-9]+?.html)\"[^>]*?>(.+?)</a>")
self.contentTag=re.compile("<div class=\"readerContent\" id=\"content\">(.+?)</div>",re.I|re.S)
def getHtml(self, url):
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req=urllib2.Request(url,headers=headers)
response = urllib2.urlopen(req)
html = response.read()
return html
#soup=BeautifulSoup(html.decode("GB18030","ignore"))
#return soup.findAll("a")
#return soup.prettify()
#typeEncode = sys.getfilesystemencoding()
#infoencode = chardet.detect(html).get('encoding','utf-8')
#return html.decode('GB18030','ignore').encode("utf-8")
return html.decode('GB18030','ignore').encode(sys.getfilesystemencoding())
def Run(self):
bookurl="http://www.44pq.com/read/13567.html"
bookname="地球上唯一的魔法师"
text=[]
matchs=self.aTag.finditer(self.getHtml(bookurl))
alist=list(matchs)
total = len(alist)
print "total {0}".format(total)
i=0
for m in alist:
i+=1
text.append(m.group(2).decode("gb18030"))
text.append(self.getContent(m.group(1)))
self.writeFile(bookname,"\n\n".join(text))
del text[:]
print "{0}/{1}".format(i,total)
self.writeFile(bookname,"\n\n".join(text))
print "done!"
def writeFile(self,filename,text):
f=open(filename+".txt","a")
f.write(text)
f.close()
def getContent(self,url):
c=self.getHtml(url)
c=self.contentTag.search(c).group(1)
c=re.sub("<[^>]+?>","",c)
c=c.replace("nbsp;","").replace("&","")
return c.decode("gb18030")
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf-8')
spider = Spider()
spider.Run()
声明一下,实在搞不定优快云编辑器的格式问题了,上述代码中:
self.writeFile(bookname,"\n\n".join(text))
del text[:]
需要声明一点,此代码每抓取一章,就写入文件一次,以防内存占用过大。
self.writeFile(bookname,"\n\n".join(text))del text[:]
如果需要,也可以抓取N章写入文件一次,只需加入一个简单的逻辑判断就OK了。占用多少内存和写多少次文件,每个人有自己不同的衡量标准。