起点现在的双域名分别是不同的页面架构技术:
cmfu是原来老的ASP,对于公开章节,起点采用的是js调用一个txt,相对原来直接页面显示,确实是一个明智的选择(对抓取的来说也方便:p).
qidian采用的是新的.Net技术,这个里面就需要采用正则去匹配了.
结合上一篇文章我的小程序,这里将社区内相关的代码全部show一下.
首先是我的:
#!/usr/bin/python #filename:simpleCMFU import re import urllib def alaynsis_id(url_given): name=url_given[-11:-1]+url_given[-1] return name def read(url_given): html=urllib.urlopen(url_given) page=html.read() html.close() #rex = r'http:////files/.qidian/.com///[a-zA-Z]{6}[0-9]//d{6}///d{6}/.txt' rex = r'http://files.qidian.com/author[0-9]//d{6}//d{7}.txt' #http://files.qidian.com/author3/172602/4451850.txt url_down=test(page,rex) url=url_down[0] #print rex return url def read2(url,name): html=urllib.urlopen(url) page=html.read() html.close() page=page[15:len(page)] fl=file(name,'w') fl.write(page) fl.close() return 'ok' def test(html,rex): #r = re.compile(rex) matchs = re.findall(rex,html,re.DOTALL) return matchs def run(): url=raw_input('please send address you wanted:') url_tmp=read(url) name=alaynsis_id(url_tmp) read2(url_tmp,name) if __name__ == '__main__': print 'this program is just for download text from qidian.com by duducai@msn.com/please visit http://duducai.javaeye.com /n' run()
其次是社区Ben Luo的大作,走的是sina读书频道:
##################### #html2txt.py ##################### from formatter import AbstractFormatter, NullWriter from htmllib import HTMLParser def _(str, in_encoder="gbk", out_encoder="utf8"): return unicode(str, in_encoder).encode(out_encoder) class myWriter(NullWriter): def __init__(self): NullWriter.__init__(self) self._bodyText = [] def send_flowing_data(self, str): self._bodyText.append(str) def _get_bodyText(self): return '/n'.join(self._bodyText) bodyText = property(_get_bodyText, None, None, 'plain text from body') class myHTMLParser(HTMLParser): def do_meta(self, attrs): self.metas = attrs def convertFile(filename): mywriter = myWriter() absformatter = AbstractFormatter(mywriter) parser = myHTMLParser(absformatter) parser.feed(open(filename).read()) return ( _(parser.title), parser.formatter.writer.bodyText ) import os import os.path OUTPUTDIR = "./txt" INPUTDIR = "." if __name__ == "__main__": if not os.path.exists(OUTPUTDIR): os.mkdir(OUTPUTDIR) for file in os.listdir(INPUTDIR): if file[-4:] == '.htm' or file[-5:] == '.html': print "Coverting", file, outfilename = os.path.splitext(file)[0] a, text = convertFile(file) outfilename = outfilename + '.txt' outfullname = os.path.join(OUTPUTDIR, outfilename) open(outfullname, "wt").write(text) print "Done!"
################################ #pickupcontent.py ################################ # -*- coding: utf-8 -*- import sys import glob import os import re sys.argv[1:] = [item for arg in sys.argv[1:] for item in glob.glob(arg)] startstr = u"^八十".encode("gb2312") # article title endstr = u"^/[返回".encode("gb2312") # tmp_start = re.compile(startstr) tmp_end = re.compile(endstr) for infile in sys.argv[1:]: # print infile f = open(infile,'r') #print f lines = f.readlines() fout = '' for index, line in enumerate(lines): if tmp_start.match(line): kstart = index if tmp_end.match(line): kend = index break f.close() fout = fout.join(lines[kstart:kend]) tmp = open('tmp','w') tmp.write(fout) tmp.close() os.remove(infile) os.rename('tmp',infile)
最后是BIGZHU的:
#@+leo-ver=4-thin-encoding=gb2312,. #@+node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py #@+at #@nonl # 起点小说爬虫 #@-at #@@c #@@language python #@+others #@+node:BIGZHU.20070731161308:import import httplib,urllib2,urllib,cookielib,re,threading import os #@nonl #@-node:BIGZHU.20070731161308:import #@+node:BIGZHU.20070731160928:getCookie def getCookie(): cj = cookielib.CookieJar()#建立Cookie实例 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))#建立opener与Cookie关联 return opener #@-node:BIGZHU.20070731160928:getCookie #@-others #@<> #@+node:BIGZHU.20070731160918.1:<> def getBookIdList(urlList): BookIdList = [] for i in urlList: url=i #print url request = urllib2.urlopen(url) cmfu = request.read() #cmfuURL = re.findall("<> #@nl #@nonl #@-node:BIGZHU.20070731160918:@thin d:/bigzhu/python/python_project/get_cmfu.py #@-leo