这个网站的防盗链不够强大,添加一个伪造的http referer header就可以了。
但是图片的URL不是特别规范(有好多种),为了省事就采用顺藤摸瓜的办法了——先访问目录页得到章节URL,然后再依次访问各图片页面(本来只需要访问第一张图片页就可以推出后面页码的,但是我发现图片URL命名风格很多种,可能是手动添加的原因吧)。
基本上都用的是标准模块,win xp sp2,py 2.5通过(家里的电脑只有windows……)
使用方法:
python source comicurl comicname
(保存路径为固定的D:\comic,因为我自己写的config和log都在寝室电脑,现在也不想去重复劳动)
贴代码:
#
!/usr/bin/env python
# coding=cp936
import urllib,httplib,os,re,sys
from sgmllib import SGMLParser
from urlparse import urlparse
'''
chuan's comic robot
'''
comicpath = ' D:\\comic '
''' class SaveLog(object):
"SaveLog for 'The break point continues the biography ' "
def __init__(self,path):
self.f = open(os.path.join(path,'save.log'),'r') '''
class IndexLister(SGMLParser):
" parser comic chapters "
def reset(self):
SGMLParser.reset(self)
self.cpturl = []
def start_a(self, attrs):
item = []
if len(attrs) > 2 and attrs[ 2 ][0] == ' title ' and attrs[0][ 1 ].find( ' /1.html ' ) > 0:
item = [attrs[0][ 1 ],attrs[ 2 ][ 1 ]]
if item:
self.cpturl.append(item)
class ChapterLister(SGMLParser):
" parser chapter infomation "
def reset(self):
SGMLParser.reset(self)
self.pagenum = 0
def start_option(self, attrs):
# get page num
if attrs[0][0] == ' value ' and attrs[0][ 1 ].isdigit() and int(attrs[0][ 1 ]) > self.pagenum:
self.pagenum = int(attrs[0][ 1 ])
class PicParser(object):
" get picture location "
def __init__ (self):
self.re = re.compile( " <td><a href=\ " (. + )\ " ><img src=\ " (. + )\ " alt=\ " (. + )\ " border=\ " 0\ " /></a></td> " )
def parser(self, attrs):
self.result = self.re.findall(attrs)
if self.result:
return self.result[0][ 1 ]
else :
return None
class Comic(object):
" Comic object "
def __init__ (self,url,name = ' 未命名 ' ):
self.url = url
self.chaptercnt = 0
self.idxLister = IndexLister()
self.chapters = []
self.currentchapter = None
self.path = os.path.join(comicpath,name)
self.makedir()
def save(self,path):
pass
def getchapter(self):
self.idxLister.feed(urllib.urlopen(self.url).read())
for item in self.idxLister.cpturl:
self.chapters.append(Chapter(item[0],self.path,item[ 1 ]))
def go(self):
pass
# for item in self.chapters:
# item.savepic()
def makedir(self):
try :
if not os.path.isdir(self.path):
os.makedirs(self.path)
except Exception, e:
pass
class Chapter(object):
" parser,get,save picture sort by chapter "
def __init__ (self,url = '' ,path = '' ,name = ' 未知章节 ' ):
self.url = url
self.cptLister = ChapterLister()
self.pic = PicParser()
self.name = name
self.pagenum = 0
self.content = None
self.picurllist = []
self.chapterurllist = []
self.path = os.path.join(path,name)
self.makedir()
self.getchapternum()
self.initchapterurl()
self.getpicurl()
self.savepic()
def makedir(self):
try :
if not os.path.isdir(self.path):
os.makedirs(self.path)
except Exception, e:
pass
def initchapterurl(self):
if self.pagenum:
self.chapterurllist.append(self.url)
urlhead = self.url[0: - 6 ]
urltail = self.url[ - 5 :]
for i in range( 2 ,self.pagenum):
self.chapterurllist.append(urlhead + str(i) + urltail)
def getpicurl(self):
if self.pagenum:
for item in self.chapterurllist:
self.picurllist.append(self.pic.parser(urllib.urlopen(item).read()))
def savepic(self):
for i in self.picurllist:
if os.path.isfile(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :])):
print i, ' exist abort. '
continue
url = urlparse(i)
con = httplib.HTTPConnection(url.hostname)
con.putrequest( ' GET ' ,url.path)
con.putheader( ' Referer ' ,i)
con.endheaders()
r = con.getresponse()
f = open(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :]), ' wb ' )
content = r.read()
f.write(content)
f.close()
print ' save ' ,i, ' done. '
def getchapternum(self):
self.content = urllib.urlopen(self.url).read()
if self.content:
self.cptLister.feed(self.content)
self.pagenum = self.cptLister.pagenum
# print debug info
print self.cptLister.pagenum, ' 页 ' ,self.name
else :
print ' get %s pageinfo fail. ' % self.name
def picparser(self,picurl):
pass
if __name__ == ' __main__ ' :
if len(sys.argv) != 3 :
print ' usage: [python] [this] [url] [name] '
else :
try :
# mycomic = Comic("http://mh.jumpcn.com/comic-book/708/","史上最强弟子兼一")
mycomic = Comic(sys.argv[ 1 ],sys.argv[ 2 ])
mycomic.getchapter()
mycomic.go()
except Exception, e:
print e
# coding=cp936
import urllib,httplib,os,re,sys
from sgmllib import SGMLParser
from urlparse import urlparse
'''
chuan's comic robot
'''
comicpath = ' D:\\comic '
''' class SaveLog(object):
"SaveLog for 'The break point continues the biography ' "
def __init__(self,path):
self.f = open(os.path.join(path,'save.log'),'r') '''
class IndexLister(SGMLParser):
" parser comic chapters "
def reset(self):
SGMLParser.reset(self)
self.cpturl = []
def start_a(self, attrs):
item = []
if len(attrs) > 2 and attrs[ 2 ][0] == ' title ' and attrs[0][ 1 ].find( ' /1.html ' ) > 0:
item = [attrs[0][ 1 ],attrs[ 2 ][ 1 ]]
if item:
self.cpturl.append(item)
class ChapterLister(SGMLParser):
" parser chapter infomation "
def reset(self):
SGMLParser.reset(self)
self.pagenum = 0
def start_option(self, attrs):
# get page num
if attrs[0][0] == ' value ' and attrs[0][ 1 ].isdigit() and int(attrs[0][ 1 ]) > self.pagenum:
self.pagenum = int(attrs[0][ 1 ])
class PicParser(object):
" get picture location "
def __init__ (self):
self.re = re.compile( " <td><a href=\ " (. + )\ " ><img src=\ " (. + )\ " alt=\ " (. + )\ " border=\ " 0\ " /></a></td> " )
def parser(self, attrs):
self.result = self.re.findall(attrs)
if self.result:
return self.result[0][ 1 ]
else :
return None
class Comic(object):
" Comic object "
def __init__ (self,url,name = ' 未命名 ' ):
self.url = url
self.chaptercnt = 0
self.idxLister = IndexLister()
self.chapters = []
self.currentchapter = None
self.path = os.path.join(comicpath,name)
self.makedir()
def save(self,path):
pass
def getchapter(self):
self.idxLister.feed(urllib.urlopen(self.url).read())
for item in self.idxLister.cpturl:
self.chapters.append(Chapter(item[0],self.path,item[ 1 ]))
def go(self):
pass
# for item in self.chapters:
# item.savepic()
def makedir(self):
try :
if not os.path.isdir(self.path):
os.makedirs(self.path)
except Exception, e:
pass
class Chapter(object):
" parser,get,save picture sort by chapter "
def __init__ (self,url = '' ,path = '' ,name = ' 未知章节 ' ):
self.url = url
self.cptLister = ChapterLister()
self.pic = PicParser()
self.name = name
self.pagenum = 0
self.content = None
self.picurllist = []
self.chapterurllist = []
self.path = os.path.join(path,name)
self.makedir()
self.getchapternum()
self.initchapterurl()
self.getpicurl()
self.savepic()
def makedir(self):
try :
if not os.path.isdir(self.path):
os.makedirs(self.path)
except Exception, e:
pass
def initchapterurl(self):
if self.pagenum:
self.chapterurllist.append(self.url)
urlhead = self.url[0: - 6 ]
urltail = self.url[ - 5 :]
for i in range( 2 ,self.pagenum):
self.chapterurllist.append(urlhead + str(i) + urltail)
def getpicurl(self):
if self.pagenum:
for item in self.chapterurllist:
self.picurllist.append(self.pic.parser(urllib.urlopen(item).read()))
def savepic(self):
for i in self.picurllist:
if os.path.isfile(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :])):
print i, ' exist abort. '
continue
url = urlparse(i)
con = httplib.HTTPConnection(url.hostname)
con.putrequest( ' GET ' ,url.path)
con.putheader( ' Referer ' ,i)
con.endheaders()
r = con.getresponse()
f = open(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :]), ' wb ' )
content = r.read()
f.write(content)
f.close()
print ' save ' ,i, ' done. '
def getchapternum(self):
self.content = urllib.urlopen(self.url).read()
if self.content:
self.cptLister.feed(self.content)
self.pagenum = self.cptLister.pagenum
# print debug info
print self.cptLister.pagenum, ' 页 ' ,self.name
else :
print ' get %s pageinfo fail. ' % self.name
def picparser(self,picurl):
pass
if __name__ == ' __main__ ' :
if len(sys.argv) != 3 :
print ' usage: [python] [this] [url] [name] '
else :
try :
# mycomic = Comic("http://mh.jumpcn.com/comic-book/708/","史上最强弟子兼一")
mycomic = Comic(sys.argv[ 1 ],sys.argv[ 2 ])
mycomic.getchapter()
mycomic.go()
except Exception, e:
print e
下载(有严重错误,停止下载)
2008年2月1日更新:发现一个重大BUG,111行应为
for i in range(2,self.pagenum+1):
否则每一章会少一页。
2008年2月6日更新:另一个BUG,最后一页的正则抓取与前面页数不一样,修改见第二版。