懒人的漫画下载工具[基本能用版]-优快云博客

本文介绍了一个用于从特定网站抓取漫画资源的Python爬虫程序。该程序能够解析不同格式的漫画链接并下载图片，适用于不规范的URL结构。文中详细展示了爬虫的设计思路及实现细节。

从 http://mh.jumpcn.com/上下载漫画。
这个网站的防盗链不够强大，添加一个伪造的http referer header就可以了。
但是图片的URL不是特别规范（有好多种），为了省事就采用顺藤摸瓜的办法了——先访问目录页得到章节URL，然后再依次访问各图片页面（本来只需要访问第一张图片页就可以推出后面页码的，但是我发现图片URL命名风格很多种，可能是手动添加的原因吧）。
基本上都用的是标准模块，win xp sp2，py 2.5通过（家里的电脑只有windows……）
使用方法：
python source comicurl comicname
（保存路径为固定的D:\comic，因为我自己写的config和log都在寝室电脑，现在也不想去重复劳动)

贴代码：

# !/usr/bin/env python
# coding=cp936
import urllib,httplib,os,re,sys
from sgmllib import SGMLParser
from urlparse import urlparse
'''
    chuan's comic robot
     '''
comicpath = ' D:\\comic '

''' class SaveLog(object):
    "SaveLog for 'The break point continues the biography ' "
    def __init__(self,path):
        self.f = open(os.path.join(path,'save.log'),'r') '''

class IndexLister(SGMLParser):
     " parser comic chapters "
     def reset(self):
        SGMLParser.reset(self)
        self.cpturl = []

     def start_a(self, attrs):
        item = []
         if len(attrs) > 2 and attrs[ 2 ][0] == ' title ' and attrs[0][ 1 ].find( ' /1.html ' ) > 0:
            item = [attrs[0][ 1 ],attrs[ 2 ][ 1 ]]
         if item:
            self.cpturl.append(item)

class ChapterLister(SGMLParser):
     " parser chapter infomation "
     def reset(self):
        SGMLParser.reset(self)
        self.pagenum = 0

     def start_option(self, attrs):
         # get page num
         if attrs[0][0] == ' value ' and attrs[0][ 1 ].isdigit() and int(attrs[0][ 1 ]) > self.pagenum:
            self.pagenum = int(attrs[0][ 1 ])

class PicParser(object):
     " get picture location "
     def __init__ (self):
        self.re = re.compile( " <td><a href=\ " (. + )\ " ><img src=\ " (. + )\ " alt=\ " (. + )\ " border=\ " 0\ " /></a></td> " )

     def parser(self, attrs):
        self.result = self.re.findall(attrs)
         if self.result:
             return self.result[0][ 1 ]
         else :
             return None

class Comic(object):
     " Comic object "
     def __init__ (self,url,name = ' 未命名 ' ):
        self.url = url
        self.chaptercnt = 0
        self.idxLister = IndexLister()
        self.chapters = []
        self.currentchapter = None
        self.path = os.path.join(comicpath,name)
        self.makedir()

     def save(self,path):
         pass
     def getchapter(self):
        self.idxLister.feed(urllib.urlopen(self.url).read())
         for item in self.idxLister.cpturl:
            self.chapters.append(Chapter(item[0],self.path,item[ 1 ]))
     def go(self):
         pass
         # for item in self.chapters:
         #     item.savepic()
     def makedir(self):
         try :
             if not os.path.isdir(self.path):
                os.makedirs(self.path)
         except   Exception, e:
             pass

class Chapter(object):
     " parser,get,save picture sort by chapter "
     def __init__ (self,url = '' ,path = '' ,name = ' 未知章节 ' ):
        self.url = url
        self.cptLister = ChapterLister()
        self.pic = PicParser()
        self.name = name
        self.pagenum = 0
        self.content = None
        self.picurllist = []
        self.chapterurllist = []
        self.path = os.path.join(path,name)

        self.makedir()
        self.getchapternum()
        self.initchapterurl()
        self.getpicurl()
        self.savepic()

     def makedir(self):
         try :
             if not os.path.isdir(self.path):
                os.makedirs(self.path)
         except   Exception, e:
             pass

     def initchapterurl(self):
         if self.pagenum:
            self.chapterurllist.append(self.url)
            urlhead = self.url[0: - 6 ]
            urltail = self.url[ - 5 :]
             for i in range( 2 ,self.pagenum):
                self.chapterurllist.append(urlhead + str(i) + urltail)

     def getpicurl(self):
         if self.pagenum:
             for item in self.chapterurllist:
                self.picurllist.append(self.pic.parser(urllib.urlopen(item).read()))

     def savepic(self):
         for i in self.picurllist:
             if os.path.isfile(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :])):
                 print i, ' exist abort. '
                 continue
            url = urlparse(i)
            con = httplib.HTTPConnection(url.hostname)
            con.putrequest( ' GET ' ,url.path)
            con.putheader( ' Referer ' ,i)
            con.endheaders()
            r = con.getresponse()
            f = open(os.path.join(self.path,i[i.rfind( ' / ' ) + 1 :]), ' wb ' )
            content = r.read()
            f.write(content)
            f.close()
             print ' save ' ,i, ' done. '

     def getchapternum(self):
        self.content = urllib.urlopen(self.url).read()
         if self.content:
            self.cptLister.feed(self.content)
            self.pagenum = self.cptLister.pagenum
             # print debug info
             print self.cptLister.pagenum, ' 页 ' ,self.name
         else :
             print ' get %s pageinfo fail. ' % self.name
     def picparser(self,picurl):
         pass

if __name__ == ' __main__ ' :
     if len(sys.argv) != 3 :
         print ' usage: [python] [this] [url] [name] '
     else :
         try :
             # mycomic = Comic("http://mh.jumpcn.com/comic-book/708/","史上最强弟子兼一")
            mycomic = Comic(sys.argv[ 1 ],sys.argv[ 2 ])
            mycomic.getchapter()
            mycomic.go()
         except   Exception, e:
             print e

下载（有严重错误，停止下载）
~~ccr.py~~

2008年2月1日更新：发现一个重大BUG，111行应为
for i in range(2,self.pagenum+1):
否则每一章会少一页。
2008年2月6日更新：另一个BUG，最后一页的正则抓取与前面页数不一样，修改见第二版。

转载于:https://www.cnblogs.com/melorain/archive/2008/01/31/1059540.html