此脚本可以下载一些漂亮的https网页模版-优快云博客

本文链接：https://blog.youkuaiyun.com/rol2er/article/details/75078079

    #!/usr/bin/python
    # coding:utf-8
    
    import httplib2
    import urllib
    import os
    from lxml import etree
    
    ##################################
    # 说明：此脚本可以下载一些漂亮的https网页模版
    ##################################
    
    #设置网页根路径
    URL = 'https://xxxx.com/theme/dexter/full-width-light/'
    
    #初始化所有子url
    sub_urls = {'':True};
    
    #保存网页根路径
    save_file_root = 'D:/tmp/full-width-light/'
    
    http = httplib2.Http(disable_ssl_certificate_validation=True)
    
    #解析html中包含的连接、图片、url等
    def get_urls(content,xpath=u'//link/@href'):
        tree = etree.HTML(content)
        links = tree.xpath(xpath)
        for link in links:  
            if link.find('#') < 0 \  #过滤此字符开头的连接
               and link.find('javascript:') < 0 \ #过滤此单词的连接
               and not sub_urls.has_key(link) \ # 过滤已经存在的url
               and link.find('https') < 0 :   #过滤绝对路径
                
                sub_urls[link] = True
    
    #保存文件
    def save_file(link):
        if link == '' and sub_urls[link]:
            urllib.urlretrieve(URL + link,save_file_root + 'index.html')
        
        if sub_urls[link] and link != URL and link !='':
            print '--->', link
            
            if link.find('/') > -1 :
                tmp_path = link[0:link.rindex('/') + 1]
                if not os.path.isdir(save_file_root + tmp_path) :
                    os.makedirs(save_file_root + tmp_path)
                
            urllib.urlretrieve(URL + link,save_file_root + link)
            
        sub_urls[link] = False
            
                
    def download_content(url):
        
        print '===>' , url
        d,content = http.request(URL + url)
        
        if url.find(".html") > -1 or url == '' :
            get_urls(content,u'//link/@href')
            get_urls(content,u'//script/@src')
            get_urls(content,u'//a/@href')
        
        # save to file
        save_file(url)
        
        #递归
        for link in sub_urls.keys() :
            if sub_urls[link] :
                download_content(link)
            
    
    if __name__ == "__main__" :
        download_content('')