#!/usr/bin/python
# coding:utf-8
import httplib2
import urllib
import os
from lxml import etree
##################################
# 说明:此脚本可以下载一些漂亮的https网页模版
##################################
#设置网页根路径
URL = 'https://xxxx.com/theme/dexter/full-width-light/'
#初始化所有子url
sub_urls = {'':True};
#保存网页根路径
save_file_root = 'D:/tmp/full-width-light/'
http = httplib2.Http(disable_ssl_certificate_validation=True)
#解析html中包含的连接、图片、url等
def get_urls(content,xpath=u'//link/@href'):
tree = etree.HTML(content)
links = tree.xpath(xpath)
for link in links:
if link.find('#') < 0 \ #过滤此字符开头的连接
and link.find('javascript:') < 0 \ #过滤此单词的连接
and not sub_urls.has_key(link) \ # 过滤已经存在的url
and link.find('https') < 0 : #过滤绝对路径
sub_urls[link] = True
#保存文件
def save_file(link):
if link == '' and sub_urls[link]:
urllib.urlretrieve(URL + link,save_file_root + 'index.html')
if sub_urls[link] and link != URL and link !='':
print '--->', link
if link.find('/') > -1 :
tmp_path = link[0:link.rindex('/') + 1]
if not os.path.isdir(save_file_root + tmp_path) :
os.makedirs(save_file_root + tmp_path)
urllib.urlretrieve(URL + link,save_file_root + link)
sub_urls[link] = False
def download_content(url):
print '===>' , url
d,content = http.request(URL + url)
if url.find(".html") > -1 or url == '' :
get_urls(content,u'//link/@href')
get_urls(content,u'//script/@src')
get_urls(content,u'//a/@href')
# save to file
save_file(url)
#递归
for link in sub_urls.keys() :
if sub_urls[link] :
download_content(link)
if __name__ == "__main__" :
download_content('')