def download_css(self,html):
liResult = re.findall('(".*?")',html)
if liResult:
print('>>>>>>>>>>>>>>>>>>>>>>>>>正在下载css文件>>>>>>>>>>>>>>>>>>>>>>>>>')
for on_url in liResult:
if on_url.endswith('.css"') or 'css?' in on_url:
css_link = re.sub('["]','',on_url)
if 'http:' not in css_link or 'https:' not in css_link:
css_link = urljoin(self.site_url,css_link)
if css_link:
css_name = re.sub('(.*//)', '', css_link)
f_index = css_name.find('/')
l_index = css_name.rfind('/')
dir_path = self.spath+css_name[f_index:l_index]
if '/$' in dir_path:
dir_path = re.sub('(/\$)','/',dir_path)
if not os.path.exists(dir_path):
os.system('mkdir -p %s'%dir_path)
css_name = re.sub('(.*/)', '', css_link)
if css_name[0] == '$':
css_name = css_name.replace('$','')
print('正在下载css文件:',css_link)
css_file = requests.get(css_link).content
with open('./%s'%(dir_path+'/'+css_name) ,'wb') as f:
f.write(css_file)
print('===========================css文件下载完毕===========================')
def download_html(self,html):
print('>>>>>>>>>>>>>>>>>>>>>>>>>正在下载网页源码>>>>>>>>>>>>>>>>>>>>>>>>>')
if not os.path.exists(self.spath+'/htmls'):
print(self.base_path+'/htmls', '已经将html文件存在对应目录')
os.system('mkdir -p %s' % self.spath+'/htmls')
else:
print(self.base_path+'/htmls', '已经存在此htmls目录')
if html:
with open('./%s' % (self.spath+'/htmls/'+self.spath+'.html'), 'w', encoding='utf-8') as f:
f.write(html)
print('===========================下载网页源码完毕===========================')
def gs_runner(self,html):
self.download_html(html)
self.download_imgs(html)
self.download_css(html)
self.download_js(html)
if __name__ == '__main__':
gs = GovSpider()
gs.start_page()
p1-2-2
最新推荐文章于 2024-07-30 03:34:14 发布