分文件夹下载图片,速度比较慢
import urllib.request
import os
import re
base_path='PicDownload'
if not os.path.isdir(base_path):
os.makedirs(base_path)
#提取子页面链接及子页面名称
def filterHTML(html):
result=[]
base_link='http://www.ivsky.com'
link_re = re.compile(r'class="il_img"><a href="(.*?)" title="(.*?)"')
for link,title in link_re.findall(html):
link=base_link+link
result.append((link,title))
return result
#在子页面提取图片地址
def search_pic(html):
base_link='http://img.ivsky.com'
pic_re=re.compile(r"var imgURL='(.*?)'.*aid='(.*?)'")
url,name=pic_re.findall(html)[0]
pic_url=base_link+url
print("pic_url:",pic_url)
name=name+'.jpg'
print("pic_name:",name)
return (pic_url,name)
#打开网页
def openURL(web):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36 QIHU 360SE'}
req=urllib.request.Request(url=web,headers=headers)
data=urllib.request.urlopen(req)
return data.read().decode('UTF-8')
#打开首页
def main1():
web='http://www.ivsky.com/tupian/'
html=openURL(web)
for link,title in filterHTML(html): #link_re.findall(html):
#建立文件夹并跳转
print(title)
path=os.path.join(base_path,title)
if not os.path.isdir(path):
os.makedirs(path)
#进入子页面提取原图
print('子页面:',link)
sub_html=openURL(link)
for link,title in filterHTML(sub_html):
#再次进入子页面
print('二次子页面:',link)
html=openURL(link)
pic_url,pic_name=search_pic(html)
#保存图片
try:
urllib.request.urlretrieve(pic_url,os.path.join(path,pic_name))
except:
print('保存失败')
def searchHTML(html):
result=[]
base_link='http://img.ivsky.com'
msg_re=re.compile(r'arctitle=.*\((.*?)张.*?imgURL=\'(.*?)\'.*?aid=\'(.*?)\'')
num,url,name=msg_re.findall(html)[0]
#print('num=%s\nurl=%s\nname=%s' % (num,url,name))
num=int(num)
link=base_link+url
result.append((name,link))
pos=link.rfind('.')
part1=link[:pos]
part2=link[pos:]
for i in range(1,num):
sign='-%03d' % i
new_name = str(int(name)+i)
new_link = part1+sign+part2
result.append((new_name,new_link))
#print(result)
return result
def main2():
web='http://www.ivsky.com/tupian/'
html=openURL(web)
for link,title in filterHTML(html): #link_re.findall(html):
#建立文件夹并跳转
print(title)
path=os.path.join(base_path,title)
if not os.path.isdir(path):
os.makedirs(path)
#os.chdir(path)
#进入子页面提取原图
print('子页面:',link)
sub_html=openURL(link)
link=filterHTML(sub_html)[0][0]
#pic_name=link[-15:-5]+'.jpg'
#print('pic_name:',pic_name)
html=openURL(link)
#提取照片张数和第一张照片地址,返回所有照片名称和下载地址
for pic_name,pic_url in searchHTML(html):
try:
pic_name=pic_name+'.jpg'
print('path=%s pic_name=%s pic_url=%s' % (path,pic_name,pic_url))
urllib.request.urlretrieve(pic_url,os.path.join(path,pic_name))
except:
print('保存失败')
main2()