import requests,re,time,math
import os
from io import BytesIO
from PIL import Image
#收集第一步Html
all_bigimg_html = []
real_url_html = []
#进入后收集大图地址
new_big_img = []
# 得到页面url
def get_pageurl(root, start, counts):
all_page = [];
#需要获取页面1
if start <= 1:
#强制设置start 防止错误
start = 1;
# 第1页面是没有_%d的 所以需要写死 index.html
# 自动转换路径
fragment_root = root.split('/')
fragment_root[-1] = 'index.html'
fixed_root = '/'.join(fragment_root)
all_page.append(fixed_root)
#循环放入其他链接
for i in range (start+1, start + counts):
newURL = root.replace("%d", str(i));
all_page.append(newURL);
#不需要获取页面1
else:
for i in range (start, start + counts):
newURL = root.replace("%d", str(i));
all_page.append(newURL);
return all_page;
# 通过得到页面 得到大图的链接
def get_html(all_page):
index = 1;
part_url = [];
all_url = [];
for pageurl in all_page:
#爬取前休息1秒钟
time.sleep(1);
#设置一个爬取进度
print('小图html完成进度' + str(math.floor(index / len(all_page) * 100 )) + '%' )
index += 1
try:
res = requests.get(pageurl, headers=headers, timeout=5);
res.encoding = 'gbk';
#判断网络链接状态
res.raise_for_status()
except Exception:
print('http error');
#r.text 获得的html
#预编译
gethtml = re.compile("^/tupian/\d*.html$");
#切割文件
mid = re.split("\"", res.text);
#正则匹配
for i in mid:
temp_part_url = re.findall(gethtml,i)
# 能匹配到就加载进入
if len(temp_part_url) != 0 :
part_url.append(temp_part_url[0])
# 数组解析 添加链接头文件
all_url = [ linkpre + i for i in part_url]
return all_url
'''
本地文件读写 根据需要取舍
# 获得的文件写入到本地文件夹中 a+追加写入的方式一次性写完
with open("C:/Users/30818/Desktop/pagefile.txt",'a+',encoding='ISO-8859-1') as pagefile:
pagefile.write(r.text)
# 打开文件做正则匹配并在屏幕输出
with open("C:/Users/30818/Desktop/pagefile.txt",'r',encoding="gbk") as readfile:
# 文件内容读取到内存中
str = readfile.read();
# " 号分隔文件
mid = re.split("\"", str);
#正则匹配
for i in mid:
readhtml = re.findall("^/tupian/\d*.html$", i);
# 匹配到了就放入数组中
if len(readhtml) != 0:
# readhtml是一个匹配到的数组 匹配到几次就会有几次 所以将第1个放入
all_bigimg_html.append(readhtml[0]);
real_url_html = [ linkpre + all_bigimg_html[i] for i in range(len(all_bigimg_html)) ]
# 清空文件
with open("C:/Users/30818/Desktop/realurl.txt",'w',encoding='ISO-8859-1') as realurl:
realurl.write('');
# 将真实地址存放在文件夹中
with open("C:/Users/30818/Desktop/realurl.txt",'a+',encoding='ISO-8859-1') as realulr:
for i in real_url_html:
realulr.write(i);
realulr.write('\r');
'''
#得到大图的真实链接地址
def big_imgget(real_url):
index = 1;
part_big_url = [];
all_big_url = [];
#获得大图html链接页面
for html in real_url:
print('大图html当前进度' + str(math.floor(index / len(real_url) * 100)) + '%');
index += 1;
time.sleep(0.4);
#依次打开链接
try:
r = requests.get(html, headers=headers, timeout=5);
r.encoding = 'gbk'
r.raise_for_status();
except Exception:
print('http error');
#re 模块 split 切割 findall 查询符合的全部 search 查看符合的第一个 compile 预编译 ?非贪婪
re_downurl = re.compile(r'/uploads/allimg/.*?jpg')
temp_part_big_url = re.search(re_downurl, r.text)
if temp_part_big_url is not None:
#获取到了 写入数组中
part_big_url.append(temp_part_big_url[0]);
#列表解析组成大图真实地址
all_big_url = [ linkpre + i for i in part_big_url ]
return all_big_url
def downloadIMG(url, path):
'''
参数一 下载的URL
参数二 保存的路径
'''
try:
r = requests.get(url, headers=headers, timeout=5);
r.raise_for_status()
# 打开图片
MyImage = Image.open(BytesIO(r.content))
# 保存图片
MyImage.save(path)
print('save successful')
except Exception:
print("save error")
#当前窗口是主函数窗口
if __name__ == "__main__":
############################设置参数
# 请求地址
url = 'http://pic.netbian.com/4kmeinv'
# 请求头 根据chrome://version 获得
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'}
# root中提前设置占位符 %04d %d
root = 'http://pic.netbian.com/4kdongman/index_%d.html'
# 子页面 和大图链接前缀
linkpre = 'http://pic.netbian.com'
# 保存路径
path = 'C:\\Users\\30818\\Desktop\\img\\'
############################开始获取
# 获取预览地址 从第几页开始 总共收集几页
all_page = get_pageurl(root, 2, 5);
# 获取大图地址
two_step = get_html(all_page)
# 获取大图下载地址
three_step = big_imgget(two_step)
#根据总页面 逐一解析所有的URL 并且下载
#downloadIMG('http://pic.netbian.com/uploads/allimg/200803/212015-15964608158e89.jpg', 'C:\\Users\\30818\\Desktop\\img\\1.jpg')
#进行下载操作
index = 1;
for i in three_step:
print('图片下载进度' + str(math.floor(index / len(three_step) * 100)) + '%');
index += 1;
# i = http://pic.netbian.com/uploads/allimg/200922/230835-160078731550f0.jpg
# 组成真实保存地址
part_path = i.split('/');
part_path = '-'.join(part_path[-2:]);
realpath = path + part_path
downloadIMG(i, realpath)