Python 图片


import urllib.request
import urllib.error
import os
import sys
import http.server
import http.client
import time
import re
import random
import math
from typing import List, Any
from bs4 import BeautifulSoup

data = None
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
enctype = 'utf-8'
proxies = []
max_error_times = 5         # 最多允许失败5次,否则放弃该图片下载
save_path = r'D:\pic\girl'

def get_result(req_or_url,is_retrieve=False,filename = None):       #flag是否使用retrieve
    error_time = 0
    while True:
        try:
            if error_time == max_error_times:
                print('失败次数达%d次......放弃操作' % max_error_times)
                return None
            error_time += 1
            if is_retrieve:
                return urllib.request.urlretrieve(req_or_url,filename)
            else:
                return urllib.request.urlopen(req_or_url)
        except urllib.error.URLError as e:
            if hasattr(e,'code'):
                print(e.code,e.reason)
                change_proxy()
                continue
            elif hasattr(e,'reason'):
                print(e)
                change_proxy()
                continue
        except (ConnectionResetError,http.client.BadStatusLine) as e:
            print(e)
            change_proxy()
            continue
        except TimeoutError as e:
            print(e)
            print('服务器长时间无响应,自动切换代理.....')
            change_proxy()
            continue


def get_proxy():
    global data,headers,proxies
    req = urllib.request.Request('http://www.xici.net.co',None,headers)
    response = get_result(req)
    html = response.read().decode('utf-8')
    # print(html)
    soup = BeautifulSoup(html, "html.parser")

    tables = soup.findAll('table')
    tab = tables[0]
    for tr in tab.findAll('tr'):
        try:
            tdlist = tr.findAll('td')
            if(len(tdlist) > 0):
                if(tdlist[5].getText()=='HTTP'):
                    proxies.append(tdlist[1].getText()+':'+tdlist[2].getText())
        except Exception as e:
            print(e)
        # for td in tr.findAll('td')[1:]:
        #     if td.getText() == 'HTTP':
        #         proxies.append(td[0]+":"+td[1])
    # print(proxies)




def change_proxy():
    proxy = random.choice(proxies)
    if proxy == None:
        proxy_support = proxy_support = urllib.request.ProxyHandler({})
    else:
        proxy_support = urllib.request.ProxyHandler({'http':proxy})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent',headers['User-Agent'])]
    urllib.request.install_opener(opener)
    print('智能切换代理:%s' % ('本机' if proxy==None else proxy))


def create_localhost():
    number = int((math.sqrt(5)-1)/2) * len(proxies)
    print(number)
    for x in range(number):
        proxies.append(None)



def get_page():         #获取最大页数
    home = 'http://jandan.net/ooxx'
    global data,headers,enctype
    req = urllib.request.Request(home,data,headers)
    response = get_result(req)
    html = response.read().decode(enctype)
    find_string = 'current-comment-page'
    find_start = html.index(find_string) + len(find_string) + 3
    find_end = html.index(']',find_start+1)
    return int(html[find_start:find_end])



def get_pic_list(url):     #生成器,返回一个图片链接
    global data,headers,enctype
    piclist = []
    req = urllib.request.Request(url, data, headers)
    response = get_result(req)
    if response == None:
        print('获取页面失败.....')
        sys.exit()
    html = response.read().decode(enctype)
    soup = BeautifulSoup(html, "html.parser")
    for image in soup.findAll('img'):
        piclist.append("http:"+image['src'])
        # print("http:"+image['src'])
    return piclist




def get_pic(page):     #生成器,返回一个图片链接
    global data,headers,enctype
    while True:
        url = 'http://jandan.net/ooxx/page-%d' % page
        print('当前页面:%d' % page)
        req = urllib.request.Request(url,data,headers)
        response = get_result(req)
        if response == None:
            print('获取页面失败.....')
            sys.exit()
        html = response.read().decode(enctype)
        pic = re.compile(r'<img\s+src="(http://.+?\.(?:jpg|jpeg|gif))"')
        for pic in pic.finditer(html):
            print(pic.group(1))
            # yield pic.group(1)
        time.sleep(5)
        page -= 1
        if page<1:
            break


def get_pageurl_list(url):
    global data, headers, enctype
    for i in range(0,10):
        # url = 'http://jandan.net/ooxx/'
        req = urllib.request.Request(url,data,headers)
        response = get_result(req)
        if response == None:
            print('获取页面失败......')
            sys.exit()
        html = response.read().decode(enctype)
        soup = BeautifulSoup(html, "html.parser")
        tag = soup.find('a',attrs={'class': {'previous-comment-page'}})
        # print(tag['href'])
        url = 'http:' + tag['href']
        yield url


def download():
    count = 1
    global data,headers
    for pic_url in get_pic_list():         #get_page()改为页数如1000可从1000页开始下载
        file_name = os.path.split(pic_url)[1]
        if not os.path.isdir(save_path):        #目录不存在就创建
            os.makedirs(save_path)
        get_result(pic_url,True,save_path+'\\'+file_name)
        print('本次成功下载第%d个图片! %s' % (count, pic_url))
        count += 1



def download2():
    page = 1
    count = 1
    global data, headers
    for page_url in get_pageurl_list('http://jandan.net/ooxx/'):  # get_page()改为页数如1000可从1000页开始下载
        print(page_url)
        for pic_url in get_pic_list(page_url):
            file_name = os.path.split(pic_url)[1]
            if not os.path.isdir(save_path):  # 目录不存在就创建
                os.makedirs(save_path)
            get_result(pic_url, True, save_path + '\\' + file_name)
            print('本次成功下载 第%d页 第%d个图片! %s' % (page,count, pic_url))
            count += 1
        page +=1


if __name__ == '__main__':
    download2()
    # download()
    # get_pic_list()
    # get_proxy()
    # get_pic(10)
    # create_localhost()
    # download()


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值