Python 图片

最新推荐文章于 2022-08-11 15:52:50 发布
原创最新推荐文章于 2022-08-11 15:52:50 发布 · 307 阅读
0 ·
CC 4.0 BY-SA版权
Python 专栏收录该内容
13 篇文章
订阅专栏

import urllib.request
import urllib.error
import os
import sys
import http.server
import http.client
import time
import re
import random
import math
from typing import List, Any
from bs4 import BeautifulSoup

data = None
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
enctype = 'utf-8'
proxies = []
max_error_times = 5         # 最多允许失败5次，否则放弃该图片下载
save_path = r'D:\pic\girl'

def get_result(req_or_url,is_retrieve=False,filename = None):       #flag是否使用retrieve
    error_time = 0
    while True:
        try:
            if error_time == max_error_times:
                print('失败次数达%d次......放弃操作' % max_error_times)
                return None
            error_time += 1
            if is_retrieve:
                return urllib.request.urlretrieve(req_or_url,filename)
            else:
                return urllib.request.urlopen(req_or_url)
        except urllib.error.URLError as e:
            if hasattr(e,'code'):
                print(e.code,e.reason)
                change_proxy()
                continue
            elif hasattr(e,'reason'):
                print(e)
                change_proxy()
                continue
        except (ConnectionResetError,http.client.BadStatusLine) as e:
            print(e)
            change_proxy()
            continue
        except TimeoutError as e:
            print(e)
            print('服务器长时间无响应，自动切换代理.....')
            change_proxy()
            continue


def get_proxy():
    global data,headers,proxies
    req = urllib.request.Request('http://www.xici.net.co',None,headers)
    response = get_result(req)
    html = response.read().decode('utf-8')
    # print(html)
    soup = BeautifulSoup(html, "html.parser")

    tables = soup.findAll('table')
    tab = tables[0]
    for tr in tab.findAll('tr'):
        try:
            tdlist = tr.findAll('td')
            if(len(tdlist) > 0):
                if(tdlist[5].getText()=='HTTP'):
                    proxies.append(tdlist[1].getText()+':'+tdlist[2].getText())
        except Exception as e:
            print(e)
        # for td in tr.findAll('td')[1:]:
        #     if td.getText() == 'HTTP':
        #         proxies.append(td[0]+":"+td[1])
    # print(proxies)




def change_proxy():
    proxy = random.choice(proxies)
    if proxy == None:
        proxy_support = proxy_support = urllib.request.ProxyHandler({})
    else:
        proxy_support = urllib.request.ProxyHandler({'http':proxy})
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders = [('User-Agent',headers['User-Agent'])]
    urllib.request.install_opener(opener)
    print('智能切换代理：%s' % ('本机' if proxy==None else proxy))


def create_localhost():
    number = int((math.sqrt(5)-1)/2) * len(proxies)
    print(number)
    for x in range(number):
        proxies.append(None)



def get_page():         #获取最大页数
    home = 'http://jandan.net/ooxx'
    global data,headers,enctype
    req = urllib.request.Request(home,data,headers)
    response = get_result(req)
    html = response.read().decode(enctype)
    find_string = 'current-comment-page'
    find_start = html.index(find_string) + len(find_string) + 3
    find_end = html.index(']',find_start+1)
    return int(html[find_start:find_end])



def get_pic_list(url):     #生成器，返回一个图片链接
    global data,headers,enctype
    piclist = []
    req = urllib.request.Request(url, data, headers)
    response = get_result(req)
    if response == None:
        print('获取页面失败.....')
        sys.exit()
    html = response.read().decode(enctype)
    soup = BeautifulSoup(html, "html.parser")
    for image in soup.findAll('img'):
        piclist.append("http:"+image['src'])
        # print("http:"+image['src'])
    return piclist




def get_pic(page):     #生成器，返回一个图片链接
    global data,headers,enctype
    while True:
        url = 'http://jandan.net/ooxx/page-%d' % page
        print('当前页面：%d' % page)
        req = urllib.request.Request(url,data,headers)
        response = get_result(req)
        if response == None:
            print('获取页面失败.....')
            sys.exit()
        html = response.read().decode(enctype)
        pic = re.compile(r'<img\s+src="(http://.+?\.(?:jpg|jpeg|gif))"')
        for pic in pic.finditer(html):
            print(pic.group(1))
            # yield pic.group(1)
        time.sleep(5)
        page -= 1
        if page<1:
            break


def get_pageurl_list(url):
    global data, headers, enctype
    for i in range(0,10):
        # url = 'http://jandan.net/ooxx/'
        req = urllib.request.Request(url,data,headers)
        response = get_result(req)
        if response == None:
            print('获取页面失败......')
            sys.exit()
        html = response.read().decode(enctype)
        soup = BeautifulSoup(html, "html.parser")
        tag = soup.find('a',attrs={'class': {'previous-comment-page'}})
        # print(tag['href'])
        url = 'http:' + tag['href']
        yield url


def download():
    count = 1
    global data,headers
    for pic_url in get_pic_list():         #get_page()改为页数如1000可从1000页开始下载
        file_name = os.path.split(pic_url)[1]
        if not os.path.isdir(save_path):        #目录不存在就创建
            os.makedirs(save_path)
        get_result(pic_url,True,save_path+'\\'+file_name)
        print('本次成功下载第%d个图片! %s' % (count, pic_url))
        count += 1



def download2():
    page = 1
    count = 1
    global data, headers
    for page_url in get_pageurl_list('http://jandan.net/ooxx/'):  # get_page()改为页数如1000可从1000页开始下载
        print(page_url)
        for pic_url in get_pic_list(page_url):
            file_name = os.path.split(pic_url)[1]
            if not os.path.isdir(save_path):  # 目录不存在就创建
                os.makedirs(save_path)
            get_result(pic_url, True, save_path + '\\' + file_name)
            print('本次成功下载 第%d页 第%d个图片! %s' % (page,count, pic_url))
            count += 1
        page +=1


if __name__ == '__main__':
    download2()
    # download()
    # get_pic_list()
    # get_proxy()
    # get_pic(10)
    # create_localhost()
    # download()