爬取图片案例

 相关网站:https://pic.netbian.com/

 

"""
爬取彼岸图网

思路:
1.按用户输入的爬取对象编号构建首页url
2.爬取url,获取页数信息(re)
3.线程池爬取所有页面,获取每个页面上所有图片块的url(xpath)
4.多任务异步协程爬取图片块信息,下载图片
5.所有图片下载完毕后再次询问用户是否继续其他爬取

----------------------------------------------------------------
"""


#需要导入的模块
import requests
import re
from lxml import etree
from concurrent.futures import ThreadPoolExecutor


#提前准备的全局变量

#图片下载路径
path_picture = "C:\\Users\\kangheng\\Desktop\\MyPicture\\"

#修改UA,添加代理IP
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0", 'Connection': 'close'}
proxies = {"http": "http://120.83.108.45:9999"}

dict = {
    1: ["4K风景","fengjing"],
    2: ["4K美女","meinv"],
    3: ["4K游戏","youxi"],
    4: ["4K动漫","dongman"],
    5: ["4K影视","yingshi"],
    6: ["4K明星","mingxing"],
    7: ["4K汽车","qiche"],
    8: ["4K动物","dongwu"],
    9: ["4K人物","renwu"],
    10: ["4K美食","meishi"],
    11: ["4K宗教","zongjiao"],
    12: ["4K背景","beijing"]
}


#根据用户输入获取url
def Get_home_url():
    for num in range(len(dict)):
        print("{0}    {1}".format(list(dict.keys())[num], list(dict.values())[num][0]))
    print("------------------------")
    n = int(input("请输入要爬取的对象对应编号:"))
    name = dict.get(n)[1]
    #print("要爬取的url为:", f"https://pic.netbian.com/4k{name}/")
    return name


#获取响应
def Get_response(url):
    try:
        response = requests.get(url, headers = headers, proxies = proxies, verify=False, timeout = 5)
        return response
    except Exception as error:
        print(error)
        return None
    

#获取页面数
def Get_page(response, name):
    response.encoding = "gbk"
    pat = f'<a href="/4k{name}/index_(.*?).html"'
    rst = re.compile(pat).findall(response.text)
    p = []
    for i in range(len(rst)):
        p.append(int(rst[i]))
    page = max(p)
    print(f"总页面数为{page}")
    return page



#下载图片
def Download_picture(url):
    response = Get_response(url)
    response.encoding = "gbk"
    tree = etree.HTML(response.text)
    img = tree.xpath('//*[@id="img"]/img/@src')[0]
    title = tree.xpath('//*[@id="img"]/img/@alt')[0]
    url_picture = f"https://pic.netbian.com{img}"
    response_picture = Get_response(url_picture)
    with open(path_picture+title+".jpg", "wb+") as fp:
        fp.write(response_picture.content)

        
#获取页面上每个图片对应部分的链接
def Get_all_part(url):
    response = Get_response(url)
    response.encoding = "gbk"
    tree = etree.HTML(response.text)
    lis = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    for li in lis:
        part_url = li.xpath('./a/@href')[0]
        Download_picture(f"https://pic.netbian.com{part_url}")
    print("一页下载完成!")
    
    
    
#线程池请求所有页面
def Get_all_response(url, page):
    with ThreadPoolExecutor(30) as t:
        for i in range(1,page + 1):
            t.submit(Get_all_part, url + f"index_{i}.html")

            
def main():
    name = Get_home_url()
    url = f"https://pic.netbian.com/4k{name}/"
    response = Get_response(url)
    page = Get_page(response, name)
    Get_all_response(url, page)
    

    
if __name__ == "__main__" :
    while True:
        main()
        choice = input("爬取全部完成!\n是否继续爬取?(请输入yes或no)")
        if choice == "yes":
            pass
        else:
            print("程序结束!")
            break

希望路过的小伙伴可以点个赞,满足一下我小小的虚荣心,同时欢迎大佬提出改进意见或提供新思路,在这里道谢了!!!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值