爬取图片案例

最新推荐文章于 2025-12-02 19:01:55 发布

原创最新推荐文章于 2025-12-02 19:01:55 发布 · 210 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python

相关网站：https://pic.netbian.com/

"""
爬取彼岸图网

思路：
1.按用户输入的爬取对象编号构建首页url
2.爬取url，获取页数信息（re）
3.线程池爬取所有页面，获取每个页面上所有图片块的url（xpath）
4.多任务异步协程爬取图片块信息，下载图片
5.所有图片下载完毕后再次询问用户是否继续其他爬取

----------------------------------------------------------------
"""


#需要导入的模块
import requests
import re
from lxml import etree
from concurrent.futures import ThreadPoolExecutor


#提前准备的全局变量

#图片下载路径
path_picture = "C:\\Users\\kangheng\\Desktop\\MyPicture\\"

#修改UA，添加代理IP
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.0", 'Connection': 'close'}
proxies = {"http": "http://120.83.108.45:9999"}

dict = {
    1: ["4K风景","fengjing"],
    2: ["4K美女","meinv"],
    3: ["4K游戏","youxi"],
    4: ["4K动漫","dongman"],
    5: ["4K影视","yingshi"],
    6: ["4K明星","mingxing"],
    7: ["4K汽车","qiche"],
    8: ["4K动物","dongwu"],
    9: ["4K人物","renwu"],
    10: ["4K美食","meishi"],
    11: ["4K宗教","zongjiao"],
    12: ["4K背景","beijing"]
}


#根据用户输入获取url
def Get_home_url():
    for num in range(len(dict)):
        print("{0}    {1}".format(list(dict.keys())[num], list(dict.values())[num][0]))
    print("------------------------")
    n = int(input("请输入要爬取的对象对应编号："))
    name = dict.get(n)[1]
    #print("要爬取的url为:", f"https://pic.netbian.com/4k{name}/")
    return name


#获取响应
def Get_response(url):
    try:
        response = requests.get(url, headers = headers, proxies = proxies, verify=False, timeout = 5)
        return response
    except Exception as error:
        print(error)
        return None
    

#获取页面数
def Get_page(response, name):
    response.encoding = "gbk"
    pat = f'<a href="/4k{name}/index_(.*?).html"'
    rst = re.compile(pat).findall(response.text)
    p = []
    for i in range(len(rst)):
        p.append(int(rst[i]))
    page = max(p)
    print(f"总页面数为{page}")
    return page



#下载图片
def Download_picture(url):
    response = Get_response(url)
    response.encoding = "gbk"
    tree = etree.HTML(response.text)
    img = tree.xpath('//*[@id="img"]/img/@src')[0]
    title = tree.xpath('//*[@id="img"]/img/@alt')[0]
    url_picture = f"https://pic.netbian.com{img}"
    response_picture = Get_response(url_picture)
    with open(path_picture+title+".jpg", "wb+") as fp:
        fp.write(response_picture.content)

        
#获取页面上每个图片对应部分的链接
def Get_all_part(url):
    response = Get_response(url)
    response.encoding = "gbk"
    tree = etree.HTML(response.text)
    lis = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    for li in lis:
        part_url = li.xpath('./a/@href')[0]
        Download_picture(f"https://pic.netbian.com{part_url}")
    print("一页下载完成！")
    
    
    
#线程池请求所有页面
def Get_all_response(url, page):
    with ThreadPoolExecutor(30) as t:
        for i in range(1,page + 1):
            t.submit(Get_all_part, url + f"index_{i}.html")

            
def main():
    name = Get_home_url()
    url = f"https://pic.netbian.com/4k{name}/"
    response = Get_response(url)
    page = Get_page(response, name)
    Get_all_response(url, page)
    

    
if __name__ == "__main__" :
    while True:
        main()
        choice = input("爬取全部完成！\n是否继续爬取？（请输入yes或no）")
        if choice == "yes":
            pass
        else:
            print("程序结束！")
            break

希望路过的小伙伴可以点个赞，满足一下我小小的虚荣心，同时欢迎大佬提出改进意见或提供新思路，在这里道谢了！！！