import requests
from lxml import etree
import time
import os
from concurrent.futures import ThreadPoolExecutor
date_dict = {
'动漫': 'dongman',
'游戏': 'youxi',
'美女': 'meinv',
'风景': 'fengjing',
'影视': 'yingshi',
'汽车': 'qiche',
'人物': 'renwu',
'动物': 'dongwu',
'宗教': 'zongjiao',
'北京': 'beijing'
}
def get_value_by_key(key):
# 检查键是否在字典中
if key in date_dict:
# 如果键存在,输出对应的值
value = date_dict[key]
else:
# 如果键不存在,输出错误信息
print("输入的键不存在于字典中,请重新输入。")
value = None
return value
# 获取正确的网络地址并返回图片的URL列表
def get_page(page_num, value):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
url = f'https://pic.netbian.com/4k{value}/index_{page_num}.html'
response = requests.get(url, headers=header)
html = response.text
et = etree.HTML(html)
result = et.xpath("//img/@src")
urls = ["https://pic.netbian.com" + url for url in result]
return urls
# 获取指定页面的图片名字列表
def get_name(html_url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
response = requests.get(html_url, headers=header)
response.encoding = response.apparent_encoding # 万能转码式
html = response.text
et = etree.HTML(html)
names = et.xpath("//img/@alt")
return names
# 清理文件名,去除非法字符
def clean_filename(name):
return ''.join(e for e in name if e.isalnum() or e in ' ._-')
# 保存图片到文件
def save_image(url, name, base_path):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0'
}
name = clean_filename(name) # 清理文件名
file_name = os.path.join(base_path, name + '.jpg') # 假设所有图片都是jpg格式
response = requests.get(url, headers=header)
time.sleep(1)
if response.status_code == 200:
with open(file_name, 'wb') as f:
f.write(response.content)
print(f'图片 {file_name} 保存完成!')
else:
print(f'请求图片 {url} 失败,状态码:{response.status_code}')
# 主函数
def main():
try:
os.makedirs('D:\\python爬图片文件', exist_ok=True)
except OSError as e:
print(f"创建文件夹时发生错误: {e}")#如果电脑里面没有文件可以使用os创建一个文件夹
base_path = 'D:\\python爬图片文件' # 图片保存的基础路径
key = input("请输入一个键(例如:动漫):")
value = get_value_by_key(key)
if value is not None:
with ThreadPoolExecutor(max_workers=8) as executor: # 最多同时开8个线程
num = int(input('请输入你想获取的页数:'))
for i in range(1, num-1): # 假设我们只爬取前4页,根据需要调整范围
urls = get_page(i, value)
names = get_name(f'https://pic.netbian.com/4k{value}/index_{i}.html')
for url, name in zip(urls, names):
executor.submit(save_image, url, name, base_path)
print("爬取完成!")
if __name__ == '__main__':
main()
这个代码是关于彼岸图网爬取的升级版可以选择类型和页数并且使用多线程速度很快!
注意:爬取的图片的文件在你的电脑的D:\\python爬图片文件文件下可以自己去找!
下面是我打包好的exe程序,没有python环境也可以执行:
链接:https://pan.baidu.com/s/1IrKjKKgY1wkx8D_WnPr3GA?pwd=sn2r
提取码:sn2r