【无标题】

仅仅展示如何爬取某网站的图片

思路 利用谷歌chromedriver 加载该网站的图像,然后获取img节点,获取网站图片
使用前请添加网站网址!!!

仅供娱乐,如有侵权,请联系删除

在这里插入图片描述

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import time

# Initialize WebDriver
driver = webdriver.Chrome()  # Replace with your path

# Generate 100 URLs sequentially
base_url = ''#将这里添加你需要的爬取的地址
#urls = [f"{base_url}{i}.html" for i in range(13163, 13163 + 100)]

# Disable SSL warnings (use cautiously)
requests.packages.urllib3.disable_warnings()

# Incremental scrolling function to load content
def incremental_scroll():
    screen_height = driver.execute_script("return window.innerHeight")
    scroll_position = 0
    scroll_pause_time = 1  # Adjust based on load speed

    while True:
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(scroll_pause_time)
        scroll_position += screen_height

        # Stop when the bottom is reached
        new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
        if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
            break

# Loop through each URL
for url in urls:
    try:
        driver.get(url)
        time.sleep(3)  # Wait for the page to load

        # Scroll incrementally to ensure all images are loaded
        incremental_scroll()

        # Extract the page title to name the folder
        page_title = driver.title.strip().replace(" ", "_").replace("/", "_")
        print(f"Page Title: {page_title}")

        # Create a folder using the page title
        os.makedirs(page_title, exist_ok=True)

        # Find and download all images on the page
        images = driver.find_elements(By.TAG_NAME, 'img')
        for img in images:
            img_url = img.get_attribute('src')
            if img_url and img_url.startswith('http'):
                try:
                    img_data = requests.get(img_url, headers={'User-Agent': 'Mozilla/5.0'}, verify=False).content
                    img_name = os.path.join(page_title, img_url.split('/')[-1])
                    with open(img_name, 'wb') as handler:
                        handler.write(img_data)
                    print(f'Saved {img_name}')
                except Exception as e:
                    print(f"Failed to save {img_url}: {e}")

    except Exception as e:
        print(f"Failed to process {url}: {e}")

# Close WebDriver
driver.quit()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值