秀人网图片爬取
仅仅展示如何爬取某网站的图片
思路 利用谷歌chromedriver 加载该网站的图像,然后获取img节点,获取网站图片
使用前请添加网站网址!!!
仅供娱乐,如有侵权,请联系删除
在这里插入图片描述
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import time
# Initialize WebDriver
driver = webdriver.Chrome() # Replace with your path
# Generate 100 URLs sequentially
base_url = ''#将这里添加你需要的爬取的地址
#urls = [f"{base_url}{i}.html" for i in range(13163, 13163 + 100)]
# Disable SSL warnings (use cautiously)
requests.packages.urllib3.disable_warnings()
# Incremental scrolling function to load content
def incremental_scroll():
screen_height = driver.execute_script("return window.innerHeight")
scroll_position = 0
scroll_pause_time = 1 # Adjust based on load speed
while True:
driver.execute_script(f"window.scrollTo(0, {scroll_position});")
time.sleep(scroll_pause_time)
scroll_position += screen_height
# Stop when the bottom is reached
new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
break
# Loop through each URL
for url in urls:
try:
driver.get(url)
time.sleep(3) # Wait for the page to load
# Scroll incrementally to ensure all images are loaded
incremental_scroll()
# Extract the page title to name the folder
page_title = driver.title.strip().replace(" ", "_").replace("/", "_")
print(f"Page Title: {page_title}")
# Create a folder using the page title
os.makedirs(page_title, exist_ok=True)
# Find and download all images on the page
images = driver.find_elements(By.TAG_NAME, 'img')
for img in images:
img_url = img.get_attribute('src')
if img_url and img_url.startswith('http'):
try:
img_data = requests.get(img_url, headers={'User-Agent': 'Mozilla/5.0'}, verify=False).content
img_name = os.path.join(page_title, img_url.split('/')[-1])
with open(img_name, 'wb') as handler:
handler.write(img_data)
print(f'Saved {img_name}')
except Exception as e:
print(f"Failed to save {img_url}: {e}")
except Exception as e:
print(f"Failed to process {url}: {e}")
# Close WebDriver
driver.quit()