10秒教你批量获取性感美女高清图!

选择任意一个相关网页 如秀人网图片来源于网络

首先是获取该网页下的所有对应的子目录结构`

import os
from bs4 import BeautifulSoup



# Target URL
url = 'https://www.xiurenwang.cc/mianfei'

# User-Agent to mimic browser behavior
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

# Request the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract subpage links from elements with class "img"
subpage_links = []
for img_div in soup.find_all('a', class_='img', href=True):
    full_url = f"https://www.xiurenwang.cc/{img_div['href']}"
    subpage_links.append(full_url)

测试一下是否正常输出

#Print the extracted links
print("Extracted Subpage Links:")
for subpage in subpage_links:
    print(subpage)

返回结果

设置窗口滑动(会影响完整的爬取速度)

# Function to scroll the page incrementally
def incremental_scroll():
    screen_height = driver.execute_script("return window.innerHeight")
    scroll_position = 0
    scroll_pause_time = 2

    while True:
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(scroll_pause_time)
        scroll_position += screen_height

        new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
        if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
            break

完整代码以及保存

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
from bs4 import BeautifulSoup
# Target URL
url = 'https://www.xiurenwang.cc/mianfei'
# User-Agent to mimic browser behavior
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# Request the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')

# Extract subpage links from elements with class "img"
subpage_links = []
for img_div in soup.find_all('a', class_='img', href=True):
    full_url = f"https://www.xiurenwang.cc/{img_div['href']}"
    subpage_links.append(full_url)

# Print the extracted links
#print("Extracted Subpage Links:")
#for subpage in subpage_links:
#    print(subpage)
# Initialize WebDriver
driver = webdriver.Chrome()  # Replace with your path


requests.packages.urllib3.disable_warnings()


# Function to scroll the page incrementally
def incremental_scroll():
    screen_height = driver.execute_script("return window.innerHeight")
    scroll_position = 0
    scroll_pause_time = 2

    while True:
        driver.execute_script(f"window.scrollTo(0, {scroll_position});")
        time.sleep(scroll_pause_time)
        scroll_position += screen_height

        new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
        if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
            break

for url in subpage_links:
    try:
        driver.get(url)
        time.sleep(3)  # Wait for the page to load

        incremental_scroll()  # Ensure all content is loaded

        # Use the page title as the folder name
        page_title = driver.title.strip().replace(" ", "_").replace("/", "_")
        os.makedirs(page_title, exist_ok=True)

        # Find and download images
        images = driver.find_elements(By.TAG_NAME, 'img')
        for img in images:
            img_url = img.get_attribute('src')
            if img_url and img_url.startswith('http'):
                try:
                    img_data = requests.get(img_url, headers={'User-Agent': 'Mozilla/5.0'}, verify=False).content
                    img_name = os.path.join(page_title, img_url.split('/')[-1])
                    with open(img_name, 'wb') as handler:
                        handler.write(img_data)
                    print(f'Saved {img_name}')
                except Exception as e:
                    print(f"Failed to save {img_url}: {e}")

    except Exception as e:
        print(f"Failed to process {url}: {e}")

# Close WebDriver
driver.quit()

欢迎交流与讨论!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值