选择任意一个相关网页 如秀人网
首先是获取该网页下的所有对应的子目录结构`
import os
from bs4 import BeautifulSoup
# Target URL
url = 'https://www.xiurenwang.cc/mianfei'
# User-Agent to mimic browser behavior
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# Request the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract subpage links from elements with class "img"
subpage_links = []
for img_div in soup.find_all('a', class_='img', href=True):
full_url = f"https://www.xiurenwang.cc/{img_div['href']}"
subpage_links.append(full_url)
测试一下是否正常输出
#Print the extracted links
print("Extracted Subpage Links:")
for subpage in subpage_links:
print(subpage)
设置窗口滑动(会影响完整的爬取速度)
# Function to scroll the page incrementally
def incremental_scroll():
screen_height = driver.execute_script("return window.innerHeight")
scroll_position = 0
scroll_pause_time = 2
while True:
driver.execute_script(f"window.scrollTo(0, {scroll_position});")
time.sleep(scroll_pause_time)
scroll_position += screen_height
new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
break
完整代码以及保存
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import os
from bs4 import BeautifulSoup
# Target URL
url = 'https://www.xiurenwang.cc/mianfei'
# User-Agent to mimic browser behavior
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# Request the webpage
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
# Extract subpage links from elements with class "img"
subpage_links = []
for img_div in soup.find_all('a', class_='img', href=True):
full_url = f"https://www.xiurenwang.cc/{img_div['href']}"
subpage_links.append(full_url)
# Print the extracted links
#print("Extracted Subpage Links:")
#for subpage in subpage_links:
# print(subpage)
# Initialize WebDriver
driver = webdriver.Chrome() # Replace with your path
requests.packages.urllib3.disable_warnings()
# Function to scroll the page incrementally
def incremental_scroll():
screen_height = driver.execute_script("return window.innerHeight")
scroll_position = 0
scroll_pause_time = 2
while True:
driver.execute_script(f"window.scrollTo(0, {scroll_position});")
time.sleep(scroll_pause_time)
scroll_position += screen_height
new_scroll_position = driver.execute_script("return window.scrollY + window.innerHeight")
if new_scroll_position >= driver.execute_script("return document.body.scrollHeight"):
break
for url in subpage_links:
try:
driver.get(url)
time.sleep(3) # Wait for the page to load
incremental_scroll() # Ensure all content is loaded
# Use the page title as the folder name
page_title = driver.title.strip().replace(" ", "_").replace("/", "_")
os.makedirs(page_title, exist_ok=True)
# Find and download images
images = driver.find_elements(By.TAG_NAME, 'img')
for img in images:
img_url = img.get_attribute('src')
if img_url and img_url.startswith('http'):
try:
img_data = requests.get(img_url, headers={'User-Agent': 'Mozilla/5.0'}, verify=False).content
img_name = os.path.join(page_title, img_url.split('/')[-1])
with open(img_name, 'wb') as handler:
handler.write(img_data)
print(f'Saved {img_name}')
except Exception as e:
print(f"Failed to save {img_url}: {e}")
except Exception as e:
print(f"Failed to process {url}: {e}")
# Close WebDriver
driver.quit()
欢迎交流与讨论!