from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
import csv
import pymysql
import os
import requests
import time
import datetime
import re
def download_image(url, save_path, filename):
if not url or url.startswith(('data:image', 'javascript:')):
print(f"无效的图片URL: {url}")
return
# 补全 URL(如果以 // 开头)
if url.startswith('//'):
url = 'https:' + url # 当当网图片通常支持 HTTPS
elif not url.startswith(('http://', 'https://')):
url = 'https://' + url # 其他情况默认加 HTTPS
# 清洗文件名
def sanitize_filename(name):
return re.sub(r'[\\/*?:"<>|]', "_", name)
safe_filename = sanitize_filename(filename)
save_to = os.path.join(save_path, safe_filename)
# 请求配置
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Referer': 'http://www.dangdang.com/'
}
# 重试机制
max_retries = 3
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
if not os.path.exists(save_path):
os.makedirs(save_path)
with open(save_to, 'wb') as f:
f.write(response.content)
print(f'图片已保存: {save_to}')
break
else:
print(f"HTTP错误: {response.status_code} (URL: {url})")
except Exception as e:
print(f"第 {attempt + 1} 次尝试失败: {str(e)}")
time.sleep(2)
else:
print(f"下载失败(超过最大重试次数): {url}")
# 初始化浏览器
driver_path = r"D:\chromedriver-win32\chromedriver.exe"
options = webdriver.ChromeOptions()
options.add_argument('--headless')
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=options)
try:
driver.get('http://www.dangdang.com/')
# 设置窗口大小,可能避免某些元素遮挡
driver.set_window_size(1920, 1080)
# 使用更健壮的方式获取搜索框
search_term = input("请输入要搜索的图书类型:")
search_box = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "key_S"))
)
search_box.clear()
search_box.send_keys(search_term)
# 等待搜索按钮可点击
search_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".search .button"))
)
# 尝试处理可能遮挡的浮动元素
try:
WebDriverWait(driver, 5).until(
EC.invisibility_of_element_located((By.CSS_SELECTOR, "a.robot"))
)
except:
pass
# 尝试点击搜索按钮
try:
search_btn.click()
except ElementClickInterceptedException:
print("常规点击被拦截,尝试使用JavaScript点击")
driver.execute_script("arguments[0].click();", search_btn)
id = 0
data = [['图书编号', '图书名称', '图书作者', '图书出版社', '图书图片url', '图书价格', '图书简介']]
# 使用更可靠的翻页方式
for _ in range(3): # 限制爬取3页防止无限循环
# 等待结果加载
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#search_nature_rg"))
)
book_items = driver.find_elements(By.CSS_SELECTOR, "#search_nature_rg li")
for item in book_items:
books = []
id += 1
books.append(id)
# 标题
try:
title = item.find_element(By.CSS_SELECTOR, "a[dd_name='单品标题']").get_attribute("title")
if not title:
title = item.find_element(By.CSS_SELECTOR, "a:first-child").text.strip()
books.append(title)
except:
books.append("未知标题")
# 作者
try:
author = item.find_element(By.CSS_SELECTOR, ".search_book_author a").text
books.append(author)
except:
books.append("未知作者")
# 出版社
try:
press_span = item.find_element(By.CSS_SELECTOR, ".search_book_author span:nth-child(3)")
press = press_span.find_element(By.TAG_NAME, "a").text
books.append(press)
except:
books.append("未知出版社")
# 图片URL
try:
img = item.find_element(By.CSS_SELECTOR, "a img")
src = img.get_attribute("src")
if "url_none.png" in src:
src = img.get_attribute("data-original") or ""
books.append(src)
except:
books.append("")
# 价格
try:
price = item.find_element(By.CSS_SELECTOR, ".search_now_price").text.replace("¥", "")
books.append(price)
except:
books.append("0.00")
# 简介
try:
intro = item.find_element(By.CSS_SELECTOR, ".detail").text
books.append(intro)
except:
books.append("无简介")
data.append(books)
# 翻页
try:
next_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, ".next"))
)
next_btn.click()
time.sleep(2)
except:
print("已到达最后一页或翻页失败")
break
# 保存到CSV
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f'dangdang_books_{timestamp}.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerows(data)
# 数据库操作
conn = pymysql.connect(
user="root",
password="123456",
host="localhost",
port=3306,
charset='utf8mb4'
)
cursor = conn.cursor()
try:
cursor.execute("CREATE DATABASE IF NOT EXISTS xyw CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")
except Exception as e:
print(f"创建数据库失败: {str(e)}")
exit()
# 选择数据库
conn.select_db("xyw")
# 创建表(如果不存在)
create_table_sql = """
CREATE TABLE IF NOT EXISTS dangdang(
id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255) CHARACTER SET utf8mb4,
author VARCHAR(100) CHARACTER SET utf8mb4,
press VARCHAR(100) CHARACTER SET utf8mb4,
src VARCHAR(255),
price DECIMAL(10,2),
introduction TEXT CHARACTER SET utf8mb4
);
"""
cursor.execute(create_table_sql)
# 插入数据
insert_sql = """
INSERT INTO dangdang (title, author, press, src, price, introduction)
VALUES (%s, %s, %s, %s, %s, %s)
"""
# 新增:统计成功插入的记录数
inserted_count = 0
for row in data[1:]:
try:
price_value = float(row[5]) if row[5].replace('.', '', 1).isdigit() else 0.0
cursor.execute(insert_sql, (row[1], row[2], row[3], row[4], price_value, row[6]))
inserted_count += 1 # 每成功插入一条,计数器加1
except Exception as e:
print(f"插入数据失败: {str(e)}")
conn.commit()
print(f"成功插入 {inserted_count} 条数据到数据库") # 使用自定义计数器
# 下载图片(修正缩进)
save_directory = 'download'
if not os.path.exists(save_directory):
os.makedirs(save_directory)
for i, row in enumerate(data[1:], 1):
image_url = row[4]
if image_url:
new_filename = f'{str(i).zfill(4)}.jpg'
download_image(image_url, save_directory, new_filename)
finally:
# 确保关闭资源
if 'conn' in locals() and conn.open:
cursor.close()
conn.close()
driver.quit()这段代码用scrapy框架写 ,功能不变
最新发布