Python:Scrapy中重写ImagePipeline组件的file_path函数,自定义图片的路径和名称

本文介绍如何在使用ImagePipeline组件下载图片时,将图片保存为原始文件名,而非默认的SHA1值。通过重写file_path函数可以实现这一目标。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

默认情况下,使用 ImagePipeline 组件下载图片的时候, 图片名称是以图片URL的SHA1值进行保存的
如:
图片URL: http://www.example.com/image.jpg
SHA1结果: 3afec3b4765f8f0a07b78f98c07b83f013567a0a
则图片名称: 3afec3b4765f8f0a07b78f98c07b83f013567a0a.jpg
但是,我想要以原来的图片名称进行保存,比如上面例子中的图片保存到本地的话,图片名称就应该是: image.jpg
stackoverflow 上说是可以重写file_path函数,def file_path(self, request, response=None, info=None):
就可以了,在file_path函数中return图片名称就可以了
from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import ElementClickInterceptedException import csv import pymysql import os import requests import time import datetime import re def download_image(url, save_path, filename): if not url or url.startswith(('data:image', 'javascript:')): print(f"无效的图片URL: {url}") return # 补全 URL(如果以 // 开头) if url.startswith('//'): url = 'https:' + url # 当当网图片通常支持 HTTPS elif not url.startswith(('http://', 'https://')): url = 'https://' + url # 其他情况默认加 HTTPS # 清洗文件名 def sanitize_filename(name): return re.sub(r'[\\/*?:"<>|]', "_", name) safe_filename = sanitize_filename(filename) save_to = os.path.join(save_path, safe_filename) # 请求配置 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Referer': 'http://www.dangdang.com/' } # 重试机制 max_retries = 3 for attempt in range(max_retries): try: response = requests.get(url, headers=headers, timeout=10) if response.status_code == 200: if not os.path.exists(save_path): os.makedirs(save_path) with open(save_to, 'wb') as f: f.write(response.content) print(f'图片已保存: {save_to}') break else: print(f"HTTP错误: {response.status_code} (URL: {url})") except Exception as e: print(f"第 {attempt + 1} 次尝试失败: {str(e)}") time.sleep(2) else: print(f"下载失败(超过最大重试次数): {url}") # 初始化浏览器 driver_path = r"D:\chromedriver-win32\chromedriver.exe" options = webdriver.ChromeOptions() options.add_argument('--headless') service = Service(driver_path) driver = webdriver.Chrome(service=service, options=options) try: driver.get('http://www.dangdang.com/') # 设置窗口大小,可能避免某些元素遮挡 driver.set_window_size(1920, 1080) # 使用更健壮的方式获取搜索框 search_term = input("请输入要搜索的图书类型:") search_box = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.ID, "key_S")) ) search_box.clear() search_box.send_keys(search_term) # 等待搜索按钮可点击 search_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, ".search .button")) ) # 尝试处理可能遮挡的浮动元素 try: WebDriverWait(driver, 5).until( EC.invisibility_of_element_located((By.CSS_SELECTOR, "a.robot")) ) except: pass # 尝试点击搜索按钮 try: search_btn.click() except ElementClickInterceptedException: print("常规点击被拦截,尝试使用JavaScript点击") driver.execute_script("arguments[0].click();", search_btn) id = 0 data = [['图书编号', '图书名称', '图书作者', '图书出版社', '图书图片url', '图书价格', '图书简介']] # 使用更可靠的翻页方式 for _ in range(3): # 限制爬取3页防止无限循环 # 等待结果加载 WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, "#search_nature_rg")) ) book_items = driver.find_elements(By.CSS_SELECTOR, "#search_nature_rg li") for item in book_items: books = [] id += 1 books.append(id) # 标题 try: title = item.find_element(By.CSS_SELECTOR, "a[dd_name='单品标题']").get_attribute("title") if not title: title = item.find_element(By.CSS_SELECTOR, "a:first-child").text.strip() books.append(title) except: books.append("未知标题") # 作者 try: author = item.find_element(By.CSS_SELECTOR, ".search_book_author a").text books.append(author) except: books.append("未知作者") # 出版社 try: press_span = item.find_element(By.CSS_SELECTOR, ".search_book_author span:nth-child(3)") press = press_span.find_element(By.TAG_NAME, "a").text books.append(press) except: books.append("未知出版社") # 图片URL try: img = item.find_element(By.CSS_SELECTOR, "a img") src = img.get_attribute("src") if "url_none.png" in src: src = img.get_attribute("data-original") or "" books.append(src) except: books.append("") # 价格 try: price = item.find_element(By.CSS_SELECTOR, ".search_now_price").text.replace("¥", "") books.append(price) except: books.append("0.00") # 简介 try: intro = item.find_element(By.CSS_SELECTOR, ".detail").text books.append(intro) except: books.append("无简介") data.append(books) # 翻页 try: next_btn = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.CSS_SELECTOR, ".next")) ) next_btn.click() time.sleep(2) except: print("已到达最后一页或翻页失败") break # 保存到CSV timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") csv_filename = f'dangdang_books_{timestamp}.csv' with open(csv_filename, 'w', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerows(data) # 数据库操作 conn = pymysql.connect( user="root", password="123456", host="localhost", port=3306, charset='utf8mb4' ) cursor = conn.cursor() try: cursor.execute("CREATE DATABASE IF NOT EXISTS xyw CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci") except Exception as e: print(f"创建数据库失败: {str(e)}") exit() # 选择数据库 conn.select_db("xyw") # 创建表(如果不存在) create_table_sql = """ CREATE TABLE IF NOT EXISTS dangdang( id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(255) CHARACTER SET utf8mb4, author VARCHAR(100) CHARACTER SET utf8mb4, press VARCHAR(100) CHARACTER SET utf8mb4, src VARCHAR(255), price DECIMAL(10,2), introduction TEXT CHARACTER SET utf8mb4 ); """ cursor.execute(create_table_sql) # 插入数据 insert_sql = """ INSERT INTO dangdang (title, author, press, src, price, introduction) VALUES (%s, %s, %s, %s, %s, %s) """ # 新增:统计成功插入的记录数 inserted_count = 0 for row in data[1:]: try: price_value = float(row[5]) if row[5].replace('.', '', 1).isdigit() else 0.0 cursor.execute(insert_sql, (row[1], row[2], row[3], row[4], price_value, row[6])) inserted_count += 1 # 每成功插入一条,计数器加1 except Exception as e: print(f"插入数据失败: {str(e)}") conn.commit() print(f"成功插入 {inserted_count} 条数据到数据库") # 使用自定义计数器 # 下载图片(修正缩进) save_directory = 'download' if not os.path.exists(save_directory): os.makedirs(save_directory) for i, row in enumerate(data[1:], 1): image_url = row[4] if image_url: new_filename = f'{str(i).zfill(4)}.jpg' download_image(image_url, save_directory, new_filename) finally: # 确保关闭资源 if 'conn' in locals() and conn.open: cursor.close() conn.close() driver.quit()这段代码用scrapy框架写 ,功能不变
最新发布
06-13
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值