玩转 python selenium---抓取某知名电商商品页的图片

wxhjk

已于 2022-04-29 17:59:43 修改

阅读量804

点赞数

文章标签： python selenium 爬图片

于 2022-04-29 15:38:33 首次发布

本文链接：https://blog.youkuaiyun.com/wxhjk/article/details/124497298

版权

练下手，爬一下某电商网站上的商品图片，还真爬到了。代码如下：

4.28

from selenium import webdriver
import time

option = webdriver.ChromeOptions()
option.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
option.add_argument("--disable-blink-features=AutomationControlled")
#option.add_argument('headless')	

driver = webdriver.Chrome(executable_path=r'C:\Program Files\Google\Chrome\Application\chromedriver.exe', options=option)
url  = 'https://detail.1688.com/offer/669667221688.html?spm=a26352.13672862.offerlist.227.34b51e62ZfC32f'
driver.get(url)
time.sleep(1)

parent = driver.find_element_by_class_name('content-detail')
images = parent.find_elements_by_tag_name('img')
#print(images.size)

from urllib import request
import urllib

for image in images:
    #找到图片的网址
    img_url = image.get_attribute("data-lazyload-src")
    #print(img_url)
    #获取图片名
    name = img_url.split('/')[-1]
    #print(name)
    #将图片存到imagess文件夹中
    request.urlretrieve(img_url,f'imagess/{name}')

driver.quit()

完善了一下，可以把抓取的图片自动保存到新建的产品目录下，一个产品一个目录：

4.29

from selenium import webdriver
import time

def mkdir(path):
    # 引入模块
    import os
    # 去除首位空格
    path=path.strip()
    # 去除尾部 \ 符号
    path=path.rstrip("\\")

    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists=os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path) 
        print(path+' 创建成功')
        return True
    else:
        # 如果目录存在则不创建，并提示目录已存在
        print (path+' 目录已存在')
        return False


option = webdriver.ChromeOptions()
option.binary_location = r'C:\Program Files\Google\Chrome\Application\chrome.exe'
option.add_argument("--disable-blink-features=AutomationControlled")
#option.add_argument('headless')	

driver = webdriver.Chrome(executable_path=r'C:\Program Files\Google\Chrome\Application\chromedriver.exe', options=option)
url  = 'https://detail.1688.com/offer/669667221688.html?spm=a26352.13672862.offerlist.227.34b51e62ZfC32f'
driver.get(url)
time.sleep(1)

parent = driver.find_element_by_class_name('content-detail')
images = parent.find_elements_by_tag_name('img')

title = driver.find_element_by_class_name('title-first-column')
title_text = title.find_element_by_class_name("title-text")
detail_dir = 'imagess/'+title_text.get_attribute('innerText')[-10:]+'/details'
mkdir(detail_dir)
headimg_dir = 'imagess/'+title_text.get_attribute('innerText')[-10:]+'/headimgs'
mkdir(headimg_dir)

from urllib import request
import urllib

for image in images:
    #找到图片的网址
    img_url = image.get_attribute("data-lazyload-src")
    print(img_url)
    name = img_url.split('/')[-1]
    print(name)
    #将图片存到images文件夹中
    request.urlretrieve(img_url,f'{detail_dir}/{name}')

detail_gallery_imgs = driver.find_elements_by_class_name('detail-gallery-img')

for image in detail_gallery_imgs:
    #找到图片的网址
    img_url = image.get_attribute("src")
    print(img_url)
    name = img_url.split('/')[-1]
    print(name)
    #将图片存到images文件夹中
    request.urlretrieve(img_url,f'{headimg_dir}/{name}')

driver.quit()