【Python】爬取国外购物网站商品信息实战

落墨留白

已于 2023-09-28 09:25:57 修改

阅读量1.9k

点赞数 1

分类专栏： Python 文章标签： python 开发语言 pycharm

于 2022-08-26 17:40:50 首次发布

本文为博主原创文章，转载请注明地址，企鹅：656984220。

本文链接：https://blog.youkuaiyun.com/qq_38971617/article/details/126547736

版权

Python 专栏收录该内容

3 篇文章

订阅专栏

1.项目目录

----Project

------venv

--------main.py

--------brickseek.py

--------database.py

2.main.py

import brickseek

SKU=["675353130","543873356","113247244","259271016","618763356"]
if __name__ == '__main__':
    for sku in SKU:
        brickseek.setZip(sku)

3.brickseek.py

from pymysql.converters import escape_string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from database import Database
import time
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions


def exlog(sku,zip):
    try:
        f = open("errorLog.txt", 'a', encoding='utf-8')
        msg = "爬取失败 sku=" + str(sku) + "  zip=" + str(zip) + "\n"
        f.write(msg)
        f.close()
    except Exception:
        print()

def saveData(id,store,price,quantity,sku):
    print("保存数据到mysql")
    try:
        d = Database().queryOne("select * from brickseek where id=%s" % id)
        date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        if d:
            print("更新")
            sql = "UPDATE brickseek SET sku = '%s',store='%s',quantity='%s',price='%s',time=str_to_date('%s','%%Y-%%m-%%d %%H:%%i:%%s') WHERE id = '%s'" % \
                  (sku, escape_string(store), quantity, escape_string(price) , date, id)
            Database().save(sql)
        else:
            print("插入")
            sql = "insert into brickseek(id,sku,store,quantity,price,time) " \
                  "values('%s', '%s', '%s', '%s', '%s', str_to_date('%s','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                      id, sku,  escape_string(store), quantity,  escape_string(price), date)
            Database().save(sql)
        print("保存数据成功")
    except Exception as e:
        print("保存数据失败,Error:",e)

def getPage(d,zip,sku):
    zipInput = d.find_element(By.ID, 'inventory-checker-form-zip')
    zipInput.send_keys(zip)
    print("输入zip...")
    but = d.find_element(By.ID,'main').find_element(By.CLASS_NAME,'bs-button')
    but.submit()
    print("正在请求zip网页数据...")
    wait = WebDriverWait(d, 30)
    wait.until(EC.presence_of_element_located((By.ID, "BrickseekVideoAdContainer")))
    print("解析数据...")
    rows = d.find_element(By.ID,'main').find_element(By.CLASS_NAME,'table__body').find_elements(By.CLASS_NAME,'table__row')
    print("数量=", len(rows))
    for row in rows:
        id = row.find_element(By.CLASS_NAME,'address-location-name').text.split("#")[1]
        list = row.find_elements(By.CLASS_NAME,'table__cell-content')
        store = list[0].text
        price = list[2].text
        quantity = list[1].find_element(By.CLASS_NAME,'availability-status-indicator__text').text
        if "In Stock"==quantity:
            quantity = list[1].find_element(By.CLASS_NAME,'table__cell-quantity').text[-1]
            if quantity == "+": quantity = "6+"
        elif "Out of Stock" == quantity:
            quantity = "0"
        else:
            quantity = "2"
        saveData(id,store,price,quantity,sku)
    rows.clear()



def setZip(sku):
    url = "https://xxx.com/?sku="
    url = url + sku

    d = webdriver.Chrome()
    d.implicitly_wait(30)  # 隐性等待，最长等30秒,调用一次即可
    # d.maximize_window()  # 浏览器全屏显示
    d.set_window_size(480, 800)
    # 反爬
    d.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
               get: () => undefined
             })
           """
    })
    print("正在打开网页...")
    d.get(url)
    print("正在获取网页数据...")
    for i in range(50):
        zip = Database().queryOne("select * from zip_code order by rand() limit 1")[0]
        try:
            print("url=", url, "zip=", zip)
            getPage(d,zip,sku)
            print("爬取成功 sku=", sku, "zip=", zip)
        except Exception as e:
            print("爬取失败 sku=", sku, "zip=", zip,"Error:",e)
            exlog(sku,zip)
            continue

4.database.py


import MySQLdb
class Database(object):
   connection = None

   def __init__(self):
      if not Database.connection:
          Database.connection = MySQLdb.connect(host="xxx", user="xxx", passwd="xxx", database="reptile", charset='utf8')
          print("get connection")

   def queryOne(self,sql):
      cursor = Database.connection.cursor()
      cursor.execute(sql)
      rows = cursor.fetchone()
      print(rows)
      return rows

   def queryAll(self,sql):
      cursor = Database.connection.cursor()
      cursor.execute(sql)
      rows = cursor.fetchall()
      print(rows)
      return rows

   def save(self,sql):
       cursor = Database.connection.cursor()
       cursor.execute(sql)
       Database.connection.commit()


   def closeDB(self):
       Database.connection = None
       print("close db ...")