【Python】爬取国外购物网站商品信息实战

1.项目目录

----Project

------venv

--------main.py

--------brickseek.py

--------database.py

2.main.py

import brickseek

SKU=["675353130","543873356","113247244","259271016","618763356"]
if __name__ == '__main__':
    for sku in SKU:
        brickseek.setZip(sku)

3.brickseek.py

from pymysql.converters import escape_string
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from database import Database
import time
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions


def exlog(sku,zip):
    try:
        f = open("errorLog.txt", 'a', encoding='utf-8')
        msg = "爬取失败 sku=" + str(sku) + "  zip=" + str(zip) + "\n"
        f.write(msg)
        f.close()
    except Exception:
        print()

def saveData(id,store,price,quantity,sku):
    print("保存数据到mysql")
    try:
        d = Database().queryOne("select * from brickseek where id=%s" % id)
        date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        if d:
            print("更新")
            sql = "UPDATE brickseek SET sku = '%s',store='%s',quantity='%s',price='%s',time=str_to_date('%s','%%Y-%%m-%%d %%H:%%i:%%s') WHERE id = '%s'" % \
                  (sku, escape_string(store), quantity, escape_string(price) , date, id)
            Database().save(sql)
        else:
            print("插入")
            sql = "insert into brickseek(id,sku,store,quantity,price,time) " \
                  "values('%s', '%s', '%s', '%s', '%s', str_to_date('%s','%%Y-%%m-%%d %%H:%%i:%%s'))" % (
                      id, sku,  escape_string(store), quantity,  escape_string(price), date)
            Database().save(sql)
        print("保存数据成功")
    except Exception as e:
        print("保存数据失败,Error:",e)

def getPage(d,zip,sku):
    zipInput = d.find_element(By.ID, 'inventory-checker-form-zip')
    zipInput.send_keys(zip)
    print("输入zip...")
    but = d.find_element(By.ID,'main').find_element(By.CLASS_NAME,'bs-button')
    but.submit()
    print("正在请求zip网页数据...")
    wait = WebDriverWait(d, 30)
    wait.until(EC.presence_of_element_located((By.ID, "BrickseekVideoAdContainer")))
    print("解析数据...")
    rows = d.find_element(By.ID,'main').find_element(By.CLASS_NAME,'table__body').find_elements(By.CLASS_NAME,'table__row')
    print("数量=", len(rows))
    for row in rows:
        id = row.find_element(By.CLASS_NAME,'address-location-name').text.split("#")[1]
        list = row.find_elements(By.CLASS_NAME,'table__cell-content')
        store = list[0].text
        price = list[2].text
        quantity = list[1].find_element(By.CLASS_NAME,'availability-status-indicator__text').text
        if "In Stock"==quantity:
            quantity = list[1].find_element(By.CLASS_NAME,'table__cell-quantity').text[-1]
            if quantity == "+": quantity = "6+"
        elif "Out of Stock" == quantity:
            quantity = "0"
        else:
            quantity = "2"
        saveData(id,store,price,quantity,sku)
    rows.clear()



def setZip(sku):
    url = "https://xxx.com/?sku="
    url = url + sku

    d = webdriver.Chrome()
    d.implicitly_wait(30)  # 隐性等待,最长等30秒,调用一次即可
    # d.maximize_window()  # 浏览器全屏显示
    d.set_window_size(480, 800)
    # 反爬
    d.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
        "source": """
            Object.defineProperty(navigator, 'webdriver', {
               get: () => undefined
             })
           """
    })
    print("正在打开网页...")
    d.get(url)
    print("正在获取网页数据...")
    for i in range(50):
        zip = Database().queryOne("select * from zip_code order by rand() limit 1")[0]
        try:
            print("url=", url, "zip=", zip)
            getPage(d,zip,sku)
            print("爬取成功 sku=", sku, "zip=", zip)
        except Exception as e:
            print("爬取失败 sku=", sku, "zip=", zip,"Error:",e)
            exlog(sku,zip)
            continue



4.database.py


import MySQLdb
class Database(object):
   connection = None

   def __init__(self):
      if not Database.connection:
          Database.connection = MySQLdb.connect(host="xxx", user="xxx", passwd="xxx", database="reptile", charset='utf8')
          print("get connection")

   def queryOne(self,sql):
      cursor = Database.connection.cursor()
      cursor.execute(sql)
      rows = cursor.fetchone()
      print(rows)
      return rows

   def queryAll(self,sql):
      cursor = Database.connection.cursor()
      cursor.execute(sql)
      rows = cursor.fetchall()
      print(rows)
      return rows

   def save(self,sql):
       cursor = Database.connection.cursor()
       cursor.execute(sql)
       Database.connection.commit()


   def closeDB(self):
       Database.connection = None
       print("close db ...")

Python中,可以使用BeautifulSoup(简称BS4)库配合requests库来爬取购物网站的数据。以下是一个基本步骤: 1. **安装依赖库**: 首先需要安装`beautifulsoup4`和`requests`库,可以用pip命令行工具安装: ```bash pip install beautifulsoup4 requests ``` 2. **发送请求获取网页内容**: 使用`requests.get()`函数向目标网站发送HTTP GET请求,并获取网页的HTML源码: ```python import requests url = 'http://example.com/shop' # 替换为你想要抓取的购物网站URL response = requests.get(url) html_content = response.text ``` 3. **解析HTML内容**: 将获取到的HTML内容传给BeautifulSoup,创建一个解析器对象: ```python from bs4 import BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') ``` 4. **查找元素**: 利用BeautifulSoup提供的方法如`.find_all()`、`.select()`等来定位你想抓取的具体信息所在的HTML元素。例如,商品名称、价格、链接等通常在`<div>`、`<p>`、`<a>`等标签内。 5. **提取数据**: 通过访问元素属性(如`text`获取文本内容,`attrs`获取属性),将数据存储在一个列表或字典中: ```python products = [] for product in soup.find_all('div', class_='product'): name = product.find('h2').text price = product.find('span', class_='price').text link = product.find('a')['href'] products.append({'name': name, 'price': price, 'link': link}) ``` 6. **处理和保存数据**: 最后,你可以选择将数据存储到本地文件、数据库或者直接用于后续分析。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

落墨留白

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值