电商网站爬虫案例(3)

本文介绍了一个电商网站爬虫案例,目标是一家拥有丰富类目和商品的美国假发电商。通过Python实现,首先获取所有类目信息,然后抓取每个类目下的商品列表,最后获取单品价格信息。网站相对友好,不需使用代理,但需解决跨域问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

今日电商网站:

https://www.asteriahair.com/

一家美国假发电商网站

主页如图:

 

首页包含大量的一级菜单和二级菜单

说明该网站的产品和分类非常的丰富,同类网站中属于少见的优秀

其中一级类目有8个

二级类目数十个

部分还有三级类目

我们的目的是获取该网站的所有类目信息和商品信息;

该网站相对来说比较友善,因此可以不用代理,但是需要番羌

整体思路是:

1、获取分类
————————————————
 

# -*-coding:utf-8
# author:lihaizhen
# date:
# description:done 

import requests
import time
from lxml import etree
from utils import connections

conn = connections.mysql_conn()
poor = connections.local_redis(0)
cur = conn.cursor()

class Asteriahair_menu(object):
    def __init__(self):
        self.proxies = None
        self.headers = {
                'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
                'Cache-Control':'max-age=0',
                'Connection':'keep-alive',
                'Cookie':'frontend=lvsi0j2e9rkauul5rob5rnpj61; frontend_cid=ROvQmNAXWMfHk37b; _uetsid=423ac070475111ecb586cbef7b845b49; _uetvid=423ae1e0475111ec992aa99b253ce4cd',
                'Host':'www.asteriahair.com',
                'sec-ch-ua':'"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"',
                'sec-ch-ua-mobile':'?0',
                'sec-ch-ua-platform':'"Windows"',
                'Sec-Fetch-Dest':'document',
                'Sec-Fetch-Mode':'navigate',
                'Sec-Fetch-Site':'none',
                'Sec-Fetch-User':'?1',
                'Upgrade-Insecure-Requests':'1',
                'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
            }
        self.host_url = 'https://www.asteriahair.com/'

    def get_web_id(self):
        select_sql = """select id from web WHERE url='{}'""".format(self.host_url)
        cur = conn.cursor()
        cur.execute(select_sql)
        id = cur.fetchone()
        if id:
            return id[0]
        else:
            return False

    def req_asteriahair(self,create_time):
        web_id = self.get_web_id()
        # response = get_response(proxies=self.proxies,url=url, headers=self.headers,method='get')
        response = requests.get(url=self.host_url, headers=self.headers)
        res = response.text
        html = etree.HTML(res)
        # 定位
        element_list = html.xpath('//*[@id="bs-example-navbar-collapse-1"]/ul/li[position()>1 and position()<8]')
        # print(len(element_list))
        data_list = []
        for i,element in enumerate(element_list):
            menu_1 = element.xpath('./a/text()')[0].replace("'","''").strip().replace("\n","")
            menu_1_url = self.host_url + element.xpath('./a/@href')[0]
            print('Ⅰ  '+menu_1)
            menu_2_list = element.xpath('./div/div/dl')
            # print(len(menu_2_list))
            if not menu_2_list:
                data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_1_url, menu_1, '', '', create_time,web_id))
                continue
            for m2 in menu_2_list:
                try:
                    # try:
                    menu_2 = m2.xpath('./dt/a/text()')[0].replace('\n','').replace('\r', '').replace('\t', '').replace("'","''").strip()
                    menu_2_url = m2.xpath('./dt/a/@href')[0]
                    # except:
                    #     menu_2 = m2.xpath('./a/span/text()')[0].replace('  ','').replace('\n','').replace('\r', '').replace('\t', '').replace("'","''")
                    #     menu_2_url = m2.xpath('./a/@href')[0]
                    print('Ⅱ ' + menu_2)
                    menu_3_list = m2.xpath('./dd')
                    # print(len(menu_3_list))
                    if not menu_3_list:
                        data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_2_url, menu_1, menu_2, '', create_time,web_id))
                        continue
                    for m3 in menu_3_list:
                        menu_3 = m3.xpath('./a/text()')[0].replace("'","''").replace("\n","").replace("  ","")
                        menu_3_url = m3.xpath('./a/@href')[0]
                        print('Ⅲ '+menu_3)
                        data_list.append("('{}','{}','{}','{}','{}',{})".format(menu_3_url, menu_1, menu_2, menu_3, create_time,web_id))
                except Exception as e:
                    continue
        return data_list

    def save_data(self,data_list):
        sql = """insert into menu (url,first_menu,second_menu,third_menu,create_time,web_id) VALUES {}""".format(','.join(data_list))
        print(sql)
        cur.execute(sql)
        conn.commit()

    def run(self):
        create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        data = []
        web_id = self.get_web_id()
        new_arrival = 'https://www.asteriahair.com/new-arrival.html'
        data.append("('{}','{}','{}','{}','{}',{})".format(new_arrival, 'new_arrival', '', '', create_time,web_id))
        # data = self.req_asteriahair(create_time)
        self.save_data(data)

if __name__ == '__main__':
    h = Asteriahair_menu()
    h.run()

我们如愿的得到了该网站的所有类目数据

 第二步:我们来获取每个目录下面的商品

# author:lihaizhen
# date:
# description:done

import time
import redis
import requests
from lxml import etree
from utils import connections,save_data,get_web_id


class Asteriahair_Spu(object):
    def __init__(self):
        self.conn = connections.mysql_conn()
        self.poor = connections.local_redis(0)
        self.cur = self.conn.cursor()
        self.session = requests.Session()
        self.host = 'https://www.asteriahair.com/'
        self.headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': 'gzip, deflate',
            'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
            'cookie': '_y=a8931b19-e949-4ba8-9582-0aa4a94ea7f2; secure_customer_sig=; _shopify_y=a8931b19-e949-4ba8-9582-0aa4a94ea7f2; _shopify_fs=2020-09-22T03%3A09%3A25.905Z; _ga=GA1.2.52149075.1600744166; _hjid=6b8a213c-e597-49fb-ac6c-db4277787d35; _gcl_au=1.1.331237812.1609924414; _fbp=fb.1.1609924439842.734923350; _pin_unauth=dWlkPU1qVmxPR001WWpNdE5qWmlNQzAwWVdRMUxXSmxObVl0WmpCaU5UZ3paVEUxWkdZeg; _orig_referrer=; _landing_page=%2F%2Fcollections%2Fwigs-at-the-original-price; lkvw_20=www.hairvivi.com//collections/wigs-at-the-original-price; lkvw_02=v5; _hjTLDTest=1; _hjAbsoluteSessionInProgress=1; _sp_ses.8c34=*; KL_FORMS_MODAL={%22disabledForms%22:{%22TTXVBG%22:{%22lastCloseTime%22:1615513536%2C%22successActionTypes%22:[]}}%2C%22viewedForms%22:{%22TTXVBG%22:2199169}}; _gid=GA1.2.1243998432.1615513536; _ps_session=T1xCWrJH9J4M-DtMy8YLe; _g1597052385=VVNE; _s=83981dac-59fe-43ff-ac2e-7980bef490e3; _shopify_s=83981dac-59fe-43ff-ac2e-7980bef490e3; _shopify_sa_p=; epb_previous_pathname=//collections/wig-with-bangs; __kla_id=eyIkcmVmZXJyZXIiOnsidHMiOjE2MDA3NDQxNjksInZhbHVlIjoiIiwiZmlyc3RfcGFnZSI6Imh0dHBzOi8vd3d3LmhhaXJ2aXZpLmNvbS8ifSwiJGxhc3RfcmVmZXJyZXIiOnsidHMiOjE2MTU1MTgxMzEsInZhbHVlIjoiIiwiZmlyc3RfcGFnZSI6Imh0dHBzOi8vd3d3LmhhaXJ2aXZpLmNvbS8vY29sbGVjdGlvbnMvd2lnLXdpdGgtYmFuZ3M/cD0wIn19; _shopify_sa_t=2021-03-12T03%3A02%3A10.574Z; _sp_id.8c34=0bfda82db3ce3969.1600744168.12.1615518141.1614235782',
            'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }

    def get_spu_per_page(self,menu_id,menu_url,create_time):
        for i in range(1,100):
            page_url =menu_url +'?___store=en&p={}'.format(i)
            print('{}-{}'.format(i,page_url))
            response = requests.get(url=page_url,headers=self.headers).text
            html = etree.HTML(response)
            items_list = html.xpath('//div[@class="category-products"]/ul/li')
            print(len(items_list))
            # if not items_list:
            #     items_list = html.xpath('//*[@id="category-products-ajax"]/ul/li')
            duplicate = False
            for el in items_list:
                try:
                    name = el.xpath('./div[1]/h2/a/text()')[0].replace('  ','').replace('\r','').replace('\n','')
                    detail_url = el.xpath('./div[1]/h2/a/@href')[0]
                except:
                    name = el.xpath('./div[2]/h2/a/text()')[0].replace('  ','').replace('\r','').replace('\n','')
                    detail_url = el.xpath('./div[2]/h2/a/@href')[0]
                if 'www' not in detail_url:
                    detail_url = self.host + detail_url
                md5 = get_web_id.get_md5(detail_url)
                if save_data.save_spu(menu_id, detail_url, name, create_time, md5, self.conn, self.cur):
                    duplicate = True
            if not duplicate or len(items_list) < 12:
                break
            self.headers['referer'] = page_url
        print('\n')

    def run(self,key):
        create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
        r = redis.Redis(connection_pool=self.poor)
        keyword = "{}_spu_url".format(key)
        while r.scard(keyword) > 0:
            message = r.spop(keyword)
            msg = message.split('|')
            menu_id = msg[0]
            menu_url = msg[1]
            try:
                self.get_spu_per_page(menu_id,menu_url,create_time)
            except Exception as e:
                print('rollback')
                r.sadd(keyword,message)

if __name__ == '__main__':
    key = 'asteriahair'
    h = Asteriahair_Spu()
    h.run(key)

这里我们可以得到每个类目下的产品列表

3、根据商品链接获取单品价格信息

# coding:gbk
# author:lihaizhen
# date:
# description:doing
import hashlib
import json
import re
from lxml import etree
import redis
import sys
import requests
sys.path.append("..")
import time
from utils import check_spu,connections,get_res,get_web_id,save_data
from itertools import product
import collections
from decimal import Decimal

class Asteriahair_Sku(object):
    def __init__(self):
        self.proxies = None
        self.conn = connections.mysql_conn()
        self.cur = self.conn.cursor()
        self.headers = {
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding': 'gzip, deflate',
            'accept-language': 'en,zh-CN;q=0.9,zh;q=0.8',
            'cache-control': 'max-age=0',
            # 'cookie': '_ga=GA1.2.1243901573.1600390060; _fbp=fb.1.1600413991177.1486909458; _pin_unauth=dWlkPU1qVmxPR001WWpNdE5qWmlNQzAwWVdRMUxXSmxObVl0WmpCaU5UZ3paVEUxWkdZeg; __cfduid=d79191ff3130791135d2a9388c77df9971610592284; PHPSESSID=sh8iij8s3m334t690r5h34mn40; _gid=GA1.2.2030092908.1611649273; cc80df2044f9acef895f69c126d69935=6ISXnXrwlGg%3D7j%2BqiPYoGnU%3DJqUwmfVtkx0%3DuQCklErX%2Fy0%3DKWUMRmCFrOk%3DT%2FmtcKAqrGQ%3DkWTgn%2Fz7x3Y%3Dj0r%2BtEVWJpI%3DBDrvzaV1NaY%3DDycV9b4fHdI%3DygzLQ4aA4%2FY%3DXjz8%2BrL7ktQ%3DGxR7A7PSD0E%3DkMkGB7TMXtU%3DbbTu01xcU0E%3DAc%2BUNJjqGA8%3D; __atuvc=2%7C1%2C5%7C2%2C0%7C3%2C3%7C4; __atuvs=600fd0f887a4d9e4002',
            'sec-fetch-dest': 'document',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-site': 'none',
            'sec-fetch-user': '?1',
            'upgrade-insecure-requests': '1',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
        }
        self.pool = connections.local_redis(1)
        self.r = redis.Redis(connection_pool=self.pool)

    def get_md5(self,src):
        m2 = hashlib.md5()
        m2.update(src.encode())
        return m2.hexdigest()

    def get_sku_temp(self,url):
        response = requests.get(url=url,headers=self.headers)
        html = etree.HTML(response.text)
        sku_list = html.xpath('//*[@id="product-options-wrapper"]/dl/dd')   # 属性列表
        goods_name = html.xpath('//*[@id="product_addtocart_form"]/div[4]/ul/li[1]/text()')[0].replace('\n','').replace('  ','')
        try:
            base_price = html.xpath('//dd[@class="pricebox"]/div/p[2]/span[2]/text()')[0].replace('$','').replace('\n','').replace('  ','')
            if not base_price:
                base_price = html.xpath('//*[@id="productGeneral"]/form/div/div[3]/div[1]/div[1]/span/text()')[0].replace('$','').replace('\n','').replace('  ','')
        except:
            base_price = html.xpath('//*[@id="productGeneral"]/form/div/div[3]/div[1]/div[1]/span/text()')[0].replace('$','').replace('\n','').replace('  ','')
        li = []    #
        temp_list = []
        for s in sku_list:
            attribute_value_list = s.xpath("./div/select/option[@value!='']")   # 属性下拉菜单列表
            if not attribute_value_list:
                attribute_value_list = [s]
            li1 = []  #
            temp1 = collections.OrderedDict()
            for attribute in attribute_value_list:
                temp2 = collections.OrderedDict()
                name = attribute.xpath('./text()')[0].replace('\n','').replace('  ','')
                if 'select' in name.lower():
                    continue
                # price = attribute.xpath('./@price')[0]
                if '$' not in name:
                    price = '0'
                else:
                    price = re.search(r'(\$.*)[\s\S]', name).group(1)
                temp2[name] = price
                li1.append(name) if name else li1
                temp1.update(temp2) if name else temp1
            temp_list.append(temp1) if temp1 else temp_list
            li.append(li1)  if li1 else li
        lis = list(product(*li))
        return lis,temp_list,goods_name,base_price

    def parse_zuhe_list(self,lis,temp_list,base_price,goods_name,url,create_time):
        data_list = []
        for i,li in enumerate(lis):
            total = {}
            zuhe = li
            price = Decimal(base_price)
            for j,p in enumerate(li):
                attribute_title = li[j]
                ajax_price = temp_list[j][attribute_title]
                ajax_price = Decimal(ajax_price.replace(' ', '').replace('$', ''))
                price += ajax_price
                total['price'] = price
                total['sku_name'] = ' / '.join(zuhe).replace("'","''")
                total['url'] = url
                total['goods_name'] = goods_name.replace("'","''")
                total['create_time'] = create_time
                total['md5'] = self.get_md5(total['sku_name']+goods_name)
            data_list.append(total)
        return data_list

    def run(self,create_time,r_k):
        detail_url_list = [
            'https://www.asteriahair.com/24-40-long-straight-hair-wigs-for-women-human-hair-lace-front-wig.html',
            'https://www.asteriahair.com/straight-lace-front-wigs-130-250-density-for-women-natural-black.html',
            'https://www.asteriahair.com/13x6-lace-front-wigs-human-hair-straight-wig-for-women.html',
            'https://www.asteriahair.com/360-lace-frontal-wigs-virgin-straight-hair-cheap-real-hair-wigs.html',
            'https://www.asteriahair.com/straight-natural-black-full-lace-front-wigs-150-180-density.html',
            'https://www.asteriahair.com/1pc-ear-to-ear-lace-frontal-with-3-bundles-straight-virgin-brazilian-hair-weft.html',
            'https://www.asteriahair.com/brazilian-straight-virgin-hair-weaves-3-pcs-with-4-4-best-lace-frontal-closure.html',
            'https://www.asteriahair.com/cheap-malaysian-straight-human-hair-virgin-hair-bundles-3pcs-natural-black.html'
        ]

        for i,url in enumerate(detail_url_list):
            print('spu:{}-{}-{}'.format(len(detail_url_list), i+1, url))
            lis, temp_list, goods_name, base_price = self.get_sku_temp(url)
            data_list = self.parse_zuhe_list(lis,temp_list,base_price,goods_name,url,create_time)
            save_data.save_sku(data_list,self.conn,self.cur,self.r,r_k)
            time.sleep(5)

if __name__ == '__main__':
    ws = Asteriahair_Sku()
    create_time = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    host_url = 'https://www.asteriahair.com/'
    r_k = 'Asteriahair_Sku'
    ws.run(create_time,r_k)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值