目标:华为商城下的商品信息
- 按主页的左边手机,笔记本&平板,智能穿戴……分类
- 每一个分类下的小分类
- 规格参数
- 写入excel
- 设置好excel数据表,分析数据
代码如下(scrapy):
import os
import re
import urllib.request
from copy import deepcopy
import scrapy
import xlrd
import xlwt
from ..items import HuaweiItem
class HuaWei(scrapy.Spider):
name = 'huawei'
allowed_domains = ['vmall.com', 'vmallres.com']
start_urls = ['http://vmall.com/']
def parse(self, response):
self.new_xls()
print("分割线-----------------------主页------------------------分割线")
classify_list_A = response.xpath('//div[@id="category-block"]/div/ol/li')
print("大分类长度:", len(classify_list_A))
for i in classify_list_A:
item = HuaweiItem()
item['classify_A'] = i.xpath('.//input[2]/@value').extract_first()
classify_list = i.xpath('.//div[2]//li[not(@class="subcate-btn")]')
for i in classify_list:
item['classify_B'] = i.xpath('.//input[1]/@value').extract_first()
href = "https://www.vmall.com" + str(i.xpath('.//a/@href').extract_first()) + '-1-3-0'
yield scrapy.Request(
href,
callback=self.parse_A,
meta={
"item": deepcopy(item)}
)
rb = xlrd.open_workbook('华为商城.xls')
rs = rb.sheet_by_index(0)
print("已爬取的商品数量:", rs.nrows - 1)
def parse_A(self, response):
print("分割线-----------------------中间页------------------------分割线")
li_list = response.xpath('//div[@class="layout"]/div[@class="channel-list"]/div[@class="pro-list clearfix"]/ul/li')
if li_list:
print("正在爬取页面链接:", response.request.url)
print("此页面商品数量:", len(li_list))
for i in li_list:
item = response.meta["item"]
rb = xlrd.open_workbook('华为商城.xls')
rs = rb.sheet_by_index(0)
cods = rs.col_values(0, start_rowx=0, end_rowx=None)
item[</