------------------taobao.py--------------------- # -*- coding: utf-8 -*- import scrapy from scrapy import Request from urllib.parse import quote from ..items import ScrapyseleniumtestItem class TaobaoSpider(scrapy.Spider): name = 'tao_bao' allowed_domains = ['www.taobao.com'] base_url = 'https://s.taobao.com/search?q=' def start_requests(self): # 通过self.settings.get()的方式获取setting里面的参数 for keyword in self.settings.get('KEYWORDS'): for page in range(1, self.settings.get('MAX_PAGE') + 1): url = self.base_url + quote(keyword) # 通过meta={'键':"值"},传递你接下来需要的数据 yield Request(url=url, callback=self.parse, meta={ 'page':page}, dont_filter=True) def parse(self, response): # xpath中class中有空格(class="J_ItemPic img") 可以使用contains(@class,'img') products = response.xpath( '//div[@id="mainsrp-itemlist"]//div[@class="items"]//div[contains(@class, "item")]') for product in products: item = ScrapyseleniumtestItem() item['price'] = ''.join(product.xpath( './/div[contains(@class, "price")]//text()').extract()).strip() item['title'] = ''.join(product.xpath( './/div[contains(@class, "title")]//text()').extract()).strip() item['shop'] = ''.join(product.xpath(
scrapy+selenium爬取淘宝
最新推荐文章于 2025-06-13 15:32:18 发布