spider文件如下:
# -*- coding: utf-8 -*-
import scrapy
import re
class GoodsclassnameUrlSpider(scrapy.Spider):
name = 'GoodsClassName_Url'
allowed_domains = ['amazon.com']
start_urls = ['https://www.amazon.com/gp/new-releases/ref=zg_bsnr_unv_0_amazon-devices_1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, dont_filter=True,meta={'father':'start','end_or_not':self.start_urls})
def parse(self, response):
str_father_name_url=response.meta['father']
if re.findall("<li><a href='https://www.amazon.com/gp/new-releases", response.text, re.S):
child_name_list = re.findall("<li><a href='https://www.amazon.com/gp/new-releases.*?'>(.*?)</a></li>",
response.text, re.S)
part_child_url_list = re.findall("<li><a href='https://www.amazon.com/gp/new-releases(.*?)'>",