在进行scrapy爬虫时,要爬取不在同一级页面的属性,使用yield scrapy.Request(url=videolink, meta={'item':item}, callback=self.parseReal2)
来进行item的传递,这样就可以爬取不在同一级页面的属性,并yield给最后的item了。
def parseReal(self, response):
jsons = json.loads(response.body)
res = jsons['data']
if len(res):
for index in range(len(res)):
item = YoukuItem()
videolink = "https:" + res[index]['videoLink']
#视频Id
item['videoid'] = res[index]['videoId']
#视频url
item['url'] = videolink
#视频名称
item['videoname'] = res[index]['title'].encode('utf-8')
yield scrapy.Request(url=videolink, meta={'item':item}, callback=self.parseReal2)
def parseReal2(self, response):
#print response
item = response.meta['item']
#视频大类型
videobigtype = response.xpath("//li[@class='p-row p-title']/a/text()").extract()
if len(videobigtype) == 0:
item['videobigtype'] = ''
else:
item['videobigtype'] = str(videobigtype[0].encode('utf-8'))
#视频小类型
vlts = response.xpath("//div[@class='p-base']/ul/li[10]/a/text()").extract()
if len(vlts) == 0:
item['videolittletype'] = ''
else:
item['videolittletype'] = ''
for vlt in vlts:
item['videolittletype'] = item['videolittletype'] + vlt.encode('utf-8') + '/'
#主演
starring = response.xpath("//div[@class='p-base']/ul/li[7]/@title").extract()
if len(starring) == 0:
item['starring'] = ''
else:
item['starring'] = starring[0]
#导演
directors = response.xpath("//div[@class='p-base']/ul/li[8]/a/text()").extract()
if len(directors) == 0:
item['director'] = ''
else:
item['director'] = ''
for director in directors:
item['director'] = item['director'] + director.encode('utf-8') + '/'
#分数/评分
score = response.xpath("//div[@class='p-base']/ul/li[6]/span[2]/text()").extract()
if len(score) == 0:
item['score'] = ''
else:
item['score'] = str(score[0].encode('utf-8'))
#地区
area = response.xpath("//div[@class='p-base']/ul/li[9]/a/text()").extract()
if len(area) == 0:
item['area'] = ''
else:
item['area'] = str(area[0].encode('utf-8'))
yield item