first_url = 'http://jbk.39.net' + li.xpath('a/@href').extract()[0]
2.如何获取到整个页面的标签
#获取到的是html页面
cause_link = response.xpath("//*[@class='intro']").extract()
# #获取到的是html页面
# cause_link = response.xpath("//*[@class='intro']").extract()
# cause_link2 = response.xpath("//*[@class='info']").extract()
# cause_link3 = response.xpath("//*[@class='art-box']").extract()
cause_link2 = response.xpath('/html/body/section/div[3]/div[1]/div/div[2]/p/text()').extract()
3.如何将列表转换为字符串
item['cause'] = response.xpath('/html/body/section/div[3]/div[1]/div/div[2]/p/text()').extract()
str1 = ''.join(item['cause'])
item['cause'] = str1.replace("'", "''")
4.Request
def parse(self, response):
li_list = response.xpath('//*[@id="mainBox"]/main/div[2]/div')
for xq in li_list:
item = XymcsdnItem()
item_list = xq.xpath('h4/a/text()').extract()
if len(item_list) > 0:
item['title'] = item_list[1].strip()
url = xq.xpath('h4/a/@href').extract()[0]
yield Request(url, meta={'item': item}, callback = self.parse_detail)
item = response.meta['item'] # get the item from before (Request)
# item = Jb39Item()# use function to create a new item