qczj.py
import scrapy
from scrapy2.items import qczj2Item
class QczjSpider(scrapy.Spider):
name = "qczj"
#allowed_domains = ["qczj.com"]
start_urls = ["https://car.autohome.com.cn/price/list-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-1.html"]
# for page in range(1, 5):
# url = f'https://car.autohome.com.cn/price/list-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-{page}.html'
# start_urls.append(url)
page_num=1
def parse(self, response):
divs = response.xpath('//div[@class="list-cont"]')
print(len(divs))
for div in divs:
#xpath返回的是一个列表,列表中存放 的是对象
#get() 返回一个字符串
title=div.xpath('.//a[@class="font-bold"]/text()').get()
title1=div.xpath('.//a[@class="font-bold"]/text()').extract_first()
pingfen=div.xpath('.//span[@class="score-number"]/text()').get()
#getall()返回的是一个列表
info=div.xpath('.//ul[@class="lever-ul"]/li//text()').getall()
info=''.join(info)
price=div.xpath('.//span[@class="font-arial"]/text()').get()
items=qczj2Item()
items['title']=title
items['pingfen']=pingfen
items['info']=title
items['price']=title
items['page'] = self.page_num
detail_url='https://car.autohome.com.cn'+div.xpath('.//a[@class="font-bold"]/@href').get()
#print(title,pingfen,info,price,detail_url)
#手动创建请求对象
yield scrapy.Request(url=detail_url,callback=self.detail_parse,meta={'i': items})
self.page_num += 1
next_url = f'https://car.autohome.com.cn/price/list-0-0-0-0-0-0-1-0-0-0-0-0-0-0-0-{self.page_num}.html'
if self.page_num > 3:
return
# #向下一页发起请求,请求成功后得到响应对象交给parse方法解析数据
yield scrapy.Request(url=next_url, callback=self.parse)
pass
def detail_parse(self, response):
#车型
cx=response.xpath('//div[@id="divSeries"]//li//p//a/text()').getall()
cx=','.join(cx)
items=response.meta['i']
items['cx']=cx
yield items
items.py
import scrapy
class qczj2Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#title, pingfen, info, price,cx
title = scrapy.Field()
pingfen = scrapy.Field()
info = scrapy.Field()
price = scrapy.Field()
cx = scrapy.Field()
page = scrapy.Field()
pass
pipelines.py
from openpyxl import Workbook
class ExcelPipeline:
def open_spider(self,spider):
self.wb=Workbook()
# self.sh=self.wb.active
# self.sh.title="汽车之家"
# self.sh.append(['名称','评分','功能','价格','车型'])
def process_item(self, item, spider):
#print(1)
try:
self.sh=self.wb["第%s页"%(item['page'])]
except:
self.sh=self.wb.create_sheet("第%s页"%(item['page']))
self.sh.append(['名称', '评分', '功能', '价格', '车型'])
self.sh.append([item['title'],item['pingfen'],item['info'],item['price'],item['cx']])
return item
#臭虫结束之后执行一次
def close_spider(self,spider):
#删除蛇夫座子表
self.wb.remove(self.wb.active)
self.wb.save('/汽车之家.xlsx')
setting.py
ITEM_PIPELINES = {
"scrapy2.pipelines.ExcelPipeline": 300,
}
其他参数参照上一篇
611

被折叠的 条评论
为什么被折叠?



