立即学习:https://edu.youkuaiyun.com/course/play/24797/282223?utm_source=blogtoedu
生成一个蜘蛛:
scrapy genspider job_position 'zhipin.com'
job_position 创建的类名称
zhipin.com 爬取的域名
# -*- coding: utf-8 -*-
import scrapy
from ZhipinSpider.items import ZhipinspiderItem
# view-source:https://www.zhipin.com/job_detail/?query=&city=101120200&industry=&position=110101
class TestScrapySpider(scrapy.Spider):
# 蜘蛛的名字
name = 'test_scrapy'
# 定义蜘蛛只爬取哪写域名
allowed_domains = ['zhipin.com']
# 从哪个页面开始爬
start_urls = ['https://www.zhipin.com/job_detail/?query=&city=101120200&industry=&position=110101']
# 该response就代表Scrapy下载器所获取的目标响应
def parse(self, response):
# 每个job_primary元素包含一个工作信息
for job_primary in response.xpath('//div[@class="job-primary"]'):
item = ZhipinspiderItem()
# 获取工作信息的内容DIV
info_primary = job_primary.xpath('./div[@class="info-primary"]')
work_primary = info_primary.xpath('./div[@class="primary-wrapper"]/div [@class="primary-box"]')
# 工作名称
item['title'] = work_primary.xpath('./div[@class="job-title"]/span[@class="job-name"]/a/text()').extract()[
0]
# # 工资
item['salary'] = work_primary.xpath(
'./div[@class="job-limit clearfix"]/span[@class="red"]/text()').extract_first()
# # 工作的连接
item['url'] = work_primary.xpath('./div[@class="job-title"]/span[@class="job-name"]/a/@href').extract()[0]
# # 工作地点
item['work_addr'] = work_primary.xpath(
'./div[@class="job-title"]/span[@class="job-area-wrapper"]/span[@class="job-area"]/text()').extract_first()
# # 招聘公司
company_primary = job_primary.xpath('./div[@class="info-company"]/div[@class="company-text"]')
item['company'] = company_primary.xpath('./h3/a/text()').extract_first()
company_info=company_primary.xpath('./p/text()').extract_first()
if company_info and len(company_info)>0:
item['industry']=company_info[0]
if company_info and len(company_info)>1:
item['company_size'] =company_info[2]
# # 行业
item['industry'] = company_primary.xpath('./p/a/text()')
# # 公司规模
# item['company_size'] = scrapy.Field()
# # 招聘人
# item['recruiter'] = scrapy.Field()