***本文章为个人学习记录***
目录
一、解析网页数据
通过rules对相应的网页进行指定爬取,接着回调方法callback到parse_item对数据进行解析。解析数据通过itemloader逐个解析。
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from MySpider.utils.common import get_md5 # 提取字符串中的数字
from MySpider.items import liepinJobItemLoader,liepinJobItem # itemloader
class LiepinSpider(CrawlSpider):
name = 'liepin'
allowed_domains = ['www.liepin.com']
start_urls = ['https://www.liepin.com/']
rules = (
Rule(LinkExtractor(allow=("career/.*",)), follow=True),
Rule(LinkExtractor(allow=("zhaopin/.*",)), follow=True),
Rule(LinkExtractor(allow=("lptjob/.*",)),follow=True),
Rule(LinkExtractor(allow=r'job/\d+.shtml'), callback='parse_item', follow=True),
)
def parse_item(self, response):
item_loader = liepinJobItemLoader(item=liepinJobItem(),response=response)
item_loader.add_xpath("title","/html/body/section[3]/div[1]/div[1]/span[1]/text()")
item_loader.add_value("url",response.url)
item_loader.add_value("url_object_id",get_md5(response.url))
item_loader.add_xpath("salary","/html/body/section[3]/div[1]/div[1]/span[2]/text()")
item_loader.add_xpath("job_city",'/html/body/section[3]/div[1]/div[2]/span[1]/text()')
item_loader.add_xpath('work_years','/html/body/section[3]/div[1]/div[2]/span[3]/text()')
item_loader.add_xpath('degree_need','/html/body/section[3]/div[1]/div[2]/span[5]/text()')
crawl_tags = response.xpath('/html/body/main/content/section[2]/dl/div/ul/li/text()').extract()
item_loader.add_value('tags',",".join(crawl_tags))
job_advantage = response.xpath('/html/body/section[4]/div/div[1]/span/text()').extract()
item_loader.add_value('job_advantage',",".join(job_advantage))
item_loader.add_xpath('job_desc','/html/body/main/content/section[2]/dl[1]/dd/text()')
item_loader.add_xpath('job_addr','/html/body/main/aside/div[2]/div[2]/div[4]/span[2]/text()'or'/html/body/main/aside/div[3]/div[2]/div[3]/span[2]')
item_loader.add_xpath('company_name','/html/body/main/aside/div[2]/div[1]/div[1]/div[1]/text()')
job_item = item_loader.load_item()
return job_item
二、items.py的编写
编写get_inset_sql()函数写sql语句,后返回插入语句和参数(insert_sql,params 到pipelines,后异步导入mysql库。
class liepinJobItem(scrapy.Item):
# 猎聘网职位信息
title = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
salary = scrapy.Field()
job_city = scrapy.Field()
work_years = scrapy.Field()
degree_need = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_addr = scrapy.Field()
company_name = scrapy.Field()
tags = scrapy.Field()
crawl_time = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into liepin(title,url,url_object_id,salary,job_city,work_years,
degree_need,job_advantage,job_desc,job_addr,company_name,tags,crawl_time) values(%s,%s,
%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
crawl_time=datetime.datetime.now().strftime(SQL_DATETIME_FORMAT)
params = (
self['title'],self['url'],self['url_object_id'],self['salary'],
self['job_city'],self['work_years'],self['degree_need'],self['job_advantage'],
self['job_desc'],self['job_addr'],self['company_name'],self['tags'],crawl_time
)
return insert_sql,params
# 连接MySQL的参数
MYSQL_HOST = "localhost"
MYSQL_DBNAME = 'article_spider'
MYSQL_USER = "root"
MYSQL_PASSWORD = "123456"
三、pipelines的编写
在do_insert函数中,获取Items中的liepinjobItem中的get_insert_sql两个参数,接着执行mysql语句将数据保存。
# 异步入Mysql库
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
# 登录参数在settings中
dbparms = dict(
# 连接Mysql库的参数(账户密码ip等
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWORD'],
charset = 'utf8',
cursorclass = DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)
def handle_error(self,failure,item,spider):
print(failure)
def do_insert(self,cursor,item):
insert_sql,params = item.get_insert_sql()
cursor.execute(insert_sql, params) # 执行数据库语句,将数据存入SQL数据库中
pass