### 实现Python爬虫抓取拉勾网招聘信息并存储至MySQL
#### 使用Scrapy框架构建爬虫项目
为了高效地完成此任务,推荐使用Scrapy这一强大的Web爬虫框架。创建一个新的Scrapy项目来处理拉勾网的数据采集工作。
安装依赖包:
```bash
pip install scrapy pymysql
```
初始化新的Scrapy项目:
```bash
scrapy startproject lagou_spider
cd lagou_spider
```
定义Item类用于表示每条记录中的字段,在`items.py`中添加如下代码:
```python
import scrapy
class LagouJobItem(scrapy.Item):
title = scrapy.Field() # 职位名称
salary_min = scrapy.Field() # 最低薪资
salary_max = scrapy.Field() # 最高薪资
city = scrapy.Field() # 工作城市
work_years = scrapy.Field() # 经验要求
education = scrapy.Field() # 学历要求
job_advantage = scrapy.Field() # 职位诱惑
job_desc = scrapy.Field() # 岗位描述
company_name = scrapy.Field() # 公司名称
url = scrapy.Field() # 页面链接
url_object_id = scrapy.Field() # URL哈希值
```
编写Spider脚本解析网页内容,假设命名为`lagou_jobs.py`:
```python
from urllib.parse import urlencode
import json
import re
import scrapy
from ..items import LagouJobItem
from ArticleSpider.utils.common import get_md5 # 自定义工具函数获取URL MD5编码[^2]
class LagouJobsSpider(scrapy.Spider):
name = 'lagou_jobs'
allowed_domains = ['www.lagou.com']
def __init__(self, *args, **kwargs):
super(LagouJobsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
base_url = "https://www.lagou.com/jobs/positionAjax.json?"
params = {
'needAddtionalResult': False,
'isSchoolJob': None
}
for page_num in range(1, 3): # 控制翻页数量
form_data = {'first': True if page_num==1 else False,'pn': str(page_num), 'kd':''}
yield scrapy.FormRequest(url=base_url + urlencode(params),
method="POST",
headers={
'Referer': f'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='},
body=json.dumps(form_data),
callback=self.parse_job_listings)
def parse_job_listings(self,response):
results = json.loads(response.text)['content']['positionResult']['result']
for result in results:
item = LagouJobItem()
try:
item['title'] = result.get('positionName')
salary_range = result.get('salary').split('-')
item['salary_min'],item['salary_max']= map(lambda s:s.strip('kK'),salary_range) if len(salary_range)==2 else (None,None)
item['city'] = result.get('city')
item['work_years'] = result.get('workYear')
item['education'] = result.get('education')
item['company_name'] = result.get('companyShortName')
item['url'] = f"https://www.lagou.com/jobs/{result.get('positionId')}.html"
item['url_object_id'] = get_md5(item['url'])
detail_request = scrapy.Request(
item['url'],
meta={'item': item},
callback=self.parse_detail_page
)
yield detail_request
except Exception as e:
print(f"Parsing Error:{e}")
continue
def parse_detail_page(self,response):
item=response.meta['item']
try:
item['job_advantage']="".join([i.strip()for i in response.xpath('//dd[@class="job-advantage"]/p/text()').extract()])
item['job_desc']="".join([i.strip().replace('\n',' ')for i in response.xpath('//div[contains(@class,"job-detail")]/text()|//div[contains(@class,"job-detail")]/*//text()').getall()])
except Exception as e:
print(f"Detail Page Parsing Error:{e}")
yield item
```
配置Pipeline以支持向MySQL写入数据,在`settings.py`里激活pipeline组件,并设置数据库连接参数;编辑`pipelines.py`文件加入以下逻辑实现对MySQL的操作[^3]:
```python
import pymysql.cursors
from twisted.enterprise import adbapi
from .items import LagouJobItem
class MySQLTwistedPipeline(object):
@classmethod
def from_settings(cls, settings):
db_params=dict(
host=settings["MYSQL_HOST"],
port=settings["MYSQL_PORT"],
user=settings["MYSQL_USER"],
password=settings["MYSQL_PASSWORD"],
database=settings["MYSQL_DBNAME"],
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True,
)
return cls(dbpool=adbapi.ConnectionPool("pymysql",**db_params))
def process_item(self,item,spider):
query=self.dbpool.runInteraction(self.do_insert,item=item)
query.addErrback(self.handle_error)#错误处理
return item
def do_insert(self,cursor,item:LagouJobItem):
insert_sql,params="""
INSERT INTO jobs(title,salary_min,salary_max,city,work_years,education,job_advantage,job_desc,company_name,url,url_object_id)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
""",(item['title'],item['salary_min'],item['salary_max'],item['city'],item['work_years'],item['education'],item['job_advantage'],item['job_desc'],item['company_name'],item['url'],item['url_object_id'])
cursor.execute(insert_sql,params)
def handle_error(self,failure):
spider.logger.error(str(failure))
```
最后修改项目的`settings.py`,确保已开启上述自定义管道服务,并填写好对应的MySQL服务器信息:
```python
ITEM_PIPELINES = {
'your_project.pipelines.MySQLTwistedPipeline': 300,
}
# 配置MySQL连接详情
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = ''
MYSQL_PASSWORD = ''
MYSQL_DBNAME = ''
DOWNLOAD_DELAY = 1 # 设置下载延迟防止被封禁IP地址
COOKIES_ENABLED=False # 关闭cookies中间件因为某些情况下它可能会干扰请求头伪造
USER_AGENT='' # 可选:更改User-Agent模拟浏览器访问行为
ROBOTSTXT_OBEY = False # 不遵循robots协议以便能够遍历更多页面资源
```
通过以上步骤可以成功搭建起一个完整的基于Python Scrapy框架的网络爬虫应用案例,该应用程序可以从拉勾网上提取职位列表及其详细说明,并将其安全可靠地保存到本地运行着MySQL实例的关系型数据库表单之中。