头歌：Scrapy爬虫之拉勾网招聘数据分析

hngcxy2022

于 2025-05-16 11:16:25 发布

阅读量751

点赞数 3

CC 4.0 BY-SA版权

文章标签： scrapy 爬虫数据分析

本文链接：https://blog.youkuaiyun.com/hngcxy2022/article/details/148002175

头歌：Scrapy爬虫之拉勾网招聘数据分析

第1关：Scrapy 爬取数据存到 MongoDB 数据库中

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from AjaxProject import settings
class AjaxprojectPipeline(object):
    #********** Begin **********#
    def __init__(self):
        self.client = pymongo.MongoClient(
            host=settings.MONGODB_HOST,
            port=settings.MONGODB_PORT
        )
        self.db = self.client['lagou']
        self.collection = self.db['zhaopin']
    
    def process_item(self, item, spider):
        self.collection.insert_one(dict(item))
        return item
    #********** End **********#

settings.py 中启用 Pipeline

# -*- coding: utf-8 -*-

# Scrapy settings for AjaxProject project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'AjaxProject'

SPIDER_MODULES = ['AjaxProject.spiders']
NEWSPIDER_MODULE = 'AjaxProject.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = True

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'AjaxProject.middlewares.AjaxprojectSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MY_USER_AGENT = [
   "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"
]
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,#这里要设置系统的useragent为None，否者会被覆盖掉
    'AjaxProject.middlewares.MyUserAgentMiddleware': 400,
    'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware': 700,
}
ITEM_PIPELINES = {
    'AjaxProject.pipelines.AjaxprojectPipeline': 300,
}
DOWNLOAD_DELAY = 1.5

COOKIE = {
    "user_trace_token": "20171109093921 - c87e4dd6 - 6116 - 4060 - a976 - 38df4f6dfc1c",
    "_ga = GA1": ".2.1308939382.1510191563",
    "LGUID": "20171109093922 - cff5ddb7 - c4ee - 11e7 - 985f - 5254005c3644",
    "JSESSIONID": "ABAAABAAAGGABCBAE8E3FCEFC061F7CF2860681B1BF3D98",
    "X_HTTP_TOKEN": "0f2396abe975f6a09df1c0b8a0a3a258",
    "showExpriedIndex": "1",
    "showExpriedCompanyHome": "1",
    "showExpriedMyPublish": "1",
    "hasDeliver": "12",
    "index_location_city": "% E6 % B7 % B1 % E5 % 9C % B3",
    "TG - TRACK - CODE": "index_user",
    "login": "false",
    "unick": "",
    "_putrc": "",
    "LG_LOGIN_USER_ID": "",
    "_gat": "1",
    "LGSID": "20180720141029 - 99fde5eb - 8be3 - 11e8 - 9e4d - 5254005c3644",
    "PRE_UTM": "",
    "PRE_HOST": "",
    "PRE_SITE": "",
    "PRE_LAND": "https % 3A % 2F % 2Fwww.lagou.com % 2Fzhaopin % 2F",
    "SEARCH_ID": "5ddd3d52f1d94b45820534b397ef21e6",
    "LGRID": "20180720141849 - c455d124 - 8be4 - 11e8 - 9e4d - 5254005c3644",
}

第2关：在 Jupyter Notebook 中导入数据

#********** Begin **********#
#1.导入基础包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

#2.读取数据
path = 'step2/'
filePath = path + 'positions.csv'
df = pd.read_csv(filePath, encoding='gbk')  # 使用 GBK 编码读取 CSV 文件

#3.重新设置列的顺序
display_columns = ["salarylow", "salaryhigh", "company", "positionname", "index_id", "position"]
df = df.reindex(columns=display_columns)

#********** End **********#
d5 = df.head()
print(d5)  # 显示前5行数据

第3关：招聘学历要求饼状图

 
#********** Begin **********#
#1.导入基础包
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import re
 
# 防止中文乱码
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'
 
#2.导入文档数据
path = r'step3/'
filePath = path + r'positions.csv'
df = pd.read_csv(filePath,encoding = 'gbk')
#3.分析数据
edu_count = df['edu'].value_counts()
edu_labels = edu_count.index.tolist()
edu_values = edu_count.tolist()
#4.画图
fig, ax = plt.subplots()
ax.pie(edu_values, autopct='%1.1f%%', labels=edu_labels)
fig.set_size_inches(7, 7)
#********** End **********#
plt.savefig(path+r'/yourimg/'+r'pie.png') #存储图片

第4关：最低薪资柱状图

#********** Begin **********#
#1.导入基础包
import numpy as np
import pandas as pd
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import re

#2.导入文档数据
path = r'step4/'
filePath = path+r'positions.csv'
df = pd.read_csv(filePath,encoding='gbk')
#3.分析数据
b1 = pd.DataFrame(df['salarylow'].value_counts())
b1.sort_index(inplace=True)
X = b1.index.tolist()
Y = list(b1.salarylow)
#4.画图
x = np.arange(len(X))+1
width = 0.5
fig,ax = plt.subplots()
ax.bar(x,Y,width)
for p in ax.patches:
    ax.annotate(str(p.get_height()), xy=(p.get_x(), p.get_height()))
#********** End **********#
plt.savefig(path+r'/yourimg/'+r'bar.png') #存储图片

第5关：薪资折线图

#********** Begin **********#
#1.导入基础包
import numpy as np
import pandas as pd
import matplotlib
#强制matplotlib不使用任何Xwindows后端（X Window图形用户接口）
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import re
# 防止中文乱码
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif'

#2.导入文档数据
path = r'step5/'
filePath = path + r'positions.csv'
df = pd.read_csv(filePath,encoding = 'gbk')
#3.分析数据
c = pd.DataFrame(df['salarylow'].value_counts())
c.sort_index(inplace=True)
X1 = c.index.tolist()
Y1 = list(c.salarylow)

d = pd.DataFrame(df['salaryhigh'].value_counts())
d.sort_index(inplace=True)
X2 = d.index.tolist()
Y2 = list(d.salaryhigh)
x1 = np.arange(len(X1))+1
x2 = np.arange(len(X2))+1
#4.画图
plt.title('薪资走势图') #给图设置标题
plt.plot(x1, Y1, color='g', label='salarylow')
plt.plot(x2, Y2, color='red', label='salaryhigh')
plt.legend() # 显示图例
plt.xlabel('薪资')
plt.ylabel('职位数')
#********** End **********#
plt.savefig(path+r'/yourimg/'+r'plot.png') #存储图片