# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.htmlimport pymongo
from AjaxProject import settings
classAjaxprojectPipeline(object):#********** Begin **********#def__init__(self):
self.client = pymongo.MongoClient(
host=settings.MONGODB_HOST,
port=settings.MONGODB_PORT
)
self.db = self.client['lagou']
self.collection = self.db['zhaopin']defprocess_item(self, item, spider):
self.collection.insert_one(dict(item))return item
#********** End **********#
settings.py 中启用 Pipeline
# -*- coding: utf-8 -*-# Scrapy settings for AjaxProject project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://doc.scrapy.org/en/latest/topics/settings.html# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME ='AjaxProject'
SPIDER_MODULES =['AjaxProject.spiders']
NEWSPIDER_MODULE ='AjaxProject.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent# Obey robots.txt rules
ROBOTSTXT_OBEY =False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = True# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'AjaxProject.middlewares.AjaxprojectSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html# Enable or disable extensions# See https://doc.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html# Enable and configure the AutoThrottle extension (disabled by default)# See https://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MY_USER_AGENT =["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"]
DOWNLOADER_MIDDLEWARES ={'scrapy.downloadermiddleware.useragent.UserAgentMiddleware':None,#这里要设置系统的useragent为None,否者会被覆盖掉'AjaxProject.middlewares.MyUserAgentMiddleware':400,'scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware':700,}
ITEM_PIPELINES ={'AjaxProject.pipelines.AjaxprojectPipeline':300,}
DOWNLOAD_DELAY =1.5
COOKIE ={"user_trace_token":"20171109093921 - c87e4dd6 - 6116 - 4060 - a976 - 38df4f6dfc1c","_ga = GA1":".2.1308939382.1510191563","LGUID":"20171109093922 - cff5ddb7 - c4ee - 11e7 - 985f - 5254005c3644","JSESSIONID":"ABAAABAAAGGABCBAE8E3FCEFC061F7CF2860681B1BF3D98","X_HTTP_TOKEN":"0f2396abe975f6a09df1c0b8a0a3a258","showExpriedIndex":"1","showExpriedCompanyHome":"1","showExpriedMyPublish":"1","hasDeliver":"12","index_location_city":"% E6 % B7 % B1 % E5 % 9C % B3","TG - TRACK - CODE":"index_user","login":"false","unick":"","_putrc":"","LG_LOGIN_USER_ID":"","_gat":"1","LGSID":"20180720141029 - 99fde5eb - 8be3 - 11e8 - 9e4d - 5254005c3644","PRE_UTM":"","PRE_HOST":"","PRE_SITE":"","PRE_LAND":"https % 3A % 2F % 2Fwww.lagou.com % 2Fzhaopin % 2F","SEARCH_ID":"5ddd3d52f1d94b45820534b397ef21e6","LGRID":"20180720141849 - c455d124 - 8be4 - 11e8 - 9e4d - 5254005c3644",}
第2关:在 Jupyter Notebook 中导入数据
#********** Begin **********##1.导入基础包import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
#2.读取数据
path ='step2/'
filePath = path +'positions.csv'
df = pd.read_csv(filePath, encoding='gbk')# 使用 GBK 编码读取 CSV 文件#3.重新设置列的顺序
display_columns =["salarylow","salaryhigh","company","positionname","index_id","position"]
df = df.reindex(columns=display_columns)#********** End **********#
d5 = df.head()print(d5)# 显示前5行数据
第3关:招聘学历要求饼状图
#********** Begin **********##1.导入基础包import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')import matplotlib.pyplot as plt
import re
# 防止中文乱码
matplotlib.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['font.family']='sans-serif'#2.导入文档数据
path =r'step3/'
filePath = path +r'positions.csv'
df = pd.read_csv(filePath,encoding ='gbk')#3.分析数据
edu_count = df['edu'].value_counts()
edu_labels = edu_count.index.tolist()
edu_values = edu_count.tolist()#4.画图
fig, ax = plt.subplots()
ax.pie(edu_values, autopct='%1.1f%%', labels=edu_labels)
fig.set_size_inches(7,7)#********** End **********#
plt.savefig(path+r'/yourimg/'+r'pie.png')#存储图片
第4关:最低薪资柱状图
#********** Begin **********##1.导入基础包import numpy as np
import pandas as pd
import matplotlib
# Force matplotlib to not use any Xwindows backend.
matplotlib.use('Agg')import matplotlib.pyplot as plt
import re
#2.导入文档数据
path =r'step4/'
filePath = path+r'positions.csv'
df = pd.read_csv(filePath,encoding='gbk')#3.分析数据
b1 = pd.DataFrame(df['salarylow'].value_counts())
b1.sort_index(inplace=True)
X = b1.index.tolist()
Y =list(b1.salarylow)#4.画图
x = np.arange(len(X))+1
width =0.5
fig,ax = plt.subplots()
ax.bar(x,Y,width)for p in ax.patches:
ax.annotate(str(p.get_height()), xy=(p.get_x(), p.get_height()))#********** End **********#
plt.savefig(path+r'/yourimg/'+r'bar.png')#存储图片
第5关:薪资折线图
#********** Begin **********##1.导入基础包import numpy as np
import pandas as pd
import matplotlib
#强制matplotlib不使用任何Xwindows后端(X Window图形用户接口)
matplotlib.use('Agg')import matplotlib.pyplot as plt
import re
# 防止中文乱码
matplotlib.rcParams['font.sans-serif']=['SimHei']
matplotlib.rcParams['font.family']='sans-serif'#2.导入文档数据
path =r'step5/'
filePath = path +r'positions.csv'
df = pd.read_csv(filePath,encoding ='gbk')#3.分析数据
c = pd.DataFrame(df['salarylow'].value_counts())
c.sort_index(inplace=True)
X1 = c.index.tolist()
Y1 =list(c.salarylow)
d = pd.DataFrame(df['salaryhigh'].value_counts())
d.sort_index(inplace=True)
X2 = d.index.tolist()
Y2 =list(d.salaryhigh)
x1 = np.arange(len(X1))+1
x2 = np.arange(len(X2))+1#4.画图
plt.title('薪资走势图')#给图设置标题
plt.plot(x1, Y1, color='g', label='salarylow')
plt.plot(x2, Y2, color='red', label='salaryhigh')
plt.legend()# 显示图例
plt.xlabel('薪资')
plt.ylabel('职位数')#********** End **********#
plt.savefig(path+r'/yourimg/'+r'plot.png')#存储图片