1.目标
- 爬取拉勾网职位基本信息,并保存至mongoDB数据库
2.环境需求
- requests
- lxml
- pymongo
- bs4
3.思路分析
3.1 全站爬取分为两部分:
1.抓取拉勾首页大类的网址,保存至数据库中;
2.根据大类的网址抓取每条职位的信息;
3.2 具体分析
3.2.1 首页分析

3.2.2 职位页分析
例如打开Java : https://www.lagou.com/zhaopin/Java
点击下一页,网址:https://www.lagou.com/zhaopin/Java/2/?filterOption=2
发现网页的规律:https://www.lagou.com/zhaopin/Java/n/?filterOption=n
url = 'https://www.lagou.com/zhaopin/Java/{}/?filterOption={}'.format(n,n)
但拉勾网限制了最大爬取页面为30页
每一页共有15条招聘信息。
4.实战演练
4.1 抓取拉勾首页大类的网址
建立一个 channel.py
from bs4 import BeautifulSoup
import requests
import pymongo
#--------<<链接mongoDB>>-----------
client = pymongo.MongoClient(host="localhost")
lagou = client['lagou']
channel = lagou['channel']
#-----------<<获取首页大类网址>>----------------------
start_url = "https://www.lagou.com/"
host = "www.lagou.com"
headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Cookies": " ",#注册账号登陆后获取的Cookie
}
def parse_index(url):
web_data = requests.get(url,headers = headers)
soup = BeautifulSoup(web_data.text, 'lxml')
all_positions = soup.select('div.menu_sub.dn > dl > dd > a')
joburls = [i['href'] for i in all_positions]
jobnames = [i.get_text() for i in all_positions]
for joburl,jobname in zip(joburls,jobnames):
data={
"url":joburl,
"jobname":jobname,
}
channel.insert_one(data)
if __name__ == '__main__':
parse_index(start_url)
4.2 抓取每类招聘信息
建立一个 lagou.py
import requests
import hashlib
import random
from lxml import etree
import pymongo
import time
import random
#-----------<<数据库>>--------------
client = pymongo.MongoClient(host="localhost")
lagou = client['lagou']
channel = lagou['channel']
lgjob = lagou["lgjob"]
#-----------<<请求头>>-----=-------
#请登陆后进行填充该信息,建议多用几个浏览器的信息
header_list= [{'User-Agent':"",
"Cookie":""},
{'User-Agent':"",
"Cookie":"",}]
#=======<<获取列表页的信息>>========
#将url转化为md5保存
def get_md5(url):
if isinstance(url,str):
url = url.encode("utf-8")
m = hashlib.md5()
m.update(url)
return m.hexdigest()
def get_info(channel,page):
url = '{}{}/?filterOption={}'.format(channel,str(page),str(page))
response = requests.get(url,headers = random.choice(header_list)).text
error = "i_error"
if error in response:
print("访问的页面不存在")
pass
else:
html = etree.HTML(response)
infos = html.xpath("//ul[@class='item_con_list']/li")
for info in infos:
link = info.xpath("div/div/div/a/@href")[0]
link_id = get_md5(link)
position = info.xpath("div/div/div/a/h3/text()")[0]
addr = info.xpath("div/div/div/a/span[@class='add']/em/text()")[0]
salary = info.xpath("div/div/div/div/span[@class='money']/text()")[0]
work_years = "".join(info.xpath("div/div/div/div/text()")[2]).strip().split("/")[0]
degree_need= "".join(info.xpath("div/div/div/div/text()")[2]).strip().split("/")[1]
try:
tag = "/".join(info.xpath("div[2]/div[1]/span/text()"))
except Exception:
tag = None
company = info.xpath("div[1]/div[3]/a/img/@alt")[0]
job_advantage = info.xpath("div[2]/div[2]/text()")[0]
company_field= "".join(info.xpath("div[1]/div[2]/div[2]/text()")[0]).strip().split("/")[0]
company_stage= "".join(info.xpath("div[1]/div[2]/div[2]/text()")[0]).strip().split("/")[1]
data = {
"link":link,
"link_id":link_id,
"position":position,
"addr":addr,
"salary":salary,
"work_years":work_years,
"degree_need":degree_need,
"tag":tag,
"company":company,
"job_advantage":job_advantage,
"company_field":company_field,
"company_stage":company_stage,
}
save_to_mongodb(data)
def save_to_mongodb(data):
try:
if lgjob.update_one({'link_id': data['link_id']}, {'$set': data}, True):
print('储存到MONGODB成功',data)
except:
print('储存到MONGODB失败',data)
4.3 创建进程池
建立一个 main.py
from multiprocessing import Pool
import pymongo
from lagou import get_info , parse_info #从上一步我们创建的文件中导入我们方法
import random
import time
#--------<<连接数据库>>-----
client = pymongo.MongoClient(host="localhost")
lagou = client['lagou']
channel = lagou['channel']
lgjob = lagou["lgjob"]
#--------<<从数据库中提取职位的url>>------
channel_list = [item['url'] for item in channel.find()]
time_snap = random.randint(0,8)
def get_all_links_from(channel):
for page in range(1,30):
get_info(channel,page)
time.sleep(time_snap)
#--------<<列表页信息抓取>>-------------------
if __name__ == '__main__':
t_start = time.time()
pool = Pool(processes=4)
pool.map(get_all_links_from,channel_list)
print(time.time() - t_start)
4.4 创建爬取数量监控
创建文件 count.py
import time
from lagou import lgjob
while True:
print(lgjob.find().count())
time.sleep(5) #每间隔5s,读取mongoDB中存储的条数
4.5 运行方式
将这些文件放在一个文件夹内
打开命令行 cmd
# 1. 运行count.py进行计数,在未运行main.py时,一直显示为0
python count.py
#2. 另外开启一个cmd,运行主文件 main.py 进行爬取
python main.py
最后,附上代码地址:https://github.com/Damaomaomao/Lagou_mutilprocessing
如有问题欢迎,请多指教!