#coding:utf-8
import requests
import re
import urllib2
from pymongo import MongoClient
from bs4 import BeautifulSoup
#抓取51job相关职位信息
def get_url():
#连接mongo数据库
cn=MongoClient(host='127.0.0.1',port=27017)
db=cn.job
table=db.autoTable
#初始化数据
rel=True
line=1
url_name=urllib2.quote(name.encode('utf-8'))
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
while rel:
url='http://search.51job.com/list/020000,000000,0000,00,9,99,{},2,{}.html'.format(url_name.replace('%','%25'),line)
rq=requests.get(url,headers=header)
bs=BeautifulSoup(rq.content,'html.parser')
page=bs.find('span',class_="td").string
page_num=re.search('\d{1,}',page).group()
if line<=int(page_num):
print u'正在抓取%s页面信息'%line
# print bs.prettify(encoding='gbk')
div=bs.find_all('div',class_="el")
for data in div:
if data.find_all('p', class_="t1 "):
jobdic={}
#正则获取需要的信息
jobdic['job_name']=data.p.span.a.attrs['title']
jobdic['job_request_href']=data.p.span.a.attrs['href']
jobdic['job_company']=data.find('span',class_="t2").a.attrs['title']
jobdic['job_place']=data.find('span',class_="t3").string
jobdic['job_money']=data.find('span',class_="t4").string
jobdic['job_pushtime']=data.find('span',class_="t5").string
#存储数据
table.save(jobdic)
line+=1
else:
rel=False
if __name__=='__main__':
name=u'自动化测试工程师'
get_url()