python爬取招聘信息_python 爬取boss直聘招聘信息实现

原标题:python 爬取boss直聘招聘信息实现

1、一些公共方法的准备

获取数据库链接:

importpymysql

'''

遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!

'''

# 获得数据库链接对象

defgetConnect(database):

DATABASE = {

'host': 'localhost',

'database': database,

'user': 'root',

'password': '123456'

}

returnpymysql.connect(**DATABASE)

获取页面soup对象:

importrequests

frombs4 importBeautifulSoup

#转换成soup对象

defto_soup(str):

returnBeautifulSoup(str, 'lxml')

#通过url和header获取页面soup对象

defget_soup(url,header):

response=requests.get(url,headers=header)

returnto_soup(response.text)

2、爬取BOSS直聘python相关岗位的实现

定义工作信息对象:

classWorkInfo:

def__init__(self, title, salary, site, experience, education, job_url, company,release_date,get_date):

self.title = title

self.salary = salary

self.site = site

self.experience = experience

self.education = education

self.job_url = job_url

self.company = company

self.release_date = release_date

self.get_date = get_date

获取工作信息到定义的对象的集合:

# 获取工作信息集合

defgetWorkInfos(url, header):

# 获得页面soup对象

htmlSoup = rep.get_soup(url, header)

workInfos = []

# 获取页面内容块状列表

job_infos = htmlSoup.find_all( 'div', class_= 'job-primary')

iflen(job_infos)== 0:

print( '已到空白页!!!')

returnworkInfos

# 遍历每块,获取每块详细类容

print( '开始爬取页面数据!')

forjob_info_soup injob_infos:

# 标题

title = job_info_soup.find( 'div', class_= 'job-title').get_text

# 薪资

salary = job_info_soup.find( 'span', class_= 'red').get_text

infos = str(job_info_soup.find( 'p'))

infosList = tool.toContent(infos)

# 工作地址

site = infosList[ 0]

# 工作经验

experience = infosList[ 1]

# 学历要求

education = infosList[ 2]

# 详细信息链接

job_url = job_info_soup.find( 'a').get( 'href')

# 公司名

company = job_info_soup.find( 'div', class_= 'company-text').find( 'a').get_text

# 发布时间

release_date = job_info_soup.find( 'div', class_= 'info-publis').find( 'p').get_text[ 3:]

# 拼接获取符合数据库要求的日期字符串

if'昨'inrelease_date:

release_date=time.strftime( "%Y-%m-%d",time.localtime(time.time -86400))

elif':'inrelease_date:

release_date=time.strftime( "%Y-%m-%d")

else:

release_date = str(time.localtime.tm_year) + '-'+ re.sub( r'[月,日]', '-', release_date)[: -1]

# 获取数据的时间

get_date = time.strftime( "%Y-%m-%d %H:%M:%S")

workInfo = WorkInfo(title, salary, site, experience, education, job_url, company, release_date, get_date)

workInfos.append(workInfo)

print( '爬取页面数据完毕!')

returnworkInfos

把获取到的工作信息集合存入数据库:

# 存入数据库

def toDatabase(workInfos):

print('开始存入数据库')

db = database.getConnect('reptile')

cursor = db.cursor

for workInfo in workInfos:

sql = " INSERTINTO`work_info`( `title`, `salary`, `site`, `experience`, `education`, `job_url`, `company`, `release_date`, `get_date`) "

"VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s') "

% (workInfo.title, workInfo.salary, workInfo.site, workInfo.experience, workInfo.education, workInfo.job_url, workInfo.company, workInfo.release_date,workInfo.get_date)

cursor.execute(sql)

cursor.close

db.commit

db.close

print('存入数据库完毕!')

爬取工作实现:

url = 'https://www.zhipin.com/c101270100/?'

header = {

'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',

'referer': '',

'cookie ':'lastCity= 101270100; _uab_collina= 155876824002955006866925; t=DPiicXvgrhx7xtms; wt=DPiicXvgrhx7xtms; sid=sem_pz_bdpc_dasou_title; __c= 1559547631; __g=sem_pz_bdpc_dasou_title; __l=l=% 2Fwww.zhipin.com% 2F% 3Fsid% 3Dsem_pz_bdpc_dasou_title&r=https% 3A% 2F% 2Fsp0.baidu.com% 2F9q9JcDHa2gU2pMbgoY3K% 2Fadrc.php% 3Ft% 3D06KL00c00fDIFkY0IWPB0KZEgsZb1OwT00000Kd7ZNC00000JqHYFm.THdBULP1doZA80K85yF9pywdpAqVuNqsusK15yF9m1DdmWfdnj0sm1PhrAf0IHYYnD7aPH9aPRckwjRLrjbsnYfYfWwaPYwDnHuDfHcdwfK95gTqFhdWpyfqn1czPjmsPjnYrausThqbpyfqnHm0uHdCIZwsT1CEQLILIz4lpA-spy38mvqVQ1q1pyfqTvNVgLKlgvFbTAPxuA71ULNxIA-YUAR0mLFW5HRvnH0s% 26tpl% 3Dtpl_11534_19713_15764% 26l% 3D1511867677% 26attach% 3Dlocation% 253D% 2526linkName% 253D% 2525E6% 2525A0% 252587% 2525E5% 252587% 252586% 2525E5% 2525A4% 2525B4% 2525E9% 252583% 2525A8-% 2525E6% 2525A0% 252587% 2525E9% 2525A2% 252598-% 2525E4% 2525B8% 2525BB% 2525E6% 2525A0% 252587% 2525E9% 2525A2% 252598% 2526linkText% 253DBoss% 2525E7% 25259B% 2525B4% 2525E8% 252581% 252598% 2525E2% 252580% 252594% 2525E2% 252580% 252594% 2525E6% 252589% 2525BE% 2525E5% 2525B7% 2525A5% 2525E4% 2525BD% 25259C% 2525EF% 2525BC% 25258C% 2525E6% 252588% 252591% 2525E8% 2525A6% 252581% 2525E8% 2525B7% 25259F% 2525E8% 252580% 252581% 2525E6% 25259D% 2525BF% 2525E8% 2525B0% 252588% 2525EF% 2525BC% 252581% 2526xp% 253Did(% 252522m3224604348_canvas% 252522)% 25252FDIV% 25255B1% 25255D% 25252FDIV% 25255B1% 25255D% 25252FDIV% 25255B1% 25255D% 25252FDIV% 25255B1% 25255D% 25252FDIV% 25255B1% 25255D% 25252FH2% 25255B1% 25255D% 25252FA% 25255B1% 25255D% 2526linkType% 253D% 2526checksum% 253D8% 26wd% 3Dboss% 25E7% 259B% 25B4% 25E8% 2581% 2598% 26issp% 3D1% 26f% 3D3% 26ie% 3Dutf -8% 26rqlang% 3Dcn% 26tn% 3Dbaiduhome_pg% 26oq% 3D% 2525E5% 25258D% 25259A% 2525E5% 2525AE% 2525A2% 2525E5% 25259B% 2525AD% 26inputT% 3D9649% 26prefixsug% 3Dboss% 26rsp% 3D0&g=% 2Fwww.zhipin.com% 2F% 3Fsid% 3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a= 1558768262, 1558768331, 1559458549, 1559547631; JSESSIONID=A0FC9E1FD0F10E42EAB681A51AC459C7; '

'__a= 86180698.1558768240.1559458549.1559547631.63.3.6.6; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a= 1559551561'

'referer: https: //www.zhipin.com/c101270100/?query=python&page=2&ka=page-2'

}

query= 'python'

page= 1

while True:

print( "开始第:{} 页".format(page))

purl=url+ 'query='+query+ '&page='+str(page)+ '&ka=page-'+str(page)

workInfos = getWorkInfos(purl, header)

iflen(workInfos)== 0:

print( '结束爬取!')

break

toDatabase(workInfos)

page=page+ 1

3、涉及的小知识

自制去取html标签,把标签内夹杂的内容存入list中:

# 通过正则表达式去掉HTML标签,获取标签内的文字内容列表

deftoContent(str):

infos=re.split( ']*>', str)

# 去除空元素

returnlist(filter( None,infos))

时间的相关操作

用‘-’替换‘月’‘日’:

re.sub(r'[月,日]', '-', release_date)

获取前一天’:

release_date=time.strftime("%Y-%m-%d",time.localtime(time.time-86400))返回搜狐,查看更多

责任编辑:

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值