import requests
from lxml import etree
import pandas as pd
class spider:
def __init__(self, word, begin_page, end_page):
self.word = word
self.begin_page = begin_page
self.end_page = end_page
self.header = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1 Trident/5.0;"}
self.url = "https://www.zhipin.com/job_detail/?query="
self.number = 1
self.df = pd.DataFrame(columns={'职位', '薪资', '公司', '公司类型', '职位信息'})
def b_spider(self):
print('---b_spider')
for page in range(self.begin_page, self.end_page + 1):
url = self.url + self.word + '&page=' + str(page)
print(url)
response = requests.get(url, headers=self.header)
html = etree.HTML(response.text)
company = html.xpath("//div[@class='company-text']/h3/a")
# // div[ @class ='company-text'] / p / text()[1]
company_type = html.xpath("//div[@class='company-text']/p/text()[1]")
links = html.xpath("//div[@class='job-list']//a/@data-url")
# https://www.zhipin.com/job_detail/c25728ffe1ba9c6e1Xd60tq9GFI~
for i, link in enumerate(links):
link = 'https://www.zhipin.com/job_detail/' + str(link)[30:-23]
print(link)
self.loadpage(link, company[i].text, company_type[i])
self.number += 1
self.df.to_excel("%s.xls" % (self.word), encoding='utf-8', index=False)
print(self.number)
def loadpage(self, link, company, company_type):
print('---loadpage')
# //div[@class='info-primary']//div[@class='name']/h1
# //div[@class='info-primary']//div[@class='name']/span
response = requests.get(link, headers=self.header)
html = etree.HTML(response.text)
title = html.xpath("//div[@class='info-primary']//div[@class='name']/h1")[0].text
salary = html.xpath("//div[@class='info-primary']//div[@class='name']/span")[0].text
# //div[@class='text']
job_des = html.xpath("//div[@class='text']/text()")
self.writepage(title, salary, company, company_type, job_des)
def writepage(self, title, salary, company, company_type, job_des):
print('---writepage')
print(title)
title = title.replace(' ', '')
salary = salary.replace(' ', '')
company = company.replace(' ', '')
company_type = company_type.replace(' ', '')
print(salary)
t = []
t.append(title)
t.append(salary)
t.append(company)
t.append(company_type)
s = ''
for i in job_des:
s = s + i.strip() + '\n'
t.append(s)
f = pd.DataFrame([t], columns={'职位', '薪资', '公司', '公司类型', '职位信息'})
self.df = pd.concat([self.df, f])
if __name__ == '__main__':
s = spider('图像处理', 1, 3)
s.b_spider()
爬取boss直聘目标职位信息
最新推荐文章于 2025-01-19 10:52:50 发布
