这里把抓取数据和分析数据写在了两个py文件中。
先写了抓取数据写入到一个txt中
import re
import time
from selenium import webdriver
driver = webdriver.Chrome(r'D:\chromedriver.exe')
driver.implicitly_wait(10)
driver.get('https://www.51job.com/')
driver.find_element_by_id('kwdselectid').send_keys('自动化测试')
driver.find_element_by_id('work_position_click').click()
#取消选中的地区
eles = driver.find_elements_by_css_selector('#work_position_click_center_right_list_000000 [class="on"]')
for ele in eles:
time.sleep(2)
ele.click()
#选择北京
driver.find_element_by_id('work_position_click_center_right_list_category_000000_010000').click()
#点确定
driver.find_element_by_id('work_position_click_bottom_save').click()
#点击搜索按钮。表达式中有双引号和单引号,所以表达式外面用三引号
driver.find_element_by_css_selector('''[οnclick="kwdGoSearch($('#kwdselectid').val());"]''').click()
#获取页数
pageMsg = driver.find_element_by_css_selector('#hidTotalPage+[class="td"]').text
totalPage = int(re.findall('[0-9]+', pageMsg)[0])
#取出所有的职位信息
salaries = []
for page in range(totalPage):
time.sleep(2)
jobs = driver.find_elements_by_css_selector('#resultList>[class="el"]')
for job in jobs:
#job.text.split('\n')是个列表
# print(job.text.split('\n')[3])
salaries.append(job.text.split('\n')[3])
if page < totalPage-1:
driver.find_element_by_css_selector('.p_in li:nth-last-child(1)').click()
time.sleep(2)
driver.quit()
file = open('salary_data.txt', 'w+')
file.write('\n'