import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from selenium import webdriver
import builtwith
import re
#检查网站采取了那些技术
html = requests.get('https://search.51job.com').content
builtwith.parse('https://search.51job.com',html=html.decode('gbk'))#因为51job是GBK格式,所以需要设置HTML的解码格式,模块默认是utf-8
#数据爬取过程,51job
occupation_infos = []
for page in range(1,262):
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598,2,{}.html?\
lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&\
confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(page)
#url中,
res = requests.get(url).content.decode('gbk')
soup = BeautifulSoup(res,'lxml')
occupations = [i.get_text().strip() for i in soup.find_all('p',{
'class':'t1'})]
companys = [i.get_text().strip() for i in soup.find_all('span',{
'class':'t2'})[1:]]
regions = [i.get_text().strip()