import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import re
# 设置请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}
def get_job_data(keyword, location, pages=1):
"""
爬取智联招聘数据
Args:
keyword: 关键词,例如 "金融科技"
location: 城市,例如 "北京"
pages: 爬取页数
Returns:
DataFrame: 包含职位信息的 DataFrame
"""
base_url = 'https://sou.zhaopin.com/jobs/searchresult.ashx?'
job_data = []
for page in range(1, pages + 1):
params = {
'jl': location,
'kw': keyword,
'p': page,
}
response = requests.get(base_url, headers=headers, params=params)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
job_list = soup.find_all('table', class_='newlist')
for job in job_list[1:]: # 跳过第一个表格,它包含了筛选条件
job_title = job.find('td', class_='zwmc').a.text.strip()
company_name = job.find('td', class_='gsmc').a.text.strip()
salary = job.find('td', class_='zwyx').text.strip()
location = job.find('td', class_='gzdd').text.strip()
job_data.append([job_title, company_name, salary, location])
df = pd.DataFrame(job_data, columns=['职位', '公司', '薪资', '地点'])
return df
def save_to_excel(df, filename="job_data.xlsx"):
"""保存数据到Excel"""
df.to_excel(filename, index=False)
def visualize_data(df):
"""数据可视化"""
# 将薪资列转换为字符串类型
df['薪资'] = df['薪资'].astype(str)
# 使用正则表达式提取最低薪资和最高薪资
# 初始化最低薪资和最高薪资列
df['最低薪资'] = None
df['最高薪资'] = None
# 遍历每一行,提取薪资范围
for i, row in df.iterrows():
salary = row['薪资']
# 使用正则表达式匹配薪资范围
match = re.match(r'(\d+)k?-(\d+)k?', salary)
if match:
min_salary = float(match.group(1)) * 1000
max_salary = float(match.group(2)) * 1000
df.at[i, '最低薪资'] = min_salary
df.at[i, '最高薪资'] = max_salary
# 过滤掉无法转换为数字的行
df = df.dropna(subset=['最低薪资', '最高薪资'])
plt.figure(figsize=(10, 5))
plt.hist(df['最低薪资'], bins=10, edgecolor='black')
plt.xlabel('最低薪资 (元)')
plt.ylabel('职位数量')
plt.title('金融科技岗位薪资分布')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体为黑体
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
plt.show()
if __name__ == '__main__':
keyword = "金融科技"
location = "北京"
pages = 3 # 爬取3页数据
df = get_job_data(keyword, location, pages)
save_to_excel(df)
visualize_data(df)帮我改进代码