dava.csv 示例
员工ID,部门,工作年限,学历,沟通能力,团队协作,问题解决,学习能力,创新能力,能力得分
1,技术部,1,本科,70,80,75,85,60,75
2,技术部,2,本科,72,82,77,87,62,77
3,技术部,3,硕士,75,85,80,90,65,80
4,市场部,1,大专,60,70,65,75,50,65
5,市场部,2,本科,62,72,67,77,52,67
6,市场部,3,本科,65,75,70,80,55,70
7,财务部,1,大专,65,70,75,80,60,70
8,财务部,2,本科,67,72,77,82,62,72
9,财务部,3,本科,70,75,80,85,65,75
10,人力资源部,1,本科,70,75,80,85,70,78
11,人力资源部,2,本科,72,77,82,87,72,80
12,人力资源部,3,硕士,75,80,85,90,75,83
在数据库中创表
CREATE TABLE department_avg_score (
department VARCHAR(255),
avg_score FLOAT
);
CREATE TABLE score_by_work_years (
work_years INT,
avg_score FLOAT
);
CREATE TABLE education_ratio (
education VARCHAR(255),
ratio FLOAT
);
CREATE TABLE ability_distribution (
ability VARCHAR(255),
avg_score FLOAT
);
CREATE TABLE experience_score_relation (
work_years INT,
score FLOAT
);
CREATE TABLE department_score_distribution (
department VARCHAR(255),
minimum FLOAT,
q1 FLOAT,
median FLOAT,
q3 FLOAT,
maximum FLOAT
);
spark代码
'''
Author: Reisen7
Date: 2025-03-11 18:00:09
LastEditTime: 2025-03-11 18:27:20
'''
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import numpy as np
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
os.environ["HADOOP_HOME"] = os.path.join(BASE_DIR, "hadoop-3.3.5")
os.environ["hadoop.home.dir"] = os.path.join(BASE_DIR, "hadoop-3.3.5")
os.environ['PYSPARK_PYTHON'] = r'F:/usr/anaconda3/envs/hadoop_env/python.exe'
spark = SparkSession.builder \
.appName("TalentAnalysis") \
.config("spark.driver.extraClassPath", "mysql-connector-java-8.0.32.jar") \
.config("spark.executor.extraClassPath", "mysql-connector-java-8.0.32.jar") \
.config("spark.python.executable", "F:/usr/anaconda3/envs/hadoop_env/python.exe") \
.getOrCreate()
data = spark.read.csv("data.csv", header=True, inferSchema=True)
jdbc_url = "jdbc:mysql://localhost:3306/talent_capability_analysis?useUnicode=true&characterEncoding=utf-8&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=GMT%2B8"
connection_properties = {
"user": "root",
"password": "123456",
"driver": "com.mysql.cj.jdbc.Driver"
}
department_avg_score = data.groupBy("部门") \
.agg(F.avg("能力得分").alias("avg_score"))
department_avg_score.write.jdbc(url=jdbc_url, table="department_avg_score", mode="overwrite", properties=connection_properties)
score_by_work_years = data.groupBy("工作年限") \
.agg(F.avg("能力得分").alias("avg_score"))
score_by_work_years.write.jdbc(url=jdbc_url, table="score_by_work_years", mode="overwrite", properties=connection_properties)
total_count = data.count()
education_ratio = data.groupBy("学历") \
.agg(F.count("*").alias("count")) \
.withColumn("ratio", F.col("count") / total_count) \
.select("学历", "ratio")
education_ratio.write.jdbc(url=jdbc_url, table="education_ratio", mode="overwrite", properties=connection_properties)
ability_columns = ["沟通能力", "团队协作", "问题解决", "学习能力", "创新能力"]
ability_distribution = []
for ability in ability_columns:
avg_score = data.agg(F.avg(ability)).collect()[0][0]
ability_distribution.append((ability, avg_score))
ability_distribution_df = spark.createDataFrame(ability_distribution, ["ability", "avg_score"])
ability_distribution_df.write.jdbc(url=jdbc_url, table="ability_distribution", mode="overwrite", properties=connection_properties)
experience_score_relation = data.select("工作年限", "能力得分")
experience_score_relation.write.jdbc(url=jdbc_url, table="experience_score_relation", mode="overwrite", properties=connection_properties)
def calculate_boxplot_stats(score_list):
score_array = np.array(score_list)
return (
float(np.min(score_array)),
float(np.percentile(score_array, 25)),
float(np.percentile(score_array, 50)),
float(np.percentile(score_array, 75)),
float(np.max(score_array))
)
udf_calculate_boxplot_stats = F.udf(calculate_boxplot_stats, "struct<minimum:float,q1:float,median:float,q3:float,maximum:float>")
department_score_distribution = data.groupBy("部门") \
.agg(F.collect_list("能力得分").alias("score_list")) \
.withColumn("boxplot_stats", udf_calculate_boxplot_stats(F.col("score_list"))) \
.select("部门",
F.col("boxplot_stats.minimum").alias("minimum"),
F.col("boxplot_stats.q1").alias("q1"),
F.col("boxplot_stats.median").alias("median"),
F.col("boxplot_stats.q3").alias("q3"),
F.col("boxplot_stats.maximum").alias("maximum"))
department_score_distribution.write.jdbc(url=jdbc_url, table="department_score_distribution", mode="overwrite", properties=connection_properties)
spark.stop()