重要:数据和特征决定了机器学习的上限,而模型和算法无限接近这个上限。(数据的质量和数量更应重视)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df = pd.read_csv("./HR.csv", header=0)
summary = df.describe()
## 空值的判断
df.isnull()
### 空值的处理
df = df.dropna(subset=["satisfaction_level","last_evaluation"], how="any")
##四分位数去异常值
le_s = df['satisfaction_level']
q_low = le_s.quantile(q=0.25)
q_high = le_s.quantile(q=0.75)
q_interval = q_high - q_low
df = df[le_s<k*q_interval+q_high][le_s>k*q_interval-q_low]
df = df[df['salary']!='nme']
### 特征预处理
## 归一化、标准化处理
df["satisfaction_level"] = MinmanScaler().fit_transform(df["satisfaction_level"].values.reshape(-1,1)).reshape(1,-1)[0]
df["satisfaction_level"] = StandardScaler().fit_transform(df["satisfaction_level"].values.reshape(-1,1)).reshape(1,-1)[0]
### 数值化
column_lst =["salary","department"]
df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
### 降维
df.features = PCA(n_components=2).fit_transform(df.values)
...
待续。。。