数据的特征预处理实战

最新推荐文章于 2024-11-20 15:55:42 发布

一曲无痕奈何

最新推荐文章于 2024-11-20 15:55:42 发布

阅读量460

点赞数 1

CC 4.0 BY-SA版权

分类专栏：数据建模与数据挖掘文章标签：数据特征值的处理

本文链接：https://blog.youkuaiyun.com/qq_41479464/article/details/99329181

数据建模与数据挖掘专栏收录该内容

2 篇文章

订阅专栏

简单的数据预处理：

import pandas as pd
import  numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
import pydotplus

#sl:satisfaction_level---False:MinMaxScaler;True:StandardScaler
#le:last_evaluation---False:MinMaxScaler;True:StandardScaler
#npr:number_project---False:MinMaxScaler;True:StandardScaler
#amh:average_monthly_hours--False:MinMaxScaler;True:StandardScaler
#tsc:time_spend_company--False:MinMaxScaler;True:StandardScaler
#wa:Work_accident--False:MinMaxScaler;True:StandardScaler
#pl5:promotion_last_5years--False:MinMaxScaler;True:StandardScaler
#dp:department--False:LabelEncoding;True:OneHotEncoding
#slr:salary--False:LabelEncoding;True:OneHotEncoding

#定义函数读入数据
def hr_preprocessing(sl=False,le=False,npr=False,amh=False,tsc=False,wa=False,pl5=False,dp=False,slr=False,lower_d=False,ld_n=1):
    df = pd.read_csv("./HR.csv")

    # 1、清洗数据
    df = df.dropna(subset=["satisfaction_level", "last_evaluation"])
    df = df[df["satisfaction_level"] <= 1][df["salary"] != "nme"]
    # 2、得到标注
    label = df["left"]
    df = df.drop("left", axis=1)
    # 3、特征选择
    # 4、特征处理
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ["satisfaction_level", "last_evaluation", "number_project", \
                  "average_monthly_hours", "time_spend_company", "Work_accident", \
                  "promotion_last_5years"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = \
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]
    scaler_lst = [slr, dp]
    column_lst = ["salary", "department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == "salary":
                df[column_lst[i]] = [map_salary(s) for s in df["salary"].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]

        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values), label #PAC降维可以不使用标注
    return df, label


d = dict([("low", 0), ("medium", 1), ("high", 2)])
def map_salary(s):
    return d.get(s,0)
def main():
    print(hr_preprocessing())

if __name__ == '__main__':
    main()

部分数据：

satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0.38,0.53,2,157,3,0,1,0,sales,low
0.8,0.86,5,262,6,0,1,0,sales,medium
0.11,0.88,7,272,4,0,1,0,sales,medium
0.72,0.87,5,223,5,0,1,0,sales,low
0.37,0.52,2,159,3,0,1,0,sales,low
0.41,0.5,2,153,3,0,1,0,sales,low
0.1,0.77,6,247,4,0,1,0,sales,low
0.92,0.85,5,259,5,0,1,0,sales,low
0.89,1,5,224,5,0,1,0,sales,low
0.42,0.53,2,142,3,0,1,0,sales,low
0.45,0.54,2,135,3,0,1,0,sales,low
0.11,0.81,6,305,4,0,1,0,sales,low
0.84,0.92,4,234,5,0,1,0,sales,low
0.41,0.55,2,148,3,0,1,0,sales,low
0.36,0.56,2,137,3,0,1,0,sales,low
0.38,0.54,2,143,3,0,1,0,sales,low
0.45,0.47,2,160,3,0,1,0,sales,low
0.78,0.99,4,255,6,0,1,0,sales,low
0.45,0.51,2,160,3,1,1,1,sales,low
0.76,0.89,5,262,5,0,1,0,sales,low
0.11,0.83,6,282,4,0,1,0,sales,low
0.38,0.55,2,147,3,0,1,0,sales,low
0.09,0.95,6,304,4,0,1,0,sales,low
0.46,0.57,2,139,3,0,1,0,sales,low
0.4,0.53,2,158,3,0,1,0,sales,low
0.89,0.92,5,242,5,0,1,0,sales,low
0.82,0.87,4,239,5,0,1,0,sales,low
0.4,0.49,2,135,3,0,1,0,sales,low
0.41,0.46,2,128,3,0,1,0,accounting,low
0.38,0.5,2,132,3,0,1,0,accounting,low
0.09,0.62,6,294,4,0,1,0,accounting,low
0.45,0.57,2,134,3,0,1,0,hr,low
0.4,0.51,2,145,3,0,1,0,hr,low
0.45,0.55,2,140,3,0,1,0,hr,low
0.84,0.87,4,246,6,0,1,0,hr,low
0.1,0.94,6,255,4,0,1,0,technical,low
0.38,0.46,2,137,3,0,1,0,technical,low
0.45,0.5,2,126,3,0,1,0,technical,low
0.11,0.89,6,306,4,0,1,0,technical,low
0.41,0.54,2,152,3,0,1,0,technical,low
0.87,0.88,5,269,5,0,1,0,technical,low
0.45,0.48,2,158,3,0,1,0,technical,low
0.4,0.46,2,127,3,0,1,0,technical,low
0.1,0.8,7,281,4,0,1,0,technical,low
0.09,0.89,6,276,4,0,1,0,technical,low
0.84,0.74,3,182,4,0,1,0,technical,low
0.4,0.55,2,147,3,0,1,0,support,low
0.57,0.7,3,273,6,0,1,0,support,low
0.4,0.54,2,148,3,0,1,0,support,low
0.43,0.47,2,147,3,0,1,0,support,low
0.13,0.78,6,152,2,0,1,0,support,low
0.44,0.55,2,135,3,0,1,0,support,low
0.38,0.55,2,134,3,0,1,0,support,low
0.39,0.54,2,132,3,0,1,0,support,low
0.1,0.92,7,307,4,0,1,0,support,low
0.37,0.46,2,140,3,0,1,0,support,low
0.11,0.94,7,255,4,0,1,0,support,low
0.1,0.81,6,309,4,0,1,0,technical,low
0.38,0.54,2,128,3,0,1,0,technical,low
0.85,1,4,225,5,0,1,0,technical,low
0.85,0.91,5,226,5,0,1,0,management,medium
0.11,0.93,7,308,4,0,1,0,IT,medium
0.1,0.95,6,244,5,0,1,0,IT,medium
0.36,0.56,2,132,3,0,1,0,IT,medium
0.11,0.94,6,286,4,0,1,0,IT,medium
0.81,0.7,6,161,4,0,1,0,IT,medium
0.43,0.54,2,153,3,0,1,0,product_mng,medium
0.9,0.98,4,264,6,0,1,0,product_mng,medium
0.76,0.86,5,223,5,1,1,0,product_mng,medium
0.43,0.5,2,135,3,0,1,0,product_mng,medium
0.74,0.99,2,277,3,0,1,0,IT,medium
0.09,0.77,5,275,4,0,1,0,product_mng,medium
0.45,0.49,2,149,3,0,1,0,product_mng,high
0.09,0.87,7,295,4,0,1,0,product_mng,low
0.11,0.97,6,277,4,0,1,0,product_mng,medium
0.11,0.79,7,306,4,0,1,0,product_mng,medium
0.1,0.83,6,295,4,0,1,0,product_mng,medium
0.4,0.54,2,137,3,0,1,0,marketing,medium
0.43,0.56,2,157,3,0,1,0,sales,low
0.39,0.56,2,142,3,0,1,0,accounting,low
0.45,0.54,2,140,3,0,1,0,support,low
0.38,0.49,2,151,3,0,1,0,technical,low
0.79,0.59,4,139,3,0,1,1,management,low
0.84,0.85,4,249,6,0,1,0,marketing,low
0.11,0.77,6,291,4,0,1,0,marketing,low
0.11,0.87,6,305,4,0,1,0,marketing,low
0.17,0.84,5,232,3,0,1,0,sales,low
0.44,0.45,2,132,3,0,1,0,sales,low
0.37,0.57,2,130,3,0,1,0,sales,low
0.1,0.79,6,291,4,0,1,0,sales,low
0.4,0.5,2,130,3,0,1,0,sales,low
0.89,1,5,246,5,0,1,0,sales,low
0.42,0.48,2,143,3,0,1,0,sales,low
0.46,0.55,2,129,3,0,1,0,sales,low
0.09,0.83,6,255,4,0,1,0,sales,low
0.37,0.51,2,155,3,0,1,0,sales,low
0.1,0.77,6,265,4,0,1,0,sales,low
0.1,0.84,6,279,4,0,1,0,sales,low
0.11,0.97,6,284,4,0,1,0,sales,low