#在上一关的基础上,对自变量X中的数值变量(x1~x6)作均值-方差标准化处理
# 需要注意的是x7~x15名义变量不需要作标准化处理
# 返回结果X1
def return_values():
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
data=pd.read_excel('银行贷款审批数据.xlsx')
numerical_vars = data.iloc[:, 0:6] # x1 到 x6
categorical_vars = data.iloc[:, 6:15] # x7 到 x15
# 对数值变量(x1 - x6)使用均值策略填充缺失值
numeric_imputer = SimpleImputer(strategy='mean')
a1 = numeric_imputer.fit_transform(numerical_vars)
# 对名义变量(x7 - x15)使用最频繁值策略填充缺失值
categorical_imputer = SimpleImputer(strategy='most_frequent')
a2 = categorical_imputer.fit_transform(categorical_vars)
scaler=StandardScaler()
scaler.fit(a1)
a1=scaler.transform(a1)
X1=np.hstack((a1,a2))
return X1