# 读取“银行贷款审批数据.xlsx”表,自变量为x1~x15,决策变量为y(1-同意贷款,0-不同意贷款)
# 其中x1~x6为数值变量,x7~x15为名义变量
# 请对x1~x6中存在的缺失值用均值策略填充,x7~x15用最频繁值策略填充
# 最后返回填充处理后的X(即x1~x15),以及决策变量Y(即y)
def return_values():
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
data=pd.read_excel('银行贷款审批数据.xlsx')
numerical_vars = data.iloc[:, 0:6] # x1 到 x6
categorical_vars = data.iloc[:, 6:] # x7 到 x15
# 对数值变量(x1 - x6)使用均值策略填充缺失值
numeric_imputer = SimpleImputer(strategy='mean')
numeric_imputer.fit(numerical_vars)
a1 = numeric_imputer.transform(numerical_vars)
# 对名义变量(x7 - x15)使用最频繁值策略填充缺失值
categorical_imputer = SimpleImputer(strategy='most_frequent')
categorical_imputer.fit(categorical_vars)
a2 = categorical_imputer.transform(categorical_vars)
X=np.hstack((a1,a2))
Y=data.iloc[:,-1]
return(X,Y)