在py文件中 一次性处理data数据中所有的连续变量和离散变量
1. 读取data数据
2. 对离散变量进行one-hot编码
3. 对独热编码后的变量转化为int类型
4. 对所有缺失值进行填充
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import pandas as pd
data = pd.read_csv(r'C:\Users\盘陇\Desktop\项目尝试\项目一信贷风险预测\data.csv')
print(data.head())
print(data.shape)
#分离数据和标签,credit default 为标签,其他是数据
x=data.drop(['Credit Default'],axis=1)
y=data['Credit Default']
print(y)
#填补缺失值
for column in x.columns:
if x[column].dtype in ['float64','int64']:
median = x[column].median()
x[column].fillna(median,inplace=True)
missing_values_count = x.isnull().sum()
print("每一列的缺失值个数:")
print(missing_values_count)
#用众数填补
# for column in x.columns:
# if x[column].dtype in ['float64','int64']:
# mode = x[column].mode()
# # x[column].fillna(median,inplace=True)
# x[column] = x[column].fillna(mode)
# 分离连续特征和离散特征
continuous_features = x.select_dtypes(include=['float64', 'int64']).columns #连续特征
discrete_features = x.select_dtypes(include=['object']).columns #离散特征
# 连续特征标准化
scaler = StandardScaler()
x_continuous = scaler.fit_transform(x[continuous_features])
# 对于无序离散特征使用独热编码
onehot_encoder = OneHotEncoder()
x_discrete = onehot_encoder.fit_transform(x[discrete_features.drop('Years in current job')])
list_final=[] #创建一个新列表,强制转换
for i in list_final: # 接着之前的,对bool特征进行类型转换
x_discrete[i] = x_discrete[i].astype(int)
# 对于有序离散特征使用数值编码
label_encoder = LabelEncoder()
# 假设 'Years in current job' 是有序离散特征,将其编码为数值
x['Years in current job'] = label_encoder.fit_transform(x['Years in current job'])
929

被折叠的 条评论
为什么被折叠?



