数据的预处理
结合前几天对数据与处理的相关知识,对data.csv数据进行完整的处理:
import pandas as pd
data = pd.read_csv('data.csv')
#查看详细信息
print(data.info)
#查看数据前五行
print(data.head(5))
#查看数据中列的信息
print(data.columns)
print(data['Years in current job'].value_counts())
print(data['Home Ownership'].value_counts())
#标签编码
mapping = {
'Years in current job':{
'10+ years':10,
'2 years':2,
'3 years':3,
'< 1 year':0,
'5 years':5,
'1 year':1,
'4 years':4,
'6 years':6,
'7 years':7,
'8 years':8,
'9 years':9,
},
'Home Ownership':{
'Home Mortgage':0,
'Rent':1,
'Own Home':2,
'Have Mortgage':3,
}
}
data['Years in current job'] = data['Years in current job'].map(mapping['Years in current job'])
data['Home Ownership'] = data['Home Ownership'].map(mapping['Home Ownership'])
print(data['Home Ownership'].head(5))
print(data['Years in current job'].head(5))
热力图的绘制
discrete_features = []
for i in data.columns:
if data[i].dtype == 'object':
discrete_features.append(i)
print(discrete_features)
continuous_features = [items for items in data.columns if items not in discrete_features]
print(continuous_features)
# 计算相关系数矩阵
correlation_matrix = data[continuous_features].corr()
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 绘制热力图
plt.figure(figsize=(6,4))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Continuous Features')
plt.show()
子图的绘制
#子图的绘制
# 定义要绘制的特征
features = ['Annual Income', 'Years in current job', 'Tax Liens', 'Number of Open Accounts']
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 创建一个包含 2 行 2 列的子图布局
fig, axes = plt.subplots(2, 2, figsize=(6,4))
# 使用 for 循环遍历特征
for i in range(len(features)):
row = i // 2 # 计算当前特征在子图中的行索引,// 是整除,即取整 ,之所以用整除是因为我们要的是行数
# 例如 0//2=0, 1//2=0, 2//2=1, 3//2=1
col = i % 2 # 计算当前特征在子图中的列索引,% 是取余,即取模
# 例如 0%2=0, 1%2=1, 2%2=0, 3%2=1
# 绘制箱线图
feature = features[i]
axes[row, col].boxplot(data[feature].dropna())
axes[row, col].set_title(f'Boxplot of {feature}')
axes[row, col].set_ylabel(feature)
# 调整子图之间的间距
plt.tight_layout()
# 显示图形
plt.show()
351

被折叠的 条评论
为什么被折叠?



