import pandas as pd
data = pd.read_csv("heart.csv")
print(data.head(10))
# age:年龄
# sex:性别
# cp:胸痛类型
# trestbps:静息血压
# chol:胆固醇
# fbs:空腹血糖
# restecg:静息心电图
# thalach:最大心率
# exang:运动诱发心绞痛
# oldpeak:旧峰值(医学上常指运动后 ST 段压低)
# slope:斜率
# ca:冠状动脉(钙化情况)
# thal:地中海贫血(或心肌灌注等医学相关指标)
# target:目标
print(data.isnull().sum())
# 设定唯一值数量的阈值
threshold = 10
# 创建一个空列表,用于存储离散变量的列名
discrete_vars = []
# 遍历 data 中的所有列
for col in data.columns:
# 计算当前列的唯一值数量
unique_values = data[col].nunique()
# 判断唯一值数量是否小于等于阈值
if unique_values <= threshold:
# 如果满足条件,将该列的列名添加到离散变量列表中
discrete_vars.append(col)
# 打印出识别出的离散变量的列名
print("离散变量有:", discrete_vars)
# 识别连续变量,从数值型变量中排除离散变量
all_numeric_vars = data.select_dtypes(include=['number']).columns
continuous_vars = [var for var in all_numeric_vars if var not in discrete_vars]
continuous_vars = pd.Index(continuous_vars) # 转换回 pandas.Index 类型
# 打印整理出的连续变量
print("连续变量有:", list(continuous_vars))
# 定义映射字典
mapping = {
'cp': {0: 0, 1: 1, 2: 2, 3: 3},
'restecg': {0: 0, 1: 1, 2: 2},
'slope': {0: 0, 1: 1, 2: 2},
'ca': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
'thal': {0: 0, 1: 1, 2: 2, 3: 3}
}
for feature, mapping in mapping.items():
data[feature] = data[feature].map(mapping)
# 借助sklearn库进行归一化处理
from sklearn.preprocessing import StandardScaler, MinMaxScaler
continuous_vars = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
# 归一化操作
scaler_minmax = MinMaxScaler()
data[continuous_vars] = scaler_minmax.fit_transform(data[continuous_vars])
# # 标准化操作
# scaler_standard = StandardScaler()
# data[continuous_vars] = scaler_standard.fit_transform(data[continuous_vars])
# 对离散变量进行独热编码
data = pd.get_dummies(data, columns=discrete_vars, drop_first=True)
print(data.head(10).T)
结果