数据集
数据集:威斯康星州乳腺癌数据
代码
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import json
import sklearn
%matplotlib inline
#sklearn library
# 1.model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import StratifiedKFold
# 2.preprocessing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
# 3.metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# 4.model
import lightgbm as lgb
from bayes_opt import BayesianOptimization
# 5.plot
import scikitplot as skplt
import warnings
warnings.filterwarnings('ignore')
font = {
'family':'Helvetica, Ariel',
'weight':'normal',
'size':12}
plt.rc('font', **font)
sns.set_context('notebook')
sns.set_style("ticks")
FIG_FONT = dict(family="Helvetica, Ariel", weight="bold", color="#7f7f7f")
sns.set_palette("Spectral")
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df
df.diagnosis=[1 if i =='M' else 0 for i in df.diagnosis.values]
df.drop(['Unnamed: 32','id'],axis=1,inplace=True)
fig,ax = plt.subplots(figsize=(18, 18))
axes = sns.heatmap(df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
#Simply select the features with a correlation>0.5 with diagnosis
features = list(df.corr()[df.corr()['diagnosis']>0.5].index)
X=df[features]
fig, axes = plt.subplots(4,4,figsize=(20,20))
cnt = 0
for i in range(4):
for j in range(4):
feature_element = features[cnt]
cnt+=1
plt.sca(axes[i,j])
sns.kdeplot(df[df.diagnosis==1][feature_element],shade=True,alpha=0.2,legend=True,label='diagnosis:M')
sns.kdeplot(df[df.diagnosis==0][feature_element],shade=True,alpha=0.2,legend=True,label='diagnosis:B'