# Pre-process dataset#fill missing calues
X = X.replace("?", np.nan)
si = SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(X)
X = si.transform(X)# normalization
mms = MinMaxScaler()
X = mms.fit_transform(X)#3. Changing the class values
le = LabelEncoder()
y = le.fit_transform(y)# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec# A function is provided to assistdefprint_data(X, y, n_rows=10):"""Takes a numpy data array and target and prints the first ten rows.
Arguments:
X: numpy array of shape (n_examples, n_features)
y: numpy array of shape (n_examples)
n_rows: numpy of rows to print
"""for example_num inrange(n_rows):for feature in X[example_num]:print("{:.4f}".format(feature), end=",")if example_num ==len(X)-1:print(y[example_num],end="")else:print(y[example_num])
print_data(X, y)# Logistic RegressiondeflogregClassifier(X, y):
scores = cross_val_score(LogisticRegression(random_state=0),X,y,cv=cvKFold)return scores.mean()#Naïve BayesdefnbClassifier(X, y):
scores = cross_val_score(GaussianNB(),X,y,cv=cvKFold)return scores.mean()# Decision TreedefdtClassifier(X, y):
scores = cross_val_score(DecisionTreeClassifier(random_state=0, criterion="entropy"),X,y,cv=cvKFold)return scores.mean()# Ensembles: Bagging, Ada Boost and Gradient BoostingdefbagDTClassifier(X, y, n_estimators, max_samples, max_depth):
scores = cross_val_score(BaggingClassifier(DecisionTreeClassifier(random_state=0,
criterion="entropy",
max_depth = max_depth,),
max_samples = max_samples,
n_estimators = n_estimators,
random_state=0),
X,y,cv=cvKFold)return scores.mean()defadaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(random_state=0,
criterion="entropy",
max_depth = max_depth),
learning_rate = learning_rate,
n_estimators = n_estimators,
random_state=0),
X,y,cv=cvKFold)return scores.mean()defgbClassifier(X, y, n_estimators, learning_rate):
scores = cross_val_score(GradientBoostingClassifier(learning_rate = learning_rate,
n_estimators = n_estimators,
random_state=0),
X,y,cv=cvKFold)return scores.mean()
Part 2: Cross-validation with parameter tuning
# KNN
k =[1,3,5,7,9]
p =[1,2]defbestKNNClassifier(X, y):
gs = GridSearchCV(KNeighborsClassifier(),{"n_neighbors":k,"p":p}, cv = cvKFold)
gs.fit(X, y)return gs
print(KNeighborsClassifier().get_params().keys())# SVM# You should use SVC from sklearn.svm with kernel set to 'rbf'
C =[0.01,0.1,1,5,15]
gamma =[0.01,0.1,1,10,50]defbestSVMClassifier(X, y):
gs = GridSearchCV(SVC(kernel ='rbf'),{"C":C,"gamma":gamma}, cv = cvKFold)
gs.fit(X, y)return gs
# Random Forest# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators =[10,30,60,100,150]
max_leaf_nodes =[6,12,18]defbestRFClassifier(X, y):
gs = GridSearchCV(RandomForestClassifier(criterion="entropy",max_features ='sqrt', random_state=0),{"n_estimators":n_estimators,"max_leaf_nodes":max_leaf_nodes}, cv = cvKFold)
gs.fit(X, y)return gs