COMP5318 Assignment 1: Classification

最新推荐文章于 2025-06-27 06:38:59 发布

todcode

最新推荐文章于 2025-06-27 06:38:59 发布

阅读量185

点赞数

CC 4.0 BY-SA版权

文章标签： sklearn scikit-learn

本文链接：https://blog.youkuaiyun.com/todcode/article/details/132858633

COMP5318 Assignment 1: Classification

在这里插入图片描述

# Pre-process dataset

#fill missing calues
X = X.replace("?", np.nan)
si = SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(X)
X = si.transform(X)
# normalization
mms = MinMaxScaler()
X = mms.fit_transform(X)
#3.	Changing the class values
le = LabelEncoder()
y = le.fit_transform(y)

# Print first ten rows of pre-processed dataset to 4 decimal places as per assignment spec
# A function is provided to assist

def print_data(X, y, n_rows=10):
    """Takes a numpy data array and target and prints the first ten rows.
    
    Arguments:
        X: numpy array of shape (n_examples, n_features)
        y: numpy array of shape (n_examples)
        n_rows: numpy of rows to print
    """
    for example_num in range(n_rows):
        for feature in X[example_num]:
            print("{:.4f}".format(feature), end=",")

        if example_num == len(X)-1:
            print(y[example_num],end="")
        else:
            print(y[example_num])
            
print_data(X, y)

# Logistic Regression
def logregClassifier(X, y):
    scores = cross_val_score(LogisticRegression(random_state=0),X,y,cv=cvKFold)
    return scores.mean()

#Naïve Bayes
def nbClassifier(X, y):
    scores = cross_val_score(GaussianNB(),X,y,cv=cvKFold)
    return scores.mean()
    
# Decision Tree
def dtClassifier(X, y):
    scores = cross_val_score(DecisionTreeClassifier(random_state=0, criterion="entropy"),X,y,cv=cvKFold)
    return scores.mean()


# Ensembles: Bagging, Ada Boost and Gradient Boosting
def bagDTClassifier(X, y, n_estimators, max_samples, max_depth):
    scores = cross_val_score(BaggingClassifier(DecisionTreeClassifier(random_state=0, 
                                                                      criterion="entropy",                                                                      
                                                                      max_depth = max_depth,
                                                                      ),                                                
                                               max_samples = max_samples,
                                               n_estimators = n_estimators,
                                               random_state=0),
                             X,y,cv=cvKFold)
    return scores.mean()
def adaDTClassifier(X, y, n_estimators, learning_rate, max_depth):
    scores = cross_val_score(AdaBoostClassifier(DecisionTreeClassifier(random_state=0, 
                                                                      criterion="entropy",                                                                      
                                                                      max_depth = max_depth), 
                                               learning_rate = learning_rate,
                                                n_estimators = n_estimators,
                                               random_state=0),
                             X,y,cv=cvKFold)
    return scores.mean()

def gbClassifier(X, y, n_estimators, learning_rate):
    scores = cross_val_score(GradientBoostingClassifier(learning_rate = learning_rate,
                                                        n_estimators = n_estimators,
                                                        random_state=0),
                             X,y,cv=cvKFold)
    return scores.mean()

Part 2: Cross-validation with parameter tuning

# KNN
k = [1, 3, 5, 7, 9]
p = [1, 2]


def bestKNNClassifier(X, y):
    gs = GridSearchCV(KNeighborsClassifier(),{"n_neighbors":k,"p":p}, cv = cvKFold)
    
    gs.fit(X, y)
    return gs
print(KNeighborsClassifier().get_params().keys())

# SVM
# You should use SVC from sklearn.svm with kernel set to 'rbf'
C = [0.01, 0.1, 1, 5, 15] 
gamma = [0.01, 0.1, 1, 10, 50]

def bestSVMClassifier(X, y):
    gs = GridSearchCV(SVC(kernel = 'rbf'),{"C":C,"gamma":gamma}, cv = cvKFold)
    gs.fit(X, y)
    return gs

# Random Forest
# You should use RandomForestClassifier from sklearn.ensemble with information gain and max_features set to ‘sqrt’.
n_estimators = [10, 30, 60, 100, 150]
max_leaf_nodes = [6, 12, 18]

def bestRFClassifier(X, y):
    gs = GridSearchCV(RandomForestClassifier(criterion="entropy",max_features = 'sqrt', random_state=0 ),{"n_estimators":n_estimators,"max_leaf_nodes":max_leaf_nodes}, cv = cvKFold)
    gs.fit(X, y)
    return gs