ZhankunLuo_TakeHomeExamProblem2

最新推荐文章于 2018-11-09 14:05:35 发布

原创最新推荐文章于 2018-11-09 14:05:35 发布 · 412 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#pattern recognition #homework

pattern recognition 专栏收录该内容

8 篇文章

订阅专栏

本文探讨了糖尿病预测中的特征选择方法，通过Relief算法确定了四个关键特征，并利用KNN、SSE和LDA三种分类器进行对比分析，展示了不同分类器在预测精度上的差异。

Take Home Exam Problem2

Zhankun Luo

PUID: 0031195279

Email: luo333@pnw.edu

Fall-2018-ECE-59500-009

Instructor: Toma Hentea

文章目录

Take Home Exam Problem2

Function

plot_point

function plot_point(X,y)
%this function can handle up to 6 different classes
[l,N]=size(X); %N=no. of data vectors, l=dimensionality
if(l~=2)
    fprintf('NO PLOT CAN BE GENERATED\n')
    return
else
    pale=['ro';'g+';'b.';'y.';'m.';'c.']; 
    %Plot of the data vectors
    hold on
    for i=1:N
       plot(X(1,i),X(2,i),pale(y(i),:))
    end    
    hold off
end

Step 1: find the best combination of 4 features.

With Relief( X, y ), get weight of each feature X( i, ? to outcome y.
Relief
diff

Relief

function weight = Relief( X, y )
%weight = Relief( X, y )
% for 2 classes
%OUTPUT: weight of different features(dimesions) to y
[l, N] = size(X);
max_X = max(X, [], 2) * ones(1, N); % return max of each row/dimension
min_X = min(X, [], 2) * ones(1, N);
X_process = (X - min_X)./ (max_X - min_X); % 0 =< X_process =< 1
X_process_class1 = X_process(:, find(y == 1)); N1 = size(X_process_class1, 2);
X_process_class2 = X_process(:, find(y == 0)); N2 = size(X_process_class2, 2);
weight = zeros(l, 1);
for i = 1:N1
    dist_1 = X_process_class1 - X_process_class1(:, i) * ones(1, N1);
    [Dist_1, Index_1] = sort(sum(dist_1.^2));
    diff_1 = Dist_1(2); index_1 = Index_1(2);
    dist_2 = X_process_class2 - X_process_class1(:, i) * ones(1, N2);
    [Dist_2, Index_2] = sort(sum(dist_2.^2));
    diff_2 = Dist_2(1); index_2 = Index_2(1);
    weight = weight - abs(dist_1(:, index_1)) + abs(dist_2(:, index_2));
end
for i = 1:N2
    dist_2 = X_process_class2 - X_process_class2(:, i) * ones(1, N2);
    [Dist_2, Index_2] = sort(sum(dist_2.^2));
    diff_2 = Dist_2(2); index_2 = Index_2(2);
    dist_1 = X_process_class1 - X_process_class2(:, i) * ones(1, N1);
    [Dist_1, Index_1] = sort(sum(dist_1.^2));
    diff_1 = Dist_1(1); index_1 = Index_1(1);
    weight = weight - abs(dist_2(:, index_2)) + abs(dist_1(:, index_1));
end
weight = weight / N;
end

Step 2: Design a classifier using the training set (X1R, Y1) , use it to predict the labels Y2 for the test set X2R.

(0) normalize data

normalize

function [X_norm, Xtest_norm] = normalize( X, Xtest )
%X_norm = normalize( X )
[l, N] = size(X);
Ntest = size(Xtest, 2);
max_X = max(X, [], 2) * ones(1, N); % return max of each row/dimension
min_X = min(X, [], 2) * ones(1, N);
X_norm = (X - min_X)./ (max_X - min_X); % 0 =< X_norm =< 1
max_Xtest = max(X, [], 2) * ones(1, Ntest); % return max of each row/dimension
min_Xtest = min(X, [], 2) * ones(1, Ntest);
Xtest_norm = (Xtest - min_Xtest)./ (max_Xtest - min_Xtest); % 0 =< Xtest_norm =< 1
end

(1) KNN algorithm

k_nn_classifier

function z = k_nn_classifier(Z, v, k, X)
[l, N1] = size(Z); %in Z we have the training data
[l, N] = size(X); %in X we have the points to be classified
c = max(v); %The number of classes
%in v we have the classes to which the vectors in Z belong

%Computation of the (squared) Euclidean distance of a point in X from each
%reference vector
for i = 1:N
    dist = sum((X(:, i) * ones(1, N1) - Z).^2);
    %sorting the above distances in ascending order
    [sorted, nearest]=sort(dist); % MODE=‘ASCEND’
    %counting the class occurrences among the k-closest reference vectors
    %Z(:,i)
    refe = zeros(1, c); %Counting the reference vectors per class
    for q = 1:k
        class = v(nearest(q));
        refe(class) = refe(class) + 1;
    end
    [val,z(i)] = max(refe); % maximizes the occurrences among the k-closest reference vectors
end

Adjust according to Number of Classes.

k_nn_classifier_adjust

function z = k_nn_classifier_adjust(Z, v, k, X)
[l, N1] = size(Z); %in Z we have the training data
[l, N] = size(X); %in X we have the points to be classified
c = max(v); %The number of classes
%in v we have the classes to which the vectors in Z belong
N_class = zeros(1, c);
for i = 1:c
    N_class(i) = length(find(v == i));
end
%Computation of the (squared) Euclidean distance of a point in X from each
%reference vector
for i = 1:N
    dist = sum((X(:, i) * ones(1, N1) - Z).^2);
    %sorting the above distances in ascending order
    [sorted, nearest]=sort(dist); % MODE=‘ASCEND’
    %counting the class occurrences among the k-closest reference vectors
    %Z(:,i)
    refe = zeros(1, c); %Counting the reference vectors per class
    for q = 1:k
        class = v(nearest(q));
        refe(class) = refe(class) + 1;
    end
    [val,z(i)] = max(refe./ N_class); % maximizes the occurrences among the k-closest reference vectors
end

(2) SSE classifier

SSE

function [w, cost_func, mis_clas] = SSE(X, y)
% FUNCTION
%  [w, cost_func, mis_clas] = SSE(X, y)
% INPUT ARGUMENTS:
%  X:       lxN matrix whose columns are the data vectors to
%           be classfied.
%  y:       N-dimensional vector whose i-th  component contains the
%           label of the class where the i-th data vector belongs (+1 or
%           -1).
% OUTPUT ARGUMENTS:
%  w:       the final estimate of the parameter vector.
%  cost_func: value of cost function = 0.5 * @sum(y - w'*X)^2
%  mis_clas: number of misclassified data vectors.
w = (X*X') \ (X*y');
[l,N] = size(X);
cost_func = 0.5 * (y - w'*X) * (y - w'*X)';  % calculate cost function
mis_clas = 0;  % calculate number of misclassified vectors
for i = 1:N
    if((X(:,i)' * w) * y(i) < 0)
        mis_clas = mis_clas + 1;
    end
end

(3) LDA classifier

FDR

function [Lambda, FDR, w ] = FDR( X, y , D_y)
%function [ FDR, w ] = FDR( X, y , D_y)
% Fisher's Discriminant Ratio  
% INPUT: 
%   X: points
%   y: y==i ==> belong to Class i
%   D_y: dimension of w, how many features Z_i = w_i'* X need to classify
% OUTPUT:
%   FDR: trace((w * S_w * w') \ (w * S_b * w'))
%   w: use w ==> make Z = w'* X => calculate tr(S_w \ S_b) of Z
%      <=> maximize FDR of X
[ S_w, S_b, S_m ] = Calc_SwSbSm( X, y );
[ Vector, Diag ] = eig( S_w \ S_b );
[Lambda, Index]= sort(diag(Diag), 'descend'); % make highest eig show first
w = Vector(:, Index(1:D_y)); % select D_y vectors corresponding to D_y highest eig values
FDR = trace((w'* S_w * w) \ (w'* S_b * w));
end

Calc_SwSbSm

function [ S_w, S_b, S_m ] = Calc_SwSbSm( X, y )
% [ S_w, S_b, S_m ] = Calc_SwSbSm( X, y )
%   Calculate S_w, S_b, S_m
% OUTPUT:
%   S_w: the within-class
%   S_b: the between-class
%   S_m: the mixture Sm = Sw + Sb
c = max(y); % number of classes
[l, N] = size(X); % N: number of vectors, l: dimensions
mu = zeros(l, c);
S_w = zeros(l, l); S_b = zeros(l, l); mu_0 = zeros(l, 1);
P = zeros(1, c);
for i = 1:c
    index_class_i = find(y == i);
    Mu = sum(X(:, index_class_i), 2) / length(index_class_i);
    mu(:, i) = Mu; mu_0 = mu_0 + sum(X(:, index_class_i), 2) / N;
    P(i) = length(index_class_i) / N;
    X_relative = X(:, index_class_i) - repmat(Mu, 1, length(index_class_i));
    S_wi = zeros(l, l);
    for j = 1:length(index_class_i)
        S_wi = S_wi + X_relative(:, j) * X_relative(:, j)';
    end
    S_w = S_w + S_wi / N;    
end
for i = 1:c
    S_b = S_b + P(i) * (mu(:, i) - mu_0) * (mu(:, i) - mu_0)';
end
S_m = S_w + S_b;

Problem2_TakeHomeExam

%% Take Home Problem 2
%  @Author:     Zhankun Luo
%  @Date:       10/21/2018
%  @Instructor: Toma Hentea
clear all; close all; clc;
load diabet2.mat
fprintf('weight of 8 features:\n');
[l, N] = size(X1);
weight = Relief(X1, Y1)  % get weight of 8 features
[weight_sort, weight_index] = sort(weight,'descend') 
X1R = X1(weight_index(1:4), :); % remain 4 most important weight
X2R = X2(weight_index(1:4), :);
[X1R_norm, X2R_norm] = normalize(X1R, X2R);
%% KNN (k = 9)
Y1_knn = Y1; Y1_knn(Y1_knn == 0) = 2;
Y2_knn = k_nn_classifier(X1R_norm, Y1_knn, 9, X2R_norm);
Y2_knn(Y2_knn == 2) = 0;
%% SSE
fprintf('SSE:\n');
X1_SSE = [X1R_norm; ones(1, N)];
X2_SSE = [X2R_norm; ones(1, N)];
Y1_SSE = Y1; Y1_SSE(Y1_SSE == 0) = -1;
[w, cost_func, mis_clas] = SSE(X1_SSE, Y1_SSE)
err_rate = mis_clas / N
Y2_SSE = sign(w'* X2_SSE); Y2_SSE(Y2_SSE == -1) = 0;
%% LDA
fprintf('LDA:\n');
[Lambda, FDR1, w] = FDR(X1R_norm, Y1_knn, 1)
LDA_X1 = w'* X1R_norm;
[LDA_X1_sort, index_LDA_X1] = sort(LDA_X1);
err1_best = N; LDA_threshold = LDA_X1_sort(1);
for i = 1:N-1
    y1_pred = [ones(1, i), zeros(1, N-i)];
    err1_pred = sum(xor(y1_pred, Y1(index_LDA_X1)));
    if (err1_best > err1_pred)
        err1_best = err1_pred;
        LDA_threshold = LDA_X1_sort(i);
    end
end
err1_best, LDA_threshold
LDA_X2 = w'* X2R_norm;
Y2_LDA = heaviside(LDA_threshold - LDA_X2);
%% plot figures
Y1_plot = Y1; Y1_plot(Y1_plot == 0) = 2;
plot_point(X1R([1,2], :), Y1_plot);
figure 
plot(LDA_X1, Y1, 'g+')
hold on; plot(LDA_threshold * ones(1,100), linspace(0,1,100), 'r-');
ylabel('HAVE diabetes or NOT'); xlabel('w * normalized X1R');
text(-0.78, 0.5, 'Threshold');
%% save predicted Y2
save('Y2', 'Y2_knn', 'Y2_SSE', 'Y2_LDA');

Result

Select Features: Weight of different Features to Y

weight of 8 features:

weight =
    0.0149
    0.0120
    0.0082
    0.0034
    0.0004
    0.0133
    0.0109
    0.0024
weight_sort =
    0.0149
    0.0133
    0.0120
    0.0109
    0.0082
    0.0034
    0.0024
    0.0004
weight_index =  % sorted descend weight 
     1
     6
     2
     7
     3
     4
     8
     5

So, choose Feature: 1, 6, 2, 7

Number of times pregnant
Body mass index (weight in kg/(height in m)^2)
undefinedPlasma glucose concentration
undefinedDiabetes pedigree function

As the 4 Features chosen.

(1) KNN (k = 9)

sum(Y2_knn)
ans =   102

Number of (For X2R)

Diabet 1: 102

Healthy 0: 282

Y2_knn =
     0     0     0     0     0     0     0     1     0     0     1     1     0     0     0
     1     0     0     0     0     1     0     0     0     1     1     0     0     1     0
     0     1     0     1     0     0     1     0     0     0     1     1     0     1     0
     0     0     0     0     0     0     0     1     0     0     0     1     0     0     1
     0     1     0     0     0     0     0     0     0     0     0     1     0     0     1
     0     0     0     0     0     1     0     0     0     1     1     0     0     0     0
     0     0     0     1     1     0     0     0     0     0     1     1     0     1     0
     1     0     0     0     1     0     1     0     0     1     1     0     0     1     0
     0     0     1     0     0     1     0     0     0     0     0     0     1     0     0
     0     0     0     1     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     1     1     0     0     0     0     0     1     1     0     1
     1     0     0     1     0     0     1     0     1     0     0     0     1     0     0
     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
     1     0     0     0     0     0     0     0     0     1     0     1     0     0     1
     1     1     0     0     1     0     0     0     0     1     1     1     1     0     0
     0     0     1     1     0     0     0     1     0     1     0     0     1     1     0
     0     0     0     0     0     0     1     0     0     0     1     0     0     0     1
     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0
     0     0     0     1     0     0     1     1     1     0     0     0     0     0     0
     1     1     0     0     1     0     1     1     0     0     0     0     1     0     0
     0     1     0     0     0     0     0     1     1     0     0     0     0     0     0
     1     0     0     1     0     0     0     0     0     1     0     0     0     0     0
     0     1     1     0     0     0     0     0     1     0     0     0     0     0     0
     0     0     1     1     0     0     0     0     0     0     0     1     0     0     1
     1     0     1     0     1     0     1     0     0     1     1     1     1     0     0
     1     0     1     0     0     0     0     0     0

(2) SSE

SSE:   % training w of decision line

w =
    0.7887
    1.4265
    1.7585
    0.8521
   -1.9029
cost_func =  126.4948
mis_clas =    95  % misclassified number for X1
err_rate =	0.2474  % error rate for X1

When w * X > 0

X => Y = 1 : Diabet

X => Y = 0 : Healthy

sum(Y2_SSE)
ans =   112

Number of (For X2R)

Diabet 1: 98

Healthy 0: 286

Y2_SSE =

     0     0     0     0     1     0     0     1     0     0     1     0     0     0     0
     1     0     0     0     0     1     0     0     0     1     1     0     0     1     0
     0     1     0     1     0     0     0     0     0     0     1     1     0     1     0
     0     0     0     0     0     0     0     1     0     0     0     1     0     0     0
     0     1     0     0     0     0     0     0     0     0     0     1     0     0     1
     0     0     0     0     0     0     0     0     0     0     1     1     0     0     0
     0     0     0     0     0     0     1     0     0     0     1     0     1     1     0
     1     0     0     0     0     0     1     0     0     1     1     0     0     1     0
     0     0     1     0     0     0     0     0     0     0     0     1     1     1     0
     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     1     0     0     0     0     0     1     1     0     1
     1     0     0     0     0     0     0     0     0     1     0     0     1     0     
     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
     1     1     0     0     0     0     0     1     0     1     0     1     0     0     0
     0     1     0     0     1     0     0     0     0     1     1     0     1     0     1
     0     0     1     1     0     1     0     0     0     1     0     0     0     1     0
     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     1     0     1     1     0     0     0     0     0
     0     1     0     1     1     0     1     1     1     1     0     0     0     0     0
     1     1     0     0     1     0     1     1     0     0     0     0     1     0     0
     0     0     0     0     0     1     0     1     0     1     0     0     1     0     0
     1     0     0     1     0     0     0     0     0     1     0     1     0     1     0
     0     1     1     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     1     0     0     0     0     0     0     0     1     0     0     1
     1     0     1     0     1     0     1     0     0     1     1     1     1     0     0
     1     0     1     0     0     0     0     0     0

(3) LDA

Linear Discriminant Analysis:

LDA:

Lambda =
   0.4269 + 0.0000i
   0.0000 + 0.0000i
   0.0000 - 0.0000i
   0.0000 + 0.0000i
FDR1 =    0.4269
w =
   -0.3099
   -0.5606
   -0.6911
   -0.3348
err1_best =    90  % misclassified number for X1
LDA_threshold =   -0.8031

LDA_threshold

When w * normalized X > LDA_threshold

X => Y = 0: Healthy

When w * normalized X < LDA_threshold

X => Y = 1: Diabet

sum(Y2_LDA)
ans =   73

Number of (For X2R)

Diabet 1: 73

Healthy 0: 311

Y2_LDA =
     0     0     0     0     0     0     0     1     0     0     1     0     0     0     0
     1     0     0     0     0     1     0     0     0     1     1     0     0     1     0
     0     1     0     1     0     0     0     0     0     0     1     1     0     1     0
     0     0     0     0     0     0     0     1     0     0     0     1     0     0     0
     0     1     0     0     0     0     0     0     0     0     0     1     0     0     1
     0     0     0     0     0     0     0     0     0     0     1     1     0     0     0
     0     0     0     0     0     0     1     0     0     0     1     0     0     1     0
     1     0     0     0     0     0     0     0     0     1     1     0     0     0     0
     0     0     1     0     0     0     0     0     0     0     0     0     1     0     0
     0     0     0     0     1     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     1     1     0     0
     1     0     0     0     0     0     0     0     0     0     0     0     1     0     0
     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
     1     0     0     0     0     0     0     0     0     1     0     1     0     0     0
     0     1     0     0     1     0     0     0     0     1     1     0     1     0     0
     0     0     1     1     0     1     0     0     0     0     0     0     0     1     0
     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     1     0     0     0     0     0     0
     0     0     0     0     1     0     1     1     1     1     0     0     0     0     0
     1     1     0     0     1     0     1     0     0     0     0     0     1     0     0
     0     0     0     0     0     1     0     1     0     0     0     0     0     0     0
     1     0     0     1     0     0     0     0     0     1     0     0     0     1     0
     0     1     1     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     1     0     0     0     0     0     0     0     1     0     0     1
     1     0     1     0     1     0     0     0     0     1     1     0     0     0     0
     1     0     1     0     0     0     0     0     0

Compare 3 different Y2

sum(xor(Y2_knn,Y2_SSE)) % Compare Y2_knn & Y2_SSE
ans =	50
sum(xor(Y2_knn,Y2_LDA)) % Compare Y2_knn & Y2_LDA
ans =    45
sum(xor(Y2_LDA,Y2_SSE)) % Compare Y2_LDA & Y2_SSE
ans =    25

These result are different for which Feature:

find(xor(Y2_knn,Y2_SSE) == 1) % Y2_knn & Y2_SSE
ans =
     5    12    37    60    81    85    87    94    95    97   102   103   110   126   132
   134   139   140   155   169   172   174   175   185   197   203   210   211   222   225
   231   233   238   247   251   255   262   265   272   275   280   302   306   309   310
   313   327   329   339   348

find(xor(Y2_knn,Y2_LDA) == 1) % Y2_knn & Y2_LDA
ans =
    12    37    60    81    85    87    94    95    97   102   110   112   119   126   139
   140   155   156   165   169   172   174   210   211   222   231   233   235   238   247
   251   255   274   275   280   293   302   306   309   329   339   348   367   372   373
   
find(xor(Y2_SSE,Y2_LDA) == 1) % Y2_SSE & Y2_LDA
ans =
     5   103   112   119   132   134   156   165   175   185   197   203   225   235   262
   265   272   274   293   310   313   327   367   372   373