R语言 Fisher Discrimination 函数
前段时间急着用Fisher Discrimination,但是R语言里面各种包都不给力(该返回的结果不返回,尤其是Actually Error Rate这种判别分析中直观评价分类器好坏最核心的东西),绕来绕去,自己按照书上的原理亲自编写了一个。Example和代码见下文:
# Example:
# x1 and x2 for train
# x for test
# x1=matrix(NA,50,4)
# x2=matrix(NA,50,4)
# x=matrix(NA,4,4)
# for (i in 1:4){
# x1[,i]=iris[1:50,i]
# x2[,i]=iris[51:100,i]
# }
# x[1:4,1:4]<-x2[1:4,1:4]
# result = fisher_Discrimination_PCA_Dahan<-function(x1,x2,x)
# result is a list consisting four sub-result:
# "category" is a n × 2 matrix, ith item in column 1 corresponds to its y(x) for further classification, in column 2 corresponds to its predicted category.
# "pvalue" is "actual misclassifying rate",the lower pvalue is, the better model is.
# "vector" Due to sparse matrix problem brought by multivector, we use PCA to reduce dimension , "vector" is number of factor we choose for further classification
# "split_point" is the split point between two categories.
fisher_Discrimination_PCA_Dahan<-function(x1,x2,x){
# x1 and x2 是两个矩阵
# 对于x1和x2,我们还要对0过多的向量进行处理
train_data=rbind(x1,x2);
w=NULL; # store positions of qualified
for (ii in 1:dim(train_data)[2]){
c = length(which(train_data[,ii] > 0)); # return number positions of 0
if (c>0.03*dim(train_data)[1]){
w=c(w,ii)
}
}
x1<-x1[,w]
x2<-x2[,w]
x<-x[,w]
# PCA分析开始
new_train_data=rbind(x1,x2);
COV=cov(new_train_data)
Eigs=eigen(COV)
EigS_values=Eigs$values # 提取特征值
EigS_vectors=Eigs$vectors # 提取特征向量
for (eig_i in 1:length(EigS_values)){
if (sum(EigS_values[1:eig_i])>0.95*sum(EigS_values)){
break
}
}
# 生成95%预测矩阵
PCA_matrix_95=new_train_data%*%EigS_vectors[,c(1:eig_i)];
x1=x1%*%EigS_vectors[,c(1:eig_i)];
x2=x2%*%EigS_vectors[,c(1:eig_i)];
x=x%*%EigS_vectors[,c(1:eig_i)];
ave1 = apply(x1, 2,mean)
ave2= apply(x2, 2,mean)
n1=dim(x1)[1];
n2=dim(x2)[1];
n=n1+n2;
S1=cov(x1);
S2=cov(x2);
Spool= ((n1-1)/(n-2))*S1+((n2-1)/(n-2))*S2
# ni_Spool
b=rep(1,dim(Spool)[1]);
ni_Spool=solve(Spool);
# punish weight of misclassifying 2 to 1
c12=1;
# punish weight of misclassifying 1 to 2
c21=1;
p2=n2/n;
p1=n1/n;
split_point=log((c12*p2)/(c21*p1))
T=matrix(NA,dim(x)[1],2)
for (i in 1:dim(x)[1]){
T[i,1]=t(ave1-ave2)%*%ni_Spool%*%x[i,]-0.5*t(ave1-ave2)%*%ni_Spool%*%(ave1+ave2)
if (T[i,1]>=split_point){
T[i,2]=1;
}else{
T[i,2]=2;
}
}
p_input=-0.5*(((-2)*(0.5*t(ave1-ave2)%*%ni_Spool%*%(ave1+ave2)-0.5*t(ave1-ave2)%*%ni_Spool%*%(ave1)))^0.5)
pvalue=pnorm(p_input, mean = 0, sd = 1)
result=list(category=T,pvalue=pvalue,vector=eig_i,split_point=split_point)
return(result)
}