[Rcode]聚类分析

最新推荐文章于 2025-04-19 10:30:18 发布

原创最新推荐文章于 2025-04-19 10:30:18 发布 · 1.6w 阅读

1 ·

CC 4.0 BY-SA版权

Rcode 专栏收录该内容

11 篇文章

订阅专栏

本文介绍了一种使用R语言进行聚类分析的方法，包括计算距离、层次聚类分析、选择聚类个数及最终聚类方案的过程，并展示了如何利用划分聚类分析进行数据处理。

#聚类分析一般步骤[见R in action343]
#计算距离,d<-dist(x,method=)

#层次聚类分析（以平均联动聚类为例）
if(!require('flexclust'))
{
  install.packages('flexclust')
  require('flexclust')
}
data(nutrient,package="flexclust")
row.names(nutrient)<-tolower(row.names(nutrient))          #把行名转化为小写
nutrient.scale<-scale(nutrient)                            #标准化变量
d<-dist(nutrient.scale)
fit.average<-hclust(d,method="average")
plot(fit.average,hang=-1,cex=.8,main="Average Linkage Clustering")
 ##选择聚类的个数
 install.packages("NbClust")
 library(NbClust)
 devAskNewPage(ask=T)
 nc<-NbClust(nutrient.scale,distance = "euclidean",min.nc=2,max.nc=15,method="average") 
 table(nc$Best.n[1,]) 
 barplot(table(nc$Best.n[1,]),
        xlab="Number of Clusters",ylab="Number of Ceiteria",
        main="Number of Clusters Chosen by 26 Ceiteria")
 ##选择最终的聚类方案
 clusters<-cutree(fit.average,k=5)
 table(clusters) 
 aggregate(nutrient,by=list(cluster=clusters),median)      #描述聚类
 aggregate(as.data.frame(nutrient.scale),by=list(cluster=clusters),median)
 plot(fit.average,hang=-1,cex=.8,main="Average Linkage Clustering\n5 Cluster Solution") 
 rect.hclust(fit.average,k=5)                              #用于叠加五类的解决方法

#划分聚类分析
 ##划分聚类分析（对于异常值是敏感的）
 
 ##围绕中心点的划分（更稳健)
 library(cluster)
 set.seed(1234)
 fit.pam<-pam(nutrient[-1],k=3,stand=T)            #去除第一列数据
 fit.pam$medoids                                   #输出中心点
 clusplot(fit.pam,main="Cluster Plot")             #画出聚类的方案
 
 ##Rand index考察分类效果
 library(flexclust)
 randIndex() 
 
 ##避免不存在的类
 install.packages("fMultivar")
 library(fMultivar)
 set.seed(1234)
 df<-rnorm2d(1000,rho=.5)
 df<-as.data.frame(df)
 plot(df,main="Bivariate Normal Distribution with rho=0.5")
 nc<-NbClust(df,min.nc=2,max.nc=15,method="kmeans")
 
 plot(nc$All.index[,4],type="o",ylab="CCC",xlab="Number of clusters",col="blue")
 #当CCC值为负并且对于两类或是更多类递减时，就是典型的单峰分布