趋势聚类分析-Kmeans-优快云博客

写在前面，Kmeans聚类分析是一种无监督的数据聚类方式

直接上代码！！！

我用的蛋白数据，其实只要输入一个矩阵数据就可以，矩阵数据，行数基因（蛋白），列是样本

library(ClusterGVis)
library(reshape2)
library(argparser)
library(dplyr)
library(tidyverse)

#读分组信息:第一列sample， 第二列是group
groupf <- read.table("group.txt", header = T, sep = '\t',check.names = F)

## Kmeans
##proteo
protein_quant <- read.csv("quant_merge.csv", row.names = 1, header = T, check.names = F, na.strings = c("", "NA"))
protein_info <- protein_quant[, c(groupf$sample)] ##提取样本列

gs <- unique(groupf$group)

# 取组的均值
avg_list <- list() 
for (g in gs) {
  sam = groupf[which(groupf$group==g),1]
  tmp <- protein_info[, sam]
  gtpmavg <- rowMeans(tmp)
  avg_list[[g]] <- gtpmavg
}
avg_info <- do.call(cbind, avg_list)
colnames(avg_info) <- gs

avg_info <- na.omit(avg_info)
avg_info <- as.data.frame(avg_info[, c("C", "D", "M")])

#确定cluster数
pdf('cluster.pdf')
getClusters(obj = avg_info) ## 看这个图逐渐趋于平缓的那个点就是cluster数目
dev.off()

#kmeans聚类
ck <- clusterData(obj = avg_info,
                  cluster.method = "kmeans",
                  cluster.num = 6)
ckinfo <- ck[["wide.res"]][,c("gene", gs, "cluster")]

##
write.table(ckinfo, 'Kmeans/kmeans_cluster.xls', sep = '\t',row.names = F, col.names = T,quote = F)

# 作聚类热图，并保存
plot_cluster <- function(filepath) {
  if (grepl("pdf", filepath)) {
    pdf(filepath,  width=13, height=8)
  } else if (grepl("png", filepath)){
    png(filepath)
  }
  p <- visCluster(object = ck,
                  plot.type = "heatmap",
                  column_names_rot = 45,
                  ctAnno.col = ggsci::pal_npg()(6))
  dev.off()
}
plot_cluster("Kmeans/cluster_heatmap.pdf")
plot_cluster("Kmeans/cluster_heatmap.png")

聚类热图：

library(ggplot2)
library(reshape2)
library(patchwork)

datain <- read.table('Kmeans/kmeans_cluster.xls',sep = '\t',header = T,check.names = F)
rownames(datain) <- datain[,1]


clusterc <- c("#FF0000","#FFA500","#EECB0F","#BEEF58","#90ED90","#6495ED","#7F72D4","#04FFFF","#F18A8A")
pdf('Kmeans/kmeans_cluster.pdf',width=13, height=8)
#par(mfrow = c(3,3))
a <- list()
for (i in unique(datain$cluster)) {
	datap <- datain[datain$cluster == i,]
	dataavg <- as.data.frame(colMeans(datap[,2:4]))
	dataavg$sample <- rownames(dataavg)
	dataavg$cluster <- rep(i,nrow(dataavg))
	colnames(dataavg) <- c('avg','sample','cluster')
	data_melt<-melt(datap[,1:(ncol(datap)-1)], id.vars="gene")
	p <- ggplot() +
		geom_line(data=data_melt,aes(x=variable,y=value,group = gene),color=clusterc[i],size=0.7)+
		#stat_summary(aes(group=gene),fun.y=mean, geom="line", size=0.8, color="black")+
		geom_line(data=dataavg, aes(x=sample,y=avg,group = cluster),size=1.2,color="black")+
		theme_classic()+
		labs(title = paste0("cluster ",i,", total: ",nrow(datap)), x = '', y = 'Standardised value') +
		theme(legend.position="none",
		      plot.title = element_text(hjust = 0.5),
		      axis.text.x = element_text(vjust=1, hjust = 1)
		      )
	a[[i]] <- p
}
a[[1]] + a[[2]] + a[[3]] + a[[4]] + a[[5]]+ a[[6]] + plot_layout(ncol = 3)
dev.off()

聚类折线图：

执行过程中遇到的问题：

输入文件必须是data.frame 或者matrix，不然会出现如下错误

 getClusters(obj = avg_info)
Error in if (cls == "cell_data_set") { : the condition has length > 1