写在前面,Kmeans聚类分析是一种无监督的数据聚类方式
直接上代码!!!
我用的蛋白数据,其实只要输入一个矩阵数据就可以,矩阵数据,行数基因(蛋白),列是样本
library(ClusterGVis)
library(reshape2)
library(argparser)
library(dplyr)
library(tidyverse)
#读分组信息:第一列sample, 第二列是group
groupf <- read.table("group.txt", header = T, sep = '\t',check.names = F)
## Kmeans
##proteo
protein_quant <- read.csv("quant_merge.csv", row.names = 1, header = T, check.names = F, na.strings = c("", "NA"))
protein_info <- protein_quant[, c(groupf$sample)] ##提取样本列
gs <- unique(groupf$group)
# 取组的均值
avg_list <- list()
for (g in gs) {
sam = groupf[which(groupf$group==g),1]
tmp <- protein_info[, sam]
gtpmavg <- rowMeans(tmp)
avg_list[[g]] <- gtpmavg
}
avg_info <- do.call(cbind, avg_list)
colnames(avg_info) <- gs
avg_info <- na.omit(avg_info)
avg_info <- as.data.frame(avg_info[, c("C", "D", "M")])
#确定cluster数
pdf('cluster.pdf')
getClusters(obj = avg_info) ## 看这个图逐渐趋于平缓的那个点就是cluster数目
dev.off()
#kmeans聚类
ck <- clusterData(obj = avg_info,
cluster.method = "kmeans",
cluster.num = 6)
ckinfo <- ck[["wide.res"]][,c("gene", gs, "cluster")]
##
write.table(ckinfo, 'Kmeans/kmeans_cluster.xls', sep = '\t',row.names = F, col.names = T,quote = F)
# 作聚类热图,并保存
plot_cluster <- function(filepath) {
if (grepl("pdf", filepath)) {
pdf(filepath, width=13, height=8)
} else if (grepl("png", filepath)){
png(filepath)
}
p <- visCluster(object = ck,
plot.type = "heatmap",
column_names_rot = 45,
ctAnno.col = ggsci::pal_npg()(6))
dev.off()
}
plot_cluster("Kmeans/cluster_heatmap.pdf")
plot_cluster("Kmeans/cluster_heatmap.png")
聚类热图:
library(ggplot2)
library(reshape2)
library(patchwork)
datain <- read.table('Kmeans/kmeans_cluster.xls',sep = '\t',header = T,check.names = F)
rownames(datain) <- datain[,1]
clusterc <- c("#FF0000","#FFA500","#EECB0F","#BEEF58","#90ED90","#6495ED","#7F72D4","#04FFFF","#F18A8A")
pdf('Kmeans/kmeans_cluster.pdf',width=13, height=8)
#par(mfrow = c(3,3))
a <- list()
for (i in unique(datain$cluster)) {
datap <- datain[datain$cluster == i,]
dataavg <- as.data.frame(colMeans(datap[,2:4]))
dataavg$sample <- rownames(dataavg)
dataavg$cluster <- rep(i,nrow(dataavg))
colnames(dataavg) <- c('avg','sample','cluster')
data_melt<-melt(datap[,1:(ncol(datap)-1)], id.vars="gene")
p <- ggplot() +
geom_line(data=data_melt,aes(x=variable,y=value,group = gene),color=clusterc[i],size=0.7)+
#stat_summary(aes(group=gene),fun.y=mean, geom="line", size=0.8, color="black")+
geom_line(data=dataavg, aes(x=sample,y=avg,group = cluster),size=1.2,color="black")+
theme_classic()+
labs(title = paste0("cluster ",i,", total: ",nrow(datap)), x = '', y = 'Standardised value') +
theme(legend.position="none",
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(vjust=1, hjust = 1)
)
a[[i]] <- p
}
a[[1]] + a[[2]] + a[[3]] + a[[4]] + a[[5]]+ a[[6]] + plot_layout(ncol = 3)
dev.off()
聚类折线图:
执行过程中遇到的问题:
输入文件必须是data.frame 或者matrix,不然会出现如下错误
getClusters(obj = avg_info) Error in if (cls == "cell_data_set") { : the condition has length > 1