随机挑选几行数据插入进csv文件中

该篇文章展示了如何用Python的pandas库从data.csv读取数据,随机选取指定行数,再将结果保存至random_selection.csv。

从一个csv文件中随机挑选几行数据,将其存储进一个新的csv文件中。
代码如下:

import pandas as pd
import random

# 读取原始 CSV 文件
input_file = 'data.csv' 

data = pd.read_csv(input_file)

# 指定要随机挑选的行数
num_rows_to_copy = 5  # 例如,选择随机挑选 5 行

# 随机挑选行索引
random_rows_indices = random.sample(range(len(data)), num_rows_to_copy)

# 创建新的 DataFrame,包含随机挑选的行
random_data = data.iloc[random_rows_indices]

# 保存随机挑选的行到新的 CSV 文件
output_file = 'random_selection.csv' 
random_data.to_csv(output_file, index=False)

rm(list = ls()) options(stringsAsFactors = F) Sys.setenv(LANGUAGE = "en") # 如果尚未安装 BiocManager,先安装它 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") # 使用 BiocManager 来安装 impute 包 if (!requireNamespace("impute", quietly = TRUE)) BiocManager::install("impute") library(WGCNA) library(FactoMineR) library(factoextra) library(tidyverse) library(data.table) library(openxlsx) ### 启用WGCNA多核计算 enableWGCNAThreads(nThreads = 0.75*parallel::detectCores()) ################################# 0.输入数据准备 ################################ ### 读取表达矩阵 cat("正在读取表达矩阵...\n") if (T) { fpkm00 <- fread("FPKM.txt", data.table = F) # 检查数据结构和内容 cat("表达矩阵维度:", dim(fpkm00), "\n") cat("前几行基因名:", head(fpkm00$gene), "\n") ### 合并具有相同基因名的行 table(duplicated(fpkm00$gene)) gene <- fpkm00$gene fpkm00 <- fpkm00[,-1] # 确保数据是矩阵而不是列表 fpkm00 <- as.matrix(fpkm00) # 检查并处理非数值数据 if(!is.numeric(fpkm00)) { cat("检测到非数值数据,正在转换...\n") fpkm00 <- matrix(as.numeric(fpkm00), nrow = nrow(fpkm00), ncol = ncol(fpkm00)) } # 处理NA和无穷大 fpkm00[is.na(fpkm00)] <- 0 fpkm00[is.infinite(fpkm00)] <- 0 # 使用矩阵行聚合 fpkm0 <- aggregate(fpkm00, by=list(gene), FUN=sum) fpkm <- as.matrix(fpkm0[,-1]) rownames(fpkm) <- fpkm0[,1] } # 确保fpkm是数值矩阵 if(!is.matrix(fpkm) || !is.numeric(fpkm)) { fpkm <- as.matrix(fpkm) fpkm <- matrix(as.numeric(fpkm), nrow = nrow(fpkm), ncol = ncol(fpkm)) } # 处理NA和无穷大 - 确保是对矩阵操作 fpkm[is.na(fpkm)] <- 0 fpkm[is.infinite(fpkm)] <- 0 data <- log2(fpkm+1) # 再次检查数据质量 - 确保data是矩阵 if(any(is.na(data)) | any(is.infinite(data))) { data[is.na(data)] <- 0 data[is.infinite(data)] <- 0 } ### 筛选MAD前5000的基因 cat("筛选MAD前8000的基因...\n") # 确保data是矩阵 if(!is.matrix(data)) { data <- as.matrix(data) } keep_data <- data[order(apply(data,1,mad, na.rm = TRUE), decreasing = T)[1:min(8000, nrow(data))],] ### 创建datTraits,包含分组、表型等信息 cat("读取分组和表型数据...\n") # 读取分组信息 samplegroup <- read.table("samplegroup.txt", header = TRUE, sep = "\t") # 读取表型数据 trait_data <- read.table("Trait.txt", header = TRUE, sep = "\t") # 检查表型数据结构 cat("表型数据维度:", dim(trait_data), "\n") cat("表型数据列名:", colnames(trait_data), "\n") # 仔细处理表型数据中的非数值 trait_data_clean <- trait_data for(col in colnames(trait_data)[-1]) { # 跳过Sample列 if(!is.numeric(trait_data[[col]])) { cat("列", col, "不是数值型,正在转换...\n") # 转换为字符再转换为数值,处理可能的因子或字符类型 trait_data_clean[[col]] <- as.numeric(as.character(trait_data[[col]])) # 检查转换后的NA值 na_count <- sum(is.na(trait_data_clean[[col]])) if(na_count > 0) { cat("警告: 列", col, "有", na_count, "个NA值,用均值替换\n") trait_data_clean[[col]][is.na(trait_data_clean[[col]])] <- mean(trait_data_clean[[col]], na.rm = TRUE) } } } # 合并分组和表型数据 datTraits <- merge(samplegroup, trait_data_clean, by.x = "sample", by.y = "Sample") rownames(datTraits) <- datTraits$sample datTraits <- datTraits[,-1] # 移除sample列 ### 给分组加上编号(用于后续分析) grouptype <- data.frame(group=sort(unique(datTraits$group)), groupNo=1:length(unique(datTraits$group))) datTraits$groupNo <- NA for(i in 1:nrow(grouptype)){ datTraits[which(datTraits$group == grouptype$group[i]),'groupNo'] <- grouptype$groupNo[i] } ### 确保表达矩阵和表型数据的样本顺序一致 common_samples <- intersect(colnames(keep_data), rownames(datTraits)) cat("共同样本数量:", length(common_samples), "\n") keep_data <- keep_data[, common_samples] datTraits <- datTraits[common_samples, ] ### 转置并确保数据类型正确 cat("转置表达矩阵...\n") # 确保keep_data是矩阵 if(!is.matrix(keep_data)) { keep_data <- as.matrix(keep_data) } datExpr0 <- as.data.frame(t(keep_data)) # 确保datExpr0的所有列都是数值型 for(col in names(datExpr0)) { if(!is.numeric(datExpr0[[col]])) { datExpr0[[col]] <- as.numeric(as.character(datExpr0[[col]])) } # 处理任何NA值 if(any(is.na(datExpr0[[col]]))) { datExpr0[[col]][is.na(datExpr0[[col]])] <- 0 } } ############################## 1.判断数据质量 ################################ ### 判断数据质量--缺失值 cat("检查数据质量...\n") gsg <- goodSamplesGenes(datExpr0,verbose = 3) gsg$allOK if (!gsg$allOK){ cat("数据质量问题:\n") if (sum(!gsg$goodGenes)>0) printFlush(paste("移除基因:", paste(names(datExpr0)[!gsg$goodGenes], collapse = ", "))); if (sum(!gsg$goodSamples)>0) printFlush(paste("移除样本:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes] datTraits = datTraits[gsg$goodSamples, ] } gsg <- goodSamplesGenes(datExpr0,verbose = 3) cat("数据质量检查通过:", gsg$allOK, "\n") ### 绘制样品的系统聚类树 if(T){ sampleTree <- hclust(dist(datExpr0), method = "average") par(mar = c(0,5,2,0)) plot(sampleTree, main = "Sample clustering", sub="", xlab="", cex.lab = 2, cex.axis = 1, cex.main = 1,cex.lab=1) sample_colors <- numbers2colors(as.numeric(factor(datTraits$group)), colors = rainbow(length(table(datTraits$group))), signed = FALSE) par(mar = c(1,4,3,1),cex=0.8) pdf("step1_Sample dendrogram and trait.pdf",width = 8,height = 6) plotDendroAndColors(sampleTree, sample_colors, groupLabels = "trait", cex.dendroLabels = 0.8, marAll = c(1, 4, 3, 1), cex.rowText = 0.01, main = "Sample dendrogram and trait" ) dev.off() } ### 判断数据质量 : PCA行分组查看 group_list <- datTraits$group # 确保datExpr0是数值矩阵 datExpr0_numeric <- datExpr0 for(i in 1:ncol(datExpr0)) { if(!is.numeric(datExpr0[[i]])) { datExpr0_numeric[[i]] <- as.numeric(as.character(datExpr0[[i]])) } } dat.pca <- PCA(datExpr0_numeric, graph = F) pca <- fviz_pca_ind(dat.pca, title = "Principal Component Analysis", legend.title = "Groups", geom.ind = c("point","text"), pointsize = 2, labelsize = 4, repel = TRUE, col.ind = group_list, axes.linetype=NA, mean.point=F ) + theme(legend.position = "none") + coord_fixed(ratio = 1) pca ggsave(pca,filename= "step1_Sample PCA analysis.pdf", width = 8, height = 8) ##保存数据 datExpr <- datExpr0 nGenes <- ncol(datExpr) nSamples <- nrow(datExpr) save(nGenes,nSamples,datExpr,datTraits,file="step1_input.Rdata") ############################### 2.挑选最佳阈值power ################################### rm(list = ls()) load("step1_input.Rdata") R.sq_cutoff = 0.8 # 检查数据是否全是数值 cat("检查datExpr数据类型:\n") print(summary(sapply(datExpr, class))) if(T){ powers <- c(seq(1,20,by = 1), seq(22,30,by = 2)) # 确保数据是数值型 datExpr_numeric <- datExpr for(i in 1:ncol(datExpr)) { if(!is.numeric(datExpr[[i]])) { datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]])) datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0 } } sft <- pickSoftThreshold(datExpr_numeric, networkType = "unsigned", powerVector = powers, RsquaredCut = R.sq_cutoff, verbose = 5) pdf("step2_power-value.pdf",width = 16,height = 12) par(mfrow = c(1,2)); cex1 = 0.9; plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n") text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red") abline(h=R.sq_cutoff ,col="red") plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n") text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") abline(h=100,col="red") dev.off() } sft$powerEstimate power = sft$powerEstimate if(is.na(power)){ type = "unsigned" nSamples=nrow(datExpr) power = ifelse(nSamples<20, ifelse(type == "unsigned", 9, 18), ifelse(nSamples<30, ifelse(type == "unsigned", 8, 16), ifelse(nSamples<40, ifelse(type == "unsigned", 7, 14), ifelse(type == "unsigned", 6, 12)) ) ) cat("使用经验power值:", power, "\n") } save(sft, power, file = "step2_power_value.Rdata") ##################### 3.一步法构建加权共表达网络,识别基因模块 #################### load(file = "step1_input.Rdata") load(file = "step2_power_value.Rdata") # 确保数据是数值型 datExpr_numeric <- datExpr for(i in 1:ncol(datExpr)) { if(!is.numeric(datExpr[[i]])) { datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]])) datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0 } } if(T){ cat("开始构建加权共表达网络...\n") net <- blockwiseModules( datExpr_numeric, power = power, maxBlockSize = ncol(datExpr_numeric), corType = "pearson", networkType = "unsigned", TOMType = "unsigned", minModuleSize = 50, mergeCutHeight = 0.25, pamRespectsDendro = FALSE, # 允许更灵活的模块检测 numericLabels = TRUE, saveTOMs = F, verbose = 3 ) table(net$colors) } ## 模块可视化,层级聚类树展示各个模块 if(T){ moduleColors <- labels2colors(net$colors) table(moduleColors) pdf("step3_genes-modules_ClusterDendrogram.pdf",width = 16,height = 12) plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() } ##################### 新增:导出每个模块的基因数量和基因列表 ##################### cat("导出模块基因信息...\n") # 获取基因名 gene_names <- colnames(datExpr_numeric) # 创建模块-基因对应表 module_genes <- data.frame( Gene = gene_names, Module = moduleColors, ModuleNumber = net$colors, stringsAsFactors = FALSE ) # 统计每个模块的基因数量 module_summary <- module_genes %>% group_by(Module, ModuleNumber) %>% summarise(GeneCount = n(), .groups = 'drop') %>% arrange(desc(GeneCount)) # 保存模块统计信息 write.csv(module_summary, "step3_module_gene_count.csv", row.names = FALSE) cat("模块基因数量统计已保存到: step3_module_gene_count.csv\n") # 保存完整的模块-基因对应表 write.csv(module_genes, "step3_all_genes_modules.csv", row.names = FALSE) cat("完整模块-基因对应表已保存到: step3_all_genes_modules.csv\n") # 为每个模块创建单独的基因列表文件 unique_modules <- unique(moduleColors) module_gene_lists <- list() for(module in unique_modules) { module_genes_subset <- module_genes[module_genes$Module == module, "Gene"] filename <- paste0("step3_module_", module, "_genes.txt") write.table(module_genes_subset, filename, row.names = FALSE, col.names = FALSE, quote = FALSE) module_gene_lists[[module]] <- module_genes_subset cat("模块", module, "的基因列表已保存到:", filename, "\n") } # 创建Excel工作簿,每个模块一个工作表 if(require(openxlsx)) { wb <- createWorkbook() # 添加模块统计工作表 addWorksheet(wb, "Module Summary") writeData(wb, "Module Summary", module_summary) # 添加完整基因列表工作表 addWorksheet(wb, "All Genes") writeData(wb, "All Genes", module_genes) # 为每个模块添加单独的工作表 for(module in unique_modules) { module_data <- module_genes[module_genes$Module == module, c("Gene", "ModuleNumber")] addWorksheet(wb, paste0("Module_", module)) writeData(wb, paste0("Module_", module), module_data) } saveWorkbook(wb, "step3_module_genes_detailed.xlsx", overwrite = TRUE) cat("详细模块基因信息已保存到: step3_module_genes_detailed.xlsx\n") } # 创建模块基因数量条形图 module_plot_data <- module_summary[module_summary$Module != "grey", ] # 排除未分类基因 if(nrow(module_plot_data) > 0) { p <- ggplot(module_plot_data, aes(x = reorder(Module, -GeneCount), y = GeneCount, fill = Module)) + geom_bar(stat = "identity") + scale_fill_manual(values = module_plot_data$Module) + labs(title = "各模块基因数量分布", x = "模块", y = "基因数量") + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none") ggsave("step3_module_gene_count_plot.pdf", p, width = 10, height = 6) cat("模块基因数量分布图已保存到: step3_module_gene_count_plot.pdf\n") } # 显示模块统计信息 cat("\n=== 模块统计信息 ===\n") print(module_summary) cat("总基因数量:", nrow(module_genes), "\n") cat("模块数量:", length(unique_modules), "\n") cat("灰色模块(未分类)基因数量:", sum(moduleColors == "grey"), "\n") save(net, moduleColors, file = "step3_genes_modules.Rdata") ####################### 4.关联基因模块与表型 ##################################### rm(list = ls()) load(file = "step1_input.Rdata") load(file = "step2_power_value.Rdata") load(file = "step3_genes_modules.Rdata") # 确保数据是数值型 datExpr_numeric <- datExpr for(i in 1:ncol(datExpr)) { if(!is.numeric(datExpr[[i]])) { datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]])) datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0 } } ## 模块与表型的相关性热图 if(T){ datTraits$group <- as.factor(datTraits$group) design <- model.matrix(~0+datTraits$group) colnames(design) <- levels(datTraits$group) MES0 <- moduleEigengenes(datExpr_numeric, moduleColors)$eigengenes MEs <- orderMEs(MES0) traits_for_cor <- datTraits[, !colnames(datTraits) %in% "groupNo"] # 确保表型数据是数值型 traits_for_cor_numeric <- traits_for_cor for(col in colnames(traits_for_cor)) { if(!is.numeric(traits_for_cor[[col]])) { traits_for_cor_numeric[[col]] <- as.numeric(as.character(traits_for_cor[[col]])) traits_for_cor_numeric[[col]][is.na(traits_for_cor_numeric[[col]])] <- 0 } } moduleTraitCor <- cor(MEs, traits_for_cor_numeric, use = "p") moduleTraitPvalue <- corPvalueStudent(moduleTraitCor, nSamples) textMatrix <- paste0(signif(moduleTraitCor,2),"\n(", signif(moduleTraitPvalue,1),")") dim(textMatrix) <- dim(moduleTraitCor) pdf("step4_Module-trait-relationship_heatmap.pdf", width = 0.6*ncol(traits_for_cor_numeric) + 2, height = 0.6*nrow(moduleTraitCor) + 2) par(mar=c(8, 10, 3, 3)) labeledHeatmap(Matrix = moduleTraitCor, xLabels = colnames(traits_for_cor_numeric), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = F, colors = blueWhiteRed(50), textMatrix = textMatrix, setStdMargins = F, cex.text = 0.5, zlim = c(-1,1), main = "Module-trait relationships") dev.off() save(design, file = "step4_design.Rdata") } ### 模块与表型的相关性boxplot图(针对分组) if(T){ # 将MEs转换为数据框并添加行名作为列 MEs_with_rownames <- MEs MEs_with_rownames$Sample <- rownames(MEs) # 将datTraits转换为数据框并添加行名作为列 datTraits_with_rownames <- as.data.frame(datTraits) datTraits_with_rownames$Sample <- rownames(datTraits) # 按Sample列合并 mes_group <- merge(MEs_with_rownames, datTraits_with_rownames, by = "Sample") library(ggpubr) library(gridExtra) draw_ggboxplot <- function(data, Module="Module", group="group"){ ggboxplot(data, x=group, y=Module, ylab = paste0(Module), xlab = group, fill = group, palette = "jco", legend = "") + stat_compare_means() } colorNames <- names(MEs)[!names(MEs) %in% "Sample"] pdf("step4_Module-trait-relationship_boxplot.pdf", width = 7.5, height = 1.6*length(colorNames)) p <- lapply(colorNames, function(x) { draw_ggboxplot(mes_group, Module = x, group = "group") }) do.call(grid.arrange, c(p, ncol=2)) dev.off() } ### 基因与模块、表型的相关性散点图 if(T){ modNames <- substring(names(MEs)[!names(MEs) %in% "Sample"], 3) geneModuleMembership <- as.data.frame(cor(datExpr_numeric, MEs[,!names(MEs) %in% "Sample"], use = "p")) MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples)) names(geneModuleMembership) <- paste0("MM", modNames) names(MMPvalue) <- paste0("p.MM", modNames) trait_names <- colnames(datTraits)[!colnames(datTraits) %in% c("group", "groupNo")] for(trait_name in trait_names){ trait <- as.data.frame(datTraits[,trait_name]) names(trait) <- trait_name geneTraitSignificance <- as.data.frame(cor(datExpr_numeric, trait, use = "p")) GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples)) names(geneTraitSignificance) <- paste0("GS.", trait_name) names(GSPvalue) <- paste0("p.GS.", trait_name) selectModule <- modNames pdf(paste0("step4_gene-Module-trait-significance_", trait_name, ".pdf"), width=7, height=1.5*length(selectModule)) par(mfrow=c(ceiling(length(selectModule)/2),2)) for(module in selectModule){ column <- match(module, selectModule) moduleGenes <- moduleColors == module if(sum(moduleGenes) > 1){ verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = paste("Gene significance for", trait_name), main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = labels2colors(as.numeric(module))) } } dev.off() } } ######################### 5. WGCNA可视化 ################################## rm(list = ls()) load(file = 'step1_input.Rdata') load(file = "step2_power_value.Rdata") load(file = "step3_genes_modules.Rdata") load(file = "step4_design.Rdata") # 确保数据是数值型 datExpr_numeric <- datExpr for(i in 1:ncol(datExpr)) { if(!is.numeric(datExpr[[i]])) { datExpr_numeric[[i]] <- as.numeric(as.character(datExpr[[i]])) datExpr_numeric[[i]][is.na(datExpr_numeric[[i]])] <- 0 } } if(T){ TOM <- TOMsimilarityFromExpr(datExpr_numeric, power=power) dissTOM <- 1-TOM if(T){ geneTree = net$dendrograms[[1]] plotTOM = dissTOM^7 diag(plotTOM)=NA png("step5_TOMplot_Network-heatmap.png",width = 800, height=600) TOMplot(plotTOM,geneTree,moduleColors, col=gplots::colorpanel(250,'red',"orange",'lemonchiffon'), main="Network heapmap plot") dev.off() } } ### 模块相关性展示 if(T){ MEs = moduleEigengenes(datExpr_numeric, moduleColors)$eigengenes MET = orderMEs(MEs) if(T){ TF = as.data.frame(datTraits$TF) names(TF) = "TF" MET = orderMEs(cbind(MEs, TF)) } pdf("step5_module_cor_Eigengene-dendrogram.pdf",width = 8,height = 10) plotEigengeneNetworks(MET, setLabels="", marDendro = c(0,4,1,4), marHeatmap = c(5,5,1,2), cex.lab = 0.8, xLabelsAngle = 90) dev.off() } cat("WGCNA分析完成!\n")。该代码结果模块与表型TF、TP只有强负相关的,没有强正相关,如何解决
11-03
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值