[R] Bind element of List of matrix or data.frame or list

本文探讨了如何使用 R 语言将矩阵或数据框列表中的元素行进行拼接的方法,包括直接使用 do.call(rbind,)、plyr 库的 ldply 函数、data.table 库的 rbindlist 函数等,并对比了它们的效率。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1. Alternative solutions for list of matrix or data.frame

If I have a list of matrix or data.frame, we can use the following ways to bind the rows of all elements.

Firstly, I generate toy data

myList1 <- list(matrix(rnorm(2*3), ncol=2),
                matrix(rnorm(2*3), ncol=2),
                matrix(rnorm(2*3), ncol=2),
                matrix(rnorm(2*3), ncol=2))
myList2 <- list(as.data.frame(matrix(rnorm(2*3), ncol=2)),
                as.data.frame(matrix(rnorm(2*3), ncol=2)),
                as.data.frame(matrix(rnorm(2*3), ncol=2)),
                as.data.frame(matrix(rnorm(2*3), ncol=2)))

Now I list the alternative solutions

# solution 1
result1.1 <- do.call(rbind, myList1)
head(result1.1)
result1.2 <- do.call(rbind, myList2)
head(result1.2)

# solution 2
## plyr: the split-apply-combine paradigm for R
library(plyr)
result1.2.1 <- ldply(myList1, rbind)
head(result1.2.1)
###Error: All inputs to rbind.fill must be data.frames
result1.2.2 <- rbind.fill(myList1)
head(result1.2.2)
result2.2.1 <- ldply(myList2, rbind)
head(result2.2.1)
result2.2.2 <- rbind.fill(myList2)
head(result2.2.2)

# solution 3
## data.table: Enhanced data.frame
library(data.table)
###Error in rbindlist(myList1) : Item 1 of list input is not a data.frame, data.table or list
result1.3 <- rbindlist(myList1)
head(result1.3)
result2.3 <- rbindlist(myList2)
head(result2.3)

From the codes, we can see that

  • for matrix, only do.call(rbind, ), ldply can work.
  • for data.frame, all do.call(rbind, ), ldply, rbind.fill, rbindlist can work.

And now benchmark for all solutions

# benchmark
## benchmark: a simple wrapper around system.time
library(rbenchmark)
benchmark(do.call(rbind, myList2), ldply(myList2, rbind), rbind.fill(myList2), rbindlist(myList2))

2. Alternative solutions for list of list

#############list of list#############################

# generate list of list
myList3 <- vector("list", 4)
for(i in 1:4){
  myList3[[i]] <- vector("list", 2)
  for(j in 1:2){
    myList3[[i]][[j]] <- rnorm(3)
  }
}

# bind each element of outer list
tempList <- lapply(myList3, function(z)do.call(rbind,z))
## selection operator "["
bind.ith.rows <- function(i) do.call(rbind, lapply(tempList, "[", i, TRUE))
nr <- nrow(tempList[[1]])
lapply(1:nr, bind.ith.rows)
``` install.packages("pacman") pacman::p_load(randomForest,caret,pROC) install.packages("randomForest") library(randomForest) install.packages("caret") library(caret) install.packages("pROC") library(pROC) install.packages("lava") library(lava) #lasso回归筛选数据集随机森林 completed_copd <- read.csv("C:\\Users\\29930\\Desktop\\COPD2.csv") completed_copd$COPD <- as.factor(completed_copd$COPD) library(caret) set.seed(40705) trainlist <- createDataPartition(completed_copd$COPD,p=0.7,list = FALSE) trainset <- completed_copd[trainlist,] testset <- completed_copd[-trainlist,] library(randomForest) set.seed(40705) rf.train <- randomForest(as.factor(COPD) ~.,data = trainset,importance = TRUE) rf.train # 10折交叉验证 library(pROC) library(MLmetrics) cv <- trainControl(method = "cv", number = 10, classProbs = TRUE, summaryFunction = twoClassSummary) results <- train(COPD ~ ., data = trainset, method = "rf", trControl = cv) # 输出交叉验证结果 results plot(rf.train, main = "图1 lasso筛选变量数据集的随机森林与误差关系图") predictions <- predict(rf.train,testset,type = "class") predictions confMatrix <- table(testset$COPD, predictions) acc <- sum(predictions ==testset$COPD)/nrow(testset) print(paste("Accuracy",acc)) set.seed(40705) rf.test <- predict(rf.train, newdata = testset, type = "class") rf.cf <- caret::confusionMatrix(as.factor(rf.test),as.factor(testset$COPD)) rf.test2 <- predict(rf.train, newdata = testset, type = "prob") head(rf.test2) library(pROC) ROC.rf <- multiclass.roc(testset$COPD,rf.test2,plot = TRUE, print.auc = TRUE, legacy.axes = TRUE) head(ROC.rf) #计算权值 varImpPlot(rf.train) importance <- importance(rf.train) imp_df <- data.frame(feature=row.names(importance), importance=importance[,1]) imp_df$weight <- imp_df$importance/sum(imp_df$importance) imp_df$score <- imp_df$weight*100 print(imp_df)```增加一个评分系统以预测COPD,并增加可视化和输出公式
03-30
# 加载必要包 library(poLCA) library(dplyr) library(tidyr) library(ggplot2) # 设置路径与变量 file_path <- "D:/SHUJU/car_and_ebike.csv" vars <- c("Driver.gender", "Driver.identity", "Passenger.car.state", "Weekend", "Road.condition.classification", "Crash.type", "Weather", "Visibility", "Lighting.condition", "Road.functional.class", "Rider.age", "Physical.separation.of.the.road", "Rider.gender", "Rider.hurt.part") # 读取并预处理数据 data <- read.csv(file_path) data <- data[vars] # 转换为从1开始的分类变量 data[] <- lapply(data, function(x) { x <- as.factor(x) x <- as.numeric(as.factor(x)) return(x) }) # 构建 LCA 公式 f <- as.formula(paste("cbind(", paste(vars, collapse = ","), ") ~ 1")) # 初始化存储指标 fit_stats <- data.frame() models <- list() N <- nrow(data) # 样本量,用于 CAIC # 拟合 1~10 类的模型 for (k in 1:10) { cat("拟合 LCA 模型,类别数 =", k, "\n") set.seed(123) lca_model <- poLCA(f, data, nclass = k, na.rm = FALSE, verbose = FALSE) models[[k]] <- lca_model # 熵R²计算 posterior <- lca_model$posterior entropy <- -rowSums(posterior * log(posterior + 1e-10)) max_entropy <- log(ncol(posterior)) entropy_r2 <- 1 - mean(entropy) / max_entropy # CAIC 计算 ll <- lca_model$llik num_params <- lca_model$npar caic <- -2 * ll + num_params * (log(N) + 1) # 存储结果 fit_stats <- rbind(fit_stats, data.frame( K = k, BIC = lca_model$bic, AIC = lca_model$aic, CAIC = caic, EntropyR2 = entropy_r2 )) } # ---------- 图 1:AIC、BIC、CAIC ---------- # 手动设置 y 轴范围,使图更“平缓” y_min <- min(fit_plot$Value) * 0.98 # 稍微留点空间 y_max <- max(fit_plot$Value) * 1.02 p1 <- ggplot(fit_plot, aes(x = K, y = Value, color = Metric)) + geom_line(size = 1.2) + geom_point(size = 2.5) + scale_x_continuous(breaks = 1:10) + labs(x = "Number of clusters", y = "Information criterion") + coord_cartesian(ylim = c(y_min, y_max)) + # 控制纵轴显示范围 theme_minimal(base_size = 14) + theme( plot.title = element_blank(), legend.position = c(0.82, 0.85), legend.background = element_rect(fill = alpha("white", 0.6), color = NA), legend.title = element_blank() ) # ---------- 图 2:熵 R² ---------- fit_stats_r2 <- fit_stats %>% filter(K > 1) p2 <- ggplot(fit_stats_r2, aes(x = K, y = EntropyR2)) + geom_line(color = "#1f77b4", size = 1.2) + geom_point(color = "#1f77b4", size = 2.5) + scale_x_continuous(breaks = 2:10) + ylim(0, 1) + labs(x = "Number of clusters", y = "Entropy R²") + theme_minimal(base_size = 14) + theme( plot.title = element_blank() ) # 调整图形窗口大小(RStudio 中有效) options(repr.plot.width=10, repr.plot.height=5) # 显示图形 print(p1) print(p2) 这是我的代码,已经选出了最佳的k值,想要继续下一步的聚类,你可以帮我继续完善代码吗
06-13
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值