1.决策树(Decision Tree)
我觉得这句话写的很清楚:“决策树算法主要是指决策树进行创建中进行树分裂(划分数据集)的时候选取最优特征的算法,他的主要目的就是要选取一个特征能够将分开的数据集尽量的规整,也就是尽可能的纯. 最大的原则就是: 将无序的数据变得更加有序”
2.K-邻近算法(KNN)
3.主成分分析(PCA)
4.因子分析(Factor Analysis)
5.聚类分析(Cluster Analysis)
6.支持向量机
桓峰基因:癌症诊断和分子分型方法之支持向量机(SVM)
基础R语言操作SVM
所用函数较多,可follow
拓展的其他用法
7.分类树(Classification Trees)
8.回归树(Regression Trees)
9.随机森林(Random Forest)
randomForest函数详解
比较详细,函数基础用法都介绍了
很系统的流程
简单明了,很有用
copy了一段可能有用的代码,先存着,这是自己更改之后的
# 原始csv数据路径,建议仅保留自变量和因变量,去掉索引号
load(file = "./3.data_union_forRandomForest_diffFeature.Rdata")
# 定义最大迭代随机种子数值,默认为3,等于分别设置种子为1、2、3运行三次,并保留模型精度最高的种子作为最终结果,max_seed根据个人需求自行修改。该代码迭代所有种子,最后再运行一次精度最高的种子作为输出,如果设置为1只是想运行一次,该代码就显得非常累赘低效,请自行修改。
m = 1234
# 数据拆分份数
k = 5
# for (m in 1:max_seed){
print(paste('set seed = ',m))
# 设置随机种子
set.seed(m)
# 读取数据
data = data_union %>% select(-c(model_id,PUBCHEM,SMILE,smiles)) %>% na.omit(.)
# 转换为dataframe
data = as.data.frame(data)
#获取长度
len = nrow(data)
#定义拆分函数mysplit
mysplit = function(k,len){
# 数据索引池
pool = c(1:len)
#每份的长度
seg = as.integer(len/k)
# 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据
train = as.data.frame(matrix(nrow = (len - seg)))
test = as.data.frame(matrix(nrow = seg))
for (i in 1 : k){
# 测试集索引
ctest = sample(pool,seg,replace = FALSE)
# 赋值
train[i] = setdiff(c(1:len),ctest)
test[i] = ctest
# 不放回抽样,更新索引池
pool = setdiff(pool,ctest)
}
# 返回索引数据框
out = list(one=train, two=test)
return(out)
}
# 调用函数,从out提取出train和test索引数据框
split1 = mysplit(k,len)
train = split1$one
test = split1$two
# 定义验证精度的累加值,循环结束后取均值
total_train = 0
total_test = 0
# 开始循环
for(i in 1:k){
# 根据索引拆分训练测试集
train_actual = data[unlist(train[i]),]
test_actual = data[unlist(test[i]),]
# 定义输出路径,为了避免冗余,输出csv这部分放到了代码最后,这里都注释掉了
# path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)
# path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)
# path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)
# path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)
# 随机森林建模,请自行进行模型的调试
library(randomForest)
rf = randomForest(ln_IC50 ~ ., data=train_actual,proximity=T,importance=T)
save(rf, paste0 ("./Model",i,".Rdata"))
# 训练测试集预测结果
train_predict = predict(rf, train_actual)
test_predict = predict(rf, test_actual)
# 定义输出路径
path_test_actual = paste (".test_actual",i,".csv",sep = "", collapse = NULL)
path_test_predict = paste ("./test_predict",i,".csv",sep = "", collapse = NULL)
path_train_actual = paste ("./train_actual",i,".csv",sep = "", collapse = NULL)
path_train_predict = paste ("./train_predict",i,".csv",sep = "", collapse = NULL)
# 输出csv,方便数据检视
write.csv(train_actual, file=path_train_actual)
write.csv(train_predict, file = path_train_predict)
write.csv(test_actual, file=path_test_actual)
write.csv(test_predict , file = path_test_predict)
total_actual = rbind(total_actual,read.csv(path_test_actual))
total_predict = rbind(total_predict,read.csv(path_test_predict))
# ---- 计算测试集预测精度 ----
# 1. 'Actual' and 'Predicted' data
#
df <- data.frame(
y_actual = test_actual[[ln_IC50]],
y_predict = test_predict) +
# 下面是计算过程
# 2. Average of actual data
avr_y_actual <- mean(df$y_actual)
# 3. Total sum of squares
ss_total <- sum((df$y_actual - avr_y_actual)^2)
# 4. Regression sum of squares
ss_regression <- sum((df$y_predict - avr_y_actual)^2)
# 5. Residual sum of squares
ss_residuals <- sum((df$y_actual - df$y_predict)^2)
# 6. R2 Score
r_test <- 1 - ss_residuals / ss_total
# 输出一下k以及测试精度R2
# print(paste ("第",i,"折测试集r2:",r_test,sep='',collapse = NULL))
# print(r_test)
# ---- 计算训练集预测精度 ----
# 1. 'Actual' and 'Predicted' data
# --这里的6记得修改,对应h_test.csv参数y所在的列
df <- data.frame(
y_actual = train_actual[[ln_IC50]],
y_predict = train_predict)
# 下面是计算过程
# 2. Average of actual data
avr_y_actual <- mean(df$y_actual)
# 3. Total sum of squares
ss_total <- sum((df$y_actual - avr_y_actual)^2)
# 4. Regression sum of squares
ss_regression <- sum((df$y_predict - avr_y_actual)^2)
# 5. Residual sum of squares
ss_residuals <- sum((df$y_actual - df$y_predict)^2)
# 6. R2 Score
r_train <- 1 - ss_residuals / ss_total#train R2
# 输出训练精度R2
# print(paste ("第",i,"折训练集r2:",r_train,sep='',collapse = NULL))
# 精度累加值更新
total_train = total_train + r_train
total_test = total_test + r_test
}
# 输出k次建模的测试集实际值和预测值
total = merge(total_actual,total_predict)
write.csv(total, './total.csv')
# 计算R2平均值
total_train = total_train/k
total_test = total_test/k
# 输出
print(paste('最终测试集r2:',total_test))
print(paste('最终训练集r2:',total_train))
print('done')
这是未经更改的,循环了不同的seed,耗时较长
已经很傻瓜操作了,需要修改的代码有:1)交叉验证折数 k 的值; 2)原始csv数据的路径; 3)因变量的参数名;4)随机种子最大值 max_seed 。模型的调试请自行尝试。
# 原始csv数据路径,建议仅保留自变量和因变量,去掉索引号
load(file = "./3.data_union_forRandomForest_diffFeature.Rdata")
# 定义最大迭代随机种子数值,默认为3,等于分别设置种子为1、2、3运行三次,并保留模型精度最高的种子作为最终结果,max_seed根据个人需求自行修改。该代码迭代所有种子,最后再运行一次精度最高的种子作为输出,如果设置为1只是想运行一次,该代码就显得非常累赘低效,请自行修改。
m = 1234
# 数据拆分份数
k = 5
# for (m in 1:max_seed){
print(paste('set seed = ',m))
# 设置随机种子
set.seed(m)
# 读取数据
data = data_union %>% select(-c(model_id,PUBCHEM,SMILE,smiles)) %>% na.omit(.)
# 转换为dataframe
data = as.data.frame(data)
#获取长度
len = nrow(data)
#定义拆分函数mysplit
mysplit = function(k,len){
# 数据索引池
pool = c(1:len)
#每份的长度
seg = as.integer(len/k)
# 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据
train = as.data.frame(matrix(nrow = (len - seg)))
test = as.data.frame(matrix(nrow = seg))
for (i in 1 : k){
# 测试集索引
ctest = sample(pool,seg,replace = FALSE)
# 赋值
train[i] = setdiff(c(1:len),ctest)
test[i] = ctest
# 不放回抽样,更新索引池
pool = setdiff(pool,ctest)
}
# 返回索引数据框
out = list(one=train, two=test)
return(out)
}
# 调用函数,从out提取出train和test索引数据框
split1 = mysplit(k,len)
train = split1$one
test = split1$two
# 定义验证精度的累加值,循环结束后取均值
total_train = 0
total_test = 0
# 开始循环
for(i in 1:k){
# 根据索引拆分训练测试集
train_actual = data[unlist(train[i]),]
test_actual = data[unlist(test[i]),]
# 定义输出路径,为了避免冗余,输出csv这部分放到了代码最后,这里都注释掉了
# path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)
# path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)
# path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)
# path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)
# 随机森林建模,请自行进行模型的调试
library(randomForest)
rf = randomForest(ln_IC50 ~ ., data=train_actual,proximity=T,importance=T)
save(rf, paste0 ("./Model",i,".Rdata"))
# 训练测试集预测结果
train_predict = predict(rf, train_actual)
test_predict = predict(rf, test_actual)
# 定义输出路径
path_test_actual = paste (".test_actual",i,".csv",sep = "", collapse = NULL)
path_test_predict = paste ("./test_predict",i,".csv",sep = "", collapse = NULL)
path_train_actual = paste ("./train_actual",i,".csv",sep = "", collapse = NULL)
path_train_predict = paste ("./train_predict",i,".csv",sep = "", collapse = NULL)
# 输出csv,方便数据检视
write.csv(train_actual, file=path_train_actual)
write.csv(train_predict, file = path_train_predict)
write.csv(test_actual, file=path_test_actual)
write.csv(test_predict , file = path_test_predict)
total_actual = rbind(total_actual,read.csv(path_test_actual))
total_predict = rbind(total_predict,read.csv(path_test_predict))
# ---- 计算测试集预测精度 ----
# 1. 'Actual' and 'Predicted' data
#
df <- data.frame(
y_actual = test_actual[[ln_IC50]],
y_predict = test_predict) +
# 下面是计算过程
# 2. Average of actual data
avr_y_actual <- mean(df$y_actual)
# 3. Total sum of squares
ss_total <- sum((df$y_actual - avr_y_actual)^2)
# 4. Regression sum of squares
ss_regression <- sum((df$y_predict - avr_y_actual)^2)
# 5. Residual sum of squares
ss_residuals <- sum((df$y_actual - df$y_predict)^2)
# 6. R2 Score
r_test <- 1 - ss_residuals / ss_total
# 输出一下k以及测试精度R2
# print(paste ("第",i,"折测试集r2:",r_test,sep='',collapse = NULL))
# print(r_test)
# ---- 计算训练集预测精度 ----
# 1. 'Actual' and 'Predicted' data
# --这里的6记得修改,对应h_test.csv参数y所在的列
df <- data.frame(
y_actual = train_actual[[ln_IC50]],
y_predict = train_predict)
# 下面是计算过程
# 2. Average of actual data
avr_y_actual <- mean(df$y_actual)
# 3. Total sum of squares
ss_total <- sum((df$y_actual - avr_y_actual)^2)
# 4. Regression sum of squares
ss_regression <- sum((df$y_predict - avr_y_actual)^2)
# 5. Residual sum of squares
ss_residuals <- sum((df$y_actual - df$y_predict)^2)
# 6. R2 Score
r_train <- 1 - ss_residuals / ss_total#train R2
# 输出训练精度R2
# print(paste ("第",i,"折训练集r2:",r_train,sep='',collapse = NULL))
# 精度累加值更新
total_train = total_train + r_train
total_test = total_test + r_test
}
# 输出k次建模的测试集实际值和预测值
total = merge(total_actual,total_predict)
write.csv(total, './total.csv')
# 计算R2平均值
total_train = total_train/k
total_test = total_test/k
# 输出
print(paste('最终测试集r2:',total_test))
print(paste('最终训练集r2:',total_train))
print('done')