机器学习+算法

R语言手把手-机器学习
指南,先看看

1.决策树(Decision Tree)

我觉得这句话写的很清楚:“决策树算法主要是指决策树进行创建中进行树分裂(划分数据集)的时候选取最优特征的算法,他的主要目的就是要选取一个特征能够将分开的数据集尽量的规整,也就是尽可能的纯. 最大的原则就是: 将无序的数据变得更加有序”

2.K-邻近算法(KNN)

3.主成分分析(PCA)

4.因子分析(Factor Analysis)

5.聚类分析(Cluster Analysis)

6.支持向量机

桓峰基因:癌症诊断和分子分型方法之支持向量机(SVM)
基础R语言操作SVM
所用函数较多,可follow
拓展的其他用法

7.分类树(Classification Trees)

8.回归树(Regression Trees)

9.随机森林(Random Forest)

randomForest函数详解
比较详细,函数基础用法都介绍了
很系统的流程
简单明了,很有用

copy了一段可能有用的代码,先存着,这是自己更改之后的

# 原始csv数据路径,建议仅保留自变量和因变量,去掉索引号
load(file = "./3.data_union_forRandomForest_diffFeature.Rdata")

# 定义最大迭代随机种子数值,默认为3,等于分别设置种子为1、2、3运行三次,并保留模型精度最高的种子作为最终结果,max_seed根据个人需求自行修改。该代码迭代所有种子,最后再运行一次精度最高的种子作为输出,如果设置为1只是想运行一次,该代码就显得非常累赘低效,请自行修改。
m = 1234
# 数据拆分份数
k = 5
# for (m in 1:max_seed){
  print(paste('set seed = ',m))
  # 设置随机种子
  set.seed(m)
  # 读取数据
  data = data_union %>% select(-c(model_id,PUBCHEM,SMILE,smiles)) %>% na.omit(.)
  # 转换为dataframe
  data = as.data.frame(data)
  #获取长度
  len = nrow(data)
  
  #定义拆分函数mysplit
  mysplit = function(k,len){
    # 数据索引池
    pool = c(1:len)
    #每份的长度
    seg = as.integer(len/k)
    # 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据
    train = as.data.frame(matrix(nrow = (len - seg)))
    test = as.data.frame(matrix(nrow = seg))
    for (i in 1 : k){
      # 测试集索引
      ctest = sample(pool,seg,replace = FALSE)
      # 赋值
      train[i] = setdiff(c(1:len),ctest)
      test[i] = ctest
      # 不放回抽样,更新索引池
      pool = setdiff(pool,ctest)
    }
    # 返回索引数据框
    out = list(one=train, two=test)
    return(out)
  }
  # 调用函数,从out提取出train和test索引数据框
  split1 = mysplit(k,len)
  train = split1$one
  test = split1$two
  
  # 定义验证精度的累加值,循环结束后取均值
  total_train = 0
  total_test = 0
  
  # 开始循环
  for(i in 1:k){
    # 根据索引拆分训练测试集
    train_actual = data[unlist(train[i]),]
    test_actual = data[unlist(test[i]),]
    # 定义输出路径,为了避免冗余,输出csv这部分放到了代码最后,这里都注释掉了
    # path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)
    # path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)
    # path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)
    # path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)
    
    
    # 随机森林建模,请自行进行模型的调试
    library(randomForest)
    rf = randomForest(ln_IC50 ~ ., data=train_actual,proximity=T,importance=T)
    save(rf, paste0 ("./Model",i,".Rdata"))
    
    # 训练测试集预测结果
    train_predict = predict(rf, train_actual)
    test_predict = predict(rf, test_actual)
    
    # 定义输出路径
    path_test_actual = paste (".test_actual",i,".csv",sep = "", collapse = NULL)
    path_test_predict = paste ("./test_predict",i,".csv",sep = "", collapse = NULL)
    
    path_train_actual = paste ("./train_actual",i,".csv",sep = "", collapse = NULL)
    path_train_predict = paste ("./train_predict",i,".csv",sep = "", collapse = NULL)
    
    
    # 输出csv,方便数据检视
    write.csv(train_actual, file=path_train_actual)
    write.csv(train_predict, file = path_train_predict)
    
    write.csv(test_actual, file=path_test_actual)
    write.csv(test_predict , file = path_test_predict)
    
    total_actual = rbind(total_actual,read.csv(path_test_actual))
    total_predict = rbind(total_predict,read.csv(path_test_predict))
    
    # ---- 计算测试集预测精度 ----
    # 1. 'Actual' and 'Predicted' data
    # 
    df <- data.frame(
      y_actual = test_actual[[ln_IC50]],
      y_predict  = test_predict) +
      
    # 下面是计算过程
    # 2. Average of actual data
    avr_y_actual <- mean(df$y_actual)
    # 3. Total sum of squares
    ss_total <- sum((df$y_actual - avr_y_actual)^2)
    # 4. Regression sum of squares
    ss_regression <- sum((df$y_predict - avr_y_actual)^2)
    # 5. Residual sum of squares
    ss_residuals <- sum((df$y_actual - df$y_predict)^2)
    # 6. R2 Score
    r_test <- 1 - ss_residuals / ss_total
    # 输出一下k以及测试精度R2
    # print(paste ("第",i,"折测试集r2:",r_test,sep='',collapse = NULL))
    # print(r_test)
    
    # ---- 计算训练集预测精度 ----
    # 1. 'Actual' and 'Predicted' data
    # --这里的6记得修改,对应h_test.csv参数y所在的列
    df <- data.frame(
      y_actual = train_actual[[ln_IC50]],
      y_predict  = train_predict) 
    # 下面是计算过程
    # 2. Average of actual data
    avr_y_actual <- mean(df$y_actual)
    # 3. Total sum of squares
    ss_total <- sum((df$y_actual - avr_y_actual)^2)
    # 4. Regression sum of squares
    ss_regression <- sum((df$y_predict - avr_y_actual)^2)
    # 5. Residual sum of squares
    ss_residuals <- sum((df$y_actual - df$y_predict)^2)
    # 6. R2 Score
    r_train <- 1 - ss_residuals / ss_total#train R2
    # 输出训练精度R2
    # print(paste ("第",i,"折训练集r2:",r_train,sep='',collapse = NULL))
    # 精度累加值更新
    total_train = total_train + r_train
    total_test = total_test + r_test
    
    
  }
  # 输出k次建模的测试集实际值和预测值
  total = merge(total_actual,total_predict)
  write.csv(total, './total.csv')
  
  # 计算R2平均值
  total_train = total_train/k
  total_test = total_test/k
  # 输出
  print(paste('最终测试集r2:',total_test))
  print(paste('最终训练集r2:',total_train))
  print('done')

这是未经更改的,循环了不同的seed,耗时较长
已经很傻瓜操作了,需要修改的代码有:1)交叉验证折数 k 的值; 2)原始csv数据的路径; 3)因变量的参数名;4)随机种子最大值 max_seed 。模型的调试请自行尝试。

# 原始csv数据路径,建议仅保留自变量和因变量,去掉索引号
load(file = "./3.data_union_forRandomForest_diffFeature.Rdata")

# 定义最大迭代随机种子数值,默认为3,等于分别设置种子为1、2、3运行三次,并保留模型精度最高的种子作为最终结果,max_seed根据个人需求自行修改。该代码迭代所有种子,最后再运行一次精度最高的种子作为输出,如果设置为1只是想运行一次,该代码就显得非常累赘低效,请自行修改。
m = 1234
# 数据拆分份数
k = 5
# for (m in 1:max_seed){
  print(paste('set seed = ',m))
  # 设置随机种子
  set.seed(m)
  # 读取数据
  data = data_union %>% select(-c(model_id,PUBCHEM,SMILE,smiles)) %>% na.omit(.)
  # 转换为dataframe
  data = as.data.frame(data)
  #获取长度
  len = nrow(data)
  
  #定义拆分函数mysplit
  mysplit = function(k,len){
    # 数据索引池
    pool = c(1:len)
    #每份的长度
    seg = as.integer(len/k)
    # 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据
    train = as.data.frame(matrix(nrow = (len - seg)))
    test = as.data.frame(matrix(nrow = seg))
    for (i in 1 : k){
      # 测试集索引
      ctest = sample(pool,seg,replace = FALSE)
      # 赋值
      train[i] = setdiff(c(1:len),ctest)
      test[i] = ctest
      # 不放回抽样,更新索引池
      pool = setdiff(pool,ctest)
    }
    # 返回索引数据框
    out = list(one=train, two=test)
    return(out)
  }
  # 调用函数,从out提取出train和test索引数据框
  split1 = mysplit(k,len)
  train = split1$one
  test = split1$two
  
  # 定义验证精度的累加值,循环结束后取均值
  total_train = 0
  total_test = 0
  
  # 开始循环
  for(i in 1:k){
    # 根据索引拆分训练测试集
    train_actual = data[unlist(train[i]),]
    test_actual = data[unlist(test[i]),]
    # 定义输出路径,为了避免冗余,输出csv这部分放到了代码最后,这里都注释掉了
    # path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)
    # path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)
    # path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)
    # path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)
    
    
    # 随机森林建模,请自行进行模型的调试
    library(randomForest)
    rf = randomForest(ln_IC50 ~ ., data=train_actual,proximity=T,importance=T)
    save(rf, paste0 ("./Model",i,".Rdata"))
    
    # 训练测试集预测结果
    train_predict = predict(rf, train_actual)
    test_predict = predict(rf, test_actual)
    
    # 定义输出路径
    path_test_actual = paste (".test_actual",i,".csv",sep = "", collapse = NULL)
    path_test_predict = paste ("./test_predict",i,".csv",sep = "", collapse = NULL)
    
    path_train_actual = paste ("./train_actual",i,".csv",sep = "", collapse = NULL)
    path_train_predict = paste ("./train_predict",i,".csv",sep = "", collapse = NULL)
    
    
    # 输出csv,方便数据检视
    write.csv(train_actual, file=path_train_actual)
    write.csv(train_predict, file = path_train_predict)
    
    write.csv(test_actual, file=path_test_actual)
    write.csv(test_predict , file = path_test_predict)
    
    total_actual = rbind(total_actual,read.csv(path_test_actual))
    total_predict = rbind(total_predict,read.csv(path_test_predict))
    
    # ---- 计算测试集预测精度 ----
    # 1. 'Actual' and 'Predicted' data
    # 
    df <- data.frame(
      y_actual = test_actual[[ln_IC50]],
      y_predict  = test_predict) +
      
    # 下面是计算过程
    # 2. Average of actual data
    avr_y_actual <- mean(df$y_actual)
    # 3. Total sum of squares
    ss_total <- sum((df$y_actual - avr_y_actual)^2)
    # 4. Regression sum of squares
    ss_regression <- sum((df$y_predict - avr_y_actual)^2)
    # 5. Residual sum of squares
    ss_residuals <- sum((df$y_actual - df$y_predict)^2)
    # 6. R2 Score
    r_test <- 1 - ss_residuals / ss_total
    # 输出一下k以及测试精度R2
    # print(paste ("第",i,"折测试集r2:",r_test,sep='',collapse = NULL))
    # print(r_test)
    
    # ---- 计算训练集预测精度 ----
    # 1. 'Actual' and 'Predicted' data
    # --这里的6记得修改,对应h_test.csv参数y所在的列
    df <- data.frame(
      y_actual = train_actual[[ln_IC50]],
      y_predict  = train_predict) 
    # 下面是计算过程
    # 2. Average of actual data
    avr_y_actual <- mean(df$y_actual)
    # 3. Total sum of squares
    ss_total <- sum((df$y_actual - avr_y_actual)^2)
    # 4. Regression sum of squares
    ss_regression <- sum((df$y_predict - avr_y_actual)^2)
    # 5. Residual sum of squares
    ss_residuals <- sum((df$y_actual - df$y_predict)^2)
    # 6. R2 Score
    r_train <- 1 - ss_residuals / ss_total#train R2
    # 输出训练精度R2
    # print(paste ("第",i,"折训练集r2:",r_train,sep='',collapse = NULL))
    # 精度累加值更新
    total_train = total_train + r_train
    total_test = total_test + r_test
    
    
  }
  # 输出k次建模的测试集实际值和预测值
  total = merge(total_actual,total_predict)
  write.csv(total, './total.csv')
  
  # 计算R2平均值
  total_train = total_train/k
  total_test = total_test/k
  # 输出
  print(paste('最终测试集r2:',total_test))
  print(paste('最终训练集r2:',total_train))
  print('done')

10.梯度提升算法(Gradient Boosting)

11.神经网络(Neural Network)

图神经网络从入门到入门

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值