catboost的R实现(包含样本权重&指标权重设置，shap值，不平衡数据集处理)

原创

已于 2022-03-16 10:27:02 修改 · 2.5k 阅读

18 ·

CC 4.0 BY-SA版权

文章标签：

#r语言 #boost

于 2021-12-16 14:41:48 首次发布

该博客主要介绍了使用R语言对肿瘤数据进行预处理，包括数据读取、列类型转换、降维处理、缺失值填充。然后通过catboost算法训练模型，涉及样本权重调整、特征重要性评估等步骤，最终得到模型并进行预测，同时计算了预测准确率。

读入数据

#L-name contrast
XMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")
#data union
jzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")
rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv', row.names = 1,header=TRUE, sep=",")
yx<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv',row.names = 1, header=TRUE,sep=",")
sg<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1, header=TRUE, sep=",")
data<-rbind(jzx,rx,yx,sg)

列类型转换

#identify colunm type
library(do)
cdv1<-sapply(data, typeof)
table(cdv1)
cdv<-Replace(cdv1,pattern=c('double:numeric','character:factor'
,'logical:factor','integer:numeric'))
table(cdv)

以规定的列类型重新读入

#read data with colclass
JZX<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv',row.names = 1, na.strings='0',header=TRUE,colClasses =cdv, sep=",")
RX<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv',row.names = 1,na.strings='0', header=TRUE, colClasses =cdv, sep=",")
YX<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv', row.names = 1,na.strings='0',header=TRUE,colClasses =cdv, sep=",")
SG<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1,na.strings='0', header=TRUE,colClasses =cdv, sep=",")
data<-rbind(JZX,RX,YX,SG)
#delete column
#data<-subset(data,select=-c(DIAGNOSTIC,CHECKEROPINION,SAMPLETYPE))

定义自变量列和因变量列

#define label
data$TYPE = gsub(".*胰腺.*" , "1", data$TYPE )
data$TYPE[which(data$TYPE !=1)] <- 0
table(data$TYPE)
#transform label type
table(sapply(data$TYPE, typeof))
data$TYPE<-sapply(data$TYPE,as.integer)
table(sapply(data$TYPE, typeof))

降维&填补缺失值（其实在catboost包里面有自带的缺失值补充参数，这一步可以