读入数据
#L-name contrast
XMDZ<-read.csv(file='C:/Users/13771/Desktop/XMDZ.csv', header=FALSE,sep=",")
#data union
jzx<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv', row.names = 1,header=TRUE, sep=",")
rx<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv', row.names = 1,header=TRUE, sep=",")
yx<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv',row.names = 1, header=TRUE,sep=",")
sg<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1, header=TRUE, sep=",")
data<-rbind(jzx,rx,yx,sg)
列类型转换
#identify colunm type
library(do)
cdv1<-sapply(data, typeof)
table(cdv1)
cdv<-Replace(cdv1,pattern=c('double:numeric','character:factor'
,'logical:factor','integer:numeric'))
table(cdv)
以规定的列类型重新读入
#read data with colclass
JZX<-read.csv(file='C:/Users/13771/Desktop/2甲状腺肿瘤.csv',row.names = 1, na.strings='0',header=TRUE,colClasses =cdv, sep=",")
RX<-read.csv(file='C:/Users/13771/Desktop/3乳腺恶性肿瘤.csv',row.names = 1,na.strings='0', header=TRUE, colClasses =cdv, sep=",")
YX<-read.csv(file='C:/Users/13771/Desktop/胰腺肿瘤.csv', row.names = 1,na.strings='0',header=TRUE,colClasses =cdv, sep=",")
SG<-read.csv(file='C:/Users/13771/Desktop/食管癌.csv',row.names = 1,na.strings='0', header=TRUE,colClasses =cdv, sep=",")
data<-rbind(JZX,RX,YX,SG)
#delete column
#data<-subset(data,select=-c(DIAGNOSTIC,CHECKEROPINION,SAMPLETYPE))
定义自变量列和因变量列
#define label
data$TYPE = gsub(".*胰腺.*" , "1", data$TYPE )
data$TYPE[which(data$TYPE !=1)] <- 0
table(data$TYPE)
#transform label type
table(sapply(data$TYPE, typeof))
data$TYPE<-sapply(data$TYPE,as.integer)
table(sapply(data$TYPE, typeof))
降维&填补缺失值(其实在catboost包里面有自带的缺失值补充参数,这一步可以

该博客主要介绍了使用R语言对肿瘤数据进行预处理,包括数据读取、列类型转换、降维处理、缺失值填充。然后通过catboost算法训练模型,涉及样本权重调整、特征重要性评估等步骤,最终得到模型并进行预测,同时计算了预测准确率。
最低0.47元/天 解锁文章





