第4R-优快云博客

该博客展示了如何使用R语言进行数据预处理，包括读取医学数据、替换表头、调整变量名、数据类型转换等。接着，博主探讨了不平衡数据的处理，通过SMOTE算法进行过采样和欠采样，以平衡类别分布。此外，还介绍了使用sample函数进行随机抽样的方法，并展示了如何确保抽样后数据的类别比例。最后，博主分享了如何根据目标变量的分布来调整抽样策略，以获得代表性样本。

#数据操作
hyper <-read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data’,
header=F)
names <- read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names’,
header=F, sep=’\t’)[[1]]
hyper
names
#替换
names <-gsub(pattern =":", replacement="", x = names)
#加表头
colnames(hyper)<-names
#显示表头
str(colnames(hyper))
#改名字
colnames(hyper)[1]<-“target”
colnames (hyper)
head(hyper)
#判断并替换
hyper $t a r g e t < - i f e l s e (h y p e r$ target==“negative”, 0, 1)
head(hyper)
#检查 0 1
table(hyper $KaTeX parse error: Expected 'EOF', got '#' at position 9: target) #̲计算相对丰度 prop.tab\dots$ target))

#将变量target变成因子型
str(hyper)
hyper $t a r g e t < - a s . f a c t o r (h y p e r$ target)
str(hyper)
#if(!require (DMWR))
#install.packages(“DMwR”)
#library(DMwR)
hyper_new<-SMOTE(target~.,hyper,perc.over = 100,
perc.under = 400)
table(hyper_new$target)

user<- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”)
#加表头
#colnames(user)<-names
table(user $l a b e l) u s e r$ label<-as.factor(user $label) hyper_new<-SMOTE(label~.,user,perc.over = 100, perc.under = 200) table(hyper_new$ label)

install.packages(“caret”)
library(“caret”)

#数据抽样查看分类占比
x<-seq(1,10)
a<-sample(x,8,replace=FALSE)
(c<-sample(x,15,xreplace=T))
user<-read.csv(“data1.csv”,T)
colnames (user)<-c()
nrow(user)

sample小例子

set.seed(1234)

创建对象x，有1~10组成

x <- seq(1,10);x
typeof(x)
str(x)

利用sample函数对x进行无放回抽样

a <- sample(x,8,replace=FALSE);a

利用sample函数对x进行有放回抽样

b <- sample(x,8,replace=TRUE);b

当size大于x的长度

(c <- sample(x,15,replace = F))
(c <- sample(x,15,replace = T))

利用sample对活跃用户数据进行抽样

导入数据

#user <- read.csv(“活跃用户是否付费数据.csv”,T)
user <- read.csv(“data1.csv”,F)
user_col_names <- c(“用户id”,“是否付费”,“注册至今距离天数”,“最后一周登陆天数”,“最后一周登陆次数”,“最后一周0-8点登陆次数”,“最后一周8-18点登陆次数”,“最后一周18-24点登陆次数”)
colnames(user) <- user_col_names

查看数据user的行数

nrow(user)

利用sample函数对user数据进行无放回抽样

#set.seed(1234)

提取下标集

index <- sample(nrow(user),10000,replace=F)
head(index)

将抽样数据赋予对象user_sample

user_sample <- user[index,]

查看user_sample的行数

nrow(user_sample)

现在我们分别查看user与user_sample变量“是否付费”中0、1占比。

table(user $是否付费) table(user_sample$ 是否付费)
round(prop.table(table(user $是否付费)),3) round(prop.table(table(user_sample$ 是否付费)),3)
rate <- sum(user $是否付费 = = 0) / n r o w (u s e r) d < - 1 : n r o w (u s e r) u s e r [d [u s e r$ 是否付费0],]#下标
user[user $KaTeX parse error: Expected 'EOF', got '#' at position 10: 是否付费==0,]#̲booeal index1 <\dots$ 是否付费0], 10000*rate)
index2 <-sample(d[user $是否付费==1], 10000*(1-rate)) user_samplel <- user[c(index1,index2),] tail(user_samplel$ 是否付费)

user <- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”,T)
index <- sample(nrow(user),500,replace=F)
head(index)
user_sample <- user[index,]i
nrow(user_sample)
table(user $SNP1) table(user_sample$ SNP1)
round(prop.table(table(user $SNP1)),2) rate<-round(prop.table(table(user_sample$ SNP1)),2)
#rate <- sum(user $S N P 1 = = 1) / n r o w (u s e r) d < - 1 : n r o w (u s e r) u s e r [u s e r$ SNP13,2]
index1 <-sample(d[user $S N P 1 = = 1], 500 * r a t e [[1]]) i n d e x 2 < - s a m p l e (d [u s e r$ SNP12], 500*rate[[2]])
index3 <-sample(d[user $SNP1==3], 500*rate[[3]]) user_samplel <- user[c(index1,index2,index3),] head(user_samplel$ SNP1)
round(prop.table(table(user_samplel$SNP1)),2)

#d<-1:nrow(user)
#index1 <-d[user $S N P 1 = = 1] i n d e x 1 < - s a m p l e (d [u s e r$ SNP11],226)
index2 <-sample(d[user $S N P 1 = = 2], 226) i n d e x 3 < - s a m p l e (d [u s e r$ SNP13],226)
length(index3)
user_samplel <- user[c(index1,index2,index3),]
round(prop.table(table(user_samplel$SNP1)),2)

round(prop.table(table(user_samplel $SNP1)),5) table(user_samplel$ SNP1)
splitindex2 <- createDataPartition (iris $S p e c i e s, t i m e s = 2, p = 0.1, l i s t = T R U E) u s e r$ 是否付费 <- as.factor(user$是否付费)