#数据操作
hyper <-read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data’,
header=F)
names <- read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names’,
header=F, sep=’\t’)[[1]]
hyper
names
#替换
names <-gsub(pattern =":", replacement="", x = names)
#加表头
colnames(hyper)<-names
#显示表头
str(colnames(hyper))
#改名字
colnames(hyper)[1]<-“target”
colnames (hyper)
head(hyper)
#判断并替换
hypertarget<−ifelse(hypertarget<-ifelse(hypertarget<−ifelse(hypertarget==“negative”, 0, 1)
head(hyper)
#检查 0 1
table(hyperKaTeX parse error: Expected 'EOF', got '#' at position 9: target)
#̲计算相对丰度
prop.tab…target))
#将变量target变成因子型
str(hyper)
hypertarget<−as.factor(hypertarget<-as.factor(hypertarget<−as.factor(hypertarget)
str(hyper)
#if(!require (DMWR))
#install.packages(“DMwR”)
#library(DMwR)
hyper_new<-SMOTE(target~.,hyper,perc.over = 100,
perc.under = 400)
table(hyper_new$target)
user<- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”)
#加表头
#colnames(user)<-names
table(userlabel)userlabel)
userlabel)userlabel<-as.factor(userlabel)hypernew<−SMOTE(label .,user,perc.over=100,perc.under=200)table(hypernewlabel)
hyper_new<-SMOTE(label~.,user,perc.over = 100,
perc.under = 200)
table(hyper_newlabel)hypernew<−SMOTE(label .,user,perc.over=100,perc.under=200)table(hypernewlabel)
install.packages(“caret”)
library(“caret”)
#数据抽样 查看分类占比
x<-seq(1,10)
a<-sample(x,8,replace=FALSE)
(c<-sample(x,15,xreplace=T))
user<-read.csv(“data1.csv”,T)
colnames (user)<-c()
nrow(user)
sample小例子
set.seed(1234)
创建对象x,有1~10组成
x <- seq(1,10);x
typeof(x)
str(x)
利用sample函数对x进行无放回抽样
a <- sample(x,8,replace=FALSE);a
利用sample函数对x进行有放回抽样
b <- sample(x,8,replace=TRUE);b
当size大于x的长度
(c <- sample(x,15,replace = F))
(c <- sample(x,15,replace = T))
利用sample对活跃用户数据进行抽样
导入数据
#user <- read.csv(“活跃用户是否付费数据.csv”,T)
user <- read.csv(“data1.csv”,F)
user_col_names <- c(“用户id”,“是否付费”,“注册至今距离天数”,“最后一周登陆天数”,“最后一周登陆次数”,“最后一周0-8点登陆次数”,“最后一周8-18点登陆次数”,“最后一周18-24点登陆次数”)
colnames(user) <- user_col_names
查看数据user的行数
nrow(user)
利用sample函数对user数据进行无放回抽样
#set.seed(1234)
提取下标集
index <- sample(nrow(user),10000,replace=F)
head(index)
将抽样数据赋予对象user_sample
user_sample <- user[index,]
查看user_sample的行数
nrow(user_sample)
现在我们分别查看user与user_sample变量“是否付费”中0、1占比。
table(user是否付费)table(usersample是否付费)
table(user_sample是否付费)table(usersample是否付费)
round(prop.table(table(user是否付费)),3)round(prop.table(table(usersample是否付费)),3)
round(prop.table(table(user_sample是否付费)),3)round(prop.table(table(usersample是否付费)),3)
rate <- sum(user是否付费==0)/nrow(user)d<−1:nrow(user)user[d[user是否付费==0) /nrow(user)
d <- 1:nrow(user)
user[d[user是否付费==0)/nrow(user)d<−1:nrow(user)user[d[user是否付费0],]#下标
user[userKaTeX parse error: Expected 'EOF', got '#' at position 10: 是否付费==0,]#̲booeal
index1 <…是否付费0], 10000*rate)
index2 <-sample(d[user是否付费==1],10000∗(1−rate))usersamplel<−user[c(index1,index2),]tail(usersamplel是否付费==1], 10000*(1-rate))
user_samplel <- user[c(index1,index2),]
tail(user_samplel是否付费==1],10000∗(1−rate))usersamplel<−user[c(index1,index2),]tail(usersamplel是否付费)
user <- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”,T)
index <- sample(nrow(user),500,replace=F)
head(index)
user_sample <- user[index,]i
nrow(user_sample)
table(userSNP1)table(usersampleSNP1)
table(user_sampleSNP1)table(usersampleSNP1)
round(prop.table(table(userSNP1)),2)rate<−round(prop.table(table(usersampleSNP1)),2)
rate<-round(prop.table(table(user_sampleSNP1)),2)rate<−round(prop.table(table(usersampleSNP1)),2)
#rate <- sum(userSNP1==1)/nrow(user)d<−1:nrow(user)user[userSNP1==1) /nrow(user)
d <- 1:nrow(user)
user[userSNP1==1)/nrow(user)d<−1:nrow(user)user[userSNP13,2]
index1 <-sample(d[userSNP1==1],500∗rate[[1]])index2<−sample(d[userSNP1==1], 500*rate[[1]])
index2 <-sample(d[userSNP1==1],500∗rate[[1]])index2<−sample(d[userSNP12], 500*rate[[2]])
index3 <-sample(d[userSNP1==3],500∗rate[[3]])usersamplel<−user[c(index1,index2,index3),]head(usersamplelSNP1==3], 500*rate[[3]])
user_samplel <- user[c(index1,index2,index3),]
head(user_samplelSNP1==3],500∗rate[[3]])usersamplel<−user[c(index1,index2,index3),]head(usersamplelSNP1)
round(prop.table(table(user_samplel$SNP1)),2)
#d<-1:nrow(user)
#index1 <-d[userSNP1==1]index1<−sample(d[userSNP1==1]
index1 <-sample(d[userSNP1==1]index1<−sample(d[userSNP11],226)
index2 <-sample(d[userSNP1==2],226)index3<−sample(d[userSNP1==2],226)
index3 <-sample(d[userSNP1==2],226)index3<−sample(d[userSNP13],226)
length(index3)
user_samplel <- user[c(index1,index2,index3),]
round(prop.table(table(user_samplel$SNP1)),2)
round(prop.table(table(user_samplelSNP1)),5)table(usersamplelSNP1)),5)
table(user_samplelSNP1)),5)table(usersamplelSNP1)
splitindex2 <- createDataPartition (irisSpecies,times=2,p=0.1,list=TRUE)userSpecies,times=2,p=0.1,list=TRUE)
userSpecies,times=2,p=0.1,list=TRUE)user是否付费 <- as.factor(user$是否付费)
提取下标集
ind <- createDataPartition(user$是否付费,p=10000/nrow(user),
times=1,list=FALSE)
查看子集中0、1占比
prop.table(table(user[ind,‘是否付费’]))
该博客展示了如何使用R语言进行数据预处理,包括读取医学数据、替换表头、调整变量名、数据类型转换等。接着,博主探讨了不平衡数据的处理,通过SMOTE算法进行过采样和欠采样,以平衡类别分布。此外,还介绍了使用sample函数进行随机抽样的方法,并展示了如何确保抽样后数据的类别比例。最后,博主分享了如何根据目标变量的分布来调整抽样策略,以获得代表性样本。

被折叠的 条评论
为什么被折叠?



