#数据操作
hyper <-read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.data’,
header=F)
names <- read.csv(‘http://archive.ics.uci.edu/ml/machine-learning-databases/thyroid-disease/hypothyroid.names’,
header=F, sep=’\t’)[[1]]
hyper
names
#替换
names <-gsub(pattern =":", replacement="", x = names)
#加表头
colnames(hyper)<-names
#显示表头
str(colnames(hyper))
#改名字
colnames(hyper)[1]<-“target”
colnames (hyper)
head(hyper)
#判断并替换
hyper
t
a
r
g
e
t
<
−
i
f
e
l
s
e
(
h
y
p
e
r
target<-ifelse(hyper
target<−ifelse(hypertarget==“negative”, 0, 1)
head(hyper)
#检查 0 1
table(hyperKaTeX parse error: Expected 'EOF', got '#' at position 9: target) #̲计算相对丰度 prop.tab…target))
#将变量target变成因子型
str(hyper)
hyper
t
a
r
g
e
t
<
−
a
s
.
f
a
c
t
o
r
(
h
y
p
e
r
target<-as.factor(hyper
target<−as.factor(hypertarget)
str(hyper)
#if(!require (DMWR))
#install.packages(“DMwR”)
#library(DMwR)
hyper_new<-SMOTE(target~.,hyper,perc.over = 100,
perc.under = 400)
table(hyper_new$target)
user<- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”)
#加表头
#colnames(user)<-names
table(user
l
a
b
e
l
)
u
s
e
r
label) user
label)userlabel<-as.factor(user
l
a
b
e
l
)
h
y
p
e
r
n
e
w
<
−
S
M
O
T
E
(
l
a
b
e
l
.
,
u
s
e
r
,
p
e
r
c
.
o
v
e
r
=
100
,
p
e
r
c
.
u
n
d
e
r
=
200
)
t
a
b
l
e
(
h
y
p
e
r
n
e
w
label) hyper_new<-SMOTE(label~.,user,perc.over = 100, perc.under = 200) table(hyper_new
label)hypernew<−SMOTE(label .,user,perc.over=100,perc.under=200)table(hypernewlabel)
install.packages(“caret”)
library(“caret”)
#数据抽样 查看分类占比
x<-seq(1,10)
a<-sample(x,8,replace=FALSE)
(c<-sample(x,15,xreplace=T))
user<-read.csv(“data1.csv”,T)
colnames (user)<-c()
nrow(user)
sample小例子
set.seed(1234)
创建对象x,有1~10组成
x <- seq(1,10);x
typeof(x)
str(x)
利用sample函数对x进行无放回抽样
a <- sample(x,8,replace=FALSE);a
利用sample函数对x进行有放回抽样
b <- sample(x,8,replace=TRUE);b
当size大于x的长度
(c <- sample(x,15,replace = F))
(c <- sample(x,15,replace = T))
利用sample对活跃用户数据进行抽样
导入数据
#user <- read.csv(“活跃用户是否付费数据.csv”,T)
user <- read.csv(“data1.csv”,F)
user_col_names <- c(“用户id”,“是否付费”,“注册至今距离天数”,“最后一周登陆天数”,“最后一周登陆次数”,“最后一周0-8点登陆次数”,“最后一周8-18点登陆次数”,“最后一周18-24点登陆次数”)
colnames(user) <- user_col_names
查看数据user的行数
nrow(user)
利用sample函数对user数据进行无放回抽样
#set.seed(1234)
提取下标集
index <- sample(nrow(user),10000,replace=F)
head(index)
将抽样数据赋予对象user_sample
user_sample <- user[index,]
查看user_sample的行数
nrow(user_sample)
现在我们分别查看user与user_sample变量“是否付费”中0、1占比。
table(user
是
否
付
费
)
t
a
b
l
e
(
u
s
e
r
s
a
m
p
l
e
是否付费) table(user_sample
是否付费)table(usersample是否付费)
round(prop.table(table(user
是
否
付
费
)
)
,
3
)
r
o
u
n
d
(
p
r
o
p
.
t
a
b
l
e
(
t
a
b
l
e
(
u
s
e
r
s
a
m
p
l
e
是否付费)),3) round(prop.table(table(user_sample
是否付费)),3)round(prop.table(table(usersample是否付费)),3)
rate <- sum(user
是
否
付
费
=
=
0
)
/
n
r
o
w
(
u
s
e
r
)
d
<
−
1
:
n
r
o
w
(
u
s
e
r
)
u
s
e
r
[
d
[
u
s
e
r
是否付费==0) /nrow(user) d <- 1:nrow(user) user[d[user
是否付费==0)/nrow(user)d<−1:nrow(user)user[d[user是否付费0],]#下标
user[userKaTeX parse error: Expected 'EOF', got '#' at position 10: 是否付费==0,]#̲booeal index1 <…是否付费0], 10000*rate)
index2 <-sample(d[user
是
否
付
费
=
=
1
]
,
10000
∗
(
1
−
r
a
t
e
)
)
u
s
e
r
s
a
m
p
l
e
l
<
−
u
s
e
r
[
c
(
i
n
d
e
x
1
,
i
n
d
e
x
2
)
,
]
t
a
i
l
(
u
s
e
r
s
a
m
p
l
e
l
是否付费==1], 10000*(1-rate)) user_samplel <- user[c(index1,index2),] tail(user_samplel
是否付费==1],10000∗(1−rate))usersamplel<−user[c(index1,index2),]tail(usersamplel是否付费)
user <- read.csv(“E://dasan//数据挖掘//精准医疗大赛-初赛(复赛)数据//f_train_20180204.csv”,T)
index <- sample(nrow(user),500,replace=F)
head(index)
user_sample <- user[index,]i
nrow(user_sample)
table(user
S
N
P
1
)
t
a
b
l
e
(
u
s
e
r
s
a
m
p
l
e
SNP1) table(user_sample
SNP1)table(usersampleSNP1)
round(prop.table(table(user
S
N
P
1
)
)
,
2
)
r
a
t
e
<
−
r
o
u
n
d
(
p
r
o
p
.
t
a
b
l
e
(
t
a
b
l
e
(
u
s
e
r
s
a
m
p
l
e
SNP1)),2) rate<-round(prop.table(table(user_sample
SNP1)),2)rate<−round(prop.table(table(usersampleSNP1)),2)
#rate <- sum(user
S
N
P
1
=
=
1
)
/
n
r
o
w
(
u
s
e
r
)
d
<
−
1
:
n
r
o
w
(
u
s
e
r
)
u
s
e
r
[
u
s
e
r
SNP1==1) /nrow(user) d <- 1:nrow(user) user[user
SNP1==1)/nrow(user)d<−1:nrow(user)user[userSNP13,2]
index1 <-sample(d[user
S
N
P
1
=
=
1
]
,
500
∗
r
a
t
e
[
[
1
]
]
)
i
n
d
e
x
2
<
−
s
a
m
p
l
e
(
d
[
u
s
e
r
SNP1==1], 500*rate[[1]]) index2 <-sample(d[user
SNP1==1],500∗rate[[1]])index2<−sample(d[userSNP12], 500*rate[[2]])
index3 <-sample(d[user
S
N
P
1
=
=
3
]
,
500
∗
r
a
t
e
[
[
3
]
]
)
u
s
e
r
s
a
m
p
l
e
l
<
−
u
s
e
r
[
c
(
i
n
d
e
x
1
,
i
n
d
e
x
2
,
i
n
d
e
x
3
)
,
]
h
e
a
d
(
u
s
e
r
s
a
m
p
l
e
l
SNP1==3], 500*rate[[3]]) user_samplel <- user[c(index1,index2,index3),] head(user_samplel
SNP1==3],500∗rate[[3]])usersamplel<−user[c(index1,index2,index3),]head(usersamplelSNP1)
round(prop.table(table(user_samplel$SNP1)),2)
#d<-1:nrow(user)
#index1 <-d[user
S
N
P
1
=
=
1
]
i
n
d
e
x
1
<
−
s
a
m
p
l
e
(
d
[
u
s
e
r
SNP1==1] index1 <-sample(d[user
SNP1==1]index1<−sample(d[userSNP11],226)
index2 <-sample(d[user
S
N
P
1
=
=
2
]
,
226
)
i
n
d
e
x
3
<
−
s
a
m
p
l
e
(
d
[
u
s
e
r
SNP1==2],226) index3 <-sample(d[user
SNP1==2],226)index3<−sample(d[userSNP13],226)
length(index3)
user_samplel <- user[c(index1,index2,index3),]
round(prop.table(table(user_samplel$SNP1)),2)
round(prop.table(table(user_samplel
S
N
P
1
)
)
,
5
)
t
a
b
l
e
(
u
s
e
r
s
a
m
p
l
e
l
SNP1)),5) table(user_samplel
SNP1)),5)table(usersamplelSNP1)
splitindex2 <- createDataPartition (iris
S
p
e
c
i
e
s
,
t
i
m
e
s
=
2
,
p
=
0.1
,
l
i
s
t
=
T
R
U
E
)
u
s
e
r
Species,times=2,p=0.1,list=TRUE) user
Species,times=2,p=0.1,list=TRUE)user是否付费 <- as.factor(user$是否付费)
提取下标集
ind <- createDataPartition(user$是否付费,p=10000/nrow(user),
times=1,list=FALSE)
查看子集中0、1占比
prop.table(table(user[ind,‘是否付费’]))