数据挖掘R实战 02 R使用简介_fancyrpartplot需要安装-优快云博客

本文链接：https://blog.youkuaiyun.com/kxiaozhuk/article/details/83751488
本文全面介绍了R语言的基础知识，包括环境搭建、数据结构、数据读取、数据处理和自定义函数等内容，并深入探讨了数据挖掘流程，从数据预处理到模型构建与评估，涵盖决策树模型的构建与应用。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
#######################################R简介部分##################################

#先设置工作目录
setwd("D:/R/r_training") #请设置自己的工作目录
#安装和加载扩展包
#install.packages("rpart")  #安装rpart
library(rpart)  #加载rpart
search() #查看已加载程序包
update.packages() #更新程序包

#R的帮助
help(findFn)
?mean         #与help（mean）是同一个意思
help(mean)    #搜索函数的帮助文档，解释函数具体参数的含义
help("mean")  #仅搜索已加载的包？
help("bs", try.all.packages = TRUE) #可搜索未加载的包
apropos("spline")     #搜索指定字符串函数，（好像是在已经安装过的函数中搜索），find也是可以的；
help.search("mean") #关键字搜索
library(sos)
findFn("spline", maxPages = 2)#可搜索指定字的程序包、函数，已HTML形式展现

#R工作原理

x=matrix(1:20,nrow=4,ncol=5) #手工创建数据对象
y=apply(x,2,mean)+2       #apply对对象x进行操作，生成结果对象y
x;y

#查看对象数据结构
str(1:4)
str(iris)
str(lm) #显示线性回归的语法结构
str(options()) #显示系统参数设置

#创建向量
c(1,c(2,c(3,4)))
c(1,2,3,4)
n1 = 10+5; n1   #integer
n2 = 1:10; n2; typeof(n2) #integer
name = "Carmen"; name; typeof(name)  #character
n3 = 3 + rnorm(10); n3; typeof(n3)    #double
n4 = n2>5; n4; typeof(n4)        #logical

str(name)    #显示数据类型，及数据
typeof(name) #只显示数据类型

#dim()将向量变成多维数组：
x=1:10;x
dim(x)=c(2,5);x
dim(c) <- c(3, 2); c
dim(c) <- c(2, 3); c
b <- dim(c);b       # b就是dim(2)中定义的行、列值； 

#创建矩阵
a <- matrix(1:6, ncol = 3, nrow = 2); a
c <- 1:6; c
cbind(a,b)
rbind(a,c)

rbind()      #按行合并对象
cbind()      #按列合并对象
length(x)    #返回x中元素的数量
names(x)     #返回x中每一列的名称

#创建数据框

df1 <- data.frame(x = 1:3, y = c("a", "b", "c")); #将向量组合在一起
df1
str(df1)
data.frame(cbind(a = 1:2, b = c("a", "b")))       
data.frame(a = 1:2, b = c("a", "b"))              
df2=data.frame(a = 1:2, b = c("a", "b")); df2
vec=3:4; df3=data.frame(df2,vec); df3
data(iris) #R自带的数据集，都为数据框结构
str(iris)   #对象数据结构
typeof(iris) #对象数据类型
class(iris) #对象的类（数据框、矩阵、数组等）
attributes(iris) #属性
is.data.frame(iris)  #类判断
names(iris); 
colnames(iris) #列名称
rownames(iris)  #行名称
nrow(iris)  #行数
ncol(iris)  #列数
dim(iris)   #维度


#创建列表/因子/时间序列
Lst<-list ( name = 'Fred', no.children=3, child.ages = c( 4,7,9)); Lst
sex <- c('M','F','M','M','F'); sexf <- factor (sex); sex; sexf
ts(1:10, frequency = 4, start = c(1959, 2)) 

#读入外部文件
read.table("weather.txt",header=TRUE)
read.csv("weather.csv",header=TRUE)

#读取数据库数据
library(RODBC)
odbcDataSources()
conn=odbcConnect('tdadmin')
result=sqlQuery(conn, ' select * from TEST.DATA_MINING_EXAMPLE_TMP ')
str(result)

#存入外部文件
library(rattle)
write.table( weather, file = "weather2.txt", row.names = FALSE )
write.csv( weather,file= "weather2.csv", row.names = FALSE)

options(width=200)
library(rattle)
data ( weather)
sink("D:/weather.txt")
weather
sink()

#向量提取子集
x=1:20
x[c(1,5,12,7,5)] #正数
x[-c(5,15)]      #负数
x[c(TRUE,FALSE)] #逻辑向量
x[x>5]           #逻辑向量
names(x)=letters[1:20] 
x[c("a","e","g","i","a")]  #元素名字

#矩阵提取子集
y=matrix(1:20,nrow=4,ncol=5); y
y[2,3]  #第二行第三列元素
y[1:2,] #第1和2行
y[,3:5] #第3到5列
y[1:2,4:5]  #第1和2行，第4和5列元素
y[-c(1,3),-c(2,5)]

#数据框提取子集，$常用于数据框
weather[1]
weather[[1]]
weather$Date
weather[,1]
weather[,"Date"]
weather[1:10,c("Date","Rainfall","WindGustDir")]


#R运算符和函数
x=1:10; x
y=2:11; y
z=c(3,5,10,6,9,4,7,1,2,8); z
x+y
x-y
x/y
x<y
x>5&x<8
x<2|x>8
log(x)
z[order(z)]
z[sample(1:10,5)]


#apply函数族示例
data(iris)
var=names(iris)[sapply(iris,is.numeric)]  #提取数值型变量
apply(iris[var],2,mean)  #对每列求均值
sapply(iris[var],mean)   #对每列求均值

#plyr包函数使用

library(rattle)
install.packages("plyr")
library(plyr)
ds=weather
names(ds)=tolower(names(ds))
ddply(ds,"location",summarize,mean.rainfall=mean(rainfall,na.rm=T)) #按地区统计降雨量平均值
 
#R程序控制
p = 0.03 
{
  if(p<=0.05)  
    print("p <= 0.05!")
  else   print("p > 0.05!")
}

for(i in 1:10) print(i)

i <- 1
while(i<10){ print(i);  i <- i + 1}


#R自定义函数
my_func=function(x) 
{
  #如果是数值变量求均值，分类变量求众数
  if(is.numeric(x)) {output=mean(x,na.rm=T)} 
  else
  {output=names(which.max(table(x)))}
  return(output)
}
my_func(weather$temp3pm)
my_func(weather$winddir3pm)


#R作图
plot(weather$mintemp)
plot(weather$mintemp, type="l")
plot(weather$mintemp, type="l", col='red')
plot(weather$mintemp, type="l", col='red',ylab="mintemp",main="plot of mintemp")
plot(weather$mintemp,weather$maxtemp)
abline(10,1)


demo(graphics)
example(barplot) 
example(matplot)

#利用ggplot2作图
library(ggplot2) 
p <- ggplot(data=mpg, mapping=aes(x=cty, y=hwy))
p + geom_point() 
p <- ggplot(mpg,  aes(x=cty, y=hwy, colour=factor(year)))  #将年份映射到颜色属性 
p + geom_point() 
p + geom_point() + stat_smooth() #增加平滑曲线

p + geom_point(aes(colour=factor(year)))+ 
  stat_smooth()+  
  scale_color_manual(values =c('blue','red'))  #用标度来修改颜色取值

p + geom_point(aes(colour=factor(year),size=displ))+ 
  stat_smooth()+ 
  scale_color_manual(values =c('blue2','red4')) #将排量映射到散点大小  


#R脚本
source("regression.r")

#########################################R数据挖掘部分#############################
#加载样例数据
library(rattle)
data(weather)
#了解你的数据
str(weather)  #查看数据结构
dim(weather) #查看数据集维度
names(weather)  #查看变量名
head(weather)    #查看数据集前几条样例
tail(weather)  #查看数据集后几条样例
weather[sample(nrow(weather),6),]  #随机查看数据集记录
summary(weather)  #获得变量描述统计信息

#变量角色定义
names(weather) = tolower(names(weather))  #为便于操作，变量名字都转为小写
id=c("date","location","risk_mm")  #不参与建模ID变量
input=setdiff(names(weather),id)  #参与建模变量
target="raintomorrow"  #目标变量
numerics=names(which(sapply(weather[input],is.numeric)) )#数值变量
categorics=names(which(sapply(weather[input],is.factor))) #字符变量

#数据质量检查，剔除建模意义不大的变量
uniques=names(which(sapply(weather[input],function(x) length(unique(x))==1))) #常数变量
missings = names(which(sapply(weather[input], function(x) sum(is.na(x))>= 0.7*nrow(weather)))) #缺失值过多
centors=names(which(sapply(weather[input],function(x) max(table(x))/length(x)>0.95))) #取值集中在某个值
manylvls=names(which(sapply(weather[categorics], function(x) length(levels(x))>20))) #分类变量水平过多
ignores=c(id, uniques, centors, missings, manylvls)
input=setdiff(input, ignores)

#单变量数据探索示例
hist(weather$mintemp, breaks=20, freq=FALSE, col='red', xlab='mintemp', ylab='probability', main='Histogram of mintemp') #直方图
boxplot(weather$mintemp, main='boxplot of mintemp') #箱线图
barplot(table(weather$windgustdir), main='barplot of windgustdir') #柱状图
pie(table(weather$raintomorrow), radius = 1,main='pie of raintomorrow')  #饼图

#多变量数据探索示例
plot(weather$rainfall, weather$evaporation) #双变量散点图
pairs(weather[numerics[1:4]]) #多变量散点矩阵
plot(weather[numerics[1:4]])  #多变量散点矩阵

boxplot(mintemp ~ raintomorrow, data=weather, xlab='raintomorrow', ylab='mintemp', main='boxplot') #数值变量与目标关系箱线图
mosaicplot(table(weather$windgustdir, weather$raintomorrow),color=1:2)  #分类变量与目标关系
bins=binning(weather$mintemp,bins=8,method="quantile")  #连续变量先离散化
mosaicplot(table(bins, weather$raintomorrow),color=1:2) #马赛克图

#变量筛选

library(RWeka)
GainRatioAttributeEval(raintomorrow ~ . , data = weather[input]) #通过信息增益率筛选变量
InfoGainAttributeEval(raintomorrow ~ . , data = weather[input])  #通过信息增益筛选变量

#通过IV值筛选变量
var.iv=function(input,target) {
  if(is.numeric(input)) input=binning(input,bins=10,method="quantile")
  temp1=table(input,target)
  temp2=as.data.frame(matrix(temp1,length(unique(input)),2))
  temp3=sapply(temp2,function(input) input/sum(input))
  if(!is.matrix(temp3)) {iv=0} else
  {
    woe=log(temp3[,1]/temp3[,2])*100
    iv=sum((temp3[,1]-temp3[,2])[!is.infinite(woe)]*woe[!is.infinite(woe)])
  }
  return(iv) }

iv.value=sapply(weather[input],var.iv,target=weather$raintomorrow)  #计算变量的IV值
input=names(which(iv.value>30)) #筛选IV值大于30的变量

#数据预处理示例
quan_mintemp<-binning(weather$mintemp, 10, method="quantile", ordered=FALSE) #等频离散化
plot(quan_mintemp)
equal_mintemp<-cut(weather$maxtemp, 10) #等宽离散化
plot(equal_mintemp)
mintemp.scale=scale(weather$mintemp,scale=T,center=T) #变量标准化
mintemp.std=(weather$mintemp-min(weather$mintemp)) / (max(weather$mintemp)-min(weather$mintemp)) #变量归一化
rainfall.log=log(weather$rainfall) #变量数学变换

#缺失值处理
del=weather[input]; dim(del);
del.imputed=del[!is.na(del[5]),]; dim(del.imputed) #直接删除缺失值

library(randomForest)
ds=weather[input]; sum(is.na(ds)); 
ds.imputed=na.roughfix(ds); sum(is.na(ds.imputed))  #对缺失值对简单替换

dss=weather[c(input,target)]; sum(is.na(dss));  
dss.imputed= rfImpute(raintomorrow~ ., dss); sum(is.na(dss.imputed))


#样本分割抽样
set.seed(42)
nobs <- nrow(weather)
n_train <- sample(nobs, 0.7*nobs)
n_validate <- sample(setdiff(seq_len(nobs), n_train), 0.15*nobs)
n_test<-setdiff(setdiff(seq_len(nobs),n_train),n_validate)
vars=union(input,target)
train<-weather[n_train,vars]         #训练集
validate<-weather[n_validate,vars]   #验证集
test<-weather[n_test,vars]           #测试集

#过抽样示例
table(train$raintomorrow) #获取正样本比例
train_new = rbind(train[train$raintomorrow=="Yes",vars],train[train$raintomorrow=="Yes",],train) #正样本扩充三倍
dim(train);dim(train_new)

#构建决策树模型
library(rpart)
model <- rpart(raintomorrow ~ .,data=train,method="class",parms=list(split="information")) #建立模型
printcp(model) #查看结果
asRules(model) #获得规则
fancyRpartPlot(model, main="Decision Tree for raintomorrow") #得到决策树图

#模型打分
pr <- predict(model, test, type="class")
write.csv(data.frame(date=weather[n_test,"date"],raintommorow=test[target], pre_raintommorow=pr), file="weather_test_score.csv", row.names=FALSE)

#模型评估

library(ROCR)
pre=predict(model,test,type="prob")[,2]
pred <- prediction(pre, test[target])

plot(performance(pred, "tpr", "fpr"),col='red', main="ROC curves") #ROC curves
plot(performance(pred, "lift", "rpp"), col='red', main="Lift curves") #Lift charts
plot(performance(pred, "prec", "rec"), col='red', main="Precision/recall graphs") #Precision/recall graphs
plot(performance(pred, "sens", "spec"), col='red') #Sensitivity/specificity plots


#模型部署
source("tree.r")
#设置R环境变量,通过命令行调用
R < d:\r\r_training\tree.r --vanilla   #更换自己的工作目录