package Bayes
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
/**
* 简单实现统计学习方法上的朴素贝叶斯算法案例
*/
object Naive {
def main(args: Array[String]): Unit ={
val conf=newSparkConf().setMaster("local").setAppName("ML")
val sc=newSparkContext(conf)//train数据源
val X1=Array(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
val X2=Array("S","M","M","S","S","S","M","M","L","L","L","M","M","L","L")
val label=Array(-1,-1,1,1,-1,-1,-1,1,1,1,1,1,1,1,-1)
val mEresult=maxLikelihoodEstimate(X1, X2, label, sc)
val lamda=1
val K=sc.parallelize(label).map { x =>(x,1)}.reduceByKey(_+_).count().toInt
val s1=sc.parallelize(X1).map { x =>(x,1)}.reduceByKey(_+_).count().toInt
val s2=sc.parallelize(X2).map { x =>(x,1)}.reduceByKey(_+_).count().toInt
val sj=Array(s1,s2)
val bEresult=bayesEstimate(X1, X2, label, sc, lamda, K, sj)println("极大似然估计:"+mEresult(0))println("贝叶斯估计:"+bEresult(0))}/**极大似然估计
* @param x1 特征列
* @param x2 特征列
* @param label 标签列
* @return 标签 + 概率
*/
def maxLikelihoodEstimate(X1:Array[Int],X2:Array[String],label:Array[Int],sc:SparkContext):Array[(Double, Double)]={//将Array数据源转为RDD
val labelCount=sc.parallelize(label).map { x =>(x,1)}.reduceByKey(_+_).collect()//特征与label数据拼接
val x1Label=ArrayBuffer[(Int,Int)]()
val x2Label=ArrayBuffer[(String,Int)]()for(i<-0 until label.length){
x1Label.append((X1(i),label(i)))
x2Label.append((X2(i),label(i)))}
val x1LabelCount=sc.parallelize(x1Label).map(x=>(x,1)).reduceByKey(_+_).collect()
val x2LabelCount=sc.parallelize(x2Label).map(x=>(x,1)).reduceByKey(_+_).collect()//总记录数
val totalRecords=label.length
//计算先验概率
val proProb=ArrayBuffer[LabeledPoint]()for(lb<-labelCount) proProb.append(LabeledPoint(lb._1,Vectors.dense(lb._2/totalRecords.toDouble)))//计算条件概率
val x1conProb=ArrayBuffer[LabeledPoint]()
val x2conProb=ArrayBuffer[(Double,(String,Int))]()for(lab<-labelCount){//针对每一个lab,计算特征X1的条件概率for(t1<-x1LabelCount){if(lab._1==t1._1._2) x1conProb.append(LabeledPoint((t1._2/lab._2.toDouble).formatted("%.3f").toDouble,Vectors.dense(t1._1._1,t1._1._2)))}//针对每一个lab,计算特征X2的条件概率for(t2<-x2LabelCount){if(lab._1==t2._1._2) x2conProb.append(((t2._2/lab._2.toDouble).formatted("%.3f").toDouble,(t2._1._1,t2._1._2)))}}//对于给定的测试数据,确定实例x的类
val testData=Array(2,"S")//计算该数据对应的所有可能label的概率
val testProb=ArrayBuffer[LabeledPoint]()for(lab<-proProb){
val tx1=testData(0).toString().toDouble
val tx2=testData(1).toString()
var resultx1con=0.0
var resultx2con=0.0for(xlcon<-x1conProb){if(lab.label == xlcon.features(1)&& xlcon.features(0)==tx1) resultx1con=xlcon.label
}for(x2con<-x2conProb){if(lab.label == x2con._2._2.toDouble && x2con._2._1.equals(tx2.toString())) resultx2con=x2con._1
}
testProb.append(LabeledPoint(lab.label,Vectors.dense(lab.features(0)*resultx1con*resultx2con)))}//取概率最大值的label作为最终的label
val resultLabel=sc.parallelize(testProb).map { x =>(x.label,x.features(0))}.map{case(k,v)=>(v,k)}.sortByKey(false).collect().take(1)
resultLabel
}/**
* 贝叶斯估计
* 贝叶斯估计在求先验概率和条件概率的时候,分别加上了lamda、K、lamda,避免了极大似然估计概率值为0的情况
* @param x1 特征列
* @param x2 特征列
* @param label 标签列
* @param lamda 贝叶斯估计参数(lamda=0时,极大似然估计 ;lamda=1时,拉普拉斯平滑估计)
* @param K label种类数
* @param sj sj(i)表示第i特征可能取值的个数
* @return 标签 + 概率
*/
def bayesEstimate(X1:Array[Int],X2:Array[String],label:Array[Int],sc:SparkContext,lamda:Int,K:Int,Sj:Array[Int]):Array[(Double, Double)]={//将Array数据源转为RDD
val labelCount=sc.parallelize(label).map { x =>(x,1)}.reduceByKey(_+_).collect()//特征与label数据拼接
val x1Label=ArrayBuffer[(Int,Int)]()
val x2Label=ArrayBuffer[(String,Int)]()for(i<-0 until label.length){
x1Label.append((X1(i),label(i)))
x2Label.append((X2(i),label(i)))}
val x1LabelCount=sc.parallelize(x1Label).map(x=>(x,1)).reduceByKey(_+_).collect()
val x2LabelCount=sc.parallelize(x2Label).map(x=>(x,1)).reduceByKey(_+_).collect()//总记录数
val totalRecords=label.length
//计算先验概率
val proProb=ArrayBuffer[LabeledPoint]()for(lb<-labelCount) proProb.append(LabeledPoint(lb._1,Vectors.dense((lb._2+lamda)/(totalRecords.toDouble+K*lamda))))//计算条件概率
val x1conProb=ArrayBuffer[LabeledPoint]()
val x2conProb=ArrayBuffer[(Double,(String,Int))]()for(lab<-labelCount){//针对每一个lab,计算特征X1的条件概率for(t1<-x1LabelCount){if(lab._1==t1._1._2) x1conProb.append(LabeledPoint(((t1._2+lamda)/(lab._2.toDouble+lamda*Sj(0))).formatted("%.3f").toDouble,Vectors.dense(t1._1._1,t1._1._2)))}//针对每一个lab,计算特征X2的条件概率for(t2<-x2LabelCount){if(lab._1==t2._1._2) x2conProb.append((((t2._2+lamda)/(lab._2.toDouble+lamda*Sj(1))).formatted("%.3f").toDouble,(t2._1._1,t2._1._2)))}}//对于给定的测试数据,确定实例x的类
val testData=Array(2,"S")//计算该数据对应的所有可能label的概率
val testProb=ArrayBuffer[LabeledPoint]()for(lab<-proProb){
val tx1=testData(0).toString().toDouble
val tx2=testData(1).toString()
var resultx1con=0.0
var resultx2con=0.0for(xlcon<-x1conProb){if(lab.label == xlcon.features(1)&& xlcon.features(0)==tx1) resultx1con=xlcon.label
}for(x2con<-x2conProb){if(lab.label == x2con._2._2.toDouble && x2con._2._1.equals(tx2.toString())) resultx2con=x2con._1
}
testProb.append(LabeledPoint(lab.label,Vectors.dense(lab.features(0)*resultx1con*resultx2con)))}//取概率最大值的label作为最终的label
val resultLabel=sc.parallelize(testProb).map { x =>(x.label,x.features(0))}.map{case(k,v)=>(v,k)}.sortByKey(false).collect().take(1)
resultLabel
}}
最终结果:
极大似然估计:(0.0666,-1.0)
贝叶斯估计:(0.06088023529411765,-1.0)