package mlia.bayes
import breeze.linalg._
object Prep {
def loadDataSet: (Array[Array[String]], Vector[Int]) = {
val postingList = Array(
Array("my", "dog", "has", "flea", "problems", "help", "please"),
Array("maybe", "not", "take", "him", "to", "dog", "park", "stupid"),
Array("my", "dalmation", "is", "so", "cute", "I", "love", "him"),
Array("stop", "posting", "stupid", "worthless", "garbage"),
Array("mr", "licks", "ate", "my", "steak", "how", "to", "stop", "him"),
Array("quit", "buying", "worthless", "dog", "food", "stupid"))
val classVec = DenseVector(0, 1, 0, 1, 0, 1)
(postingList, classVec)
}
def createVocabList(dataSet: Array[Array[String]]): Array[String] = dataSet.flatten.distinct
def setOfWords2Vec(vocabList: Array[String], inputSet: Array[String]): DenseVector[Int] = {
val returnVec: DenseVector[Int] = DenseVector.zeros[Int](vocabList.size)
inputSet.foldLeft(returnVec) { (state, word) =>
if (vocabList.contains(word)) state(vocabList.indexOf(word)) = 1
else println(s"the word: $word is not in my Vocabulary!")
state
}
}
def bagOfWords2VecMN(vocabList: Array[String], inputSet: Array[String]): DenseVector[Int] = {
inputSet.foldLeft(DenseVector.zeros[Int](vocabList.size)) { (state, word) =>
if (vocabList.contains(word)) state(vocabList.indexOf(word)) = state(vocabList.indexOf(word)) + 1
else println(s"the word: $word is not in my Vocabulary!")
state
}
}
}
package mlia.bayes
import breeze.linalg._
import breeze.numerics._
object NaiveBayes {
case class Prob(num: Vector[Int], denom: Double) {
def probability: Vector[Double] = num.mapValues(_.toDouble) :/ denom
def logProbability: Vector[Double] = log(num.mapValues(_.toDouble) :/ denom)
}
object Prob {
def apply(size: Int): Prob = Prob(DenseVector.ones(size), 2.0d)
}
def trainNB0(trainMatrix: DenseMatrix[Int], trainCategory: Vector[Int]): (Prob, Prob, Double) = {
val numTrainDocs = trainMatrix.rows
val numWords = trainMatrix.cols
val probs = (0 until numTrainDocs).foldLeft((Prob(numWords), Prob(numWords))) { (state, i) =>
val v: Vector[Int] = trainMatrix(i, ::).toDenseVector
if (trainCategory(i) == 1) (Prob(state._1.num + v, state._1.denom + v.sum), state._2)
else (state._1, Prob(state._2.num + v, state._2.denom + v.sum))
}
(probs._2, probs._1, trainCategory.sum / numTrainDocs.toDouble)
}
def classifyNB(vec2Classify: Vector[Int], p0Vec: Vector[Double], p1Vec: Vector[Double], pClass1: Double) = {
val p1 = (vec2Classify.mapValues(_.toDouble) :* p1Vec: Vector[Double]).sum + log(pClass1)
val p0 = (vec2Classify.mapValues(_.toDouble) :* p0Vec: Vector[Double]).sum + log(1.0 - pClass1)
if (p1 > p0) 1 else 0
}
}