决策树算法实例 scala

最新推荐文章于 2025-09-04 14:53:46 发布

原创最新推荐文章于 2025-09-04 14:53:46 发布 · 2.1k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#算法

ml 同时被 2 个专栏收录

41 篇文章

订阅专栏

机器学习基础

29 篇文章

订阅专栏

本文介绍了一种基于信息增益的决策树分类器实现方法，包括熵计算、特征选择及决策树构造等关键步骤。

young   myope   no  reduced no lenses
young   myope   no  normal  soft
young   myope   yes reduced no lenses
young   myope   yes normal  hard
young   hyper   no  reduced no lenses
young   hyper   no  normal  soft
young   hyper   yes reduced no lenses
young   hyper   yes normal  hard
pre myope   no  reduced no lenses
pre myope   no  normal  soft
pre myope   yes reduced no lenses
pre myope   yes normal  hard
pre hyper   no  reduced no lenses
pre hyper   no  normal  soft
pre hyper   yes reduced no lenses
pre hyper   yes normal  no lenses
presbyopic  myope   no  reduced no lenses
presbyopic  myope   no  normal  no lenses
presbyopic  myope   yes reduced no lenses
presbyopic  myope   yes normal  hard
presbyopic  hyper   no  reduced no lenses
presbyopic  hyper   no  normal  soft
presbyopic  hyper   yes reduced no lenses
presbyopic  hyper   yes normal  no lenses

package mlia.trees

import breeze.numerics._
import scala.annotation.tailrec

case class Tree(nodes: Array[Node] = Array.empty) {

  override def toString = s"Tree[${nodes.map(_.toString).mkString(",")}]"

  def <<-(node: Node): Tree = Tree(nodes :+ node)

  def classify(testVec: Vector[Int], featLabels: Array[String], cur: Array[Node] = nodes): String = search(testVec, featLabels, nodes)

  @tailrec
  private def search(testVec: Vector[Int], featLabels: Array[String], cur: Array[Node]): String = {
    cur.find { node =>
      node.isLeaf || testVec(featLabels.indexOf(node.key)).toString == node.value.toString
    } match {
      case None => "Fail to classify."
      case Some(node) if node.isLeaf => node.value.toString
      case Some(node) => search(testVec, featLabels, node.children)
    }
  }
}

case class Node(key: String, value: Any, children: Array[Node] = Array.empty) {

  val isLeaf = children.isEmpty

  override def toString =
    if (children.isEmpty) s" -> $value[Leaf]" else s"{$key : $value ${children.map(_.toString).mkString(",")}}"
}

object Tree {

  case class Row(data: Array[Int], label: String)

  case class InformationGain(featureIdx: Int, infoGain: Double)

  def calcShannonEnt(dataSet: Array[Row]) = {

    val labelCounts = dataSet.foldLeft(Map.empty[String, Int]) { (map, row) =>
      map + (row.label -> (map.getOrElse(row.label, 0) + 1))
    }
    val numEntries = dataSet.size
    labelCounts.foldLeft(0.0) { (state, count) =>
      val prob = labelCounts(count._1).toDouble / numEntries
      state - prob * (log(prob) / log(2))
    }
  }

  def splitDataSet(dataSet: Array[Row], axis: Int, value: Int) = dataSet.filter(_.data(axis) == value)

  def chooseBestFeatureToSplit(dataSet: Array[Row]) = {

    val numEntries = dataSet.size
    val numFeatures = dataSet.head.data.size
    val baseEntropy = calcShannonEnt(dataSet)

    (0 until numFeatures).foldLeft(InformationGain(-1, 0.0)) { (curBest, cur) =>
      val uniqueVals = dataSet.map(_.data(cur)).distinct
      val newEntropy = uniqueVals.foldLeft(0.0) { (ent, value) =>
        val subDataSet = splitDataSet(dataSet, cur, value)
        val prob = subDataSet.size / numEntries.toDouble
        ent + prob * calcShannonEnt(subDataSet)
      }
      val infoGain = baseEntropy - newEntropy
      if (infoGain > curBest.infoGain) InformationGain(cur, infoGain) else curBest
    }
  }

  def majorityCnt(classList: Array[String]): String =
    classList.foldLeft(Map.empty[String, Int]) { (state, x) =>
      state + (x -> (state.getOrElse(x, 0) + 1))
    }.toArray.sortBy(_._2).reverse.head._1

  private def remove(num: Int, list: Array[String]) = list diff Array(num)

  def apply(dataSet: Array[Row], labels: Array[String]): Tree = createTree(dataSet, labels)

  private def createTree(dataSet: Array[Row], labels: Array[String], cur: Tree = Tree(), value: Int = -1): Tree = {
    val classList = dataSet.map(_.label)
    if (classList.distinct.size == 1) cur <<- Node(value.toString, classList(0)) // all label is equal
    else if (dataSet.head.data.isEmpty) cur <<- Node(value.toString, majorityCnt(classList)) // no more feature
    else {
      val bestFeat = chooseBestFeatureToSplit(dataSet).featureIdx
      val subLabels = remove(bestFeat, labels)
      val uniqueFeatValues = dataSet.map(_.data(bestFeat)).distinct
      uniqueFeatValues.foldLeft(cur) { (state, value) =>
        val subTree = createTree(splitDataSet(dataSet, bestFeat, value), subLabels, cur, value)
        state <<- Node(labels(bestFeat), value.toString, subTree.nodes)
      }
    }
  }
}