决策树,随机森林,boost森林算法

欢迎访问我的主页: https://heeheeaii.github.io/

package com.treevalue.beself.other

import kotlin.math.*
import kotlin.random.Random

data class DataNode(val features: DoubleArray, val value: Double) {
    override fun equals(other: Any?): Boolean {
        if (this === other) return true
        if (javaClass != other?.javaClass) return false
        other as DataNode
        if (!features.contentEquals(other.features)) return false
        if (value != other.value) return false
        return true
    }

    override fun hashCode(): Int {
        var result = features.contentHashCode()
        result = 31 * result + value.hashCode()
        return result
    }
}

sealed class TreeNode {
    data class Leaf(val value: Double) : TreeNode()
    data class MidNode(
        val splitIdx: Int,
        val threshold: Double,
        val left: TreeNode,
        val right: TreeNode,
    ) : TreeNode()
}

class DecisionTree(
    private val maxDepth: Int = 10,
    private val minSplitNum: Int = 2,
    private val minSamplesLeaf: Int = 1,
) {
    private var root: TreeNode? = null

    fun train(data: List<DataNode>) {
        root = buildTree(data, depth = 0)
    }

    private fun buildTree(data: List<DataNode>, depth: Int): TreeNode {
        if (depth >= maxDepth || // 深度过深
            data.size < minSplitNum || // 数量过少
            data.map { it.value }.distinct().size == 1 // 值相同
        ) {
            val prediction = data.map { it.value }.average()
            return TreeNode.Leaf(prediction)
        }

        val bestSplit = findBestSplit(data)

        if (bestSplit == null) {
            val prediction = data.map { it.value }.average()
            return TreeNode.Leaf(prediction)
        }

        val (leftData, rightData) = splitData(data, bestSplit.first, bestSplit.second)

        if (leftData.size < minSamplesLeaf || rightData.size < minSamplesLeaf) {
            val prediction = data.map { it.value }.average()
            return TreeNode.Leaf(prediction)
        }

        val leftTree = buildTree(leftData, depth + 1)
        val rightTree = buildTree(rightData, depth + 1)

        return TreeNode.MidNode(bestSplit.first, bestSplit.second, leftTree, rightTree)
    }

    private fun findBestSplit(data: List<DataNode>): Pair<Int, Double>? {
        if (data.isEmpty()) return null

        val featureSize = data[0].features.size
        var bestMse = Double.MAX_VALUE // 均方误差
        var bestSplit: Pair<Int, Double>? = null // featureIdx, threshold

        for (featureIdx in 0 until featureSize) { // 遍历特征找最小的均方误差
            val featureValues = data.map { it.features[featureIdx] }.distinct().sorted()

            for (jdx in 0 until featureValues.size - 1) {
                val threshold = (featureValues[jdx] + featureValues[jdx + 1]) / 2
                val mse = calculateSplitMse(data, featureIdx, threshold)

                if (mse < bestMse) {
                    bestMse = mse
                    bestSplit = Pair(featureIdx, threshold)
                }
            }
        }

        return bestSplit
    }

    private fun calculateSplitMse(data: List<DataNode>, featureIndex: Int, threshold: Double): Double {
        val (leftData, rightData) = splitData(data, featureIndex, threshold)

        if (leftData.isEmpty() || rightData.isEmpty()) {
            return Double.MAX_VALUE
        }

        val totalSize = data.size.toDouble()
        val leftWeight = leftData.size / totalSize
        val rightWeight = rightData.size / totalSize

        val leftMse = calculateMse(leftData)
        val rightMse = calculateMse(rightData)

        return leftWeight * leftMse + rightWeight * rightMse
    }

    private fun calculateMse(data: List<DataNode>): Double {
        if (data.isEmpty()) return 0.0

        val mean = data.map { it.value }.average()
        return data.map { (it.value - mean).pow(2) }.average()
    }

    private fun splitData(
        data: List<DataNode>,
        featureIndex: Int,
        threshold: Double,
    ): Pair<List<DataNode>, List<DataNode>> {
        val leftData = data.filter { it.features[featureIndex] <= threshold }
        val rightData = data.filter { it.features[featureIndex] > threshold }
        return Pair(leftData, rightData)
    }

    fun predict(features: DoubleArray): Double {
        return root?.let { predictRecursive(it, features) } ?: 0.0
    }

    private fun predictRecursive(inputNode: TreeNode, features: DoubleArray): Double {
        return when (inputNode) {
            is TreeNode.Leaf -> inputNode.value
            is TreeNode.MidNode -> {
                if (features[inputNode.splitIdx] <= inputNode.threshold) {
                    predictRecursive(inputNode.left, features)
                } else {
                    predictRecursive(inputNode.right, features)
                }
            }
        }
    }

    fun predict(dataPoints: List<DoubleArray>): List<Double> {
        return dataPoints.map { predict(it) }
    }
}

class RandomForest(
    private val maxTreeNum: Int = 100,
    private val maxTreeDepth: Int = 10,
    private val minSplitNum: Int = 2,
    private val minLeafNodeNum: Int = 1,
    private val maxFeatureNum: Int? = null,
    private val sampleRatio: Double = 1.0,
    private val random: Random = Random.Default,
) {
    private val trees = mutableListOf<DecisionTree>()
    private val useFeatures = mutableListOf<IntArray>()

    fun train(data: List<DataNode>) {
        val featureNum = data[0].features.size
        val actualMaxFeatures = maxFeatureNum ?: sqrt(featureNum.toDouble()).toInt()

        repeat(maxTreeNum) { _ ->
            val sampleData = startSample(data, sampleRatio)
            val sampleFeature = selectRandomFeatures(featureNum, actualMaxFeatures)
            useFeatures.add(sampleFeature)

            val subsetData = createFeatureSubsetData(sampleData, sampleFeature) // 随机采样

            val tree = DecisionTree(maxTreeDepth, minSplitNum, minLeafNodeNum)
            tree.train(subsetData)
            trees.add(tree)
        }

    }

    private fun startSample(data: List<DataNode>, ratio: Double): List<DataNode> {
        val sampleSize = (data.size * ratio).toInt()
        return (1..sampleSize).map {
            data[random.nextInt(data.size)]
        }
    }

    private fun selectRandomFeatures(totalFeatures: Int, maxFeatures: Int): IntArray {
        val features = (0 until totalFeatures).toMutableList()
        features.shuffle(random)
        return features.take(maxFeatures).toIntArray()
    }

    private fun createFeatureSubsetData(data: List<DataNode>, featureSubset: IntArray): List<DataNode> {
        return data.map { point ->
            val subsetFeatures = featureSubset.map { point.features[it] }.toDoubleArray()
            DataNode(subsetFeatures, point.value)
        }
    }

    private fun predict(features: DoubleArray): Double {
        val predictions = trees.mapIndexed { idx, tree ->
            val featureSubset = useFeatures[idx]
            val subsetFeatures = featureSubset.map { features[it] }.toDoubleArray()
            tree.predict(subsetFeatures)
        }

        return predictions.average()
    }

    fun predict(dataPoints: List<DoubleArray>): List<Double> {
        return dataPoints.map { predict(it) }
    }

    fun getWeightStatistic(maxFeatureNum: Int): DoubleArray { //
        val weights = DoubleArray(maxFeatureNum)

        useFeatures.forEach { useFt ->
            useFt.forEach { idx ->
                weights[idx] += 1.0
            }
        }

        val total = weights.sum()
        if (total > 0) { // 除0错误
            for (idx in weights.indices) {
                weights[idx] /= total
            }
        }

        return weights
    }
}

class GradientBoostingRegressor(
    // 梯度提升回归器
    private val learnerNum: Int = 100,
    private val learningRate: Double = 0.1,
    private val maxTreeDepth: Int = 3,
    private val minSplitNum: Int = 2,
    private val minLeafNum: Int = 1,
    private val sampleRate: Double = 1.0,
    private val random: Random = Random.Default,
) {
    private val trees = mutableListOf<DecisionTree>()
    private var initPrediction: Double = 0.0

    fun train(data: List<DataNode>) {
        initPrediction = data.map { it.value }.average()

        var residuals = data.map { it.value - initPrediction }.toMutableList()

        repeat(learnerNum) { idx ->
            val residualData = data.mapIndexed { index, point ->
                DataNode(point.features, residuals[index])
            }

            val trainData = if (sampleRate < 1.0) {
                val sampleSize = (residualData.size * sampleRate).toInt()
                residualData.shuffled(random).take(sampleSize)
            } else {
                residualData
            }

            val tree = DecisionTree(
                maxDepth = maxTreeDepth,
                minSplitNum = minSplitNum,
                minSamplesLeaf = minLeafNum
            )
            tree.train(trainData)
            trees.add(tree)

            residuals = residuals.mapIndexed { idx, residual ->
                val prediction = tree.predict(data[idx].features)
                residual - learningRate * prediction
            }.toMutableList()

            if ((idx + 1) % 20 == 0) {
                val predictionList = data.map { predict(it.features) }
                val mse = data.zip(predictionList) { origin, pred ->
                    (origin.value - pred).pow(2)
                }.average()
                println("第 ${idx + 1} 轮后的MSE: $mse")
            }
        }

        println("梯度提升模型训练完成!")
    }

    private fun predict(features: DoubleArray): Double {
        var prediction = initPrediction

        trees.forEach { tree ->
            prediction += learningRate * tree.predict(features)
        }

        return prediction
    }

    fun predict(dataPoints: List<DoubleArray>): List<Double> {
        return dataPoints.map { predict(it) }
    }

}

object ModelEvaluator {
    fun calculateMseInSameLen(actual: List<Double>, predicted: List<Double>): Double {
        return actual.zip(predicted) { a, p -> (a - p).pow(2) }.average()
    }

    fun calculateRmseInSameLen(actual: List<Double>, predicted: List<Double>): Double {
        return sqrt(calculateMseInSameLen(actual, predicted))
    }

    fun calculateR2InSameLen(actual: List<Double>, predicted: List<Double>): Double {
        val actualMean = actual.average()
        val totalSumSquares = actual.sumOf { (it - actualMean).pow(2) }
        val residualSumSquares = actual.zip(predicted) { a, p -> (a - p).pow(2) }.sum()
        return 1.0 - (residualSumSquares / totalSumSquares)
    }
}

object DataGenerator {
    fun generateNonlinearData(
        samples: Int = 1000,
        noise: Double = 0.2,
        random: Random = Random.Default,
    ): List<DataNode> {
        return (1..samples).map {
            val x1 = random.nextDouble(-PI, PI)
            val x2 = random.nextDouble(-PI, PI)
            val x3 = random.nextDouble(-2.0, 2.0)

            val target = sin(x1) + cos(x2) + x3.pow(2) + random.nextDouble() * noise
            DataNode(doubleArrayOf(x1, x2, x3), target)
        }
    }
}

fun main() {
    println("=== 机器学习算法演示 ===\n")

    println("生成数据集...")
    val random = Random(42)
    val trainData = DataGenerator.generateNonlinearData(800, 0.2, random)
    val testData = DataGenerator.generateNonlinearData(200, 0.2, random)

    val testFeatures = testData.map { it.features }
    val testLabels = testData.map { it.value }

    println("训练集大小: ${trainData.size}")
    println("测试集大小: ${testData.size}\n")

    println("=== 1. 决策树 ===")
    val decisionTree = DecisionTree(maxDepth = 8, minSplitNum = 5, minSamplesLeaf = 2)
    decisionTree.train(trainData)

    val dtPredictions = decisionTree.predict(testFeatures)
    val dtMse = ModelEvaluator.calculateMseInSameLen(testLabels, dtPredictions)
    val dtRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, dtPredictions)
    val dtR2 = ModelEvaluator.calculateR2InSameLen(testLabels, dtPredictions)

    println("决策树结果:")
    println("  MSE: $dtMse")
    println("  RMSE: $dtRmse")
    println("  R²: $dtR2\n")

    println("=== 2. 随机森林 ===")
    val randomForest = RandomForest(
        maxTreeNum = 50,
        maxTreeDepth = 8,
        minSplitNum = 5,
        minLeafNodeNum = 2,
        maxFeatureNum = 2,
        sampleRatio = 0.8,
        random = random
    )
    randomForest.train(trainData)

    val rfPredictions = randomForest.predict(testFeatures)
    val rfMse = ModelEvaluator.calculateMseInSameLen(testLabels, rfPredictions)
    val rfRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, rfPredictions)
    val rfR2 = ModelEvaluator.calculateR2InSameLen(testLabels, rfPredictions)

    println("随机森林结果:")
    println("  MSE: $rfMse")
    println("  RMSE: $rfRmse")
    println("  R²: $rfR2")

    val featureImportances = randomForest.getWeightStatistic(3)
    println("  特征重要性: ${featureImportances.contentToString()}\n")

    println("=== 3. 梯度提升 ===")
    val gradientBoosting = GradientBoostingRegressor(
        learnerNum = 100,
        learningRate = 0.1,
        maxTreeDepth = 4,
        minSplitNum = 5,
        sampleRate = 0.8,
        random = random
    )
    gradientBoosting.train(trainData)

    val gbPredictions = gradientBoosting.predict(testFeatures)
    val gbMse = ModelEvaluator.calculateMseInSameLen(testLabels, gbPredictions)
    val gbRmse = ModelEvaluator.calculateRmseInSameLen(testLabels, gbPredictions)
    val gbR2 = ModelEvaluator.calculateR2InSameLen(testLabels, gbPredictions)

    println("梯度提升结果:")
    println("  MSE: $gbMse")
    println("  RMSE: $gbRmse")
    println("  R²: $gbR2\n")

    println("=== 算法对比 ===")
    println("算法           MSE        RMSE       R²")
    println("决策树      %.6f   %.6f   %.6f".format(dtMse, dtRmse, dtR2))
    println("随机森林    %.6f   %.6f   %.6f".format(rfMse, rfRmse, rfR2))
    println("梯度提升    %.6f   %.6f   %.6f".format(gbMse, gbRmse, gbR2))

    println("\n=== 演示完成 ===")
}
<think>我们正在讨论机器学习中的树模型。用户要求获取CatBoost、XGBoost决策树随机森林的模型图。由于模型图通常指的是算法的结构示意图,而树模型的结构图通常以树形图表示,因此我们需要为每个模型绘制一个代表性的结构图。 注意:由于文本环境限制,我们无法直接绘制图像,但可以使用字符画或文本描述来示意模型结构。同时,我们也可以描述模型图的特点。 决策树:单个树结构,含根节点、内部节点和叶节点。 随机森林:多个决策树的集合,每个树独立训练,然后通过投票或平均进行预测。 XGBoost:多个决策树以加法方式组合,每棵树学习前序树的残差。 CatBoost:与XGBoost类似,但特别处理类别特征,并且采用对称树结构(oblivious trees)。 由于无法输出真实图片,我们可以尝试用字符画表示简单的树结构,但复杂的模型(如随机森林、XGBoost、CatBoost)可能难以用字符画完整表示。因此,我们将为每个模型提供一个简单的树结构示意图(以决策树为例),并描述其特点。对于集成模型,我们可以用一个简单的示例(如两棵树)来表示。 考虑到字符画的局限性,我们主要描述模型图的特点,并提供简单的示意图。 决策树示意图(以字符画表示): 根节点 / \ 内部节点 叶节点 / \ 叶节点 叶节点 随机森林:多个决策树的集合,因此可以画出多个决策树,每个树的结构可能不同。 XGBoost:多个决策树串联,每棵树都基于前一棵树的残差进行构建。示意图可以表示为: 树1 -> 树2 -> 树3 -> ... -> 树n 但实际上每棵树的结构是树形,因此我们可以画出一棵具体的树,然后说明这种加法模型。 CatBoost:通常使用对称树(oblivious trees),即每一层使用相同的分裂特征和分裂点。因此,它的树结构是平衡的。例如: 根节点(特征A>0.5) / \ 内部节点 内部节点 (特征B>1) (特征B>1) / \ / \ 叶节点 叶节点 叶节点 叶节点 但是,由于我们无法生成真实的图,建议用户使用专业工具(如Graphviz)来生成这些模型的图。在代码中,我们可以使用相应的库(如sklearn的plot_tree)来绘制。 因此,我们可以提供生成这些模型图的代码示例(使用Python和相应库),这样用户可以在本地运行生成图像。 以下为每个模型提供代码示例: 1. 决策树:使用sklearn的决策树和plot_tree函数。 2. 随机森林随机森林由多个决策树组成,我们可以绘制其中一棵树。 3. XGBoost:使用XGBoost库的plot_tree函数。 4. CatBoost:使用CatBoost库的plot_tree函数。 注意:这些代码需要用户安装相应的库,并且运行环境支持图形显示。 由于我们当前是文本环境,无法显示图像,因此只提供代码示例。 另外,我们也可以提供绘制对称树(CatBoost)的示例,但CatBoost的对称树在绘制时和普通决策树类似,只是每层都是相同的分裂条件(因为对称树每层只用一个特征分裂)。 下面我们将分别给出代码示例。 </think>以下是为您整理的CatBoost、XGBoost决策树随机森林的模型结构示意图及核心特点: ### 1. 决策树模型图 ``` [根节点:特征X₁≤0.5] / \ [左子节点:特征X₂>1.3] [右子节点:类别A] / \ | [叶节点:类别B] [叶节点:类别C] [叶节点:类别A] ``` - **特点**:单树结构,基于特征阈值递归分裂 - **优势**:直观易解释,支持分类和回归 - **局限**:容易过拟合[^1] ### 2. 随机森林模型图 ``` 决策树1 决策树2 决策树n / | \ / | \ / | \ [A] [B] [C] [B] [A] [C] [C] [A] [B] \ | / \ | / \ | / \ | / \ | / \ | / [集成输出] [类别投票] [最终预测] ``` - **特点**:并行训练的树集合(bagging) - **核心**:每棵树使用随机样本和随机特征子集 - **优势**:降低方差,抗过拟合能力强[^1] ### 3. XGBoost模型图 ``` 初始模型F₀ → 残差树T₁ → 更新模型F₁ → 残差树T₂ → ... → 最终模型Fₖ │ ▲ │ ▲ │ └─预测误差─┘ └─预测误差─┘ └─预测 ``` - **特点**:顺序构建的梯度提升树(boosting) - **核心**:每棵树学习前序模型的残差 - **创新**:正则化项控制复杂度,加权分位数策略[^2] ### 4. CatBoost模型图 ``` 对称树结构(所有节点同层同特征分裂): [特征A≤0.5] / \ [特征B≤1.0] [特征B≤1.0] ← 同层相同分裂特征 / \ / \ [叶节点] [叶节点] [叶节点] [叶节点] ``` - **特点**:对称树(oblivious trees)结构 - **核心创新**: - 类别特征的目标编码(Target Encoding) - 有序提升(Ordered Boosting)防过拟合[^2] - 默认对称树加速预测 > 注:实际模型图可通过工具生成: > - Python代码示例(决策树): > ```python > from sklearn.tree import plot_tree > import matplotlib.pyplot as plt > plt.figure(figsize=(12,8)) > plot_tree(model, feature_names=X.columns, filled=True) > plt.show() > ``` ### 模型对比总结 | 特性 | 决策树 | 随机森林 | XGBoost | CatBoost | |-------------|--------|----------|---------|-------------| | **集成方式** | 单树 | Bagging | Boosting| Boosting | | **树结构** | 不规则 | 不规则 | 不规则 | 对称树 | | **类别特征** | 需编码 | 需编码 | 需编码 | 原生支持 | | **过拟合** | 易 | 不易 | 中等 | 强抵抗 |
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值