🤔 从医生的诊断过程说起
想象你去看医生,医生会问一系列问题:
医生: 你发烧吗?
患者: 是的,38.5°C
医生: 咳嗽吗?
患者: 干咳,没有痰
医生: 有呼吸困难吗?
患者: 有点胸闷
医生: 最近接触过感冒的人吗?
患者: 没有...
诊断: 很可能是流感,建议多休息,多喝水
医生通过一连串的二元选择(是/否)逐步缩小可能性,最终得出结论。
决策树就是模仿这种思维方式的AI算法!
🌳 什么是决策树?
决策树就像一个智能问答游戏:
- 从根节点(第一个问题)开始
- 根据答案走向不同的分支
- 重复这个过程直到叶子节点(最终结论)
- 每个叶子节点给出一个预测结果
🎯 生活中的决策树例子
贷款审批决策树:
年收入 > 30万?
/ \
是 否
/ \
信用分 > 700? 信用分 > 700?
/ \ / \
是 否 是 否
/ \ / \
批准 拒绝 需要担保 拒绝
看到了吗?计算机可以学会像银行审批员一样做决策!
🛠️ 用Python实现简单的决策树
方法1:手工实现ID3算法(理解原理)
import numpy as np
import pandas as pd
from collections import Counter
class SimpleDecisionTree:
def __init__(self, max_depth=5):
self.max_depth = max_depth
self.tree = None
def entropy(self, y):
"""计算信息熵:衡量混乱程度"""
if len(y) == 0:
return 0
counts = np.bincount(y)
probabilities = counts / len(y)
# 熵公式:H = -Σ(pi × log₂(pi))
entropy = 0
for p in probabilities:
if p > 0:
entropy -= p * np.log2(p)
return entropy
def information_gain(self, X_column, y, threshold):
"""计算信息增益:划分前后的熵减少量"""
# 根据阈值分割数据
left_indices = X_column <= threshold
right_indices = X_column > threshold
if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
return 0
# 计算父节点的熵
parent_entropy = self.entropy(y)
# 计算子节点的加权平均熵
n = len(y)
n_left, n_right = np.sum(left_indices), np.sum(right_indices)
child_entropy = (n_left / n) * self.entropy(y[left_indices]) + \
(n_right / n) * self.entropy(y[right_indices])
# 信息增益 = 父节点熵 - 子节点熵
gain = parent_entropy - child_entropy
return gain
def find_best_split(self, X, y):
"""找到最佳分割特征和阈值"""
best_gain = -1
best_feature = None
best_threshold = None
n_features = X.shape[1]
for feature_idx in range(n_features):
X_column = X[:, feature_idx]
thresholds = np.unique(X_column)
for threshold in thresholds:
gain = self.information_gain(X_column, y, threshold)
if gain > best_gain:
best_gain = gain
best_feature = feature_idx
best_threshold = threshold
return best_feature, best_threshold, best_gain
def fit(self, X, y, depth=0):
"""递归构建决策树"""
n_samples, n_features = X.shape
n_classes = len(np.unique(y))
# 停止条件
if (depth >= self.max_depth or n_classes == 1 or n_samples < 2):
leaf_value = self._most_common_label(y)
return {'leaf': True, 'value': leaf_value}
# 找到最佳分割
best_feature, best_threshold, best_gain = self.find_best_split(X, y)
if best_gain == 0: # 无法继续分割
leaf_value = self._most_common_label(y)
return {'leaf': True, 'value': leaf_value}
# 递归构建子树
left_indices = X[:, best_feature] <= best_threshold
right_indices = X[:, best_feature] > best_threshold
left_subtree = self.fit(X[left_indices], y[left_indices], depth + 1)
right_subtree = self.fit(X[right_indices], y[right_indices], depth + 1)
return {
'leaf': False,
'feature': best_feature,
'threshold': best_threshold,
'left': left_subtree,
'right': right_subtree
}
def _most_common_label(self, y):
"""返回最常见的标签"""
counter = Counter(y)
return counter.most_common(1)[0][0]
def predict_sample(self, x, tree):
"""预测单个样本"""
if tree['leaf']:
return tree['value']
if x[tree['feature']] <= tree['threshold']:
return self.predict_sample(x, tree['left'])
else:
return self.predict_sample(x, tree['right'])
def predict(self, X):
"""预测多个样本"""
return np.array([self.predict_sample(x, self.tree) for x in X])
# 创建简单的分类数据集:基于天气预测是否去公园玩
np.random.seed(42)
# 特征:[温度(°C), 湿度(%), 风力(级)]
# 标签:0=不去公园, 1=去公园
X_weather = np.array([
[25, 60, 2], [30, 45, 1], [35, 30, 3], [20, 80, 2], # 前4个样本
[28, 55, 1], [32, 40, 2], [18, 85, 3], [33, 35, 4], # 后4个样本
[22, 70, 2], [29, 50, 1], [15, 90, 4], [36, 25, 2] # 更多样本
])
y_park = np.array([1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1]) # 对应的标签
print("天气数据:")
print("温度(°C)\t湿度(%)\t风力(级)\t去公园?")
for i in range(len(X_weather)):
weather = X_weather[i]
decision = "是" if y_park[i] == 1 else "否"
print(f"{weather[0]}\t\t{weather[1]}\t{weather[2]}\t\t{decision}")
# 训练决策树
tree = SimpleDecisionTree(max_depth=3)
tree.tree = tree.fit(X_weather, y_park)
print(f"\n决策树训练完成!")
# 可视化决策树(简化版文本显示)
def print_tree(node, depth=0, feature_names=None):
indent = " " * depth
if node['leaf']:
print(f"{indent}预测: {'去公园' if node['value'] == 1 else '不去公园'}")
else:
feature_name = f"特征{node['feature']}" if feature_names is None else feature_names[node['feature']]
print(f"{indent}{feature_name} <= {node['threshold']:.1f}?")
print(f"{indent}├─ 是:")
print_tree(node['left'], depth + 1, feature_names)
print(f"{indent}└─ 否:")
print_tree(node['right'], depth + 1, feature_names)
print("\n决策树结构:")
feature_names = ['温度', '湿度', '风力']
print_tree(tree.tree, 0, feature_names)
# 测试预测
test_weather = np.array([
[27, 65, 2], # 温和天气
[37, 20, 1], # 炎热干燥微风
[19, 85, 4] # 寒冷潮湿大风
])
predictions = tree.predict(test_weather)
print(f"\n测试预测:")
for i, (weather, pred) in enumerate(zip(test_weather, predictions)):
decision = "去公园" if pred == 1 else "不去公园"
print(f"天气{weather} -> {decision}")
运行结果会显示一个基于温度和湿度的简单决策规则!
方法2:使用sklearn(实际应用)
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
# 使用经典的鸢尾花数据集
iris = load_iris()
X_iris = iris.data
y_iris = iris.target
feature_names = iris.feature_names
class_names = iris.target_names
print("鸢尾花数据集:")
print(f"特征: {list(feature_names)}")
print(f"类别: {list(class_names)}")
print(f"数据形状: {X_iris.shape}")
# 训练sklearn决策树
dt_classifier = DecisionTreeClassifier(
max_depth=3, # 限制树的深度,防止过拟合
random_state=42
)
dt_classifier.fit(X_iris, y_iris)
# 预测
predictions = dt_classifier.predict(X_iris)
accuracy = np.mean(predictions == y_iris)
print(f"\nsklearn决策树结果:")
print(f"训练准确率: {accuracy:.3f}")
# 可视化决策树
plt.figure(figsize=(15, 10))
plot_tree(dt_classifier,
feature_names=feature_names,
class_names=class_names,
filled=True, # 用颜色填充节点
rounded=True) # 圆角矩形
plt.title('鸢尾花分类决策树')
plt.show()
# 特征重要性
feature_importance = dt_classifier.feature_importances_
print(f"\n特征重要性:")
for name, importance in zip(feature_names, feature_importance):
print(f"{name}: {importance:.3f}")
# 单个样本预测解释
sample_idx = 0
sample_features = X_iris[sample_idx]
sample_true = y_iris[sample_idx]
sample_pred = predictions[sample_idx]
print(f"\n样本预测解释:")
print(f"真实类别: {class_names[sample_true]} (编号{sample_true})")
print(f"预测类别: {class_names[sample_pred]} (编号{sample_pred})")
print(f"特征值:")
for name, value in zip(feature_names, sample_features):
print(f" {name}: {value:.1f}")
# 使用predict_proba查看置信度
probabilities = dt_classifier.predict_proba(X_iris[sample_idx].reshape(1, -1))
print(f"预测概率: {dict(zip(class_names, probabilities[0]))}")
🔍 深入理解:信息论基础
📊 信息熵:衡量不确定性
熵越高 = 越混乱 = 越难预测
def visualize_entropy():
"""可视化不同概率分布的信息熵"""
# 计算不同概率下的熵
p_list = np.linspace(0.01, 0.99, 100)
entropy_list = [-p * np.log2(p) - (1-p) * np.log2(1-p) for p in p_list]
plt.figure(figsize=(10, 6))
plt.plot(p_list, entropy_list, 'b-', linewidth=2)
plt.axvline(x=0.5, color='red', linestyle='--', alpha=0.7, label='p=0.5 (最大熵)')
plt.axvline(x=0.1, color='green', linestyle='--', alpha=0.7, label='p=0.1')
plt.axvline(x=0.9, color='green', linestyle='--', alpha=0.7, label='p=0.9')
plt.xlabel('正例概率 (p)')
plt.ylabel('信息熵 (bits)')
plt.title('二元分类的信息熵:当p=0.5时最不确定')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
print("熵值解释:")
print(f"p=0.5时熵={max(entropy_list):.3f}: 完全随机,最难预测")
print(f"p=0.1时熵={[-p*np.log2(p)-(1-p)*np.log2(1-p) for p in [0.1]][0]:.3f}: 偏向负例")
print(f"p=0.9时熵={[-p*np.log2(p)-(1-p)*np.log2(1-p) for p in [0.9]][0]:.3f}: 偏向正例")
visualize_entropy()
🎯 信息增益:选择最佳问题
def demonstrate_information_gain():
"""演示信息增益的计算过程"""
# 简单的天气数据
weather_data = {
'温度': [25, 30, 35, 20, 28, 32, 18, 33],
'湿度': [60, 45, 30, 80, 55, 40, 85, 35],
'去公园': [1, 1, 1, 0, 1, 1, 0, 0] # 1=去,0=不去
}
df = pd.DataFrame(weather_data)
y = df['去公园'].values
X_temp = df['温度'].values
X_humidity = df['湿度'].values
tree = SimpleDecisionTree()
print("信息增益计算演示:")
print("=" * 50)
# 计算初始熵
initial_entropy = tree.entropy(y)
print(f"初始熵 (整个数据集): {initial_entropy:.3f}")
print(f"数据分布: {Counter(y)}")
# 按温度分割的信息增益
temp_thresholds = [22, 25, 28, 30, 32]
print(f"\n按温度分割的信息增益:")
for threshold in temp_thresholds:
gain = tree.information_gain(X_temp, y, threshold)
print(f" 温度 ≤ {threshold}°C: 信息增益 = {gain:.3f}")
# 按湿度分割的信息增益
humidity_thresholds = [35, 40, 50, 60, 70, 80]
print(f"\n按湿度分割的信息增益:")
for threshold in humidity_thresholds:
gain = tree.information_gain(X_humidity, y, threshold)
print(f" 湿度 ≤ {threshold}%: 信息增益 = {gain:.3f}")
demonstrate_information_gain()
🎨 决策树的可视化和解释性
📈 决策边界可视化
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# 生成二维分类数据
X_cls, y_cls = make_classification(
n_samples=200, n_features=2, n_redundant=0, n_informative=2,
n_clusters_per_class=1, random_state=42
)
# 训练决策树
dt_viz = DecisionTreeClassifier(max_depth=4, random_state=42)
dt_viz.fit(X_cls, y_cls)
# 可视化决策边界
plt.figure(figsize=(12, 5))
# 原始数据
plt.subplot(1, 2, 1)
scatter = plt.scatter(X_cls[:, 0], X_cls[:, 1], c=y_cls, cmap=plt.cm.RdYlBu,
edgecolors='black', s=30)
plt.title('原始数据分布')
plt.colorbar(scatter)
# 决策边界
plt.subplot(1, 2, 2)
# 创建网格
x_min, x_max = X_cls[:, 0].min() - 1, X_cls[:, 0].max() + 1
y_min, y_max = X_cls[:, 1].min() - 1, X_cls[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
np.arange(y_min, y_max, 0.02))
# 预测
Z = dt_viz.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.6, cmap=plt.cm.RdYlBu)
contour = plt.contour(xx, yy, Z, colors='black', linewidths=0.8)
plt.clabel(contour, inline=True, fontsize=8)
plt.scatter(X_cls[:, 0], X_cls[:, 1], c=y_cls, cmap=plt.cm.RdYlBu,
edgecolors='black', s=20, alpha=0.7)
plt.title('决策树分类边界\n(可以看到轴对齐的矩形区域)')
plt.xlabel('特征1')
plt.ylabel('特征2')
plt.tight_layout()
plt.show()
print("观察要点:")
print("1. 决策边界是轴对齐的矩形 - 这是决策树的特点")
print("2. 边界很'硬',没有模糊地带 - 不同于逻辑回归的渐变")
print("3. 复杂的树会产生更复杂的边界形状")
🧠 决策树的剪枝:防止过度生长
🌿 过拟合问题
决策树很容易过拟合,就像这样:
def demonstrate_overfitting():
"""演示决策树过拟合问题"""
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
# 生成非线性可分数据
X_moons, y_moons = make_moons(n_samples=200, noise=0.3, random_state=42)
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(X_moons, y_moons, test_size=0.3, random_state=42)
# 训练不同深度的树
depths = [1, 2, 3, 5, 10, None] # None表示不限制深度
train_scores = []
test_scores = []
for depth in depths:
tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
tree.fit(X_train, y_train)
train_pred = tree.predict(X_train)
test_pred = tree.predict(X_test)
train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)
train_scores.append(train_acc)
test_scores.append(test_acc)
print(f"深度 {depth if depth is not None else '无限制'}: "
f"训练准确率={train_acc:.3f}, 测试准确率={test_acc:.3f}")
# 可视化过拟合
plt.figure(figsize=(10, 6))
plt.plot(depths, train_scores, 'bo-', label='训练准确率', linewidth=2, markersize=8)
plt.plot(depths, test_scores, 'ro-', label='测试准确率', linewidth=2, markersize=8)
plt.xlabel('树深度')
plt.ylabel('准确率')
plt.title('决策树过拟合演示:训练准确率持续上升,但测试准确率在下降')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
demonstrate_overfitting()
✂️ 预剪枝和后剪枝
def pruning_strategies():
"""展示不同的剪枝策略"""
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X, y = data.data, data.target
# 不同的剪枝参数
strategies = {
'无限制': {},
'限制叶子节点数': {'min_samples_leaf': 10},
'限制分裂样本数': {'min_samples_split': 20},
'限制树深度': {'max_depth': 4},
'组合限制': {'max_depth': 4, 'min_samples_leaf': 5}
}
print("剪枝策略对比:")
print("=" * 60)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
for strategy_name, params in strategies.items():
tree = DecisionTreeClassifier(random_state=42, **params)
tree.fit(X_train, y_train)
train_acc = tree.score(X_train, y_train)
test_acc = tree.score(X_test, y_test)
n_nodes = tree.tree_.node_count
max_depth = tree.tree_.max_depth
print(f"{strategy_name:12}: "
f"训练={train_acc:.3f}, 测试={test_acc:.3f}, "
f"节点数={n_nodes:3d}, 深度={max_depth:2d}")
pruning_strategies()
🤖 决策树的实际应用
🏦 信用卡欺诈检测
# 模拟信用卡交易数据
np.random.seed(42)
# 特征:[交易金额, 交易时间(小时), 商家类型, 用户历史评分]
n_samples = 1000
transaction_amounts = np.random.exponential(scale=100, size=n_samples) # 交易金额
transaction_hours = np.random.uniform(0, 24, size=n_samples) # 交易时间
merchant_types = np.random.choice([0, 1, 2], size=n_samples) # 商家类型:0=线上, 1=实体店, 2=ATM
user_scores = np.random.normal(0.7, 0.2, size=n_samples) # 用户历史评分(0-1)
# 生成标签:欺诈交易(1) vs 正常交易(0)
# 欺诈交易的特征:金额异常、深夜交易、ATM、低评分用户
fraud_labels = (
((transaction_amounts > 300) & (transaction_hours < 6)) | # 大额深夜交易
((merchant_types == 2) & (user_scores < 0.3)) | # ATM + 低评分
((transaction_amounts > 500) & (user_scores < 0.4)) # 超大金额 + 低评分
).astype(int)
# 添加一些随机性
fraud_noise = np.random.random(n_samples) < 0.1 # 10%的噪声
fraud_labels[fraud_noise] = 1 - fraud_labels[fraud_noise]
X_transactions = np.column_stack([transaction_amounts, transaction_hours, merchant_types, user_scores])
print("信用卡交易数据概览:")
print(f"总交易数: {n_samples}")
print(f"欺诈交易: {np.sum(fraud_labels)} ({np.sum(fraud_labels)/n_samples*100:.1f}%)")
print(f"正常交易: {n_samples - np.sum(fraud_labels)} ({(n_samples-np.sum(fraud_labels))/n_samples*100:.1f}%)")
# 训练欺诈检测模型
fraud_detector = DecisionTreeClassifier(
max_depth=5,
min_samples_leaf=10,
random_state=42
)
fraud_detector.fit(X_transactions, fraud_labels)
# 模型评估
train_pred = fraud_detector.predict(X_transactions)
train_accuracy = accuracy_score(fraud_labels, train_pred)
print(f"\n欺诈检测模型:")
print(f"训练准确率: {train_accuracy:.3f}")
# 特征重要性
feature_names = ['交易金额', '交易时间', '商家类型', '用户评分']
importance = fraud_detector.feature_importance_
print(f"\n特征重要性排序:")
for name, imp in sorted(zip(feature_names, importance), key=lambda x: x[1], reverse=True):
print(f"{name}: {imp:.3f}")
# 测试几个可疑交易
suspicious_transactions = np.array([
[600, 3, 2, 0.2], # 大额、深夜、ATM、低评分 → 很可能欺诈
[50, 14, 1, 0.8], # 小额、白天、实体店、高评分 → 很可能正常
[400, 2, 0, 0.3], # 大额、深夜、线上、低评分 → 可疑
[80, 10, 1, 0.6] # 中等、白天、实体店、中等评分 → 可能正常
])
predictions = fraud_detector.predict(suspicious_transactions)
probabilities = fraud_detector.predict_proba(suspicious_transactions)
print(f"\n可疑交易检测结果:")
for i, (transaction, pred, prob) in enumerate(zip(suspicious_transactions, predictions, probabilities)):
risk_level = "高风险" if prob[1] > 0.7 else "中等风险" if prob[1] > 0.3 else "低风险"
status = "欺诈" if pred == 1 else "正常"
confidence = max(prob) * 100
print(f"交易{i+1}: 金额={transaction[0]:.0f}, 时间={transaction[1]:.0f}h, "
f"商家={['线上','实体店','ATM'][transaction[2]]}, 评分={transaction[3]:.1f}")
print(f" 预测: {status} ({risk_level}), 置信度: {confidence:.1f}%")
print()
⚠️ 决策树的优缺点
✅ 优点
def decision_tree_advantages():
"""展示决策树的优点"""
advantages = {
"可解释性强": "像阅读流程图一样清晰",
"不需要特征缩放": "对数据的尺度不敏感",
"能处理混合数据类型": "数值型和类别型都可以",
"自动特征选择": "不重要特征会被忽略",
"非线性关系": "能捕捉复杂的非线性模式"
}
print("决策树的优点:")
print("=" * 40)
for advantage, explanation in advantages.items():
print(f"✅ {advantage:12}: {explanation}")
# 演示不需要特征缩放
from sklearn.preprocessing import StandardScaler
# 创建不同尺度的特征
X_unscaled = np.array([[1000, 2.5, 1], [2000, 3.1, 0], [1500, 2.8, 1]])
y_dummy = np.array([1, 0, 1])
print(f"\n特征缩放演示:")
print(f"原始特征(不同尺度):")
print(f" 特征1: {X_unscaled[:, 0]} (范围: {X_unscaled[:, 0].min()}-{X_unscaled[:, 0].max()})")
print(f" 特征2: {X_unscaled[:, 1]} (范围: {X_unscaled[:, 1].min():.1f}-{X_unscaled[:, 1].max():.1f})")
print(f" 特征3: {X_unscaled[:, 2]} (范围: {X_unscaled[:, 2].min()}-{X_unscaled[:, 2].max()})")
# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_unscaled)
print(f"标准化后:")
print(f" 特征1: {X_scaled[:, 0].round(2)} (范围: {X_scaled[:, 0].min():.2f}-{X_scaled[:, 0].max():.2f})")
print(f" 特征2: {X_scaled[:, 1].round(2)} (范围: {X_scaled[:, 1].min():.2f}-{X_scaled[:, 1].max():.2f})")
print(f" 特征3: {X_scaled[:, 2].round(2)} (范围: {X_scaled[:, 2].min():.2f}-{X_scaled[:, 2].max():.2f})")
# 决策树对两者一视同仁
tree1 = DecisionTreeClassifier(random_state=42)
tree2 = DecisionTreeClassifier(random_state=42)
tree1.fit(X_unscaled, y_dummy)
tree2.fit(X_scaled, y_dummy)
print(f"\n决策树权重对比:")
print(f"未标准化: {tree1.feature_importances_.round(3)}")
print(f"标准化后: {tree2.feature_importances_.round(3)}")
print("(注意:权重几乎相同!说明决策树不受特征尺度影响)")
decision_tree_advantages()
❌ 缺点
def decision_tree_disadvantages():
"""展示决策树的缺点"""
disadvantages = {
"容易过拟合": "树太深会记住训练数据",
"不稳定": "数据微小变化可能导致完全不同的树",
"有偏性": "倾向于选择多值特征",
"不是最优": "贪心算法可能不是全局最优",
"难以外推": "对超出训练范围的预测不可靠"
}
print("决策树的缺点:")
print("=" * 40)
for disadvantage, explanation in disadvantages.items():
print(f"❌ {disadvantage:8}: {explanation}")
# 演示不稳定性
from sklearn.tree import export_text
# 稍微改变数据,看决策树如何变化
X_base = np.array([[1, 2], [2, 3], [3, 1], [4, 2], [5, 3]])
y_base = np.array([0, 0, 1, 1, 1])
# 原始数据训练的树
tree1 = DecisionTreeClassifier(random_state=42, max_depth=3)
tree1.fit(X_base, y_base)
rules1 = export_text(tree1, feature_names=['特征A', '特征B'])
# 稍微扰动数据
X_noisy = X_base + np.random.normal(0, 0.1, X_base.shape)
tree2 = DecisionTreeClassifier(random_state=42, max_depth=3)
tree2.fit(X_noisy, y_base)
rules2 = export_text(tree2, feature_names=['特征A', '特征B'])
print(f"\n不稳定性演示:")
print(f"原始数据决策规则:")
print(rules1)
print(f"\n扰动数据决策规则:")
print(rules2)
print(f"(注意:微小的数据变化导致了完全不同的决策规则!)")
decision_tree_disadvantages()
📝 本篇小结
- 决策树模仿人类决策:通过一系列二元选择逐步得出结论
- 信息论是基础:熵衡量不确定性,信息增益指导最佳分割
- 可解释性极强:决策规则清晰易懂,像阅读流程图
- 不需要数据预处理:对特征尺度不敏感,处理混合数据类型
- 容易过拟合:需要剪枝来控制复杂度
- 不稳定:数据微小变化可能导致完全不同的树
- 实际应用广泛:从医疗诊断到金融风控都有应用
🎯 练习题
- 手工构建:给定简单的数据集,试着手工画出决策树的结构
- 业务应用:选择一个你熟悉的领域(如电商、教育、医疗),设计一套决策树规则
- 参数调优:实验不同的剪枝参数,观察对模型性能的影响
- 特征工程:为决策树创造新的特征组合,看是否能提升分类效果
- 模型融合:尝试将决策树与其他算法结合(如随机森林)
🔮 下一篇预告
第7篇:神经网络——模仿大脑的电脑
决策树虽然可解释性强,但面对图像识别、语音理解这类复杂任务时就力不从心了。
人类大脑有860亿个神经元,通过复杂的连接网络处理信息。神经网络就是受到大脑启发的算法:
- 能识别照片中的猫狗
- 能理解人类的语言
- 能下围棋击败世界冠军
- 能创作艺术作品
我们将学习这个让AI获得"智能"的核心技术!准备好探索人工神经网络的奇妙世界了吗? 🧠✨
决策树:像做选择题一样预测


被折叠的 条评论
为什么被折叠?



