机器学习自学笔记(4)——KNN算法详解与NumPy实现

1. KNN算法基本概念

1.1 什么是KNN算法

**K-Nearest Neighbors(K近邻算法)**是一种基于实例的学习方法,也称为懒惰学习(Lazy Learning)。它的核心思想是:

“近朱者赤,近墨者黑” - 一个样本的类别由其最近的K个邻居决定

1.2 算法原理

  1. 存储所有训练数据(不需要显式的训练过程)
  2. 计算待预测点与所有训练点的距离
  3. 选择距离最近的K个点
  4. 根据这K个点的标签进行预测
    • 分类:多数投票
    • 回归:平均值

1.3 为什么需要特征标准化

import numpy as np
import matplotlib.pyplot as plt

# 演示特征标准化的重要性
def demonstrate_scaling_importance():
    """
    演示特征标准化对KNN的重要性
    """
    # 创建示例数据:年龄和收入
    np.random.seed(42)
    
    # 未标准化的数据
    age = np.random.normal(30, 10, 100)        # 年龄:20-40岁
    income = np.random.normal(50000, 15000, 100)  # 收入:30k-70k
    
    # 两个测试点
    point1 = np.array([25, 45000])    # 年龄25,收入45k
    point2 = np.array([35, 55000])    # 年龄35,收入55k
    
    # 计算距离(未标准化)
    distances_raw = []
    for i in range(len(age)):
        dist = np.sqrt((age[i] - point1[0])**2 + (income[i] - point1[1])**2)
        distances_raw.append(dist)
    
    # 标准化后的数据
    age_std = (age - np.mean(age)) / np.std(age)
    income_std = (income - np.mean(income)) / np.std(income)
    point1_std = [(25 - np.mean(age)) / np.std(age), 
                  (45000 - np.mean(income)) / np.std(income)]
    
    # 计算距离(标准化后)
    distances_std = []
    for i in range(len(age_std)):
        dist = np.sqrt((age_std[i] - point1_std[0])**2 + (income_std[i] - point1_std[1])**2)
        distances_std.append(dist)
    
    print("=== 特征标准化的重要性 ===")
    print(f"未标准化距离的前5个: {sorted(distances_raw)[:5]}")
    print(f"标准化后距离的前5个: {sorted(distances_std)[:5]}")
    print("\n可以看到,收入的大数值主导了距离计算,年龄的影响被忽略了")

demonstrate_scaling_importance()

2. KNN算法实现

2.1 基础KNN分类器

class KNNClassifier:
    """
    KNN分类器的NumPy实现
    """
    
    def __init__(self, k=3):
        """
        初始化KNN分类器
        
        参数:
        k: 邻居数量
        """
        self.k = k
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        """
        训练KNN(实际上只是存储数据)
        
        参数:
        X: 训练特征矩阵 (n_samples, n_features)
        y: 训练标签向量 (n_samples,)
        """
        self.X_train = X.copy()
        self.y_train = y.copy()
    
    def _euclidean_distance(self, point1, point2):
        """
        计算欧几里得距离
        """
        return np.sqrt(np.sum((point1 - point2) ** 2))
    
    def _get_neighbors(self, test_point):
        """
        获取K个最近邻居
        
        参数:
        test_point: 待预测的点
        
        返回:
        neighbors: K个最近邻居的索引
        """
        # 计算测试点与所有训练点的距离
        distances = []
        for i, train_point in enumerate(self.X_train):
            dist = self._euclidean_distance(test_point, train_point)
            distances.append((dist, i))
        
        # 按距离排序并取前K个
        distances.sort(key=lambda x: x[0])
        neighbors = [distances[i][1] for i in range(self.k)]
        
        return neighbors
    
    def predict_single(self, test_point):
        """
        预测单个点的类别
        """
        neighbors = self._get_neighbors(test_point)
        neighbor_labels = [self.y_train[i] for i in neighbors]
        
        # 多数投票
        unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
        max_count_index = np.argmax(counts)
        
        return unique_labels[max_count_index]
    
    def predict(self, X_test):
        """
        预测多个点的类别
        """
        predictions = []
        for test_point in X_test:
            pred = self.predict_single(test_point)
            predictions.append(pred)
        
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        """
        计算准确率
        """
        predictions = self.predict(X_test)
        accuracy = np.mean(predictions == y_test)
        return accuracy

# 使用示例
def knn_classification_example():
    """
    KNN分类示例
    """
    np.random.seed(42)
    
    # 生成示例数据
    # 类别0:围绕(2,2)的点
    class0_x = np.random.normal(2, 0.5, 50)
    class0_y = np.random.normal(2, 0.5, 50)
    
    # 类别1:围绕(6,6)的点
    class1_x = np.random.normal(6, 0.5, 50)
    class1_y = np.random.normal(6, 0.5, 50)
    
    # 组合数据
    X = np.column_stack([
        np.concatenate([class0_x, class1_x]),
        np.concatenate([class0_y, class1_y])
    ])
    y = np.concatenate([np.zeros(50), np.ones(50)])
    
    # 划分训练集和测试集
    train_size = int(0.8 * len(X))
    indices = np.random.permutation(len(X))
    train_indices = indices[:train_size]
    test_indices = indices[train_size:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    # 创建和训练KNN分类器
    knn = KNNClassifier(k=3)
    knn.fit(X_train, y_train)
    
    # 预测和评估
    predictions = knn.predict(X_test)
    accuracy = knn.score(X_test, y_test)
    
    print("=== KNN分类结果 ===")
    print(f"测试集准确率: {accuracy:.4f}")
    print(f"预测结果: {predictions}")
    print(f"真实标签: {y_test}")

knn_classification_example()

2.2 优化版KNN(向量化计算)

class OptimizedKNNClassifier:
    """
    优化的KNN分类器(使用向量化计算)
    """
    
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X.copy()
        self.y_train = y.copy()
    
    def _calculate_distances(self, X_test):
        """
        向量化计算所有距离
        """
        if self.distance_metric == 'euclidean':
            # 使用广播计算欧几里得距离
            # X_test: (n_test, n_features)
            # X_train: (n_train, n_features)
            
            # 展开 ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a·b
            test_sq = np.sum(X_test**2, axis=1, keepdims=True)  # (n_test, 1)
            train_sq = np.sum(self.X_train**2, axis=1)  # (n_train,)
            cross_term = 2 * np.dot(X_test, self.X_train.T)  # (n_test, n_train)
            
            distances = test_sq + train_sq - cross_term
            return np.sqrt(np.maximum(distances, 0))  # 避免数值误差导致的负值
        
        elif self.distance_metric == 'manhattan':
            # 曼哈顿距离
            distances = np.sum(np.abs(X_test[:, np.newaxis] - self.X_train), axis=2)
            return distances
    
    def predict(self, X_test):
        """
        向量化预测
        """
        # 计算所有距离
        distances = self._calculate_distances(X_test)
        
        # 找到每个测试点的K个最近邻居
        k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        
        # 获取邻居标签
        k_nearest_labels = self.y_train[k_nearest_indices]
        
        # 多数投票
        predictions = []
        for labels in k_nearest_labels:
            unique_labels, counts = np.unique(labels, return_counts=True)
            max_count_index = np.argmax(counts)
            predictions.append(unique_labels[max_count_index])
        
        return np.array(predictions)
    
    def predict_proba(self, X_test):
        """
        预测概率
        """
        distances = self._calculate_distances(X_test)
        k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        k_nearest_labels = self.y_train[k_nearest_indices]
        
        # 计算每个类别的概率
        unique_classes = np.unique(self.y_train)
        probabilities = []
        
        for labels in k_nearest_labels:
            class_counts = np.bincount(labels.astype(int), minlength=len(unique_classes))
            probabilities.append(class_counts / self.k)
        
        return np.array(probabilities)

# 性能对比
def compare_knn_performance():
    """
    比较基础版本和优化版本的性能
    """
    import time
    
    np.random.seed(42)
    
    # 生成较大的数据集
    n_samples = 1000
    n_features = 10
    X = np.random.randn(n_samples, n_features)
    y = np.random.randint(0, 3, n_samples)
    
    # 测试数据
    X_test = np.random.randn(100, n_features)
    y_test = np.random.randint(0, 3, 100)
    
    # 基础版本
    knn_basic = KNNClassifier(k=5)
    knn_basic.fit(X, y)
    
    start_time = time.time()
    pred_basic = knn_basic.predict(X_test)
    time_basic = time.time() - start_time
    
    # 优化版本
    knn_opt = OptimizedKNNClassifier(k=5)
    knn_opt.fit(X, y)
    
    start_time = time.time()
    pred_opt = knn_opt.predict(X_test)
    time_opt = time.time() - start_time
    
    print("=== 性能对比 ===")
    print(f"基础版本用时: {time_basic:.4f}秒")
    print(f"优化版本用时: {time_opt:.4f}秒")
    print(f"加速比: {time_basic / time_opt:.2f}x")
    print(f"预测结果一致性: {np.array_equal(pred_basic, pred_opt)}")

compare_knn_performance()

2.3 KNN回归器

class KNNRegressor:
    """
    KNN回归器
    """
    
    def __init__(self, k=3, weights='uniform'):
        """
        参数:
        k: 邻居数量
        weights: 权重方式 ('uniform' 或 'distance')
        """
        self.k = k
        self.weights = weights
        self.X_train = None
        self.y_train = None
    
    def fit(self, X, y):
        self.X_train = X.copy()
        self.y_train = y.copy()
    
    def _calculate_distances(self, X_test):
        """计算距离"""
        test_sq = np.sum(X_test**2, axis=1, keepdims=True)
        train_sq = np.sum(self.X_train**2, axis=1)
        cross_term = 2 * np.dot(X_test, self.X_train.T)
        
        distances = test_sq + train_sq - cross_term
        return np.sqrt(np.maximum(distances, 0))
    
    def predict(self, X_test):
        """预测连续值"""
        distances = self._calculate_distances(X_test)
        k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        
        predictions = []
        for i, indices in enumerate(k_nearest_indices):
            neighbor_values = self.y_train[indices]
            
            if self.weights == 'uniform':
                # 均匀权重:简单平均
                pred = np.mean(neighbor_values)
            elif self.weights == 'distance':
                # 距离权重:距离越近权重越大
                neighbor_distances = distances[i, indices]
                # 避免除零,添加小值
                weights = 1 / (neighbor_distances + 1e-8)
                pred = np.sum(weights * neighbor_values) / np.sum(weights)
            
            predictions.append(pred)
        
        return np.array(predictions)

# KNN回归示例
def knn_regression_example():
    """
    KNN回归示例
    """
    np.random.seed(42)
    
    # 生成非线性数据
    X = np.linspace(0, 10, 100).reshape(-1, 1)
    y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
    
    # 测试数据
    X_test = np.linspace(0, 10, 50).reshape(-1, 1)
    y_true = np.sin(X_test).ravel()
    
    # 不同K值的比较
    k_values = [1, 3, 5, 10]
    
    print("=== KNN回归结果 ===")
    
    for k in k_values:
        # 均匀权重
        knn_uniform = KNNRegressor(k=k, weights='uniform')
        knn_uniform.fit(X, y)
        pred_uniform = knn_uniform.predict(X_test)
        mse_uniform = np.mean((y_true - pred_uniform) ** 2)
        
        # 距离权重
        knn_distance = KNNRegressor(k=k, weights='distance')
        knn_distance.fit(X, y)
        pred_distance = knn_distance.predict(X_test)
        mse_distance = np.mean((y_true - pred_distance) ** 2)
        
        print(f"K={k}:")
        print(f"  均匀权重 MSE: {mse_uniform:.6f}")
        print(f"  距离权重 MSE: {mse_distance:.6f}")

knn_regression_example()

3. 不同距离度量

3.1 常用距离度量

class DistanceMetrics:
    """
    各种距离度量的实现
    """
    
    @staticmethod
    def euclidean_distance(x1, x2):
        """欧几里得距离"""
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    @staticmethod
    def manhattan_distance(x1, x2):
        """曼哈顿距离"""
        return np.sum(np.abs(x1 - x2))
    
    @staticmethod
    def chebyshev_distance(x1, x2):
        """切比雪夫距离"""
        return np.max(np.abs(x1 - x2))
    
    @staticmethod
    def cosine_distance(x1, x2):
        """余弦距离"""
        dot_product = np.dot(x1, x2)
        norm_x1 = np.linalg.norm(x1)
        norm_x2 = np.linalg.norm(x2)
        return 1 - dot_product / (norm_x1 * norm_x2)
    
    @staticmethod
    def hamming_distance(x1, x2):
        """汉明距离(适用于二进制/分类特征)"""
        return np.sum(x1 != x2) / len(x1)

# 距离度量比较
def compare_distance_metrics():
    """
    比较不同距离度量的效果
    """
    # 示例点
    point1 = np.array([1, 2, 3])
    point2 = np.array([4, 5, 6])
    
    metrics = DistanceMetrics()
    
    print("=== 不同距离度量比较 ===")
    print(f"点1: {point1}")
    print(f"点2: {point2}")
    print(f"欧几里得距离: {metrics.euclidean_distance(point1, point2):.4f}")
    print(f"曼哈顿距离: {metrics.manhattan_distance(point1, point2):.4f}")
    print(f"切比雪夫距离: {metrics.chebyshev_distance(point1, point2):.4f}")
    print(f"余弦距离: {metrics.cosine_distance(point1, point2):.4f}")

compare_distance_metrics()

4. KNN算法的优缺点

4.1 优点

  1. 简单易懂:算法原理直观,易于理解和实现
  2. 无参数学习:不需要对数据分布做假设
  3. 适用于多分类:天然支持多分类问题
  4. 局部敏感:能够处理复杂的决策边界
  5. 在线学习:新数据可以直接加入训练集

4.2 缺点

  1. 计算复杂度高:预测时需要计算与所有训练样本的距离
  2. 存储开销大:需要存储所有训练数据
  3. 对维度敏感:高维数据会导致"维度灾难"
  4. 对噪声敏感:异常点会影响预测结果
  5. 需要特征标准化:不同量纲的特征会影响距离计算

5. KNN算法的优化和变体

5.1 K-D树优化

class KDTree:
    """
    K-D树实现(简化版)
    用于加速KNN搜索
    """
    
    def __init__(self, points, depth=0):
        self.points = points
        self.depth = depth
        self.left = None
        self.right = None
        self.split_dim = None
        self.split_value = None
        
        if len(points) > 1:
            self._build_tree()
    
    def _build_tree(self):
        """构建K-D树"""
        n_features = self.points.shape[1]
        self.split_dim = self.depth % n_features
        
        # 按分割维度排序
        sorted_points = self.points[self.points[:, self.split_dim].argsort()]
        median_idx = len(sorted_points) // 2
        
        self.split_value = sorted_points[median_idx, self.split_dim]
        
        # 递归构建左右子树
        left_points = sorted_points[:median_idx]
        right_points = sorted_points[median_idx+1:]
        
        if len(left_points) > 0:
            self.left = KDTree(left_points, self.depth + 1)
        if len(right_points) > 0:
            self.right = KDTree(right_points, self.depth + 1)
    
    def nearest_neighbors(self, query_point, k=1):
        """
        查找K个最近邻居(简化实现)
        """
        # 这里是简化版本,实际实现需要更复杂的搜索策略
        all_distances = []
        for point in self.points:
            dist = np.linalg.norm(query_point - point)
            all_distances.append((dist, point))
        
        all_distances.sort(key=lambda x: x[0])
        return [point for _, point in all_distances[:k]]

print("=== K-D树优化示例 ===")
print("K-D树可以将KNN搜索的时间复杂度从O(n)降低到O(log n)")

5.2 加权KNN

def weighted_knn_example():
    """
    加权KNN示例
    """
    np.random.seed(42)
    
    # 生成数据
    X = np.random.randn(100, 2)
    y = (X[:, 0] + X[:, 1] > 0).astype(int)
    
    # 测试点
    test_point = np.array([[0.5, 0.5]])
    
    # 计算距离
    distances = np.linalg.norm(X - test_point, axis=1)
    
    # 找到K个最近邻居
    k = 5
    nearest_indices = np.argsort(distances)[:k]
    nearest_distances = distances[nearest_indices]
    nearest_labels = y[nearest_indices]
    
    # 不同权重策略
    print("=== 加权KNN比较 ===")
    
    # 1. 均匀权重
    uniform_pred = np.mean(nearest_labels)
    print(f"均匀权重预测: {uniform_pred:.4f}")
    
    # 2. 距离权重
    weights = 1 / (nearest_distances + 1e-8)
    distance_pred = np.sum(weights * nearest_labels) / np.sum(weights)
    print(f"距离权重预测: {distance_pred:.4f}")
    
    # 3. 高斯权重
    sigma = np.std(nearest_distances)
    gaussian_weights = np.exp(-nearest_distances**2 / (2 * sigma**2))
    gaussian_pred = np.sum(gaussian_weights * nearest_labels) / np.sum(gaussian_weights)
    print(f"高斯权重预测: {gaussian_pred:.4f}")

weighted_knn_example()

6. 实际应用场景

6.1 推荐系统

def recommendation_system_example():
    """
    基于KNN的推荐系统示例
    """
    np.random.seed(42)
    
    # 模拟用户-物品评分矩阵
    n_users, n_items = 100, 50
    ratings = np.random.choice([0, 1, 2, 3, 4, 5], size=(n_users, n_items), 
                              p=[0.7, 0.05, 0.05, 0.05, 0.05, 0.1])
    
    # 用户相似度计算(基于余弦相似度)
    def cosine_similarity(user1, user2):
        # 只考虑两个用户都评过分的物品
        common_items = (user1 != 0) & (user2 != 0)
        if np.sum(common_items) == 0:
            return 0
        
        user1_common = user1[common_items]
        user2_common = user2[common_items]
        
        dot_product = np.dot(user1_common, user2_common)
        norm1 = np.linalg.norm(user1_common)
        norm2 = np.linalg.norm(user2_common)
        
        if norm1 == 0 or norm2 == 0:
            return 0
        
        return dot_product / (norm1 * norm2)
    
    # 为用户0推荐物品
    target_user = 0
    target_ratings = ratings[target_user]
    
    # 计算与其他用户的相似度
    similarities = []
    for i in range(1, n_users):
        sim = cosine_similarity(target_ratings, ratings[i])
        similarities.append((sim, i))
    
    # 选择K个最相似的用户
    k = 5
    similarities.sort(reverse=True)
    similar_users = [user_id for _, user_id in similarities[:k]]
    
    print("=== 基于KNN的推荐系统 ===")
    print(f"目标用户: {target_user}")
    print(f"最相似的{k}个用户: {similar_users}")
    
    # 推荐物品(目标用户未评分但相似用户评分较高的物品)
    unrated_items = np.where(target_ratings == 0)[0]
    recommendations = []
    
    for item in unrated_items:
        weighted_rating = 0
        total_similarity = 0
        
        for sim, user_id in similarities[:k]:
            if ratings[user_id, item] > 0:
                weighted_rating += sim * ratings[user_id, item]
                total_similarity += abs(sim)
        
        if total_similarity > 0:
            predicted_rating = weighted_rating / total_similarity
            recommendations.append((item, predicted_rating))
    
    # 排序并显示推荐
    recommendations.sort(key=lambda x: x[1], reverse=True)
    print(f"推荐物品(前5个): {recommendations[:5]}")

recommendation_system_example()

7. 总结

KNN算法是机器学习中的基础算法,具有以下特点:

核心优势

  • 简单直观:易于理解和实现
  • 无模型假设:不需要对数据分布做假设
  • 适应性强:可用于分类和回归

主要挑战

  • 计算开销:预测时间复杂度高
  • 存储需求:需要保存所有训练数据
  • 特征标准化:必须处理不同量纲的特征

实际应用

  • 推荐系统:协同过滤
  • 图像识别:相似图像检索
  • 文本分类:相似文档匹配
  • 异常检测:识别异常模式

理解KNN算法有助于掌握基于实例的学习方法,为学习更复杂的机器学习算法打下基础。

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值