1. KNN算法基本概念
1.1 什么是KNN算法
**K-Nearest Neighbors(K近邻算法)**是一种基于实例的学习方法,也称为懒惰学习(Lazy Learning)。它的核心思想是:
“近朱者赤,近墨者黑” - 一个样本的类别由其最近的K个邻居决定
1.2 算法原理
- 存储所有训练数据(不需要显式的训练过程)
- 计算待预测点与所有训练点的距离
- 选择距离最近的K个点
- 根据这K个点的标签进行预测
- 分类:多数投票
- 回归:平均值
1.3 为什么需要特征标准化
import numpy as np
import matplotlib.pyplot as plt
# 演示特征标准化的重要性
def demonstrate_scaling_importance():
"""
演示特征标准化对KNN的重要性
"""
# 创建示例数据:年龄和收入
np.random.seed(42)
# 未标准化的数据
age = np.random.normal(30, 10, 100) # 年龄:20-40岁
income = np.random.normal(50000, 15000, 100) # 收入:30k-70k
# 两个测试点
point1 = np.array([25, 45000]) # 年龄25,收入45k
point2 = np.array([35, 55000]) # 年龄35,收入55k
# 计算距离(未标准化)
distances_raw = []
for i in range(len(age)):
dist = np.sqrt((age[i] - point1[0])**2 + (income[i] - point1[1])**2)
distances_raw.append(dist)
# 标准化后的数据
age_std = (age - np.mean(age)) / np.std(age)
income_std = (income - np.mean(income)) / np.std(income)
point1_std = [(25 - np.mean(age)) / np.std(age),
(45000 - np.mean(income)) / np.std(income)]
# 计算距离(标准化后)
distances_std = []
for i in range(len(age_std)):
dist = np.sqrt((age_std[i] - point1_std[0])**2 + (income_std[i] - point1_std[1])**2)
distances_std.append(dist)
print("=== 特征标准化的重要性 ===")
print(f"未标准化距离的前5个: {sorted(distances_raw)[:5]}")
print(f"标准化后距离的前5个: {sorted(distances_std)[:5]}")
print("\n可以看到,收入的大数值主导了距离计算,年龄的影响被忽略了")
demonstrate_scaling_importance()
2. KNN算法实现
2.1 基础KNN分类器
class KNNClassifier:
"""
KNN分类器的NumPy实现
"""
def __init__(self, k=3):
"""
初始化KNN分类器
参数:
k: 邻居数量
"""
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X, y):
"""
训练KNN(实际上只是存储数据)
参数:
X: 训练特征矩阵 (n_samples, n_features)
y: 训练标签向量 (n_samples,)
"""
self.X_train = X.copy()
self.y_train = y.copy()
def _euclidean_distance(self, point1, point2):
"""
计算欧几里得距离
"""
return np.sqrt(np.sum((point1 - point2) ** 2))
def _get_neighbors(self, test_point):
"""
获取K个最近邻居
参数:
test_point: 待预测的点
返回:
neighbors: K个最近邻居的索引
"""
# 计算测试点与所有训练点的距离
distances = []
for i, train_point in enumerate(self.X_train):
dist = self._euclidean_distance(test_point, train_point)
distances.append((dist, i))
# 按距离排序并取前K个
distances.sort(key=lambda x: x[0])
neighbors = [distances[i][1] for i in range(self.k)]
return neighbors
def predict_single(self, test_point):
"""
预测单个点的类别
"""
neighbors = self._get_neighbors(test_point)
neighbor_labels = [self.y_train[i] for i in neighbors]
# 多数投票
unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
max_count_index = np.argmax(counts)
return unique_labels[max_count_index]
def predict(self, X_test):
"""
预测多个点的类别
"""
predictions = []
for test_point in X_test:
pred = self.predict_single(test_point)
predictions.append(pred)
return np.array(predictions)
def score(self, X_test, y_test):
"""
计算准确率
"""
predictions = self.predict(X_test)
accuracy = np.mean(predictions == y_test)
return accuracy
# 使用示例
def knn_classification_example():
"""
KNN分类示例
"""
np.random.seed(42)
# 生成示例数据
# 类别0:围绕(2,2)的点
class0_x = np.random.normal(2, 0.5, 50)
class0_y = np.random.normal(2, 0.5, 50)
# 类别1:围绕(6,6)的点
class1_x = np.random.normal(6, 0.5, 50)
class1_y = np.random.normal(6, 0.5, 50)
# 组合数据
X = np.column_stack([
np.concatenate([class0_x, class1_x]),
np.concatenate([class0_y, class1_y])
])
y = np.concatenate([np.zeros(50), np.ones(50)])
# 划分训练集和测试集
train_size = int(0.8 * len(X))
indices = np.random.permutation(len(X))
train_indices = indices[:train_size]
test_indices = indices[train_size:]
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
# 创建和训练KNN分类器
knn = KNNClassifier(k=3)
knn.fit(X_train, y_train)
# 预测和评估
predictions = knn.predict(X_test)
accuracy = knn.score(X_test, y_test)
print("=== KNN分类结果 ===")
print(f"测试集准确率: {accuracy:.4f}")
print(f"预测结果: {predictions}")
print(f"真实标签: {y_test}")
knn_classification_example()
2.2 优化版KNN(向量化计算)
class OptimizedKNNClassifier:
"""
优化的KNN分类器(使用向量化计算)
"""
def __init__(self, k=3, distance_metric='euclidean'):
self.k = k
self.distance_metric = distance_metric
self.X_train = None
self.y_train = None
def fit(self, X, y):
self.X_train = X.copy()
self.y_train = y.copy()
def _calculate_distances(self, X_test):
"""
向量化计算所有距离
"""
if self.distance_metric == 'euclidean':
# 使用广播计算欧几里得距离
# X_test: (n_test, n_features)
# X_train: (n_train, n_features)
# 展开 ||a - b||^2 = ||a||^2 + ||b||^2 - 2*a·b
test_sq = np.sum(X_test**2, axis=1, keepdims=True) # (n_test, 1)
train_sq = np.sum(self.X_train**2, axis=1) # (n_train,)
cross_term = 2 * np.dot(X_test, self.X_train.T) # (n_test, n_train)
distances = test_sq + train_sq - cross_term
return np.sqrt(np.maximum(distances, 0)) # 避免数值误差导致的负值
elif self.distance_metric == 'manhattan':
# 曼哈顿距离
distances = np.sum(np.abs(X_test[:, np.newaxis] - self.X_train), axis=2)
return distances
def predict(self, X_test):
"""
向量化预测
"""
# 计算所有距离
distances = self._calculate_distances(X_test)
# 找到每个测试点的K个最近邻居
k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
# 获取邻居标签
k_nearest_labels = self.y_train[k_nearest_indices]
# 多数投票
predictions = []
for labels in k_nearest_labels:
unique_labels, counts = np.unique(labels, return_counts=True)
max_count_index = np.argmax(counts)
predictions.append(unique_labels[max_count_index])
return np.array(predictions)
def predict_proba(self, X_test):
"""
预测概率
"""
distances = self._calculate_distances(X_test)
k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
k_nearest_labels = self.y_train[k_nearest_indices]
# 计算每个类别的概率
unique_classes = np.unique(self.y_train)
probabilities = []
for labels in k_nearest_labels:
class_counts = np.bincount(labels.astype(int), minlength=len(unique_classes))
probabilities.append(class_counts / self.k)
return np.array(probabilities)
# 性能对比
def compare_knn_performance():
"""
比较基础版本和优化版本的性能
"""
import time
np.random.seed(42)
# 生成较大的数据集
n_samples = 1000
n_features = 10
X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 3, n_samples)
# 测试数据
X_test = np.random.randn(100, n_features)
y_test = np.random.randint(0, 3, 100)
# 基础版本
knn_basic = KNNClassifier(k=5)
knn_basic.fit(X, y)
start_time = time.time()
pred_basic = knn_basic.predict(X_test)
time_basic = time.time() - start_time
# 优化版本
knn_opt = OptimizedKNNClassifier(k=5)
knn_opt.fit(X, y)
start_time = time.time()
pred_opt = knn_opt.predict(X_test)
time_opt = time.time() - start_time
print("=== 性能对比 ===")
print(f"基础版本用时: {time_basic:.4f}秒")
print(f"优化版本用时: {time_opt:.4f}秒")
print(f"加速比: {time_basic / time_opt:.2f}x")
print(f"预测结果一致性: {np.array_equal(pred_basic, pred_opt)}")
compare_knn_performance()
2.3 KNN回归器
class KNNRegressor:
"""
KNN回归器
"""
def __init__(self, k=3, weights='uniform'):
"""
参数:
k: 邻居数量
weights: 权重方式 ('uniform' 或 'distance')
"""
self.k = k
self.weights = weights
self.X_train = None
self.y_train = None
def fit(self, X, y):
self.X_train = X.copy()
self.y_train = y.copy()
def _calculate_distances(self, X_test):
"""计算距离"""
test_sq = np.sum(X_test**2, axis=1, keepdims=True)
train_sq = np.sum(self.X_train**2, axis=1)
cross_term = 2 * np.dot(X_test, self.X_train.T)
distances = test_sq + train_sq - cross_term
return np.sqrt(np.maximum(distances, 0))
def predict(self, X_test):
"""预测连续值"""
distances = self._calculate_distances(X_test)
k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
predictions = []
for i, indices in enumerate(k_nearest_indices):
neighbor_values = self.y_train[indices]
if self.weights == 'uniform':
# 均匀权重:简单平均
pred = np.mean(neighbor_values)
elif self.weights == 'distance':
# 距离权重:距离越近权重越大
neighbor_distances = distances[i, indices]
# 避免除零,添加小值
weights = 1 / (neighbor_distances + 1e-8)
pred = np.sum(weights * neighbor_values) / np.sum(weights)
predictions.append(pred)
return np.array(predictions)
# KNN回归示例
def knn_regression_example():
"""
KNN回归示例
"""
np.random.seed(42)
# 生成非线性数据
X = np.linspace(0, 10, 100).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])
# 测试数据
X_test = np.linspace(0, 10, 50).reshape(-1, 1)
y_true = np.sin(X_test).ravel()
# 不同K值的比较
k_values = [1, 3, 5, 10]
print("=== KNN回归结果 ===")
for k in k_values:
# 均匀权重
knn_uniform = KNNRegressor(k=k, weights='uniform')
knn_uniform.fit(X, y)
pred_uniform = knn_uniform.predict(X_test)
mse_uniform = np.mean((y_true - pred_uniform) ** 2)
# 距离权重
knn_distance = KNNRegressor(k=k, weights='distance')
knn_distance.fit(X, y)
pred_distance = knn_distance.predict(X_test)
mse_distance = np.mean((y_true - pred_distance) ** 2)
print(f"K={k}:")
print(f" 均匀权重 MSE: {mse_uniform:.6f}")
print(f" 距离权重 MSE: {mse_distance:.6f}")
knn_regression_example()
3. 不同距离度量
3.1 常用距离度量
class DistanceMetrics:
"""
各种距离度量的实现
"""
@staticmethod
def euclidean_distance(x1, x2):
"""欧几里得距离"""
return np.sqrt(np.sum((x1 - x2) ** 2))
@staticmethod
def manhattan_distance(x1, x2):
"""曼哈顿距离"""
return np.sum(np.abs(x1 - x2))
@staticmethod
def chebyshev_distance(x1, x2):
"""切比雪夫距离"""
return np.max(np.abs(x1 - x2))
@staticmethod
def cosine_distance(x1, x2):
"""余弦距离"""
dot_product = np.dot(x1, x2)
norm_x1 = np.linalg.norm(x1)
norm_x2 = np.linalg.norm(x2)
return 1 - dot_product / (norm_x1 * norm_x2)
@staticmethod
def hamming_distance(x1, x2):
"""汉明距离(适用于二进制/分类特征)"""
return np.sum(x1 != x2) / len(x1)
# 距离度量比较
def compare_distance_metrics():
"""
比较不同距离度量的效果
"""
# 示例点
point1 = np.array([1, 2, 3])
point2 = np.array([4, 5, 6])
metrics = DistanceMetrics()
print("=== 不同距离度量比较 ===")
print(f"点1: {point1}")
print(f"点2: {point2}")
print(f"欧几里得距离: {metrics.euclidean_distance(point1, point2):.4f}")
print(f"曼哈顿距离: {metrics.manhattan_distance(point1, point2):.4f}")
print(f"切比雪夫距离: {metrics.chebyshev_distance(point1, point2):.4f}")
print(f"余弦距离: {metrics.cosine_distance(point1, point2):.4f}")
compare_distance_metrics()
4. KNN算法的优缺点
4.1 优点
- 简单易懂:算法原理直观,易于理解和实现
- 无参数学习:不需要对数据分布做假设
- 适用于多分类:天然支持多分类问题
- 局部敏感:能够处理复杂的决策边界
- 在线学习:新数据可以直接加入训练集
4.2 缺点
- 计算复杂度高:预测时需要计算与所有训练样本的距离
- 存储开销大:需要存储所有训练数据
- 对维度敏感:高维数据会导致"维度灾难"
- 对噪声敏感:异常点会影响预测结果
- 需要特征标准化:不同量纲的特征会影响距离计算
5. KNN算法的优化和变体
5.1 K-D树优化
class KDTree:
"""
K-D树实现(简化版)
用于加速KNN搜索
"""
def __init__(self, points, depth=0):
self.points = points
self.depth = depth
self.left = None
self.right = None
self.split_dim = None
self.split_value = None
if len(points) > 1:
self._build_tree()
def _build_tree(self):
"""构建K-D树"""
n_features = self.points.shape[1]
self.split_dim = self.depth % n_features
# 按分割维度排序
sorted_points = self.points[self.points[:, self.split_dim].argsort()]
median_idx = len(sorted_points) // 2
self.split_value = sorted_points[median_idx, self.split_dim]
# 递归构建左右子树
left_points = sorted_points[:median_idx]
right_points = sorted_points[median_idx+1:]
if len(left_points) > 0:
self.left = KDTree(left_points, self.depth + 1)
if len(right_points) > 0:
self.right = KDTree(right_points, self.depth + 1)
def nearest_neighbors(self, query_point, k=1):
"""
查找K个最近邻居(简化实现)
"""
# 这里是简化版本,实际实现需要更复杂的搜索策略
all_distances = []
for point in self.points:
dist = np.linalg.norm(query_point - point)
all_distances.append((dist, point))
all_distances.sort(key=lambda x: x[0])
return [point for _, point in all_distances[:k]]
print("=== K-D树优化示例 ===")
print("K-D树可以将KNN搜索的时间复杂度从O(n)降低到O(log n)")
5.2 加权KNN
def weighted_knn_example():
"""
加权KNN示例
"""
np.random.seed(42)
# 生成数据
X = np.random.randn(100, 2)
y = (X[:, 0] + X[:, 1] > 0).astype(int)
# 测试点
test_point = np.array([[0.5, 0.5]])
# 计算距离
distances = np.linalg.norm(X - test_point, axis=1)
# 找到K个最近邻居
k = 5
nearest_indices = np.argsort(distances)[:k]
nearest_distances = distances[nearest_indices]
nearest_labels = y[nearest_indices]
# 不同权重策略
print("=== 加权KNN比较 ===")
# 1. 均匀权重
uniform_pred = np.mean(nearest_labels)
print(f"均匀权重预测: {uniform_pred:.4f}")
# 2. 距离权重
weights = 1 / (nearest_distances + 1e-8)
distance_pred = np.sum(weights * nearest_labels) / np.sum(weights)
print(f"距离权重预测: {distance_pred:.4f}")
# 3. 高斯权重
sigma = np.std(nearest_distances)
gaussian_weights = np.exp(-nearest_distances**2 / (2 * sigma**2))
gaussian_pred = np.sum(gaussian_weights * nearest_labels) / np.sum(gaussian_weights)
print(f"高斯权重预测: {gaussian_pred:.4f}")
weighted_knn_example()
6. 实际应用场景
6.1 推荐系统
def recommendation_system_example():
"""
基于KNN的推荐系统示例
"""
np.random.seed(42)
# 模拟用户-物品评分矩阵
n_users, n_items = 100, 50
ratings = np.random.choice([0, 1, 2, 3, 4, 5], size=(n_users, n_items),
p=[0.7, 0.05, 0.05, 0.05, 0.05, 0.1])
# 用户相似度计算(基于余弦相似度)
def cosine_similarity(user1, user2):
# 只考虑两个用户都评过分的物品
common_items = (user1 != 0) & (user2 != 0)
if np.sum(common_items) == 0:
return 0
user1_common = user1[common_items]
user2_common = user2[common_items]
dot_product = np.dot(user1_common, user2_common)
norm1 = np.linalg.norm(user1_common)
norm2 = np.linalg.norm(user2_common)
if norm1 == 0 or norm2 == 0:
return 0
return dot_product / (norm1 * norm2)
# 为用户0推荐物品
target_user = 0
target_ratings = ratings[target_user]
# 计算与其他用户的相似度
similarities = []
for i in range(1, n_users):
sim = cosine_similarity(target_ratings, ratings[i])
similarities.append((sim, i))
# 选择K个最相似的用户
k = 5
similarities.sort(reverse=True)
similar_users = [user_id for _, user_id in similarities[:k]]
print("=== 基于KNN的推荐系统 ===")
print(f"目标用户: {target_user}")
print(f"最相似的{k}个用户: {similar_users}")
# 推荐物品(目标用户未评分但相似用户评分较高的物品)
unrated_items = np.where(target_ratings == 0)[0]
recommendations = []
for item in unrated_items:
weighted_rating = 0
total_similarity = 0
for sim, user_id in similarities[:k]:
if ratings[user_id, item] > 0:
weighted_rating += sim * ratings[user_id, item]
total_similarity += abs(sim)
if total_similarity > 0:
predicted_rating = weighted_rating / total_similarity
recommendations.append((item, predicted_rating))
# 排序并显示推荐
recommendations.sort(key=lambda x: x[1], reverse=True)
print(f"推荐物品(前5个): {recommendations[:5]}")
recommendation_system_example()
7. 总结
KNN算法是机器学习中的基础算法,具有以下特点:
核心优势
- 简单直观:易于理解和实现
- 无模型假设:不需要对数据分布做假设
- 适应性强:可用于分类和回归
主要挑战
- 计算开销:预测时间复杂度高
- 存储需求:需要保存所有训练数据
- 特征标准化:必须处理不同量纲的特征
实际应用
- 推荐系统:协同过滤
- 图像识别:相似图像检索
- 文本分类:相似文档匹配
- 异常检测:识别异常模式
理解KNN算法有助于掌握基于实例的学习方法,为学习更复杂的机器学习算法打下基础。

4961

被折叠的 条评论
为什么被折叠?



