准备工作
pip install numpy
pip install pandas
pip install scikit-learn
pip install matplotlib
KNN算法
示例一: 鸢尾花数据集分类
- Iris也称鸢尾花卉数据集,是一类多重变量分析的数据集。
- 数据集包含150个数据样本,分为3类,每类50个数据,每个数据包含4个属性(花萼长度,花萼宽度,花瓣长度,花瓣宽度)。
- 可通过花萼长度,花萼宽度,花瓣长度,花瓣宽度4个属性预测鸢尾花卉属于
山鸢尾
,杂色鸢尾
,维吉尼亚鸢尾
3个种类中的哪一类。
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
iris = load_iris()
X = iris.data[:, :2]
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型的预测准确率为: {accuracy * 100:.2f}%")
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.Paired, edgecolors='k', s=30)
plt.title("训练集 - 花萼长度 vs 花萼宽度")
plt.xlabel("花萼长度 (cm)")
plt.ylabel("花萼宽度 (cm)")
plt.subplot(1, 2, 2)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap=plt.cm.Paired, edgecolors='k', s=30)
plt.title("测试集 - 花萼长度 vs 花萼宽度")
plt.xlabel("花萼长度 (cm)")
plt.ylabel("花萼宽度 (cm)")
plt.tight_layout()
plt.show()
X_train_2d = X_train[:, :2]
X_test_2d = X_test[:, :2]
knn_2d = KNeighborsClassifier(n_neighbors=3)
knn_2d.fit(X_train_2d, y_train)
x_min, x_max = X_train_2d[:, 0].min() - 1, X_train_2d[:, 0].max() + 1
y_min, y_max = X_train_2d[:, 1].min() - 1, X_train_2d[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=ListedColormap(['red', 'green', 'blue']))
plt.scatter(X_test_2d[:, 0], X_test_2d[:, 1], c=y_test, edgecolors='k', cmap=ListedColormap(['red', 'green', 'blue']))
plt.title('KNN决策边界 - 测试集')
plt.xlabel('花萼长度 (cm)')
plt.ylabel('花萼宽度 (cm)')
plt.tight_layout()
plt.show()
示例二:葡萄酒数据集分类
- load_wine是一个葡萄酒数据集,是一类多重变量分析的数据集。
- 数据集包含178个数据样本,分为3类,第一类59个样本,第二类71个样本,第三类48个样本,每个样本包含13个化学特征(这些化学特征包括酸度、灰分、酒精浓度等)。
- 可通过酸度、灰分、酒精浓度等化学特征预测葡萄酒属于
琴酒
,雪莉
,贝尔莫得
3个种类中的哪一类。
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
wine = load_wine()
X = wine.data[:, :2]
y = wine.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型的预测准确率为: {accuracy * 100:.2f}%")
plt.figure(figsize=(8, 6))
xx, yy = np.meshgrid(np.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 100),
np.linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.Paired)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, edgecolors='k', cmap=plt.cm.Paired)
plt.title('KNN Classification (Wine Dataset)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()
示例三:乳腺癌肿瘤数据集分类
- load_breast_cancer是一个乳腺癌肿瘤数据集,是一类多重变量分析的数据集。
- 数据集包含569个数据样本,分为2类,第一类357个样本,第二类212个样本,每个样本包含30个属性(这些属性包括肿瘤的半径、纹理、对称性等)。
- 可通过半径、纹理、对称性等属性预测肿瘤属于
良性(B)
、恶性(M)
2个种类中的哪一类。
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
cancer = load_breast_cancer()
X = cancer.data[:, :2]
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"模型的预测准确率为: {accuracy * 100:.2f}%")
plt.figure(figsize=(8, 6))
xx, yy = np.meshgrid(np.linspace(X[:, 0].min() - 1, X[:, 0].max() + 1, 100),
np.linspace(X[:, 1].min() - 1, X[:, 1].max() + 1, 100))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.RdBu)
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, edgecolors='k', cmap=plt.cm.RdBu)
plt.title('KNN Classification (Breast Cancer Dataset)')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()