import matplotlib
matplotlib.use('TkAgg') # 切换到官方Tk后端
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_olivetti_faces
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
class PCA:
def __init__(self, n_components):
self.n_components = n_components
self.components = None
self.mean = None
def fit(self, X):
# 中心化
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
# 计算协方差矩阵
covariance_matrix = np.cov(X_centered, rowvar=False)
# 特征值分解
eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)
# 按特征值大小排序(降序)
sorted_indices = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_indices]
eigenvectors = eigenvectors[:, sorted_indices]
# 选择前n_components个特征向量
self.components = eigenvectors[:, :self.n_components]
return self.transform(X)
def transform(self, X):
X_centered = X - self.mean
return np.dot(X_centered, self.components)
def fit_transform(self, X):
return self.fit(X)
def inverse_transform(self, X_pca):
return np.dot(X_pca, self.components.T) + self.mean
class KNNClassifier:
def __init__(self, k=1):
self.k = k
self.X_train = None
self.y_train = None
def fit(self, X, y):
self.X_train = X
self.y_train = y
def predict(self, X_test):
predictions = []
for i in range(X_test.shape[0]):
distances = np.linalg.norm(self.X_train - X_test[i], axis=1)
nearest_indices = np.argsort(distances)[:self.k]
nearest_labels = self.y_train[nearest_indices]
unique, counts = np.unique(nearest_labels, return_counts=True)
predicted_label = unique[np.argmax(counts)]
predictions.append(predicted_label)
return np.array(predictions)
def load_and_prepare_data():
"""加载并准备数据"""
print("正在加载Olivetti人脸数据集...")
faces = fetch_olivetti_faces(shuffle=True, random_state=42)
X, y, images = faces.data, faces.target, faces.images
print(f"数据集信息:")
print(f"- 总样本数: {X.shape[0]}")
print(f"- 特征维度: {X.shape[1]}")
print(f"- 类别数: {len(np.unique(y))}")
print(f"- 每类样本数: {len(y) // len(np.unique(y))}")
return X, y, images
def find_optimal_components(X_train, X_test, y_train, y_test, max_components=200):
"""寻找最优的主成分数量"""
print("\n正在寻找最优的主成分数量...")
accuracies = []
component_range = range(10, min(max_components, X_train.shape[1]), 10)
best_accuracy = 0
best_n_components = 0
for n_comp in component_range:
# 执行PCA
pca = PCA(n_components=n_comp)
X_train_pca = pca.fit(X_train)
X_test_pca = pca.transform(X_test)
# 使用KNN分类
knn = KNNClassifier(k=1)
knn.fit(X_train_pca, y_train)
y_pred = knn.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_n_components = n_comp
print(f"主成分数: {n_comp:3d}, 准确率: {accuracy:.3f}")
# 如果已经达到90%准确率,可以提前停止
if accuracy >= 0.90:
break
# 绘制准确率曲线
plt.figure(figsize=(10, 6))
plt.plot(list(component_range)[:len(accuracies)], accuracies, 'b-o', linewidth=2, markersize=6)
plt.axhline(y=0.90, color='r', linestyle='--', label='90%准确率线')
plt.xlabel('主成分数量')
plt.ylabel('分类准确率')
plt.title('PCA维度与分类准确率的关系')
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()
return best_n_components, best_accuracy
def main():
print("=" * 60)
print("PCA人脸识别实验 - 目标准确率 > 90%")
print("=" * 60)
# 1. 加载数据
X, y, images = load_and_prepare_data()
# 2. 划分训练集和测试集 (80%训练, 20%测试)
X_train, X_test, y_train, y_test, images_train, images_test = train_test_split(
X, y, images, test_size=0.2, random_state=42, stratify=y
)
print(f"\n数据划分:")
print(f"- 训练集大小: {X_train.shape[0]}")
print(f"- 测试集大小: {X_test.shape[0]}")
# 3. 寻找最优的主成分数量
best_n_components, best_accuracy = find_optimal_components(X_train, X_test, y_train, y_test)
print(f"\n最优参数:")
print(f"- 最佳主成分数: {best_n_components}")
print(f"- 最佳准确率: {best_accuracy:.3f}")
# 4. 使用最优参数进行最终训练和测试
print(f"\n使用最优参数进行最终测试...")
pca = PCA(n_components=best_n_components)
X_train_pca = pca.fit(X_train)
X_test_pca = pca.transform(X_test)
# 使用KNN分类器
knn = KNNClassifier(k=1)
knn.fit(X_train_pca, y_train)
y_pred = knn.predict(X_test_pca)
# 计算最终准确率
final_accuracy = accuracy_score(y_test, y_pred)
print(f"\n最终结果:")
print(f"- 测试集准确率: {final_accuracy:.3f} ({final_accuracy * 100:.1f}%)")
print(f"- 是否达到90%目标: {'✓ 达到' if final_accuracy >= 0.90 else '✗ 未达到'}")
# 5. 显示一些测试样本的识别结果
print(f"\n显示部分测试样本识别结果:")
correct_count = 0
total_display = min(10, len(y_test))
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.ravel()
for i in range(total_display):
axes[i].imshow(images_test[i], cmap='gray')
if y_pred[i] == y_test[i]:
color = 'green'
marker = '✓'
correct_count += 1
else:
color = 'red'
marker = '✗'
axes[i].set_title(f'真实: {y_test[i]}, 预测: {y_pred[i]} {marker}',
color=color, fontsize=10)
axes[i].axis('off')
plt.tight_layout()
plt.show()
print(f"显示的{total_display}个样本中,正确识别: {correct_count}个")
# 6. 显示特征脸
print(f"\n显示前16个特征脸...")
n_components_to_show = min(16, best_n_components)
fig, axes = plt.subplots(4, 4, figsize=(12, 12))
for i in range(n_components_to_show):
row = i // 4
col = i % 4
eigenface = pca.components[:, i].reshape(64, 64)
axes[row, col].imshow(eigenface, cmap='gray')
axes[row, col].set_title(f'主成分 {i + 1}')
axes[row, col].axis('off')
# 隐藏多余的子图
for i in range(n_components_to_show, 16):
row = i // 4
col = i % 4
axes[row, col].axis('off')
plt.tight_layout()
plt.show()
# 7. 显示压缩信息
original_dim = X_train.shape[1]
compressed_dim = best_n_components
compression_ratio = (1 - compressed_dim / original_dim) * 100
print(f"\n压缩统计:")
print(f"- 原始维度: {original_dim}")
print(f"- 降维后维度: {compressed_dim}")
print(f"- 压缩率: {compression_ratio:.1f}%")
print(f"- 数据量减少: {original_dim / compressed_dim:.1f} 倍")
# 8. 重建示例
print(f"\n显示重建示例...")
test_idx = 0
reconstructed = pca.inverse_transform(X_test_pca[test_idx:test_idx + 1])
reconstructed_image = reconstructed.reshape(64, 64)
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
axes[0].imshow(images_test[test_idx], cmap='gray')
axes[0].set_title('原始测试图片')
axes[0].axis('off')
axes[1].imshow(reconstructed_image, cmap='gray')
axes[1].set_title(f'PCA重建图片\n(使用{best_n_components}个主成分)')
axes[1].axis('off')
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
换一组图片数据
最新发布