奇异值的应用
a. 特征降维:对高维数据减小计算量、可视化
b. 数据重构:比如重构信号、重构图像(可以实现有损压缩,k 越小压缩率越高,但图像质量损失越大)
c. 降噪:通常噪声对应较小的奇异值。通过丢弃这些小奇异值并重构矩阵,可以达到一定程度的降噪效果。
d. 推荐系统:在协同过滤算法中,用户-物品评分矩阵通常是稀疏且高维的。SVD (或其变种如 FunkSVD, SVD++) 可以用来分解这个矩阵,发现潜在因子 (latent factors),从而预测未评分的项。这里其实属于特征降维的部分。
import numpy as np
A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]])
print('原始矩阵 A:')
print(A)
U, sigma, Vt = np.linalg.svd(A, full_matrices=False)
print('奇异值 sigma:')
print(sigma)
k = 1
U_k = U[:, :k]
sigma_k = sigma[:k]
Vt_k = Vt[:k, :]
A_approx = U_k @ np.diag(sigma_k) @ Vt_k
print('保留前', k, '个奇异值后的近似矩阵 A_approx:')
print(A_approx)
error = np.linalg.norm(A - A_approx, 'fro') / np.linalg.norm(A, 'fro')
print('近似误差 (Frobenius 范数相对误差):', error)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
np.random.seed(42)
n_samples = 1000
n_features = 50
X = np.random.randn(n_samples, n_features) * 10
Y = (X[:, 0] + X[:, 1] > 0).astype(int)
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42)
print(f'训练集形状: {X_train.shape}')
print(f'测试集形状: {X_test.shape}')
U_train, sigma_train, Vt_train = np.linalg.svd(X_train, full_matrices=False)
print(f'Vt_train 矩阵形状: {Vt_train.shape}')
k = 10
Vt_k = Vt_train[:k, :]
print(f'保留 k={k} 后的 Vt_k 矩阵形状: {Vt_k.shape}')
X_train_reduced = X_train @ Vt_k.T
print(f'降维后训练集形状: {X_train_reduced.shape}')
X_test_reduced = X_test @ Vt_k.T
print(f'降维后测试集形状: {X_test_reduced.shape}')
model = LogisticRegression(random_state=42)
model.fit(X_train_reduced, Y_train)
Y_pred = model.predict(X_test_reduced)
accuracy = accuracy_score(Y_test, Y_pred)
print(f'测试集准确率: {accuracy}')
X_train_approx = U_train[:, :k] @ np.diag(sigma_train[:k]) @ Vt_k
error = np.linalg.norm(X_train - X_train_approx, 'fro') / \
np.linalg.norm(X_train, 'fro')
print(f'训练集近似误差 (Frobenius 范数相对误差): {error}')
import matplotlib.pyplot as plt
import time
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import warnings
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.model_selection import train_test_split
import pandas as pd
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['PingFang HK']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv(r'heart.csv')
X = data.drop(['target'], axis=1)
Y = data['target']
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size=0.2, random_state=42)
print(f'训练集形状: {X_train.shape}')
print(f'测试集形状: {X_test.shape}')
U_train, sigma_train, Vt_train = np.linalg.svd(X_train, full_matrices=False)
print(f'Vt_train 矩阵形状: {Vt_train.shape}')
k = 10
Vt_k = Vt_train[:k, :]
print(f'保留 k={k} 后的 Vt_k 矩阵形状: {Vt_k.shape}')
X_train_reduced = X_train @ Vt_k.T
print(f'降维后训练集形状: {X_train_reduced.shape}')
X_test_reduced = X_test @ Vt_k.T
print(f'降维后测试集形状: {X_test_reduced.shape}')
model = LogisticRegression(random_state=42)
model.fit(X_train_reduced, Y_train)
Y_pred = model.predict(X_test_reduced)
accuracy = accuracy_score(Y_test, Y_pred)
print(f'测试集准确率: {accuracy}')
X_train_approx = U_train[:, :k] @ np.diag(sigma_train[:k]) @ Vt_k
error = np.linalg.norm(X_train - X_train_approx, 'fro') / \
np.linalg.norm(X_train, 'fro')
print(f'训练集近似误差 (Frobenius 范数相对误差): {error}')
