高斯过程详解-优快云博客

高斯过程

高斯过程是为了求解回归和概率分类问题的一类通用的监督学习方法。其优点是

预测可以解释观测值
预测是概率的，这样可以计算经验信心区间
通用性

缺点是：

不是稀疏的，要使用全部样本和特征信息执行预测
在高维空间失效

高斯过程回归

GaussianProcessRegressor针对回归目的实现了高斯过程。需要实现设置预定的参数。

GPR例子

考虑噪音估计的GPR

这个例子解释了包含WhiteKernel的GPR能够估计数据噪音水平。对数间隔似然展示了其两个局部最优。

# coding: utf-8
# Gaussian process regression (GPR) with noise-level estimation

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

rng = np.random.RandomState(0)
X = rng.uniform(0, 5, 20)[:, np.newaxis]
y = 0.5 * np.sin(3 * X[:, 0] + rng.normal(0, 0.5, X.shape[0]))

plt.figure()
# initialize a kernel
kernel = 1.0 * RBF(length_scale=100.0, length_scale_bounds=(1e-2, 1e3))\
    + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))
gp = GaussianProcessRegressor(kernel=kernel, alpha=0.0).fit(X, y)

X_ = np.linspace(0, 5, 100)
y_mean, y_cov = gp.predict(X_[:, np.newaxis], return_cov=True)
plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
plt.fill_between(X_, y_mean-np.sqrt(np.diag(y_cov)),
                 y_mean+np.sqrt(np.diag(y_cov)),
                 alpha=0.5, color='k')
plt.plot(X_, 0.5*np.sin(3*X_), 'r', lw=3, zorder=9)
plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0,0,0))
plt.title('Initial: %s\nOptimum: %s\nLog-Marginal-Likelihood: %s'
          % (kernel, gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)))
plt.tight_layout()

plt.figure()
theta0 = np.logspace(-2, 3, 49)
theta1 = np.logspace(-2, 0, 50)
Theta0, Theta1 = np.meshgrid(theta0, theta1)
LML = [[gp.log_marginal_likelihood(np.log([0.36, Theta0[i,j],Theta1[i, j]]))
        for i in range(Theta0.shape[0])] for j in range(Theta0.shape[1])]
LML = np.array(LML).T

vmin, vmax = (-LML).min(), (-LML).max()
vmax = 50
level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), 50), decimals=1)
plt.contour(Theta0, Theta1, -LML,
            levels=level, norm=LogNorm(vmin=vmin, vmax=vmax))
plt.colorbar()
plt.xscale('log')
plt.yscale('log')
plt.xlabel('Length-scale')
plt.ylabel('Noise-scale')
plt.title('Log-marginal-likelihood')
plt.tight_layout()

plt.show()

GPR和岭回归比较

岭回归和GPR通过核技巧学习目标函数。KRR在由原始非线性空间通过核技巧生成的空间内学习一个线性函数。在核空间依据带岭正则项的均方差误差选择一个线性函数。GPR使用核来定义目标函数上先验分布的方差，然后使用训练数据来定义一个似然函数。基于贝叶斯定理，可以定义一个在目标函数上的后验分布，使用其均值作为预测。

GPR基于在间隔似然函数的梯度下降选择核的超参数，而KRR需要在一个交叉验证损失函数上执行网格搜索。一个更明显的区别是GPR学到了目标函数的一个一般的概率模型，能够提供有意义的置信区间和与预测一同的后验样本，而KRR仅仅提供预测值。

下例介绍了上述两种方法

# coding: utf-8
# Comparison of kernel ridge and Gaussian process regression

import time

import numpy as np

import matplotlib.pyplot as plt

from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, ExpSineSquared

rng = np.random.RandomState(0)

X = 15 * rng.rand(100, 1)
y = np.sin(X).ravel()
y += 3 * (0.5 - rng.rand(X.shape[0]))

param_grid = {'alpha': [1e0, 1e-1, 1e-2, 1e-3],
              'kernel': [ExpSineSquared(l, p)
                         for l in np.logspace(-2, 2, 10)
                         for p in np.logspace(0, 2, 10)]}
kr = GridSearchCV(KernelRidge(), param_grid=param_grid)
stime = time.time()
kr.fit(X, y)
print("Time for KRR fitting: %.3f" % (time.time() - stime))

gp_kernel = ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1))\
    + WhiteKernel(1e-1)
gpr = GaussianProcessRegressor(kernel=gp_kernel)
stime = time.time()
gpr.fit(X, y)
print("Time for GPR fitting: %.3f" % (time.time()-stime))

X_plot = np.linspace(0, 20, 10000)[:, None]
stime = time.time()
y_kr = kr.predict(X_plot)
print("Time for KRR prediction: %.3f" % (time.time()-stime))

stime = time.time()
y_gpr = gpr.predict(X_plot, return_std=False)
print("Time for GPR prediction: %.3f" % (time.time() - stime))

stime = time.time()
y_gpr, y_std = gpr.predict(X_plot, return_std=True)
print("Time for GPR prediction with standard-deviation: %.3f"
      % (time.time() - stime))

plt.figure(figsize=(10, 5))
lw =2
plt.scatter(X, y, c='k', label='data')
plt.plot(X_plot, np.sin(X_plot), color='navy', lw=lw, label='True')
plt.plot(X_plot, y_kr, color='turquoise', lw=lw,
         label='KRR (%s)' % kr.best_params_)
plt.plot(X_plot, y_gpr, color='darkorange', lw=lw,
         label="GRP (%s)" % gpr.kernel_)
plt.fill_between(X_plot[:, 0], y_gpr - y_std, y_gpr + y_std, color='darkorange',
                 alpha=0.2)
plt.xlabel('data')
plt.ylabel('target')
plt.xlim(0, 20)
plt.ylim(-4, 4)
plt.title('GPR versus Kernel Ridge')
plt.legend(loc='best', scatterpoints=1, prop={'size': 8})
plt.show()

plt.show()

高斯过程分类

GaussianProcessClassifier实现了高斯过程用于分类，尤其是用于概率分类，即测试预测采取了类别概率的形式。高斯过程分类器在潜在函数上放置一个高斯过程预分布，然后通过一个连接函数挤压获得概率分类。潜在函数也被称为干扰函数，其值不能被观察到且与它们自己不相关。它的目的是允许模型一个更便捷的形式，潜在函数在预测过程中会被移除。高斯过程分类实现了logistic连接函数，无法得到解析解但在二元分类时可以很容易逼近。

GaussianProcessClassifier可以通过执行基于one-versus-rest或者one-versus-one的训练和预测策略，实现多类别分类。

高斯过程分类的例子

使用GPC做概率预测

下例展示了GPC使用各种超参的RBF核的概率预测。

# coding: utf-8
# Probabilistic predictions with Gaussian process classification (GPC)

import numpy as np

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

train_size = 50
rng = np.random.RandomState(0)
X = rng.uniform(0, 5, 100)[:, np.newaxis]
y = np.array(X[:, 0] > 2.5, dtype=int)

gp_fix = GaussianProcessClassifier(kernel=1.0*RBF(length_scale=1.0), optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0*RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f"
      % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f"
      % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)"
      % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
         accuracy_score(y[:train_size], gp_opt.predict(X[:train_size]))))
print("Log-loss: %.3f (initial) %.3f (optimized)"
      % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]),
         log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1])))

plt.figure()
plt.scatter(X[:train_size, 0], y[:train_size], c='k', label='Train data', edgecolors=(0, 0, 0))
plt.scatter(X[train_size:, 0], y[train_size:], c='g', label='Test data', edgecolors=(0, 0, 0))
X_ = np.linspace(0, 5, 100)
plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r',
         label="Initial kernel: %s" % gp_fix.kernel_)
plt.plot(X_, gp_opt.predict_proba(X_[:, np.newaxis])[:, 1], 'b',
         label='Optimized kernel: %s' % gp_opt.kernel_)
plt.xlabel('Feature')
plt.ylabel('Class 1 probability')
plt.xlim(0, 5)
plt.ylim(-0.25, 1.5)
plt.legend(loc='best')

plt.show()