混合精度策略在PBiCGStab算法中的应用
PBiCGStab(预处理双共轭梯度稳定法)是一种常用的Krylov子空间迭代方法,结合混合精度策略可以显著提高计算效率同时保持足够的精度。下面我将介绍如何在PBiCGStab中实现混合精度,并提供示例代码。
混合精度策略概述
混合精度策略的核心思想是:
- 使用较低精度(如单精度)进行大部分计算,提高内存带宽利用率和计算速度
- 在关键计算步骤使用较高精度(如双精度)保持数值稳定性
- 在必要时进行精度转换
PBiCGStab混合精度实现要点
在PBiCGStab中,以下部分适合使用混合精度:
- 矩阵向量乘法:可使用低精度
- 向量内积:建议使用高精度
- 预处理操作:根据预处理类型选择精度
- 标量计算:使用高精度
- 向量更新:可使用低精度
示例代码(C++实现)
#include <iostream>
#include <vector>
#include <cmath>
#include <chrono>
// 定义精度类型
using high_precision = double;
using low_precision = float;
template <typename T>
class Vector {
std::vector<T> data;
public:
Vector(size_t size) : data(size) {}
T& operator[](size_t i) { return data[i]; }
const T& operator[](size_t i) const { return data[i]; }
size_t size() const { return data.size(); }
};
// 矩阵向量乘法 (低精度)
void matVecMult(const Vector<low_precision>& mat_values,
const std::vector<int>& mat_col_indices,
const std::vector<int>& mat_row_ptr,
const Vector<low_precision>& vec,
Vector<low_precision>& result) {
for (int i = 0; i < mat_row_ptr.size() - 1; ++i) {
low_precision sum = 0.0f;
for (int j = mat_row_ptr[i]; j < mat_row_ptr[i + 1]; ++j) {
sum += mat_values[j] * vec[mat_col_indices[j]];
}
result[i] = sum;
}
}
// 向量内积 (高精度)
high_precision dotProduct(const Vector<low_precision>& a, const Vector<low_precision>& b) {
high_precision result = 0.0;
for (size_t i = 0; i < a.size(); ++i) {
result += static_cast<high_precision>(a[i]) * static_cast<high_precision>(b[i]);
}
return result;
}
// PBiCGStab with mixed precision
void mixedPrecisionPBiCGStab(
const Vector<low_precision>& mat_values,
const std::vector<int>& mat_col_indices,
const std::vector<int>& mat_row_ptr,
const Vector<low_precision>& b,
Vector<low_precision>& x,
int max_iter,
high_precision tol) {
size_t n = b.size();
Vector<low_precision> r(n), r0(n), p(n), v(n), s(n), t(n);
// 初始残差计算 (低精度)
matVecMult(mat_values, mat_col_indices, mat_row_ptr, x, v);
for (size_t i = 0; i < n; ++i) {
r[i] = b[i] - v[i];
r0[i] = r[i];
}
high_precision rho = 1.0, alpha = 1.0, omega = 1.0;
high_precision rho_old, beta;
for (int k = 0; k < max_iter; ++k) {
rho_old = rho;
// 内积计算 (转换为高精度)
rho = dotProduct(r0, r);
if (std::abs(rho) < 1e-30) {
std::cout << "Breakdown in rho\n";
break;
}
if (k == 0) {
for (size_t i = 0; i < n; ++i) {
p[i] = r[i];
}
} else {
beta = (rho / rho_old) * (alpha / omega);
for (size_t i = 0; i < n; ++i) {
p[i] = r[i] + beta * (p[i] - omega * v[i]);
}
}
// 矩阵向量乘法 (低精度)
matVecMult(mat_values, mat_col_indices, mat_row_ptr, p, v);
// 内积计算 (高精度)
high_precision r0v = dotProduct(r0, v);
alpha = rho / r0v;
for (size_t i = 0; i < n; ++i) {
s[i] = r[i] - alpha * v[i];
}
// 检查收敛 (使用高精度计算残差范数)
high_precision s_norm = std::sqrt(dotProduct(s, s));
if (s_norm < tol) {
for (size_t i = 0; i < n; ++i) {
x[i] += static_cast<low_precision>(alpha) * p[i];
}
std::cout << "Converged at iteration " << k << "\n";
return;
}
// 矩阵向量乘法 (低精度)
matVecMult(mat_values, mat_col_indices, mat_row_ptr, s, t);
// 内积计算 (高精度)
high_precision tt = dotProduct(t, t);
high_precision ts = dotProduct(t, s);
omega = ts / tt;
for (size_t i = 0; i < n; ++i) {
x[i] += static_cast<low_precision>(alpha) * p[i] + static_cast<low_precision>(omega) * s[i];
r[i] = s[i] - static_cast<low_precision>(omega) * t[i];
}
// 检查收敛
high_precision r_norm = std::sqrt(dotProduct(r, r));
if (r_norm < tol) {
std::cout << "Converged at iteration " << k << "\n";
return;
}
if (std::abs(omega) < 1e-30) {
std::cout << "Breakdown in omega\n";
break;
}
}
std::cout << "Reached maximum iterations\n";
}
int main() {
// 示例: 创建一个简单的稀疏矩阵 (CSR格式)
const int n = 1000;
Vector<low_precision> mat_values(n * 3 - 2);
std::vector<int> mat_col_indices(n * 3 - 2);
std::vector<int> mat_row_ptr(n + 1);
// 填充三对角矩阵
int idx = 0;
for (int i = 0; i < n; ++i) {
mat_row_ptr[i] = idx;
if (i > 0) {
mat_values[idx] = -1.0f;
mat_col_indices[idx] = i - 1;
++idx;
}
mat_values[idx] = 2.0f;
mat_col_indices[idx] = i;
++idx;
if (i < n - 1) {
mat_values[idx] = -1.0f;
mat_col_indices[idx] = i + 1;
++idx;
}
}
mat_row_ptr[n] = idx;
// 创建右侧向量和解向量
Vector<low_precision> b(n), x(n);
for (int i = 0; i < n; ++i) {
b[i] = static_cast<low_precision>(i + 1);
x[i] = 0.0f;
}
// 调用混合精度PBiCGStab
auto start = std::chrono::high_resolution_clock::now();
mixedPrecisionPBiCGStab(mat_values, mat_col_indices, mat_row_ptr, b, x, 1000, 1e-6);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;
std::cout << "Execution time: " << elapsed.count() << " seconds\n";
return 0;
}
Python示例(使用NumPy和SciPy)
import numpy as np
from scipy.sparse import csr_matrix
from time import time
def mixed_precision_pbicgstab(A, b, x0=None, max_iter=1000, tol=1e-6):
"""
Mixed precision PBiCGStab solver
A: scipy sparse matrix (stored in low precision)
b: numpy array (low precision)
"""
# Convert inputs to appropriate precision
A = A.astype(np.float32)
b = b.astype(np.float32)
if x0 is None:
x = np.zeros_like(b, dtype=np.float32)
else:
x = x0.astype(np.float32)
n = len(b)
r = b - A.dot(x)
r0 = r.copy()
rho = alpha = omega = np.float32(1.0)
for k in range(max_iter):
rho_old = rho
# High precision dot product
rho = np.dot(r0.astype(np.float64), r.astype(np.float64)).astype(np.float64)
if abs(rho) < 1e-30:
print("Breakdown in rho")
break
if k == 0:
p = r.copy()
else:
beta = (rho / rho_old) * (alpha / omega)
p = r + beta * (p - omega * v)
# Low precision matrix-vector product
v = A.dot(p)
# High precision dot product
r0v = np.dot(r0.astype(np.float64), v.astype(np.float64)).astype(np.float64)
alpha = rho / r0v
s = r - alpha * v
# Check convergence with high precision norm
s_norm = np.linalg.norm(s.astype(np.float64))
if s_norm < tol:
x += alpha.astype(np.float32) * p
print(f"Converged at iteration {k}")
return x.astype(np.float32)
# Low precision matrix-vector product
t = A.dot(s)
# High precision dot products
tt = np.dot(t.astype(np.float64), t.astype(np.float64))
ts = np.dot(t.astype(np.float64), s.astype(np.float64))
omega = ts / tt
x += alpha.astype(np.float32) * p + omega.astype(np.float32) * s
r = s - omega.astype(np.float32) * t
# Check convergence
r_norm = np.linalg.norm(r.astype(np.float64))
if r_norm < tol:
print(f"Converged at iteration {k}")
return x.astype(np.float32)
if abs(omega) < 1e-30:
print("Breakdown in omega")
break
print("Reached maximum iterations")
return x.astype(np.float32)
# 示例使用
n = 1000
# 创建三对角矩阵
diagonals = [np.ones(n-1)*-1, np.ones(n)*2, np.ones(n-1)*-1]
A = scipy.sparse.diags(diagonals, [-1, 0, 1], format='csr')
b = np.arange(1, n+1, dtype=np.float32)
start = time()
x = mixed_precision_pbicgstab(A, b, max_iter=1000, tol=1e-6)
end = time()
print(f"Execution time: {end - start} seconds")
关键注意事项
- 内积计算:必须使用高精度以避免累积舍入误差
- 收敛判断:残差范数计算应使用高精度
- 预处理:如果使用预处理,预处理矩阵可以保持低精度
- 数据类型转换:注意在精度转换时的性能开销
- 硬件支持:确保硬件支持混合精度计算以获得最佳性能
混合精度策略可以显著提高PBiCGStab的性能,特别是在内存带宽受限的问题上,同时保持足够的数值稳定性。