高性能计算(High Performance Computing, HPC)是指通过组合大量计算资源(如多核处理器、GPU、集群等)来解决需要大量计算能力的复杂问题。
C++ 是一种常用的高性能计算语言,它具有高效的内存管理和并行处理能力。CUDA(Compute Unified Device Architecture)是 NVIDIA 公司推出的一种用于在 NVIDIA GPU 上编程的接口。CUDA 允许开发者以高效的方式利用 GPU 的并行处理能力,从而提高计算性能。
下面介绍如何使用 C++ 和 CUDA 搭建高性能计算系统。
搭建C++ CUDA 编程环境,及相关代码,可参看另外一个优快云作者的文章:使用CLion进行cuda编程,并使用cuda-gdb对核函数进行debug,这可能是全网你能够找到的最详细的CLion和cuda编程环境配置教程了-优快云博客
下图为运行成功的情况:
在这里,通过CUDA在并行环境下实现了对两个向量进行相加:
1、先是创建三个指针变量,分别与两个入参与一个输出关联,用于在CUDA中存储两个入参与一个输出;
2、在CUDA中给两个入参向量与一个输出向量分配存储空间;
3、把两个入参向量复制到CUDA中;
4、然后在CUDA中进行向量求和计算,得到输出;
5、把输出复制到主机上;
6、释放CUDA中三个指针向量所占用的内存。
下面列出用C++、CUDA实现矩阵乘法、快速傅里叶变换、数值积分、矩阵点乘的完整代码:
矩阵乘法:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
cudaError_t matrixMulWithCuda(const float *A, const float *B, float *C, int m, int n, int p);
__global__ void matrixMul(const float *A, const float *B, float *C, int m, int n, int p) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < m) {
for (int k = 0; k < n; ++k) {
float sum = 0.0f;
for (int j = 0; j < p; ++j) {
sum += A[i * n + j] * B[j * p + k];
}
C[i * p + k] = sum;
}
}
}
// 动态分配2D数组内存
float **alloc2DArray(int rows, int cols) {
float **arr = (float **) malloc(rows * sizeof(float *));
arr[0] = (float *) malloc(rows * cols * sizeof(float));
for (int i = 1; i < rows; i++) {
arr[i] = arr[i - 1] + cols;
}
return arr;
}
// 释放2D数组内存
void free2DArray(float **arr) {
free(arr[0]);
free(arr);
}
// 矩阵相乘函数,接受动态大小的矩阵
cudaError_t matrixMulWithCuda(float **A, int m, int n, float **B, int p, float **C) {
// 分配 GPU 内存
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, m * n * sizeof(float));
cudaMalloc(&d_B, n * p * sizeof(float));
cudaMalloc(&d_C, m * p * sizeof(float));
cudaError_t cudaStatus;
// 将 A、B 矩阵复制到 GPU 内存中
cudaMemcpy(d_A, A[0], m * n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B[0], n * p * sizeof(float), cudaMemcpyHostToDevice);
// 设置块大小和线程数
int blockSize = 16;
int gridSize = (m + blockSize - 1) / blockSize;
// 调用矩阵乘法kernel
matrixMul<<<gridSize, blockSize>>>(d_A, d_B, d_C, m, n, p);
// 将结果矩阵C复制回CPU内存中
cudaStatus = cudaMemcpy(C[0], d_C, m * p * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus!= cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
// 释放 GPU 内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return cudaStatus;
}
int main() {
// 定义矩阵A
int m = 3;
int n = 2;
float **A = alloc2DArray(m, n);
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
// 使用基于行列索引的表达式生成非零值
A[i][j]=(i + 1) * (j + 1);
}
}
// 定义矩阵B
int p = 4;
float **B = alloc2DArray(n, p);
for (int i = 0; i < n; i++) {
for (int j = 0; j < p; j++) {
// 使用基于行列索引的表达式生成非零值
B[i][j]=(i + 1) * (j + 1);
}
}
// 定义结果矩阵C
float **C = alloc2DArray(m, p);
// 矩阵相乘
cudaError_t cudaStatus = matrixMulWithCuda(A, m, n, B, p, C);
if (cudaStatus!= cudaSuccess) {
fprintf(stderr, "matrixMulWithCuda failed!");
return 1;
}
// 输出结果矩阵C
for (int i = 0; i < m; i++) {
for (int j = 0; j < p; j++) {
printf("%f ", C[i][j]);
}
printf("\n");
}
// 释放内存
free2DArray(A);
free2DArray(B);
free2DArray(C);
// 重置CUDA设备
cudaStatus = cudaDeviceReset();
if (cudaStatus!= cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
快速傅里叶变换:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// 定义复数结构体(与cufftComplex类似,用于演示目的)
struct Complex {
float real;
float imag;
};
// 动态分配1D Complex数组内存
Complex **alloc1DComplexArray(int size) {
Complex **arr = (Complex **) malloc(size * sizeof(Complex *));
arr[0] = (Complex *) malloc(size * sizeof(Complex));
for (int i = 1; i < size; i++) {
arr[i] = arr[i - 1] + 1;
}
return arr;
}
// 释放1D Complex数组内存
void free1DComplexArray(Complex **arr) {
free(arr[0]);
free(arr);
}
// 定义全局函数,执行简单的1D FFT(基于DFT公式)
__global__ void simpleFFT(Complex *input, Complex *output, int N) {
int tid = blockIdx.x * blockDim.x+threadIdx.x;
if (tid < N) {
Complex sum = {0.0f, 0.0f};
for (int k = 0; k < N; k++) {
float angle = -2.0f * M_PI * tid * k / N;
Complex twiddle = {cosf(angle), sinf(angle)};
sum.real += input[k].real * twiddle.real - input[k].imag * twiddle.imag;
sum.imag += input[k].real * twiddle.imag + input[k].imag * twiddle.real;
}
output[tid] = sum;
}
}
int main() {
int N = 16;
// 在主机端分配并初始化数据
Complex **h_input = alloc1DComplexArray(N);
for (int i = 0; i < N; i++) {
h_input[i]->real = (float)i;
h_input[i]->imag = 0.0f;
}
// 在GPU端分配内存
Complex *d_input, *d_output;
cudaMalloc((void **)&d_input, N * sizeof(Complex));
cudaMalloc((void **)&d_output, N * sizeof(Complex));
// 将主机端数据复制到GPU端
cudaMemcpy(d_input, h_input[0], N * sizeof(Complex), cudaMemcpyHostToDevice);
// 定义线程块和网格大小
int blockSize = 16;
int gridSize = (N + blockSize - 1) / blockSize;
// 调用全局函数执行FFT
simpleFFT<<<gridSize, blockSize>>>(d_input, d_output, N);
// 将结果从GPU端复制回主机端
Complex **h_output = alloc1DComplexArray(N);
cudaMemcpy(h_output[0], d_output, N * sizeof(Complex), cudaMemcpyDeviceToHost);
// 输出结果(仅为示例,可根据需求调整)
for (int i = 0; i < N; i++) {
printf("%.2f + %.2fi ", h_output[i]->real, h_output[i]->imag);
}
printf("\n");
// 释放GPU端内存
cudaFree(d_input);
cudaFree(d_output);
// 释放主机端内存
free1DComplexArray(h_input);
free1DComplexArray(h_output);
return 0;
}
数值积分:
#include <iostream>
#include <cuda_runtime.h>
// 假设integralKernel函数已经在其他地方定义
__global__ void integralKernel(float *f, float *x, float *result, int n, int m) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
float sum = 0.0f;
for (int j = 0; j <= m; ++j) {
float xi = x[i * (m + 1)+j];
float fxi = f[i * (m + 1)+j];
float div_result = fxi / (1.0f + xi * xi);
sum += div_result;
// 输出中间结果用于调试
if (j == 0) {
printf("Thread %d, i = %d, j = %d, xi = %f, fxi = %f, div_result = %f\n",
threadIdx.x, i, j, xi, fxi, div_result);
}
}
result[i] = sum * (x[i * (m + 1)+m + 1]-x[i * (m + 1)+1]);
}
}
int main() {
// 初始化数据
int n = 1000;
float *f = new float[n * (n + 1)];
float *x = new float[n * (n + 1)];
// 正确初始化result指针
float *result = new float[n];
// 假设m的值为n - 1,根据实际需求修改
int m = n - 1;
// 初始化f和x数组为非零值(这里简单示例为1.0f)
for (int i = 0; i < n * (n + 1); ++i) {
f[i]=1.0f;
x[i]=1.0f;
}
// 分配 GPU 内存
float *d_f, *d_x, *d_result;
cudaMalloc(&d_f, n * (n + 1) * sizeof(float));
cudaMalloc(&d_x, n * (n + 1) * sizeof(float));
cudaMalloc(&d_result, n * sizeof(float));
// // 将数据复制到 GPU 内存中
// cudaMemcpy(d_f, f, n * (n + 1) * sizeof(float), cudaMemcpyHostToDevice);
// cudaMemcpy(d_x, x, n * (n + 1) * sizeof(float), cudaMemcpyHostToDevice);
// 将数据复制到 GPU 内存中
cudaError_t err;
err = cudaMemcpy(d_f, f, n * (n + 1) * sizeof(float), cudaMemcpyHostToDevice);
if (err!= cudaSuccess) {
std::cerr << "cudaMemcpy d_f failed: " << cudaGetErrorString(err) << std::endl;
}
err = cudaMemcpy(d_x, x, n * (n + 1) * sizeof(float), cudaMemcpyHostToDevice);
if (err!= cudaSuccess) {
std::cerr << "cudaMemcpy d_x failed: " << cudaGetErrorString(err) << std::endl;
}
// 设置块大小和线程数
int blockSize = 256;
int gridSize = (n + blockSize - 1) / blockSize;
// 调用积分 kernel
integralKernel<<<gridSize, blockSize>>>(d_f, d_x, d_result, n, m);
// 将结果复制回 CPU 内存中
cudaMemcpy(result, d_result, n * sizeof(float), cudaMemcpyDeviceToHost);
// 打印结果
for (int i = 0; i < n; ++i) {
std::cout << "result[" << i << "]: " << result[i] << std::endl;
}
// 释放 GPU 内存
cudaFree(d_f);
cudaFree(d_x);
cudaFree(d_result);
// 释放 CPU 内存
delete[] f;
delete[] x;
delete[] result;
return 0;
}
矩阵点乘:
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
__global__ void dotProduct(float* a, float* b, int n, float* result) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
float sum = 0;
while (idx < n) {
sum += a[idx] * b[idx];
idx += blockDim.x * gridDim.x;
}
atomicAdd(result, sum);
}
int main() {
const int n = 10000;
float* a_h = new float[n]; // Host array a
float* b_h = new float[n]; // Host array b
float* result_h = new float[1]; // Host result
// Initialize data
for (int i = 0; i < n; ++i) {
a_h[i] = 1.0f;
b_h[i] = 2.0f;
}
float* d_a; // Device array a
float* d_b; // Device array b
float* d_result; // Device result
cudaMalloc((void**)&d_a, n * sizeof(float));
cudaMalloc((void**)&d_b, n * sizeof(float));
cudaMalloc((void**)&d_result, sizeof(float));
cudaMemcpy(d_a, a_h, n * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b_h, n * sizeof(float), cudaMemcpyHostToDevice);
dotProduct<<<1, 1>>>(d_a, d_b, n, d_result);
cudaMemcpy(result_h, d_result, sizeof(float), cudaMemcpyDeviceToHost);
printf("Result: %f\n", *result_h);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
delete[] a_h;
delete[] b_h;
delete[] result_h;
return 0;
}