初学OpenMP和CUDA,写一个OpenMP的多GPU的编程小例子:
//程序目的:将5维方阵的每一列都乘以2,给定OpenMP线程数和GPU设备数均为4,环境为北京并行科技的一个GPU节点
//主函数
#include "head_file.cuh"
int main()
{
//获取gpu设备数
int gpu_num;
cudaGetDeviceCount(&gpu_num);
cout<<"gpu线程数:"<<gpu_num<<endl<<endl;
int size=5;
double**M=new double*[size];
for(int i=0;i<size;i++)
M[i]=new double[size];
//从文件中读取矩阵
Create_Matrix_From_File(M,"Matrix.txt");
//输出矩阵
cout<<"输出原始矩阵:"<<endl;
Matrix_Display(M,size,size);
//列任务分配
int shang=size/gpu_num;
int yushu=size%gpu_num;
int *task_num=new int[gpu_num]();
int *task_st=new int[gpu_num]();
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
//第一部分
if(gpu_id<yushu)
{
task_st[gpu_id]=(shang+1)*gpu_id+1;
task_num[gpu_id]=shang+1;
}
//第二部分
else
{
task_st[gpu_id]=(shang+1)*yushu+shang*(gpu_id-yushu)+1;
task_num[gpu_id]=shang;
}
}
cout<<endl<<"列任务分配:"<<endl;
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
cout<<"gpu_id: "<<gpu_id<<" task_st: "<<task_st[gpu_id]<<" task_num: "<<task_num[gpu_id]<<endl;
}
//分配h_M的固定主机内存/设备内存
double *h_M[gpu_num], *d_M[gpu_num]; //均为指针数组
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
cudaSetDevice(gpu_id);
//固定主机内存
cudaMallocHost((void**)&h_M[gpu_id],size*task_num[gpu_id]*sizeof(double));
//设备内存
cudaMalloc((void**)&d_M[gpu_id],size*task_num[gpu_id]*sizeof(double));
}
//生成h_M
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
//h_M=M(1:size,task_st:task_st+task_num-1)
for (int i = 1; i <= size; i++)
for(int j=task_st[gpu_id];j<=task_st[gpu_id]+task_num[gpu_id]-1;j++)
h_M[gpu_id][(i-1)*task_num[gpu_id]+j-task_st[gpu_id]]=M[i-1][j-1];
}
//输出h_M
cout<<endl<<"输出h_M:"<<endl;
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
cout<<"***********************************************"<<endl;
cout<<"gpu_id:"<<gpu_id<<endl<<endl;
cout<<"对应的h_M:"<<endl;
for (int i = 1; i <= size; i++)
{
for(int j=task_st[gpu_id];j<=task_st[gpu_id]+task_num[gpu_id]-1;j++)
{
cout<<h_M[gpu_id][(i-1)*task_num[gpu_id]+j-task_st[gpu_id]]<<" ";
}
cout<<endl;
}
}
//每个设备产生一个CUDA流
cudaStream_t stream[gpu_num];
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
cudaSetDevice(gpu_id);
cudaStreamCreate(&stream[gpu_id]);
}
dim3 block(8);
dim3 grid((size + block.x - 1) / block.x);
cout <<endl<< "线程块的大小block.x:" << block.x << endl;
cout << "线程块的个数grid.x:" << grid.x << endl<<endl;
double t=2;
#pragma omp parallel num_threads(gpu_num) \
default(none)shared(size, h_M, d_M, task_st, task_num, grid, block, stream, t)
{
//线程编号
int tid = omp_get_thread_num();
printf("openmp线程编号:%d\n",tid);
//设置当前设备
cudaSetDevice(tid);
//传输h_M到d_M
cudaMemcpyAsync(d_M[tid],h_M[tid],size*task_num[tid]*sizeof(double),cudaMemcpyHostToDevice,stream[tid]);
Col_Mutiple << <grid, block, 0, stream[tid]>> >(d_M[tid], size, task_num[tid], t);
//传输d_M到h_M
cudaMemcpyAsync(h_M[tid],d_M[tid],size*task_num[tid]*sizeof(double),cudaMemcpyDeviceToHost,stream[tid]);
}
cudaDeviceSynchronize();
//释放设备端显存
cudaFree(d_M);
//释放主机端固定内存
cudaFreeHost(h_M);
//销毁流
for (int gpu_id = 0; gpu_id < gpu_num; gpu_id++)
{
cudaStreamDestroy(stream[gpu_id]);
}
//将h_M更新到M
cout<<endl<<"h_M更新到M:"<<endl;
for(int gpu_id=0;gpu_id<gpu_num;gpu_id++)
{
//h_M=M(1:size,task_st:task_st+task_num-1)
for (int i = 1; i <= size; i++)
{
for(int j=task_st[gpu_id];j<=task_st[gpu_id]+task_num[gpu_id]-1;j++)
{
M[i-1][j-1]=h_M[gpu_id][(i-1)*task_num[gpu_id]+j-task_st[gpu_id]];
}
}
}
cout<<"更新之后的M:"<<endl;
Matrix_Display(M,size,size);
return 0;
}
//头文件
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <fstream>
#include <omp.h>
#include <iomanip>
//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"
//#include "device_functions.h"
//命名空间
using namespace std;
#define cout_width 10
#define cout_decimal_digits 5
//行主序的宏函数
#define index(i,j,lda) (((i)-1)*(lda)+(j)-1)
void Create_Matrix_From_File(double**x,string matrix);
void Matrix_Display(double** M, int row, int col);
__global__ void Col_Mutiple(double*d_M, int row, int col, double t);
//子函数
#include "head_file.cuh"
//d_M(1:row,1:col)=d_M(1:row,1:col)*t
__global__ void Col_Mutiple(double *d_M,int row, int col, double t)
{
int tid=blockDim.x*blockIdx.x+threadIdx.x;
printf("gpu线程号(总):%d\n\n",tid);
if(tid<col)
{
int j=tid+1;
//d_M(1:row,j)=d_M(1:row,j)*t
for(int i=1;i<=row;i++)
{
d_M[index(i,j,col)]=t*d_M[index(i,j,col)];
printf("gpu线程号(实际):%d 列号:%d i的值:%d j的值:%d\n",tid,j,i,j);
}
}
}
//子函数:
#include "head_file.cuh"
//从文件中创建矩阵
void Create_Matrix_From_File(double**x,string matrix)
{
//打开文件
ifstream fin(matrix, ios::in);
//判断打开文件是否成功
if (!fin)
{
cerr << "打开文件失败!" << endl << endl;
/*exit(zero);*/
}
else
{
cout << "打开文件成功!" << endl << endl;
}
//接收数据用的临时值
int i, j;
double value;
//判断文件结尾不准确的解决方法:预读值
int pre;
//判断条件结束的方法:文件结束符
while (fin >> pre && fin.peek() != EOF)
{
//行号等于预读值
i = pre;
//接收列号/值
fin >> j >> value;
x[i-1][j-1] = value;
}
//关闭文件
fin.close();
}
//子函数
#include "head_file.cuh"
//输出矩阵
void Matrix_Display(double** M, int row, int col)
{
//输出格式控制
cout << setiosflags(ios::fixed) << setiosflags(ios::right) << \
setprecision(cout_decimal_digits);
for (int i = 1; i <= row; i++)
{
for (int j = 1; j <= col; j++)
{
cout << setw(cout_width) << M[i-1][j-1] << " ";
}
cout << endl;
}
}
Matrix.txt文件如下:
1 1 2
1 2 3
1 3 4
1 4 5
1 5 6
2 1 3
2 2 4
2 3 5
2 4 6
2 5 7
3 1 4
3 2 5
3 3 6
3 4 7
3 5 8
4 1 5
4 2 6
4 3 7
4 4 8
4 5 9
5 1 6
5 2 7
5 3 8
5 4 9
5 5 10
得到的test.log文件(屏幕输出)如下:
gpu线程数:4
打开文件成功!
输出原始矩阵:
2.00000 3.00000 4.00000 5.00000 6.00000
3.00000 4.00000 5.00000 6.00000 7.00000
4.00000 5.00000 6.00000 7.00000 8.00000
5.00000 6.00000 7.00000 8.00000 9.00000
6.00000 7.00000 8.00000 9.00000 10.00000
列任务分配:
gpu_id: 0 task_st: 1 task_num: 2
gpu_id: 1 task_st: 3 task_num: 1
gpu_id: 2 task_st: 4 task_num: 1
gpu_id: 3 task_st: 5 task_num: 1
输出h_M:
***********************************************
gpu_id:0
对应的h_M:
2.00000 3.00000
3.00000 4.00000
4.00000 5.00000
5.00000 6.00000
6.00000 7.00000
***********************************************
gpu_id:1
对应的h_M:
4.00000
5.00000
6.00000
7.00000
8.00000
***********************************************
gpu_id:2
对应的h_M:
5.00000
6.00000
7.00000
8.00000
9.00000
***********************************************
gpu_id:3
对应的h_M:
6.00000
7.00000
8.00000
9.00000
10.00000
线程块的大小block.x:8
线程块的个数grid.x:1
openmp线程编号:0
openmp线程编号:1
openmp线程编号:3
openmp线程编号:2
gpu线程号(总):0
gpu线程号(总):1
gpu线程号(总):2
gpu线程号(总):3
gpu线程号(总):4
gpu线程号(总):5
gpu线程号(总):6
gpu线程号(总):7
gpu线程号(实际):0 列号:1 i的值:1 j的值:1
gpu线程号(实际):1 列号:2 i的值:1 j的值:2
gpu线程号(实际):0 列号:1 i的值:2 j的值:1
gpu线程号(实际):1 列号:2 i的值:2 j的值:2
gpu线程号(实际):0 列号:1 i的值:3 j的值:1
gpu线程号(实际):1 列号:2 i的值:3 j的值:2
gpu线程号(实际):0 列号:1 i的值:4 j的值:1
gpu线程号(实际):1 列号:2 i的值:4 j的值:2
gpu线程号(实际):0 列号:1 i的值:5 j的值:1
gpu线程号(实际):1 列号:2 i的值:5 j的值:2
h_M更新到M:
更新之后的M:
4.00000 6.00000 8.00000 10.00000 12.00000
6.00000 8.00000 10.00000 12.00000 14.00000
8.00000 10.00000 12.00000 14.00000 16.00000
10.00000 12.00000 14.00000 16.00000 18.00000
12.00000 14.00000 16.00000 18.00000 20.00000