宏定义:
- 一维网络线程:
这里是两个block(线程块),block为线性排列;每个block中的线程(thread)也是线性排列。 - 二维网络线程/三维网络线程(z = 1)
图中为一个线程块内部的线程(这里不用管内部的索引顺序,我的这张图片只是为了说明它是二维排列的,实际的索引顺序不要看这张图.).定义为:
dim3 theards_per_block(4,4,1);
- 三维网络线程(z != 1) 留白
首先我先介绍一下,在一维的网络线程下,网络总线程数与数据量大小关系的三种情况:
一维网络线程
这里赘述一维网络线程是为了抛砖引玉,如此循序渐进地就可以推出二维网络线程的循环思路了.
网络总线程数大于数据量
这时候直接可以用threadIdx.x + blockDim.x*blockIdx.x
遍历所有了.
注意: 不要越界访问数组索引,使用if
网络总线程数小于数据量
这时候,引入for
int stride = gridDim.x*blockDim.x;
for(int i=threadIdx.x + blockDim.x*blockIdx.x;i<len(Array);i+=stride)
{
/*
*/
}
二维网络线程
这里我把今上午成功解决问题的代码放在这里,花太多时间讲思路对我来说没必要,主要思路也很简单。
但首先要理解的一点是:我这里是把一个一维数组看成二维数组来进行操作的。
#define _CRT_SECURE_NO_WARNINGS
#include "stdio.h"
#include "cuda_runtime.h"
#include "device_functions.h"
#include "device_launch_parameters.h"
#include "driver_types.h"
#include "assert.h"
#define Num 64
/*
@Author Chi
@brief a dim1 Array that can be turned into dim3(z=1) is inited by not enough dim3(z=1)threads_per_block and not enough num_of_blocks.
@param N:dim(matrix)
@param <<<>>>
@return void
*/
__global__ void initArray(int* a, int N, int initValue);
/*
@Author Chi
@brief show the ArrayValue
@param N:dim(matrix)
@param <<<>>>
@return void
*/
__global__ void viewArray(int* a, int N);
inline cudaError_t checkCuda(cudaError_t result);
//矩阵乘法GPU计算(dim3(z=1))
__global__ void matrixMulGPU(int* a, int* b, int* c)
{
int val = 0;
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int stridex = gridDim.x * blockDim.x;
int stridey = gridDim.y * blockDim.y;
for (; row < Num && col < Num; row += stridex, col += stridey)
{
for (int k = 0; k < Num; ++k)
{
val += a[row * Num + k] * b[k * Num + col];
c[row * Num + col] = val;
}
/*
|1|1|1|1| |-|-|1|1|
|1|1|1|1|---> |-|-|1|1|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
int rowtemp = row;
int coltemp = col;
for (val = 0; row < Num; row += stridex)//索引右移
{
for (int k = 0; k < Num; ++k)
val += a[row * Num + k] * b[k * Num + col];
c[row * Num + col] = val;
val = 0;
}
/*
|-|-|1|1| |-|-|-|-|
|-|-|1|1|---> |-|-|-|-|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
row = rowtemp;
for (val=0; col < Num; col += stridey)//索引下移
{
for (int k = 0; k < Num; ++k)
{
val += a[row * Num + k] * b[k * Num + col];
c[row * Num + col] = val;
}
val = 0;
}
col = coltemp;
}
/*
|-|-|-|-| |-|-|-|-|
|-|-|-|-|---> |-|-|-|-|
|1|1|1|1| |-|-|1|1|
|1|1|1|1| |-|-|1|1|
*/
/*
将初始位置转移到剩余矩阵左上角,在剩余矩阵中重复操作。
|1|1|
|1|1|
*/
}
int main()
{
//这里是把二维矩阵a和b拉伸开成为了一维矩阵。
int* a, * b, * c_gpu;
int size = Num * Num * sizeof(int);
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c_gpu, size);
dim3 threads_per_block(16, 16, 1);
dim3 num_of_blocks((Num / threads_per_block.x)+ 1, (Num / threads_per_block.y) + 1, 1);
initArray << <num_of_blocks, threads_per_block >> > (a, Num, 3);
initArray << <num_of_blocks, threads_per_block >> > (b, Num, 5);
//matrixMulGPU << <num_of_blocks, threads_per_block >> > (a, b, c_gpu);
matrixMulGPU << <2, threads_per_block >> > (a, b, c_gpu);
cudaError_t err1 = cudaGetLastError();
checkCuda(err1);
checkCuda(cudaDeviceSynchronize());
cudaDeviceSynchronize();
viewArray<<<num_of_blocks,threads_per_block>>>(c_gpu, Num);
cudaFree(a);
cudaFree(b);
cudaFree(c_gpu);
}
//@param N:矩阵维度
__global__ void initArray(int* a, int N, int initValue)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int stridex = gridDim.x * blockDim.x;
int stridey = gridDim.y * blockDim.y;
for (; row < N && col < N; row += stridex, col += stridey)
{
a[row * N + col] = initValue;
/*
|1|1|1|1| |-|-|1|1|
|1|1|1|1|---> |-|-|1|1|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
int rowtemp = row;
int coltemp = col;
for (; row < N; row += stridex)//索引右移
{
a[row * N + col] = initValue;
}
/*
|-|-|1|1| |-|-|-|-|
|-|-|1|1|---> |-|-|-|-|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
row = rowtemp;
for (; col < N; col += stridey)//索引下移
{
a[row * N + col] = initValue;
}
col = coltemp;
}
/*
|-|-|-|-| |-|-|-|-|
|-|-|-|-|---> |-|-|-|-|
|1|1|1|1| |-|-|1|1|
|1|1|1|1| |-|-|1|1|
*/
/*
将初始位置转移到剩余矩阵,在剩余矩阵中重复操作。
*/
}
__global__ void viewArray(int* a, int N)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
int stridex = gridDim.x * blockDim.x;
int stridey = gridDim.y * blockDim.y;
for (; row < N && col < N; row += stridex, col += stridey)
{
printf("%d: %d \n",row*N+col, a[row * N + col]);
/*
|1|1|1|1| |-|-|1|1|
|1|1|1|1|---> |-|-|1|1|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
int rowtemp = row;
int coltemp = col;
for (; row < N; row += stridex)//索引右移
{
printf("%d: %d \n",row*N+col, a[row * N + col]);
}
/*
|-|-|1|1| |-|-|-|-|
|-|-|1|1|---> |-|-|-|-|
|1|1|1|1| |1|1|1|1|
|1|1|1|1| |1|1|1|1|
*/
row = rowtemp;
for (; col < N; col += stridey)//索引下移
{
printf("%d: %d \n",row*N+col, a[row * N + col]);
}
col = coltemp;
}
/*
|-|-|-|-| |-|-|-|-|
|-|-|-|-|---> |-|-|-|-|
|1|1|1|1| |-|-|1|1|
|1|1|1|1| |-|-|1|1|
*/
/*
将初始位置转移到剩余矩阵,在剩余矩阵中重复操作。
*/
}
cudaError_t checkCuda(cudaError_t result)
{
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return result;
}
到此结束