#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
using namespace std;
int getThreadNum()
{
cudaDeviceProp prop;
int count = 0;
cudaGetDeviceCount(&count);
std::cout << "gpu 的个数:" << count << '\n';
cudaGetDeviceProperties(&prop, 0);
cout << "最大线程数:" << prop.maxThreadsPerBlock << endl;
cout << "最大网格类型:" << prop.maxGridSize[0] << '\t' << prop.maxGridSize[1] << '\t' << prop.maxGridSize[2] << endl;
return prop.maxThreadsPerBlock;
}
__global__ void conv(float *imgGpu, float*kernelGpu, float*resultGpu, int width, int height, int kernelSize)
{
int id = threadIdx.x + blockIdx.x*blockDim.x;
if (id >= width * height)
{
return;
}
int row = id / width;
int clo = id / height;
for (int i = 0; i < kernelSize; ++i)
{
for (int j = 0; j < kernelSize; ++j)
{
float imgValue = 0;
int curRow = row - kernelSize / 2 + i;
int curClo = clo - kernelSize / 2 + j;
if (curRow < 0 || curClo < 0||curRow>=height||curClo>=width)
{}
else
{
imgValue = imgGpu[curRow*width + curClo];
}
resultGpu[id] += kernelGpu[i*kernelSize + j] * imgValue;
}
}
}
int main()
{
const int width = 1920;
const int height = 1080;
float *img = new float[width*height];
for (int row=0; row < height; ++row)
{
for (int col = 0; col < width; ++col)
{
img[col + row * width] = (col + row) % 255;
}
}
const int kernelSize = 3;
float*kernel = new float[kernelSize*kernelSize];
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
kernel[i] = i % kernelSize - 1;
}
for (int row=0; row < 10; ++row)
{
for (int col = 0; col < 10; ++col)
{
std::cout << img[col + row * width] << '\t';
}
std::cout << '\n';
}
for (int i = 0; i < kernelSize*kernelSize; ++i)
{
std::cout << kernel[i] << '\t';
}
float *imgGpu = 0;
float *kernelGpu = 0;
float *resultGpu = 0;
cudaMalloc(&imgGpu, height*width * sizeof(float));
cudaMalloc(&kernel, kernelSize*kernelSize * sizeof(float));
cudaMemcpy(imgGpu, img, width*height*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(kernelGpu, kernel, kernelSize*kernelSize*sizeof(float), cudaMemcpyHostToDevice);
const int threadNum = getThreadNum();
const int blockNum = (width*height + threadNum - 1) / threadNum;
conv << <blockNum, threadNum >> > (imgGpu, kernelGpu, resultGpu, width, height, kernelSize);
float *showImg = new float[height*width];
cudaMemcpy(showImg, resultGpu, width*height * sizeof(float), cudaMemcpyDeviceToHost);
return 0;
}