以下为混合编程测试例子一相对应的gpu计算能力和配置(本文测试gpuGTX1050),例子中计算的数组的平方和与数组元素求立方。(使用例子过程中遇到问题可以一起讨论)
英伟达部分gpu的计算能力:
Tesla V100
# ARCH= -gencode arch=compute_70,code=[sm_70,compute_70]
GTX 1080, GTX 1070, GTX 1060, GTX 1050, GTX 1030, Titan Xp, Tesla P40, Tesla P4
# ARCH= -gencode arch=compute_61,code=sm_61 -gencode arch=compute_61,code=compute_61
GP100/Tesla P100 DGX-1
# ARCH= -gencode arch=compute_60,code=sm_60
For Jetson Tx1 uncomment:
# ARCH= -gencode arch=compute_51,code=[sm_51,compute_51]
For Jetson Tx2 or Drive-PX2 uncomment:
# ARCH= -gencode arch=compute_62,code=[sm_62,compute_62]
CMakeLists文件配置调用cuda库:
project(cudafilter2d)
cmake_minimum_required(VERSION 2.8)
INCLUDE(/usr/share/cmake-3.5/Modules/FindCUDA.cmake)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ")
#find_package(OpenCV 3.4 REQUIRED)
#INCLUDE_DIRECTORIES(${OpenCV_INCLUDE_DIRS})
find_package(CUDA)
set(CUDA_NVCC_FLAGS -gencode arch=compute_61,code=sm_61;-G;-g)
CUDA_ADD_EXECUTABLE(main main.cpp filter.cu filter.h )
#target_link_libraries(main ${OpenCV_LIBS})
测试程序:
mian.cpp:
#include <stdio.h>
#include "filter.h"
#include <opencv2/opencv.hpp>
#include <iostream>
#include <future>
using namespace cv;
using namespace std;
int main()
{
float data[96],data1[96];
for(int i =0;i<96;i++)
data[i]=float(i);
double t1 = (double)getTickCount();
CUDA_cube();
//for(int i= 0; i<96;i++)
//{
// data1[i]=data[i]*data[i]*data[i];
//}
//CUDA_square();
double t2 = ((double)getTickCount()-t1)/getTickFrequency();
//cout<<t2<<endl;
return 0;
}
filter.h:
#ifndef FILTER_H
#define FILTER_H
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <opencv2/opencv.hpp>
using namespace cv;
extern "C" void CUDA_square();
extern "C" void CUDA_cube();
#endif
filter.cu:
#include "filter.h"
__global__ void square (int *data,int *sum)
{
//int i = threadIdx.x;
for(int i=0;i<100;i++)
{
sum[0] += data[i]*data[i];
}
}
__global__ void cube(float * d_out, float * d_in)
{
int idx = threadIdx.x;
//for(int idx=0;idx<96;idx++)
//{
float f = d_in[idx];
d_out[idx] = f*f*f;
//}
}
void CUDA_cube()
{
const int ARRAY_SIZE = 96;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
// declare GPU memory pointers
float * d_in;
float * d_out;
// allocate GPU memory
cudaMalloc((void**)&d_in, ARRAY_BYTES);
cudaMalloc((void**)&d_out, ARRAY_BYTES);
// transfer the array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
// launch the kernel
double t1 = (double)getTickCount();
cube << <1, ARRAY_SIZE >> >(d_out, d_in);
double t2 = ((double)getTickCount()-t1)/getTickFrequency();
printf("time:%f",t2);
// cudaError_t cudaStatus = cudaGetLastError();
// if (cudaStatus != cudaSuccess)
// { fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); }
// copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// print out the resulting array
// for (int i = 0; i < ARRAY_SIZE; i++) {
// printf("%f\n", h_out[i]);
// //printf(((i % 4) != 3) ? "\t" : "\n");
// }
cudaFree(d_in);
cudaFree(d_out);
}
void CUDA_square()
{
int data[100];
int result[1];
for(int i = 0; i<100;i++)
data[i]=i;
int *gpudata,*gpusum;
cudaMalloc((void**) &gpudata,sizeof(int)*100);
cudaMalloc((void**) &gpusum,sizeof(int));
cudaMemcpy(gpudata,data,sizeof(int)*100,cudaMemcpyHostToDevice);
square<<<1,100>>>(gpudata,gpusum);
cudaMemcpy(result,gpusum,sizeof(int),cudaMemcpyDeviceToHost);
printf("result:%d\n",result[0]);
}