需求:求整数型数组int array[10000001]所有元素之和,统计计算部分的开销
验证平台:ThinkPad T450, Ubuntu 16.04
方法一:使用CPU暴力计算
代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define N 10000001
int64_t getClockTimeUs() {
struct timespec ts;
if(clock_gettime(CLOCK_MONOTONIC,&ts) == 0){
return ts.tv_sec * 1000000LL + ts.tv_nsec/1000L;
}
return 0;
}
int main() {
int *src = (int *)malloc(N * sizeof(int));
int result;
for(int i = 0; i < N; i++) src[i] = 1;
int64_t start = getClockTimeUs();
for(int k = 0; k < N; k++) {
result += src[k];
}
int64_t end = getClockTimeUs();
printf("result %d, cost %ld ms\n", result, (end - start)/1000L);
}
编译:
gcc sum_by_cpu.c -o sum_cpu
运行结果:
result 10000001, cost 29 ms
方法二:使用OpenMP计算
代码:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>
#define N 10000001
int64_t getClockTimeUs() {
struct timespec ts;
if(clock_gettime(CLOCK_MONOTONIC,&ts) == 0){
return ts.tv_sec * 1000000LL + ts.tv_nsec/1000L;
}
return 0;
}
int main() {
int *src = (int *)malloc(N * sizeof(int));
int result = 0;
for(int i = 0; i < N; i++) src[i] = 1;
int64_t start = getClockTimeUs();
#pragma omp parallel for default(shared) reduction(+:result)
for(int k = 0; k < N; k++) {
result += src[k];
}
int64_t end = getClockTimeUs();
printf("result %d, cost %ld ms\n", result, (end - start)/1000L);
}
编译:
gcc sum_by_openmp.c -fopenmp -o sum_openmp
运行结果:
result 10000001, cost 14 ms
方法三:使用CUDA计算,归约思想
代码:
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include "cuda_runtime.h"
#define N 10000001
#define ThreadsPerBlock 1024
__global__ void sum(int *src, int size, int *result){
__shared__ int cache[ThreadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cachedIndex = threadIdx.x;
int temp = 0;
while(tid < size) {
temp += src[tid];
tid += gridDim.x * blockDim.x;
}
cache[cachedIndex] = temp;
__syncthreads();
int i = blockDim.x / 2;
while (i != 0) {
if(cachedIndex < i)
cache[cachedIndex] += cache[cachedIndex + i];
__syncthreads();
i /= 2;
}
if(cachedIndex == 0)
result[blockIdx.x] = cache[0];
}
__global__ void sum2(int *result) {
int cachedIndex = threadIdx.x;
int i = blockDim.x / 2;
while (i != 0) {
if(cachedIndex < i)
result[cachedIndex] += result[cachedIndex + i];
__syncthreads();
i /= 2;
}
}
int main()
{
int BlocksPerGrid = (N+511/ThreadsPerBlock) > ThreadsPerBlock ? ThreadsPerBlock : ((N+511/ThreadsPerBlock));
BlocksPerGrid = (BlocksPerGrid%2) ? (BlocksPerGrid+1) : BlocksPerGrid;
printf("BlocksPerGrid %d, ThreadsPerBlock %d\n", BlocksPerGrid, ThreadsPerBlock);
int *src = (int *)malloc(N * sizeof(int));
int result;
// running on CPU
for(int i = 0; i < N; i++) src[i] = 1;
// running on GPU
int *dev_src;
int *dev_result;
cudaMalloc((void**)&dev_src, N * sizeof(int));
cudaMalloc((void**)&dev_result, BlocksPerGrid * sizeof(int));
cudaMemcpy(dev_src, src, N * sizeof(int), cudaMemcpyHostToDevice);
sum<<<BlocksPerGrid, ThreadsPerBlock>>>(dev_src, N, dev_result);
sum2<<<1, BlocksPerGrid>>>(dev_result);
cudaError_t cudaStatus;
cudaStatus = cudaGetLastError();
if(cudaStatus != cudaSuccess) {
printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus));
}
cudaMemcpy((void *)&result, &(dev_result[0]), sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
// check result
printf("result %d\n", result);
cudaFree(dev_src);
cudaFree(dev_result);
free(src);
return 0;
}
编译:
nvcc sum_by_cuda.cu -o sum_cuda
运行结果:CPU和GPU代码是并行的,所以不能简单用CPU上的时间函数来计算GPU的开销。可以使用nvprof查看。
$ nvprof ./sum_cuda
BlocksPerGrid 1024, ThreadsPerBlock 1024
==5706== NVPROF is profiling process 5706, command: ./sum_cuda
result 10000001
==5706== Profiling application: ./sum_cuda
==5706== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 90.34% 27.215ms 1 27.215ms 27.215ms 27.215ms [CUDA memcpy HtoD]
9.63% 2.9010ms 1 2.9010ms 2.9010ms 2.9010ms sum(int*, int, int*)
0.02% 6.7840us 1 6.7840us 6.7840us 6.7840us sum2(int*)
0.01% 1.7920us 1 1.7920us 1.7920us 1.7920us [CUDA memcpy DtoH]
API calls: 76.90% 114.87ms 2 57.436ms 150.59us 114.72ms cudaMalloc
20.34% 30.378ms 2 15.189ms 3.6222ms 26.755ms cudaMemcpy
2.22% 3.3087ms 2 1.6544ms 243.07us 3.0657ms cudaFree
0.37% 559.65us 97 5.7690us 164ns 246.74us cuDeviceGetAttribute
0.08% 116.75us 1 116.75us 116.75us 116.75us cuDeviceTotalMem
0.05% 78.535us 1 78.535us 78.535us 78.535us cuDeviceGetName
0.03% 43.591us 2 21.795us 9.1640us 34.427us cudaLaunchKernel
0.00% 5.1410us 1 5.1410us 5.1410us 5.1410us cudaDeviceSynchronize
0.00% 4.2410us 1 4.2410us 4.2410us 4.2410us cuDeviceGetPCIBusId
0.00% 2.7160us 3 905ns 176ns 1.5650us cuDeviceGetCount
0.00% 1.3010us 2 650ns 362ns 939ns cuDeviceGet
0.00% 290ns 1 290ns 290ns 290ns cuDeviceGetUuid
0.00% 283ns 1 283ns 283ns 283ns cudaGetLastError
两个核函数sum()和sum2()本身的开销很小,不过cudaMalloc和cudaMemcpy()的开销却很大。
对比总结:
单纯从计算开销来说,CUDA>>OpenMP>CPU。这很好理解,CUDA就是为并行计算而生的,OpenMP相当于提供了一个便捷创建多线程的方法,
使代码更简洁。不过大数据在CPU和GPU之间搬移时确实非常费时,有可能会抵消CUDA并行计算节省下来的时间,在实践中要注意。
————————————————
版权声明:本文为优快云博主「BeALinuxGraphicCoder」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.youkuaiyun.com/weixin_42263483/article/details/89736603