大数组累计求和 - CPU、OpenMP、CUDA三种方法的性能对比

最新推荐文章于 2025-04-15 06:20:42 发布

lihiajiaose

最新推荐文章于 2025-04-15 06:20:42 发布

阅读量439

点赞数 1

文章标签：算法

本文链接：https://blog.youkuaiyun.com/lihiajiaose/article/details/131072704

版权

需求：求整数型数组int array[10000001]所有元素之和，统计计算部分的开销
验证平台：ThinkPad T450, Ubuntu 16.04

方法一：使用CPU暴力计算
代码：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 10000001

int64_t getClockTimeUs() {
   struct timespec ts;
   if(clock_gettime(CLOCK_MONOTONIC,&ts) == 0){
       return ts.tv_sec * 1000000LL + ts.tv_nsec/1000L;
}
return 0;
}

int main() {
   int *src = (int *)malloc(N * sizeof(int));
   int result;
   for(int i = 0; i < N; i++) src[i] = 1;

   int64_t start = getClockTimeUs();
   for(int k = 0; k < N; k++) {
       result += src[k];
   }
   int64_t end = getClockTimeUs();

   printf("result %d, cost %ld ms\n", result, (end - start)/1000L);
}
编译：

gcc sum_by_cpu.c -o sum_cpu
运行结果：

result 10000001, cost 29 ms

方法二：使用OpenMP计算
代码：

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

#define N 10000001

int64_t getClockTimeUs() {
   struct timespec ts;
   if(clock_gettime(CLOCK_MONOTONIC,&ts) == 0){
       return ts.tv_sec * 1000000LL + ts.tv_nsec/1000L;
}
return 0;
}

int main() {
   int *src = (int *)malloc(N * sizeof(int));
   int result = 0;
   for(int i = 0; i < N; i++) src[i] = 1;

   int64_t start = getClockTimeUs();
#pragma omp parallel for default(shared) reduction(+:result)
   for(int k = 0; k < N; k++) {
       result += src[k];
   }
   int64_t end = getClockTimeUs();

   printf("result %d, cost %ld ms\n", result, (end - start)/1000L);
}
编译：

gcc sum_by_openmp.c -fopenmp -o sum_openmp
运行结果：

result 10000001, cost 14 ms
方法三：使用CUDA计算，归约思想
代码：

#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include "cuda_runtime.h"

#define N 10000001

#define ThreadsPerBlock 1024

__global__ void sum(int *src, int size, int *result){
   __shared__ int cache[ThreadsPerBlock];
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int cachedIndex = threadIdx.x;

   int temp = 0;
   while(tid < size) {
       temp += src[tid];
       tid += gridDim.x * blockDim.x;
   }
   cache[cachedIndex] = temp;

   __syncthreads();

   int i = blockDim.x / 2;
   while (i != 0) {
       if(cachedIndex < i)
           cache[cachedIndex] += cache[cachedIndex + i];

       __syncthreads();

       i /= 2;
   }

   if(cachedIndex == 0)
       result[blockIdx.x] = cache[0];
}

__global__ void sum2(int *result) {
   int cachedIndex = threadIdx.x;

   int i = blockDim.x / 2;
   while (i != 0) {
       if(cachedIndex < i)
           result[cachedIndex] += result[cachedIndex + i];

       __syncthreads();

       i /= 2;
   }
}

int main()
{
   int BlocksPerGrid = (N+511/ThreadsPerBlock) > ThreadsPerBlock ? ThreadsPerBlock : ((N+511/ThreadsPerBlock));
   BlocksPerGrid = (BlocksPerGrid%2) ? (BlocksPerGrid+1) : BlocksPerGrid;
   printf("BlocksPerGrid %d, ThreadsPerBlock %d\n", BlocksPerGrid, ThreadsPerBlock);

   int *src = (int *)malloc(N * sizeof(int));
   int result;
   // running on CPU
   for(int i = 0; i < N; i++) src[i] = 1;

   // running on GPU
   int *dev_src;
   int *dev_result;
   cudaMalloc((void**)&dev_src, N * sizeof(int));
   cudaMalloc((void**)&dev_result, BlocksPerGrid * sizeof(int));

   cudaMemcpy(dev_src, src, N * sizeof(int), cudaMemcpyHostToDevice);

   sum<<<BlocksPerGrid, ThreadsPerBlock>>>(dev_src, N, dev_result);
   sum2<<<1, BlocksPerGrid>>>(dev_result);

   cudaError_t cudaStatus;
   cudaStatus = cudaGetLastError();
   if(cudaStatus != cudaSuccess) {
       printf("CUDA error: %s\n", cudaGetErrorString(cudaStatus));
   }

   cudaMemcpy((void *)&result, &(dev_result[0]), sizeof(int), cudaMemcpyDeviceToHost);

   cudaDeviceSynchronize();

   // check result
   printf("result %d\n", result);

   cudaFree(dev_src);
   cudaFree(dev_result);
   free(src);

   return 0;
}
编译：

nvcc sum_by_cuda.cu -o sum_cuda
运行结果：CPU和GPU代码是并行的，所以不能简单用CPU上的时间函数来计算GPU的开销。可以使用nvprof查看。

$ nvprof ./sum_cuda
BlocksPerGrid 1024, ThreadsPerBlock 1024
==5706== NVPROF is profiling process 5706, command: ./sum_cuda
result 10000001
==5706== Profiling application: ./sum_cuda
==5706== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 90.34% 27.215ms 1 27.215ms 27.215ms 27.215ms [CUDA memcpy HtoD]
9.63% 2.9010ms 1 2.9010ms 2.9010ms 2.9010ms sum(int*, int, int*)
0.02% 6.7840us 1 6.7840us 6.7840us 6.7840us sum2(int*)
0.01% 1.7920us 1 1.7920us 1.7920us 1.7920us [CUDA memcpy DtoH]
API calls: 76.90% 114.87ms 2 57.436ms 150.59us 114.72ms cudaMalloc
20.34% 30.378ms 2 15.189ms 3.6222ms 26.755ms cudaMemcpy
2.22% 3.3087ms 2 1.6544ms 243.07us 3.0657ms cudaFree
0.37% 559.65us 97 5.7690us 164ns 246.74us cuDeviceGetAttribute
0.08% 116.75us 1 116.75us 116.75us 116.75us cuDeviceTotalMem
0.05% 78.535us 1 78.535us 78.535us 78.535us cuDeviceGetName
0.03% 43.591us 2 21.795us 9.1640us 34.427us cudaLaunchKernel
0.00% 5.1410us 1 5.1410us 5.1410us 5.1410us cudaDeviceSynchronize
0.00% 4.2410us 1 4.2410us 4.2410us 4.2410us cuDeviceGetPCIBusId
0.00% 2.7160us 3 905ns 176ns 1.5650us cuDeviceGetCount
0.00% 1.3010us 2 650ns 362ns 939ns cuDeviceGet
0.00% 290ns 1 290ns 290ns 290ns cuDeviceGetUuid
0.00% 283ns 1 283ns 283ns 283ns cudaGetLastError

两个核函数sum()和sum2()本身的开销很小，不过cudaMalloc和cudaMemcpy()的开销却很大。

对比总结：
单纯从计算开销来说，CUDA>>OpenMP>CPU。这很好理解，CUDA就是为并行计算而生的，OpenMP相当于提供了一个便捷创建多线程的方法，
使代码更简洁。不过大数据在CPU和GPU之间搬移时确实非常费时，有可能会抵消CUDA并行计算节省下来的时间，在实践中要注意。

————————————————
版权声明：本文为优快云博主「BeALinuxGraphicCoder」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：https://blog.youkuaiyun.com/weixin_42263483/article/details/89736603