归约
方法
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[101], int b[101]) {
__shared__ int sdata[101];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
sdata[idx] = a[x];
__syncthreads();
for (int s = 1; s < blockDim.x; s *= 2) {
if (idx % (s * 2) == 0) {
sdata[idx] += sdata[idx + s];
}
__syncthreads();
}
if (idx == 0)b[0] = sdata[0];
}
int main() {
int ha[101];
int *da,*db;
int hb[101];
for(int i = 0; i <= 100; i++)
ha[i] = i;
cudaMalloc((void**)&da, sizeof(int) * 101);
cudaMalloc((void**)&db, sizeof(int) * 101);
cudaMemcpy(da, ha, sizeof(int) * 101, cudaMemcpyHostToDevice);
dim3 Block(101);
jia << <1, Block >> > (da,db);
cudaMemcpy(hb, db, sizeof(int), cudaMemcpyDeviceToHost);
cout << hb[0] << endl;
cudaFree(da);
cudaFree(db);
}
优化一
在上一方法中对于线程的使用由于间隔使用会造成很大的资源浪费
#include<cuda_runtime_api.h>
#include<cuda_runtime.h>
#include<device_launch_parameters.h>
#include<iostream>
#include<device_functions.h>
using namespace std;
__global__ void jia(int a[101], int b[101]) {
__shared__ int sdata[101];
int idx = threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
sdata[idx] = a[x];
__syncthreads();
f