#include"stdio.h"#include<iostream>#include<cuda.h>#include<cuda_runtime.h>#define N 1024#define threadsPerBlock 512
__global__ voidgpu_dot(float*d_a,float*d_b,float*d_c){
//Declare shared memory
__shared__ float partial_sum[threadsPerBlock];int tid = threadIdx.x + blockIdx.x * blockDim.x;//Calculate index for shared memory int index = threadIdx.x;//Calculate Partial Sumfloat sum =0;while(tid < N){
sum += d_a[tid]* d_b[tid];
tid += blockDim.x * gridDim.x;}// Store partial sum in shared memory
partial_sum[index]= sum;// synchronize threads __syncthreads();// Calculating partial sum for whole block in reduce operationint i = blockDim.x /2;while(i !=0){
if(index < i)
partial_sum[index]+= partial_sum[index + i];__syncthreads();
i /=2;}//Store block partial sum in global memoryif(index ==0)
d_c[blockIdx.x]= partial_sum[0];}intmain(void){
//Declare Host Arrayfloat*h_a,*h_b, h_c,*partial_sum;//Declare device Arrayfloat*d_a,*d_b,*d_partial_sum;//Calculate total number of blocks per gridint block_calc =(N + threadsPerBlock -1)/ threadsPerBlock;int blocksPerGrid =(32< block_calc ?32: block_calc);// allocate memory on the host side
h_a =(float*)malloc(N *sizeof(float));
h_b =(float*)malloc(N *sizeof(float));
partial_sum =(float*)malloc(blocksPerGrid *sizeof(float));// allocate the memory on the devicecudaMalloc(