/* CUDA 内积简单例程 */
const int THREAD_DIM = 256;
void __global__ dot(const float* d_a, const float* d_b, float* d_c , const int n) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int threads = threadIdx.x;
__shared__ float cache[THREAD_DIM];
float temp = 0.0;
while(tid < n){
temp += d_a[tid] * d_b[tid];
tid += blockDim.x * GridDim.x;
}
// 将每一个线程计算的乘积放入相应block中的shared memory里
cache[threads] = temp;
__syncthreads();
//归约求每一个block中内积和
int i = blockDim.x / 2;
while( i != 0){
if ( threads < i )
{
cache[threads] += cache[threads + i]
}
__syncthreads();
i>>2;
}
if (threads == 0) // 选择一个线程取出一个block中的和
{
d_c[blockIdx.x] = cache[0];
}
}
int main(){
int n = some const;
dim3 blockPergrid((n +255) / 256, 1);
dim3 threadPerblock(256,1);
// 将每个block中得和取出在CPU中求和
}