A1:我有N张图片,每张图片需要进行K次处理。我希望调用N个线程,每个线程处理一张图片,对每一张图片,我又希望能够调用K个线程,每个线程处理一次我该怎么做哦,求思路。
Q1:N张图片对应N个block,每个block对应k个线程。
__global__ kernelPicAdd ( uchar *pic1, uchar *pic2 )
{
// get index
i = ..;
j = ..;
k = ..;
pic [ index(i,j,k) ] += pic[ index(i,j,k) ];
}
__host__ void MergePics ( uchar *ptr1, uchar *ptr2 )
{
...
kernelPicAdd <<< gridDim, blockDim >>> (ptr1,ptr2);
...
}
CalErrorKernel3<<<samplecount*initialcount,K>>>(d_img,d_tempdetaS,d_loc,d_detaSb,d_dis);//K<=512
__global__ void CalErrorKernel3(int *img,float*tempdetaS,int*loc,float*detaSb,float*dis)
{
__shared__ float diffS[marknum*K];
diffS[threadIdx.x]=0;
__syncthreads();
int imgidx=blockIdx.x;
int weakridx=threadIdx.x;
float diffS2[marknum];
printf("cal%d\n",imgidx);
/*if(imgidx<samplecount*initialcount)
{*/
//cal diffS
int fervalue = 0;
for (int i = 0; i <F*2; i+=2)
{
fervalue = fervalue * 2 + (img[imgidx*marknum+loc[weakridx*F*2+i]] <img[imgidx*marknum+loc[weakridx*F*2+i+1]]? 1 : 0);
}
for (int i = 0; i < marknum; i++)
{
diffS[weakridx*marknum+i]=detaSb[weakridx*32*marknum+fervalue*marknum+i];
}
__syncthreads();
printf("cal diffS end%d\n",imgidx);
for(int j=0;j<marknum;j++)
{
diffS2[j]=0;
for(int jj=0;jj<K;jj++)
{
diffS2[j]+=diffS[jj*marknum+j];
}
}
double temp = 0;
for (int j = 0; j < marknum; j++)
{
temp+=pow(tempdetaS[imgidx*marknum+j]-diffS[j],2);
}
temp = sqrt(temp)/(samplecount*initialcount);
dis[imgidx]=temp;
printf("calend%d\n",imgidx);
//}
}
804

被折叠的 条评论
为什么被折叠?



