书名:基于 CUDA 的 GPU 并行程序开发指南
imrotateMC Rotate7 是怎么做优化的
书里面没有详细说
void *Rotate6(void* tid)
{
long tn; // My thread number (ID) is stored here
// int row,col,h,v,c;
int row,col,h,v,c, hp3;
int NewRow,NewCol;
double X, Y, newX, newY, ScaleFactor;
double Diagonal, H, V;
double CRA,SRA, CRAS, SRAS, SRAYS, CRAYS;
struct Pixel pix;
tn = *((int *) tid); // Calculate my Thread ID
tn *= ip.Vpixels/NumThreads;
H=(double)ip.Hpixels;
V=(double)ip.Vpixels;
Diagonal=sqrt(H*H+V*V);
ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
CRA=cos(RotAngle); CRAS=ScaleFactor*CRA;
SRA=sin(RotAngle); SRAS=ScaleFactor*SRA;
h=ip.Hpixels/2; v=ip.Vpixels/2; // integer div
hp3=ip.Hpixels*3;
for(row=tn; row<tn+ip.Vpixels/NumThreads; row++){
col=0;
c=0;
Y=(double)v-(double)row;
SRAYS=SRAS*Y; CRAYS=CRAS*Y;
// while(col<ip.Hpixels*3){
while(col<hp3){
// transpose image coordinates to Cartesian coordinates
// c=col/3; h=ip.Hpixels/2; v=ip.Vpixels/2; // integer div
X=(double)c-(double)h;
// Y=(double)v-(double)row;
// pixel rotation matrix
newX=CRAS*X-SRAYS;
newY=SRAS*X+CRAYS;
// newX=CRA*X-SRA*Y;
// newY=SRA*X+CRA*Y;
// newX=cos(RotAngle)*X-sin(RotAngle)*Y;
// newY=sin(RotAngle)*X+cos(RotAngle)*Y;
// Scale to fit everything in the image box
// H=(double)ip.Hpixels;
// V=(double)ip.Vpixels;
// Diagonal=sqrt(H*H+V*V);
// ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
// newX=newX*ScaleFactor;
// newY=newY*ScaleFactor;
// convert back from Cartesian to image coordinates
NewCol=((int) newX+h);
NewRow=v-(int)newY;
if((NewCol>=0) && (NewRow>=0) && (NewCol<ip.Hpixels) && (NewRow<ip.Vpixels)){
NewCol*=3;
CopyImage[NewRow][NewCol] = TheImage[row][col];
CopyImage[NewRow][NewCol+1] = TheImage[row][col+1];
CopyImage[NewRow][NewCol+2] = TheImage[row][col+2];
}
col+=3;
c++;
}
}
pthread_exit(NULL);
}
void *Rotate7(void* tid)
{
long tn; // My thread number (ID) is stored here
// int row,col,h,v,c;
int row,col,h,v,c, hp3;
double cc, ss, k1, k2;
int NewRow,NewCol;
double X, Y, newX, newY, ScaleFactor;
double Diagonal, H, V;
double CRA,SRA, CRAS, SRAS, SRAYS, CRAYS;
struct Pixel pix;
tn = *((int *) tid); // Calculate my Thread ID
tn *= ip.Vpixels/NumThreads;
H=(double)ip.Hpixels;
V=(double)ip.Vpixels;
Diagonal=sqrt(H*H+V*V);
ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
CRA=cos(RotAngle); CRAS=ScaleFactor*CRA;
SRA=sin(RotAngle); SRAS=ScaleFactor*SRA;
h=ip.Hpixels/2; v=ip.Vpixels/2; // integer div
hp3=ip.Hpixels*3;
for(row=tn; row<tn+ip.Vpixels/NumThreads; row++){
col=0;
cc=0.00;
ss=0.00;
Y=(double)v-(double)row;
SRAYS=SRAS*Y; CRAYS=CRAS*Y;
k1=CRAS*(double)h + SRAYS;
k2=SRAS*(double)h - CRAYS;
// while(col<ip.Hpixels*3){
while(col<hp3){
// transpose image coordinates to Cartesian coordinates
// c=col/3; h=ip.Hpixels/2; v=ip.Vpixels/2; // integer div
// X=(double)c-(double)h;
// Y=(double)v-(double)row;
// pixel rotation matrix
newX=cc-k1;
newY=ss-k2;
// newX=CRA*X-SRA*Y;
// newY=SRA*X+CRA*Y;
// newX=cos(RotAngle)*X-sin(RotAngle)*Y;
// newY=sin(RotAngle)*X+cos(RotAngle)*Y;
// Scale to fit everything in the image box
// H=(double)ip.Hpixels;
// V=(double)ip.Vpixels;
// Diagonal=sqrt(H*H+V*V);
// ScaleFactor=(ip.Hpixels>ip.Vpixels) ? V/Diagonal : H/Diagonal;
// newX=newX*ScaleFactor;
// newY=newY*ScaleFactor;
// convert back from Cartesian to image coordinates
NewCol=((int) newX+h);
NewRow=v-(int)newY;
if((NewCol>=0) && (NewRow>=0) && (NewCol<ip.Hpixels) && (NewRow<ip.Vpixels)){
NewCol*=3;
CopyImage[NewRow][NewCol] = TheImage[row][col];
CopyImage[NewRow][NewCol+1] = TheImage[row][col+1];
CopyImage[NewRow][NewCol+2] = TheImage[row][col+2];
}
col+=3;
cc += CRAS;
ss += SRAS;
}
}
pthread_exit(NULL);
}
可见,他就是多了个
double cc, ss, k1, k2;
的定义
看看他是怎么用的
可见,他就是把 X 这个变量化简了,如果 X 是一个整数,最终得到 newX
newY
需要做一个浮点和整数的乘法
newX=CRAS*X-SRAYS;
newY=SRAS*X+CRAYS;
现在直接用 cc
ss
,以累加的形式替代了 CRAS
SRAS
与计数变量的乘积,k1
k2
也就是存储一下偏置而已
变成了加法之后,速度就上来了