E:优化内部循环的判断函数PixelsIsInPic,将判断展开到内部循环之外,跳过不需要处理的目标像素;
TRotaryClipData类用于寻找旋转需要处理的边界范围;算法思路是首先寻找原图片中心点对应的;
那条扫描线,然后依次向上和向下寻找边界;
如果想要更快速的边界寻找算法,可以考虑利用像素的直线
绘制原理来自动生成边界(有机会的时候再来实现它);
边界寻找算法图示
struct TRotaryClipData{
public:
long src_width;
long src_height;
long dst_width;
long dst_height;
long Ax_16;
long Ay_16;
long Bx_16;
long By_16;
long Cx_16;
long Cy_16;
public:
long out_dst_up_y;
long out_dst_down_y;
long out_src_x0_16;
long out_src_y0_16;
private:
long out_dst_up_x0;
long out_dst_up_x1;
long out_dst_down_x0;
long out_dst_down_x1;
public:
inline long get_up_x0(){ if (out_dst_up_x0<0) return 0; else return out_dst_up_x0; }
inline long get_up_x1(){ if (out_dst_up_x1>=dst_width) return dst_width; else return out_dst_up_x1; }
inline long get_down_x0(){ if (out_dst_down_x0<0) return 0; else return out_dst_down_x0; }
inline long get_down_x1(){ if (out_dst_down_x1>=dst_width) return dst_width; else return out_dst_down_x1; }
inline bool is_in_src(long src_x_16,long src_y_16)
{
return ( ( (src_x_16>=0)&&((src_x_16>>16)<src_width) )
&& ( (src_y_16>=0)&&((src_y_16>>16)<src_height) ) );
}
void find_begin_in(long dst_y,long& out_dst_x,long& src_x_16,long& src_y_16)
{
src_x_16-=Ax_16;
src_y_16-=Ay_16;
while (is_in_src(src_x_16,src_y_16))
{
--out_dst_x;
src_x_16-=Ax_16;
src_y_16-=Ay_16;
}
src_x_16+=Ax_16;
src_y_16+=Ay_16;
}
bool find_begin(long dst_y,long& out_dst_x0,long dst_x1)
{
long test_dst_x0=out_dst_x0-1;
long src_x_16=Ax_16*test_dst_x0 + Bx_16*dst_y + Cx_16;
long src_y_16=Ay_16*test_dst_x0 + By_16*dst_y + Cy_16;
for (long i=test_dst_x0;i<=dst_x1;++i)
{
if (is_in_src(src_x_16,src_y_16))
{
out_dst_x0=i;
if (i==test_dst_x0)
find_begin_in(dst_y,out_dst_x0,src_x_16,src_y_16);
if (out_dst_x0<0)
{
src_x_16-=(Ax_16*out_dst_x0);
src_y_16-=(Ay_16*out_dst_x0);
}
out_src_x0_16=src_x_16;
out_src_y0_16=src_y_16;
return true;
}
else
{
src_x_16+=Ax_16;
src_y_16+=Ay_16;
}
}
return false;
}
void find_end(long dst_y,long dst_x0,long& out_dst_x1)
{
long test_dst_x1=out_dst_x1;
if (test_dst_x1<dst_x0) test_dst_x1=dst_x0;
long src_x_16=Ax_16*test_dst_x1 + Bx_16*dst_y + Cx_16;
long src_y_16=Ay_16*test_dst_x1 + By_16*dst_y + Cy_16;
if (is_in_src(src_x_16,src_y_16))
{
++test_dst_x1;
src_x_16+=Ax_16;
src_y_16+=Ay_16;
while (is_in_src(src_x_16,src_y_16))
{
++test_dst_x1;
src_x_16+=Ax_16;
src_y_16+=Ay_16;
}
out_dst_x1=test_dst_x1;
}
else
{
src_x_16-=Ax_16;
src_y_16-=Ay_16;
while (!is_in_src(src_x_16,src_y_16))
{
--test_dst_x1;
src_x_16-=Ax_16;
src_y_16-=Ay_16;
}
out_dst_x1=test_dst_x1;
}
}
bool inti_clip(double move_x,double move_y)
{
//计算src中心点映射到dst后的坐标
out_dst_down_y=(long)(src_height*0.5+move_y);
out_dst_down_x0=(long)(src_width*0.5+move_x);
out_dst_down_x1=out_dst_down_x0;
//得到初始out_???
if (find_begin(out_dst_down_y,out_dst_down_x0,out_dst_down_x1))
find_end(out_dst_down_y,out_dst_down_x0,out_dst_down_x1);
out_dst_up_y=out_dst_down_y;
out_dst_up_x0=out_dst_down_x0;
out_dst_up_x1=out_dst_down_x1;
return (out_dst_down_x0<out_dst_down_x1);
}
bool next_clip_line_down()
{
++out_dst_down_y;
if (!find_begin(out_dst_down_y,out_dst_down_x0,out_dst_down_x1)) return false;
find_end(out_dst_down_y,out_dst_down_x0,out_dst_down_x1);
return (out_dst_down_x0<out_dst_down_x1);
}
bool next_clip_line_up()
{
--out_dst_up_y;
if (!find_begin(out_dst_up_y,out_dst_up_x0,out_dst_up_x1)) return false;
find_end(out_dst_up_y,out_dst_up_x0,out_dst_up_x1);
return (out_dst_up_x0<out_dst_up_x1);
}
};void PicRotary3_CopyLine(TARGB32* pDstLine,long dstCount,long Ax_16,long Ay_16,
long srcx0_16,long srcy0_16,TARGB32* pSrcLine,long src_byte_width)
{
for (long x=0;x<dstCount;++x)
{
pDstLine[x]=Pixels(pSrcLine,src_byte_width,srcx0_16>>16,srcy0_16>>16);
srcx0_16+=Ax_16;
srcy0_16+=Ay_16;
}
}
void PicRotary3(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的缩放比例认为已经不可见
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)为旋转中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y)) return;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
long x0=rcData.get_down_x0();
PicRotary3_CopyLine(&pDstLine[x0],rcData.get_down_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
long x0=rcData.get_up_x0();
PicRotary3_CopyLine(&pDstLine[x0],rcData.get_up_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
}
}
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotary3 280.9 fps
////////////////////////////////////////////////////////////////////////////////
F:使用SSE的MOVNTQ指令优化CPU写缓冲
(仅改写了PicRotary3_CopyLine的实现)
void __declspec(naked) __stdcall PicRotarySSE_CopyLine(TARGB32* pDstLine,long dstCount,long Ax_16,long Ay_16,
// [esp+ 4] [esp+ 8] [esp+12] [esp+16]
long srcx0_16,long srcy0_16,TARGB32* pSrcLine,long src_byte_width)
// [esp+20] [esp+24] [esp+28] [esp+32]
{
//利用SSE的MOVNTQ指令优化写缓冲的汇编实现
asm
{
push ebx
push esi
push edi
push ebp
//esp offset 16
mov ebx,dword ptr [esp+ 8+16]
mov esi,dword ptr [esp+32+16]
mov edi,dword ptr [esp+28+16]
mov eax,dword ptr [esp+24+16]
mov ecx,dword ptr [esp+20+16]
dec ebx
xor edx,edx
test ebx,ebx
mov dword ptr [esp+ 8+16],ebx
jle loop_bound
//jmp loop_begin
//align 16
loop_begin:
mov ebx,eax
add eax,dword ptr [esp+16+16]
sar ebx,16
imul ebx,esi
add ebx,edi
mov ebp,ecx
add ecx,dword ptr [esp+12+16]
sar ebp,16
MOVD MM0,dword ptr [ebx+ebp*4]
mov ebx,eax
add eax,dword ptr [esp+16+16]
sar ebx,16
imul ebx,esi
mov ebp,ecx
add ecx,dword ptr [esp+12+16]
sar ebp,16
add ebx,edi
MOVD MM1,dword ptr [ebx+ebp*4]
mov ebp,dword ptr [esp+ 4+16]
PUNPCKlDQ MM0,MM1
mov ebx,dword ptr [esp+ 8+16]
MOVNTQ qword ptr [ebp+edx*4],MM0
add edx,2
cmp edx,ebx
jl loop_begin
EMMS
loop_bound:
cmp edx,ebx
jne loop_bound_end
sar eax,16
imul eax,esi
sar ecx,16
add eax,edi
mov eax,dword ptr [eax+ecx*4]
mov ecx,dword ptr [esp+ 4+16]
mov dword ptr [ecx+edx*4],eax
loop_bound_end:
pop ebp
pop edi
pop esi
pop ebx
ret 32
}
}
void PicRotarySSE(const TPicRegion& Dst,const TPicRegion& Src,double RotaryAngle,double ZoomX,double ZoomY,double move_x,double move_y)
{
if ( (fabs(ZoomX*Src.width)<1.0e-4) || (fabs(ZoomY*Src.height)<1.0e-4) ) return; //太小的缩放比例认为已经不可见
double tmprZoomXY=1.0/(ZoomX*ZoomY);
double rZoomX=tmprZoomXY*ZoomY;
double rZoomY=tmprZoomXY*ZoomX;
double sinA,cosA;
SinCos(RotaryAngle,sinA,cosA);
long Ax_16=(long)(rZoomX*cosA*(1<<16));
long Ay_16=(long)(rZoomX*sinA*(1<<16));
long Bx_16=(long)(-rZoomY*sinA*(1<<16));
long By_16=(long)(rZoomY*cosA*(1<<16));
double rx0=Src.width*0.5; //(rx0,ry0)为旋转中心
double ry0=Src.height*0.5;
long Cx_16=(long)((-(rx0+move_x)*rZoomX*cosA+(ry0+move_y)*rZoomY*sinA+rx0)*(1<<16));
long Cy_16=(long)((-(rx0+move_x)*rZoomX*sinA-(ry0+move_y)*rZoomY*cosA+ry0)*(1<<16));
TRotaryClipData rcData;
rcData.Ax_16=Ax_16;
rcData.Bx_16=Bx_16;
rcData.Cx_16=Cx_16;
rcData.Ay_16=Ay_16;
rcData.By_16=By_16;
rcData.Cy_16=Cy_16;
rcData.dst_width=Dst.width;
rcData.dst_height=Dst.height;
rcData.src_width=Src.width;
rcData.src_height=Src.height;
if (!rcData.inti_clip(move_x,move_y)) return;
TARGB32* pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_down_y);
while (true) //to down
{
long y=rcData.out_dst_down_y;
if (y>=Dst.height) break;
if (y>=0)
{
long x0=rcData.get_down_x0();
PicRotarySSE_CopyLine(&pDstLine[x0],rcData.get_down_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
if (!rcData.next_clip_line_down()) break;
((TUInt8*&)pDstLine)+=Dst.byte_width;
}
pDstLine=Dst.pdata;
((TUInt8*&)pDstLine)+=(Dst.byte_width*rcData.out_dst_up_y);
while (rcData.next_clip_line_up()) //to up
{
long y=rcData.out_dst_up_y;
if (y<0) break;
((TUInt8*&)pDstLine)-=Dst.byte_width;
if (y<Dst.height)
{
long x0=rcData.get_up_x0();
PicRotarySSE_CopyLine(&pDstLine[x0],rcData.get_up_x1()-x0,Ax_16,Ay_16,
rcData.out_src_x0_16,rcData.out_src_y0_16,Src.pdata,Src.byte_width);
}
}
asm sfence //刷新写入
}
////////////////////////////////////////////////////////////////////////////////
//速度测试:
//==============================================================================
// PicRotarySEE 306.3 fps
////////////////////////////////////////////////////////////////////////////////