卷积在图像处理中使用很频繁,由于数据量大,计算多,未经优化的卷积算法很慢。利用neon的并行计算,可以对其进行优化。
未经优化的C语言实现:
bool convolve2DSlow(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY,
float* kernel, int kernelSizeX, int kernelSizeY)
{
int i, j, m, n, mm, nn;
int kCenterX, kCenterY; // center index of kernel
float sum; // temp accumulation buffer
int rowIndex, colIndex;
// check validity of params
if(!in || !out || !kernel) return false;
if(dataSizeX <= 0 || kernelSizeX <= 0) return false;
// find center position of kernel (half of kernel size)
kCenterX = kernelSizeX / 2;
kCenterY = kernelSizeY / 2;
for(i=0; i < dataSizeY; ++i) // rows
{
for(j=0; j < dataSizeX; ++j) // columns
{
sum = 0; // init to 0 before sum
for(m=0; m < kernelSizeY; ++m) // kernel rows
{
mm = kernelSizeY - 1 - m; // row index of flipped kernel
for(n=0; n < kernelSizeX; ++n) // kernel columns
{
nn = kernelSizeX - 1 - n; // column index of flipped kernel
// index of input signal, used for checking boundary
rowIndex = i + m - kCenterY;
colIndex = j + n - kCenterX;
// ignore input samples which are out of bound
if(rowIndex >= 0 && rowIndex < dataSizeY && colIndex >= 0 && colIndex < dataSizeX)
sum += in[dataSizeX * rowIndex + colIndex] * kernel[kernelSizeX * mm + nn];
}
}
out[dataSizeX * i + j] = (unsigned char)((float)fabs(sum) + 0.5f);
}
}
return true;
}
neon优化
思路:neon可以一次处理4个32位数据,考虑到级、卷积核为float类型,灰度像素数据为uint8_t,neon可以一次读取8个像素数据,将像素数据转化为float后进行计算。因为neon只能按内存顺序读取固定数量(8个)数据,为了避免剩余数据和卷积计算中的边界问题,可以将像素矩阵用0填充扩展,每行的头尾分别加上kernelSizeX/2个0数据,并确保每行数据量为8的倍数,不是则添0不足(增加内存开销来减小代码复杂度和时间复杂度,因为优化目标主要在于提升时间上效率)
neon优化后的实现:
/*convolve2D_neon:neon优化的二维卷积*/
int convolve2D_neon(unsigned char * src,unsigned char * dst,int datasizeX,int datasizeY,float* kernel,int kernelSizeX,int kernelSizeY)
{
if(!dst || !src || !kernel) return -1;
if(datasizeX <= 0 || kernelSizeX <= 0) return -1;
int x,y,kx,ky,k,mm,nn,yIndex,xIndex;
//float k_num;
float32x4_t k_neon;
uint8x8_t neon_x_u8;
uint16x8_t neon_x_u16_8;
uint16x4_t neon_x_u16_4;
uint16x4_t neon_x_u16_4_2;
uint32x4_t neon_x_u32;
float32x4_t neon_x_f32;
float32x4_t neon_sum_f32_high;
float32x4_t neon_sum_f32_low;
int kCenterX, kCenterY;
kCenterX = kernelSizeX >> 1;
kCenterY = kernelSizeY >> 1;
k=datasizeX%8;
if(k)
k=8-k;
unsigned char* in=(unsigned char * )malloc((datasizeX+2*kCenterX+k)*(datasizeY+2*kCenterY)); //扩展,避免边界和剩余
int strde=datasizeX+2*kCenterX+k;//in的步长
memset(in,0,sizeof(in));
for(y=0;y<datasizeY;y++)
{
for(x=0;x<datasizeX;x++)
in[(y+kCenterY)*(strde)+x+kCenterX]=src[y*datasizeX+x];
}
for(y=0;y<datasizeY;y++)
{
for(x=0;x<datasizeX;x=x+8)
{
neon_sum_f32_high=vdupq_n_f32(0);
neon_sum_f32_low =vdupq_n_f32(0);
for(ky=0;ky<kernelSizeY;ky++) // kernel rows
{
mm = kernelSizeY - 1 - ky; // row index of flipped kernel
for(kx=0; kx < kernelSizeX; ++kx) // kernel columns
{
nn = kernelSizeX - 1 - kx; // column index of flipped kernel
yIndex = y + ky;
xIndex = x + kx;
neon_x_u8=vld1_u8(in+yIndex*(strde)+xIndex); //加载8个8位数据
neon_x_u16_8=vmovl_u8(neon_x_u8);
k_neon=vld1q_dup_f32(kernel+kernelSizeX * mm + nn);//加载kernel[kernelSizeX * mm + nn]
/*处理低位的4个数据*/
neon_x_u16_4=vget_low_u16(neon_x_u16_8);
neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位
neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
/*处理高位4个数据*/
neon_x_u16_4=vget_high_u16(neon_x_u16_8);
neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位
neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
}
}
neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low); //将float32*4*2转为uint8*8
neon_x_u16_4=vqmovn_u32(neon_x_u32);
neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high);
neon_x_u16_4_2=vqmovn_u32(neon_x_u32);
neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2);
neon_x_u8=vqmovn_u16(neon_x_u16_8);
vst1_u8(dst+y*datasizeX+x,neon_x_u8); //存储
}
}
free(in);
in=NULL;
return 0;
}
可分离的卷积计算:
如果二维的卷积核矩阵可以分解为两个一维矩阵相乘,那么可以对二维数先进行x方向(行)上的一维卷积计算,然后在进行y方向(列)的一维卷积计算
C/C++语言代码实现:
bool convolve2DSeparable(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY,
float* kernelX, int kSizeX, float* kernelY, int kSizeY)
{
int i, j, k, m, n;
float *tmp, *sum; // intermediate data buffer
unsigned char *inPtr, *outPtr; // working pointers
float *tmpPtr, *tmpPtr2; // working pointers
int kCenter, kOffset, endIndex; // kernel indice
// check validity of params
if(!in || !out || !kernelX || !kernelY) return false;
if(dataSizeX <= 0 || kSizeX <= 0) return false;
// allocate temp storage to keep intermediate result
tmp = new float[dataSizeX * dataSizeY];
if(!tmp) return false; // memory allocation error
// store accumulated sum
sum = new float[dataSizeX];
if(!sum) return false; // memory allocation error
// covolve horizontal direction ///
// find center position of kernel (half of kernel size)
kCenter = kSizeX >> 1; // center index of kernel array
endIndex = dataSizeX - kCenter; // index for full kernel convolution
// init working pointers
inPtr = in;
tmpPtr = tmp; // store intermediate results from 1D horizontal convolution
// start horizontal convolution (x-direction)
for(i=0; i < dataSizeY; ++i) // number of rows
{
kOffset = 0; // starting index of partial kernel varies for each sample
// COLUMN FROM index=0 TO index=kCenter-1
for(j=0; j < kCenter; ++j)
{
*tmpPtr = 0; // init to 0 before accumulation
for(k = kCenter + kOffset, m = 0; k >= 0; --k, ++m) // convolve with partial of kernel
{
*tmpPtr += *(inPtr + m) * kernelX[k];
}
++tmpPtr; // next output
++kOffset; // increase starting index of kernel
}
// COLUMN FROM index=kCenter TO index=(dataSizeX-kCenter-1)
for(j = kCenter; j < endIndex; ++j)
{
*tmpPtr = 0; // init to 0 before accumulate
for(k = kSizeX-1, m = 0; k >= 0; --k, ++m) // full kernel
{
*tmpPtr += *(inPtr + m) * kernelX[k];
}
++inPtr; // next input
++tmpPtr; // next output
}
kOffset = 1; // ending index of partial kernel varies for each sample
// COLUMN FROM index=(dataSizeX-kCenter) TO index=(dataSizeX-1)
for(j = endIndex; j < dataSizeX; ++j)
{
*tmpPtr = 0; // init to 0 before accumulation
for(k = kSizeX-1, m=0; k >= kOffset; --k, ++m) // convolve with partial of kernel
{
*tmpPtr += *(inPtr + m) * kernelX[k];
}
++inPtr; // next input
++tmpPtr; // next output
++kOffset; // increase ending index of partial kernel
}
inPtr += kCenter; // next row
}
// END OF HORIZONTAL CONVOLUTION //
// start vertical direction ///
// find center position of kernel (half of kernel size)
kCenter = kSizeY >> 1; // center index of vertical kernel
endIndex = dataSizeY - kCenter; // index where full kernel convolution should stop
// set working pointers
tmpPtr = tmpPtr2 = tmp;
outPtr = out;
// clear out array before accumulation
for(i = 0; i < dataSizeX; ++i)
sum[i] = 0;
// start to convolve vertical direction (y-direction)
// ROW FROM index=0 TO index=(kCenter-1)
kOffset = 0; // starting index of partial kernel varies for each sample
for(i=0; i < kCenter; ++i)
{
for(k = kCenter + kOffset; k >= 0; --k) // convolve with partial kernel
{
for(j=0; j < dataSizeX; ++j)
{
sum[j] += *tmpPtr * kernelY[k];
++tmpPtr;
}
}
for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out
{
// covert negative to positive
*outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
sum[n] = 0; // reset to zero for next summing
++outPtr; // next element of output
}
tmpPtr = tmpPtr2; // reset input pointer
++kOffset; // increase starting index of kernel
}
// ROW FROM index=kCenter TO index=(dataSizeY-kCenter-1)
for(i = kCenter; i < endIndex; ++i)
{
for(k = kSizeY -1; k >= 0; --k) // convolve with full kernel
{
for(j = 0; j < dataSizeX; ++j)
{
sum[j] += *tmpPtr * kernelY[k];
++tmpPtr;
}
}
for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out
{
// covert negative to positive
*outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
sum[n] = 0; // reset for next summing
++outPtr; // next output
}
// move to next row
tmpPtr2 += dataSizeX;
tmpPtr = tmpPtr2;
}
// ROW FROM index=(dataSizeY-kCenter) TO index=(dataSizeY-1)
kOffset = 1; // ending index of partial kernel varies for each sample
for(i=endIndex; i < dataSizeY; ++i)
{
for(k = kSizeY-1; k >= kOffset; --k) // convolve with partial kernel
{
for(j=0; j < dataSizeX; ++j)
{
sum[j] += *tmpPtr * kernelY[k];
++tmpPtr;
}
}
for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out
{
// covert negative to positive
*outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f);
sum[n] = 0; // reset for next summing
++outPtr; // next output
}
// move to next row
tmpPtr2 += dataSizeX;
tmpPtr = tmpPtr2; // next input
++kOffset; // increase ending index of kernel
}
// END OF VERTICAL CONVOLUTION
// deallocate temp buffers
delete [] tmp;
delete [] sum;
return true;
}
可分离二维卷积的neon优化思路:先实现一维的卷积优化,思路相同,一样是填0扩展,避免剩余数据和边界问题,然后并行计算4个像素的卷积计算,由于neon只能一次读取顺序的8个uint8_t数据,只能进行x方向的卷积,所以需要进行矩阵转置(矩阵转置也可以用neon优化,网上说可以提速10倍)
neon优化实现:
/*convolve1D:一维卷积*/
int convolve1D(unsigned char* src, unsigned char* dst,int dataSize, float* kernel, int kernelSize)
{
int kCenter=kernelSize>>1;
int k=dataSize%8;
unsigned char* inpre=src;
int x,kx;
if(k)
k=8-k;
unsigned char* in=(unsigned char*)malloc(dataSize+2*kCenter+k+8);
//unsigned char* in=(unsigned char*)malloc(2048);
memset(in,0,(dataSize+2*kCenter+k));
memcpy(in+kCenter,inpre,dataSize);
/*
for(x=0;x<dataSize;x++)
in[x+kCenter]=inpre[x];
*/
float32x4_t k_neon;
uint8x8_t neon_x_u8;
uint16x8_t neon_x_u16_8;
uint16x4_t neon_x_u16_4;
uint16x4_t neon_x_u16_4_2;
uint32x4_t neon_x_u32;
float32x4_t neon_x_f32;
float32x4_t neon_sum_f32_high;
float32x4_t neon_sum_f32_low;
for(x=0;x<dataSize;x=x+8)
{
neon_sum_f32_high=vdupq_n_f32(0);
neon_sum_f32_low =vdupq_n_f32(0);
for(kx=0;kx<kernelSize;kx++)
{
neon_x_u8=vld1_u8(in+x+kx); //加载8个8位数据
neon_x_u16_8=vmovl_u8(neon_x_u8);
//printf("kernelSize=%d kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]);
k_neon=vld1q_dup_f32((&kernel[kernelSize-kx-1]));//加载kernelX[kSizeX-kx-1]
//printf("kernelSize=%d kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]);
/*处理低位的4个数据*/
neon_x_u16_4=vget_low_u16(neon_x_u16_8);
neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位
neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
/*处理高位4个数据*/
neon_x_u16_4=vget_high_u16(neon_x_u16_8);
neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位
neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32
neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon
}
neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low); //将float32*4*2转为uint8*8
neon_x_u16_4=vqmovn_u32(neon_x_u32);
neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high);
neon_x_u16_4_2=vqmovn_u32(neon_x_u32);
neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2);
neon_x_u8=vqmovn_u16(neon_x_u16_8);
vst1_u8(dst+x,neon_x_u8);
}
free(in);
in=NULL;
return 0;
}
/*convolve2DSeparable:neon优化的可分离二维卷积,先对行一维卷积,再对列进行一维卷积*/
int convolve2DSeparable(unsigned char* src, unsigned char* dst, int dataSizeX, int dataSizeY,
float* kernelX, int kSizeX, float* kernelY, int kSizeY)
{
int x,y;
struct timeval start;
struct timeval end;
unsigned char* in_tmp=(unsigned char*)malloc(dataSizeX*dataSizeY);
for(y=0;y<dataSizeY;y++) //一维卷积
convolve1D(src+y*dataSizeX,in_tmp+y*dataSizeX,dataSizeX,kernelX,kSizeX);
for(y=0;y<dataSizeY;y++) //矩阵转置
{
for(x=0;x<dataSizeX;x++)
{
dst[x*dataSizeY+y]=in_tmp[y*dataSizeX+x];
}
}
for(x=0;x<dataSizeX;x++) //一维卷积
{
convolve1D(dst+x*dataSizeY,in_tmp+x*dataSizeY,dataSizeY,kernelY,kSizeY);
}
for(y=0;y<dataSizeY;y++) //矩阵转置输出
{
for(x=0;x<dataSizeX;x++)
{
dst[y*dataSizeX+x]=in_tmp[x*dataSizeY+y];
}
}
free(in_tmp);
return 0;
}
测试:
测试图为1920*1080的灰度图,测试平台为海思3559板子,卷积核大小为5*5,所以值均为0.25(平均模糊卷积),分别进行普通编译和O3级优化编译neon优化代码和原来的c语言代码,进行运行时间比较
编译命令:
arm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -o neon_test1080 neon_test.c
arm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -O3 neon_test.c
测试结果
测试结果与平台以及编译器相关,经过不同平台的测试,O3优化的加速比在不同平台和编译器下不同,但基本能提速2-10倍,而neon相比于C的提速比,经测试,似乎与处理的图片大小有关,没有进一步测试。