OpenCL实现SobelFilter(行列分解)

文章介绍了使用CUDA进行Sobel滤波器的两种实现方法,分别是利用非局部内存和局部内存。非局部内存版本展示了如何在全局内存与本地线程块之间共享数据,而局部内存版本则涉及了使用LDS(LocalDataShare)进行优化。
部署运行你感兴趣的模型镜像

1.行列分解数学原理

row_filter=[1 0 -1], col_filter=[1 2 1]

 row_filter=[1 2 1], col_filter=[1 0 -1]

2.非局部内存实现

__kernel void sobel_filter_separable(__global uchar* padSrc, __global uchar* dst, int height, int width, int pad_width)
{
	__local short local_output_x[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
	__local short local_output_y[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];

	uint col = get_global_id(0);
    uint row = get_global_id(1);

	if (col >= width || row >= height) return;

	int lid_x = get_local_id(0);
    int lid_y = get_local_id(1);

    int start_col = col; 
    
	/* row-wise */
	// row_filterx = [1 0 -1]
	local_output_x[lid_y * LOCAL_XRES + lid_x] = padSrc[row * pad_width + col] - padSrc[row * pad_width + col + 2];
	// row_filtery = [1 2 1]
	local_output_y[lid_y * LOCAL_XRES + lid_x] = padSrc[row * pad_width + col] + padSrc[row * pad_width + col + 1] * 2 + padSrc[row * pad_width + col + 2];
	           
	if (lid_y < FILTERSIZE - 1) {
		local_output_x[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = padSrc[(row + LOCAL_YRES) * pad_width + col] - padSrc[(row + LOCAL_YRES) * pad_width + col + 2];
		local_output_y[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = padSrc[(row + LOCAL_YRES) * pad_width + col] + padSrc[(row + LOCAL_YRES) * pad_width + col + 1] * 2 + padSrc[(row + LOCAL_YRES) * pad_width + col + 2];
	}
	
	barrier(CLK_LOCAL_MEM_FENCE); 

	/* col-wise */
	// col_filterx = [1 2 1]
	ushort sumx = abs(local_output_x[lid_y * LOCAL_XRES + lid_x] + local_output_x[(lid_y + 1) * LOCAL_XRES + lid_x] * 2 + local_output_x[(lid_y + 2) * LOCAL_XRES + lid_x]);
	// col_filtery = [1 0 -1]
	ushort sumy = abs(local_output_y[lid_y * LOCAL_XRES + lid_x] - local_output_y[(lid_y + 2) * LOCAL_XRES + lid_x]);

	dst[row * width + col] = clamp((convert_uchar)(sumx + sumy), (uchar)0, (uchar)255);
}

3.局部内存实现

__kernel void sobel_filter_separable_lds(__global uchar* padSrc, __global uchar* dst, int height, int width, int pad_width)
{
	__local short local_output_x[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
	__local short local_output_y[LOCAL_XRES * (LOCAL_YRES + FILTERSIZE - 1)];
	__local uchar local_input[(LOCAL_XRES + FILTERSIZE - 1) * (LOCAL_YRES + FILTERSIZE - 1)];
	 
	uint col = get_global_id(0);
    uint row = get_global_id(1);

	if (col >= width || row >= height) return;

	int lid_x = get_local_id(0);
    int lid_y = get_local_id(1);

	int tile_xres = (LOCAL_XRES + FILTERSIZE - 1);
    int tile_yres = (LOCAL_YRES + FILTERSIZE - 1);

    int start_col = get_group_id(0) * LOCAL_XRES; 
    int start_row = get_group_id(1) * LOCAL_YRES; 
 
    int lid = lid_y * LOCAL_XRES + lid_x; 
    int gx, gy;
    
	do {
        gy = lid / tile_xres;
        gx = lid - gy * tile_xres; 
        
        local_input[lid] = padSrc[(start_row + gy) * pad_width + (start_col + gx)];
        lid += (LOCAL_XRES * LOCAL_YRES);
    } while (lid < (tile_xres * tile_yres));

    barrier(CLK_LOCAL_MEM_FENCE);

	/* row-wise */
	// row_filterx = [1 0 -1]
	local_output_x[lid_y * LOCAL_XRES + lid_x] = local_input[lid_y * tile_xres + lid_x] - local_input[lid_y * tile_xres + lid_x + 2];
	// row_filtery = [1 2 1]
	local_output_y[lid_y * LOCAL_XRES + lid_x] = local_input[lid_y * tile_xres + lid_x] + local_input[lid_y * tile_xres + lid_x + 1] * 2 + local_input[lid_y * tile_xres + lid_x + 2];
	           
	if (lid_y < FILTERSIZE - 1) {
		local_output_x[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x] - local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 2];
		local_output_y[(lid_y + LOCAL_YRES) * LOCAL_XRES + lid_x] = local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x] + local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 1] * 2 + local_input[(lid_y + LOCAL_YRES) * tile_xres + lid_x + 2];
	}
	
	barrier(CLK_LOCAL_MEM_FENCE); 

	/* col-wise */
	// col_filterx = [1 2 1]
	ushort sumx = abs(local_output_x[lid_y * LOCAL_XRES + lid_x] + local_output_x[(lid_y + 1) * LOCAL_XRES + lid_x] * 2 + local_output_x[(lid_y + 2) * LOCAL_XRES + lid_x]);
	// col_filtery = [1 0 -1]
	ushort sumy = abs(local_output_y[lid_y * LOCAL_XRES + lid_x] - local_output_y[(lid_y + 2) * LOCAL_XRES + lid_x]);

	dst[row * width + col] = clamp((convert_uchar)(sumx + sumy), (uchar)0, (uchar)255);
}

输入数据局部内存初始化原理参考下图

 

您可能感兴趣的与本文相关的镜像

PyTorch 2.5

PyTorch 2.5

PyTorch
Cuda

PyTorch 是一个开源的 Python 机器学习库,基于 Torch 库,底层由 C++ 实现,应用于人工智能领域,如计算机视觉和自然语言处理

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值