关于FPGA中cordic算法的设计与关键细节

关于FPGA中cordic算法的设计与关键细节

一 codic算法核心内涵

codic 算法是一种旋转变换算法,通过若干次迭代,使横坐标值平方趋于((x ^2 )
+(y^2))纵坐标y轴趋于零,z轴角度趋于原直角坐标(x,y)角度值。其可进行任意两点间距离计算,直角坐标到坐标转换,应用方面包括图像边缘灰度值获取等,可在非常大程度上减少浮点运算导致的资源消耗,在硬件环境下应用广泛。
二 算法原理
初始坐标值(x,y)进行若干次迭代运算使y趋于0,先假设初始坐标值[Xi,Yi,Zi],旋转后坐标值[Xj,Yj,Zj],Z表示角度,P值为旋转过程中的增益补偿量,一般取值为0.6072,旋转公式如下
[Xj,Yj,Zj]=[P(XiXi +YiYi)^(1/2), 0 , Zi+arctan(Yi/Xi) ],及经过多次旋转迭代最终旋转到x轴,
旋转角度为arctan(Yi/Xi),迭代公式为Xn+1=Xn-AnYn
Yn+1=Yn+An
Xn
Zn+1=Zn-arctan(2负n次方)
An是Sn乘以2的负n次方,Sn通过Y决定,为使Y趋于零,当Y大于零时,Sn=-1,当Y小于零
Sn=1。
由于角度值一般情况下为小数,fpga中一般需对其做整形转换,fpga中一般迭代次数达到15次即可,归一化系数设置为2的20次方,定义360为2的20次方,1rad值为166886.053,在fpga中通过建立查找表的方式进行迭代运算,其值误差随输入数据增大而减小。
三 fpga电路设计
分为四大模块,预处理模块,单次运算迭代模块,核心处理模块,后续处理模块
1 预处理模块
首先对输入值取绝对值,使其位于第一象限,后对X,Y进行变换使X大于Y,位于1/4(0~45度)象限,代码如下

// An highlighted block
`timescale 1ps/1ps
module cordic_pre(
	clk,
	rst_n,
	din_valid,
	din_x,
	din_y,
	dout_x,
	dout_y,
	din_info,
	dout_valid
	);
	parameter DW=16;
	parameter latency=2;
	input clk;
	input rst_n;
	input din_valid;
	input [DW-1:0]din_x;
	input [DW-1:0]din_y;
	output reg [DW-1:0]dout_x;
	output reg [DW-1:0]dout_y;
	output reg [2:0]din_info;
	output dout_valid;
	reg [latency-1:0]din_valid_r;
	reg [DW-1:0]abs_x;
	reg [DW-1:0]abs_y;
	wire [DW-1:0]x_swap;
	wire [DW-1:0]y_swap;
	wire swap;
	reg x_signed,y_signed;
	function [DW-1:0]abs;
		input [DW-1:0]data;
			if(data[DW-1]==1'b1)
				abs=1'b1+(~data);
			else
				abs=data;
			endfunction
	always@(posedge clk or negedge rst_n)
	begin
		if(!rst_n)
			din_valid_r<={latency{1'b0}};
		else
		begin
			din_valid_r<={din_valid_r[latency-2:0],din_valid};
		end
	end
	assign x_swap=(abs_y>abs_x)?abs_y:abs_x;
	assign y_swap=(abs_y>abs_x)?abs_x:abs_y;
	assign swap=(abs_y>abs_x)?1'b1:1'b0;
	always@(posedge clk or negedge rst_n)
	begin
		if(!rst_n)
		begin
			x_signed<=1'b0;
			abs_x<={DW{1'b0}};
			y_signed<=1'b0;
			abs_y<={DW{1'b0}};
		end
		else
		begin
			abs_x<=#1 (abs(din_x));
			x_signed<=din_x[DW-1];
			abs_y<=#1 (abs(din_y));
			y_signed<=din_y[DW-1];
		end
	end
	always@(posedge clk or negedge rst_n)
	begin
		if(!rst_n)
		begin
			dout_x<={DW{1'b0}};
			dout_y<={DW{1'b0}};
			din_info<=3'b0;
		end
		else
		begin
			if(din_valid_r[1]==1'b1)
			begin
				dout_x<=#1 x_swap;
				dout_y<=#1 y_swap;
				din_info<=#1 {y_signed,x_signed,swap};
			end
		end
	end
	assign dout_valid=din_valid_r[latency-1];
	endmodule

2 单次运算迭代模块
得到预处理模块值后进行迭代运算,核心部分在于移位加法运算每次迭代需一次取符号,1次查表,2次位移和3次乘加运算,运算延迟一个时钟。代码如下

// An highlighted block
module cordic_ir_unit(
	clk,
	rst_n,
	din_valid,
	din_x,
	din_y,
	din_z,
	dout_valid,
	dout_x,
	dout_y,
	dout_z
	);
	parameter DW=16;
	parameter PIPE_ID=1;
	localparam DW_NOR=20;
	localparam IR_NUM=15;
	localparam latency=1;
	input clk;
	input rst_n;
	input din_valid;
	input signed[DW-1:0]din_x;
	input signed[DW-1:0]din_y;
	input [DW_NOR-1:0]din_z;
	output reg [DW-1:0]dout_x;
	output reg [DW-1:0]dout_y;
	output reg [DW_NOR-1:0]dout_z;
	output dout_valid;
	reg din_valid_r;
	wire y_is_neg;
	wire y_is_pos;
	wire signed [DW-1:0]delta_x;
	wire signed [DW-1:0]delta_y;
	wire signed [DW_NOR-1:0]delta_z;
	wire signed [DW-1:0]dout_temp_x;
	wire signed [DW-1:0]dout_temp_y;
	wire [DW_NOR-1:0]dout_temp_z;
	always@(posedge clk or negedge rst_n)
	begin
		if(!rst_n)
			din_valid_r<=1'b0;
		else
			din_valid_r<=din_valid;
	end
	wire [DW_NOR-1:0]atan_lut[0:IR_NUM-1];
	assign atan_lut[0]=20'h20000;
	assign atan_lut[1]=20'h12E40;
	assign atan_lut[2]=20'h09FB4;
	assign atan_lut[3]=20'h05111;
	assign atan_lut[4]=20'h028B1;
	assign atan_lut[5]=20'h0145D;
	assign atan_lut[6]=20'h00A2F;
	assign atan_lut[7]=20'h00518;
	assign atan_lut[8]=20'h0028C;
	assign atan_lut[9]=20'h00146;
	assign atan_lut[10]=20'h000A3;
	assign atan_lut[11]=20'h00051;
	assign atan_lut[12]=20'h00029;
	assign atan_lut[13]=20'h00014;
	assign atan_lut[14]=20'h0000A;
	assign y_is_neg=din_y[DW-1];
	assign y_is_pos=(~din_y[DW-1]);
	assign delta_z=atan_lut[PIPE_ID];
	generate
		if(PIPE_ID==0)
		begin:shift0
			assign delta_x=din_y;
			assign delta_y=din_x;
		end
	endgenerate
	generate
		if(PIPE_ID!=0)
		begin:shift
			wire signed [DW-1:0]delta_x_temp;
			wire signed [DW-1:0]delta_y_temp;
			assign delta_x_temp=(din_valid==1'b1)?din_y:{DW{1'b0}};
			assign delta_y_temp=(din_valid==1'b1)?din_x:{DW{1'b0}};
			assign delta_x=(din_y[DW-1]==1'b1)?{{PIPE_ID{1'b1}},delta_x_temp[DW-1:PIPE_ID]}:{{PIPE_ID{1'b0}},delta_x_temp[DW-1:PIPE_ID]};
			assign delta_y=(din_x[DW-1]==1'b1)?{{PIPE_ID{1'b1}},delta_y_temp[DW-1:PIPE_ID]}:{{PIPE_ID{1'b0}},delta_y_temp[DW-1:PIPE_ID]};
		end
	endgenerate
			assign dout_temp_x=(y_is_pos)?(din_x+delta_x):(din_x-delta_x);
			assign dout_temp_y=(y_is_neg)?(din_y+delta_y):(din_y-delta_y);
			assign dout_temp_z=(y_is_pos)?(din_z+delta_z):(din_z-delta_z);
		always@(posedge clk or negedge rst_n)
		begin
			if(!rst_n)
			begin
				dout_x<={DW{1'b0}};
				dout_y<={DW{1'b0}};
				dout_z<={DW_NOR{1'b0}};
			end
			else
			begin
				if(din_valid==1'b1)
				begin
					dout_x<=dout_temp_x;
					dout_y<=dout_temp_y;
					dout_z<=dout_temp_z;
				end
			end
		end
		assign dout_valid=din_valid_r;
		endmodule

3,核心处理单元
对单次迭代运算模块进行例化,以菊花链形式连接,上一级输出接下一级输入,输入数据位扩展到4位小数位输出为第15次迭代运算值,代码如下

// An highlighted block
module cordic_core(
	clk,
	rst_n,
	din_valid,
	din_x,
	din_y,
	din_z,
	dout_valid,
	dout_x,
	dout_z
	);
	parameter PIPELINE=15;
	parameter DW=16;
	parameter DW_FRAC=4;
	parameter DW_NOR=20;
	input clk;
	input rst_n;
	input din_valid;
	input [DW-1:0]din_x;
	input [DW-1:0]din_y;
	input [DW-1:0]din_z;
	output [DW+DW_FRAC-1:0]dout_x;
	output [DW_NOR-1:0]dout_z;
	output dout_valid;
	wire [DW+DW_FRAC-1:0]din_x_frac[PIPELINE:0];
	wire [DW+DW_FRAC-1:0]din_y_frac[PIPELINE:0];
	wire [DW_NOR-1:0]din_z_temp[PIPELINE:0];
	wire dout_valid_temp[PIPELINE:0];
	assign din_x_frac[0][DW+DW_FRAC-1:DW_FRAC]=din_x;
	assign din_x_frac[0][DW_FRAC-1:0]={DW_FRAC{1'b0}};
	assign din_y_frac[0][DW+DW_FRAC-1:DW_FRAC]=din_y;
	assign din_y_frac[0][DW_FRAC-1:0]={DW_FRAC{1'b0}};
	assign din_z_temp[0]=din_z;
	assign dout_valid_temp[0]=din_valid;
	generate
		begin:gen_iteration
			genvar n;
			for(n=1;n<=PIPELINE;n=n+1)
				begin:gen_pipeline
				cordic_ir_unit cordic_ir_unit(
				.clk(clk),
				.rst_n(rst_n),
				.din_valid(dout_valid_temp[n-1]),
				.din_x(din_x_frac[n-1]),
				.din_y(din_y_frac[n-1]),
				.din_z(din_z_temp[n-1]),
				.dout_valid(dout_valid_temp[n]),
				.dout_x(din_x_frac[n]),
				.dout_y(din_y_frac[n]),
				.dout_z(din_z_temp[n])
				);
			defparam cordic_ir_unit.DW=DW+DW_FRAC;
			defparam cordic_ir_unit.PIPE_ID=n-1;
			end
		end
	endgenerate
	assign dout_x=din_x_frac[PIPELINE];
	assign dout_z=din_z_temp[PIPELINE];
	assign dout_valid=dout_valid_temp[PIPELINE];
	endmodule

4,后续处理模块
后续处理模块单元实现坐标还原,对象限位置实现还原,处理核需要n-1个时钟完成迭代运算,预处理的象限信息缓存n-1个时钟与处理结果对齐。
在象限还原时,根据预处理逆运算
1,还原x与y交换信息
2,还原y轴
3,还原x轴
运算模块开销3个时钟,代码如下

// An highlighted block
module cordic_post(
	clk,
	rst_n,
	din_valid,
	din_x,
	din_z,
	din_info,
	dout_valid,
	dout_x,
	dout_z
	);
	parameter DW=16;
	parameter DW_FRAC=14;
	parameter DW_NOR=20;
	parameter PIPELINE=15;
	localparam latency_pre=2;
	localparam latency_core=15;
	localparam latency=3;
	localparam const_half_pi=20'h40000;
	localparam const_pi=20'h80000;
	localparam const_double_pi=20'h00000;
	input clk;
	input rst_n;
	input din_valid;
	input [DW+DW_FRAC-1:0]din_x;
	input [DW_NOR-1:0]din_z;
	input [latency-1:0]din_info;
	output [DW+DW_FRAC-1:0]dout_x;
	output reg[DW_NOR-1:0]dout_z;
	output dout_valid;
	integer n;
	reg [DW+DW_FRAC-1:0]gain_temp[0:3];
	wire [DW_NOR-1:0]angle_temp;
	wire [DW_NOR-1:0]angle_valid;
	wire [DW_NOR-1:0]angle_swap;
	reg [2:0]din_info_r[latency_core+latency-1:0];
	reg [latency-1:0]din_valid_r;
	reg [DW_NOR-1:0]angle_swap_r[0:latency-1];
	wire [DW_NOR-1:0]angle_temp_x;
	wire [DW_NOR-1:0]angle_temp_y;
	always@(posedge clk or negedge rst_n)
	begin
		if(!rst_n)
		begin
			gain_temp[0]<={DW+DW_FRAC{1'b0}};
			gain_temp[1]<={DW+DW_FRAC{1'b0}};
			gain_temp[2]<={DW+DW_FRAC{1'b0}};
			gain_temp[3]<={DW+DW_FRAC{1'b0}};
		end
		else
		begin
			if(din_valid==1'b1)
			begin
				gain_temp[0]<={1'b0,din_x[DW+DW_FRAC-1:1]}+{3'b0,din_x[DW+DW_FRAC-1:3]};
				gain_temp[1]<={6'b0,din_x[DW+DW_FRAC-1:6]}+{9'b0,din_x[DW+DW_FRAC-1:9]};
			end
			if(din_valid_r[0]==1'b1)
			begin
				gain_temp[2]<=gain_temp[0]-gain_temp[1];
			end
			if(din_valid_r[1]==1'b1)
			begin
				gain_temp[3]<=gain_temp[2]-{12'b0,gain_temp[2][DW+DW_FRAC-1:12]};
			end
		end
	end
	assign dout_x=gain_temp[3];
	assign angle_valid=(din_valid==1'b1)?din_z:{DW_NOR{1'b0}};
	assign angle_temp=(angle_valid[DW_NOR-1]==1'b1)?({DW_NOR{1'b0}}):angle_valid;
	assign angle_swap=(din_info_r[latency_core-1][0]==1'b1)?(const_half_pi-angle_temp):angle_temp;
	always@(posedge clk)
	begin
		din_valid_r<={din_valid_r[latency-2:0],din_valid};
		angle_swap_r[0]<=angle_swap;
		angle_swap_r[1]<=angle_temp_x;
	end
	always@(posedge clk)
	begin
		din_info_r[0]<=din_info;
		for(n=1;n<latency_core+latency;n=n+1)
		din_info_r[n]<=din_info_r[n-1];
	end
	assign angle_temp_x=(din_info_r[latency_core][1]==1'b1)?(const_half_pi-angle_swap_r[0]):angle_swap_r[0];
	assign angle_temp_y=(din_info_r[latency_core][2]==1'b1)?(const_double_pi-angle_swap_r[1]):angle_swap_r[1];
	always@(posedge clk)
	begin
		dout_z<=angle_temp_y;
	end
	assign dout_valid=din_valid_r[2];
	endmodule

四 总结
cordic运算结果与输入值大小有关,与输入位数有关,最终值X除以16 ,为实际距离值,误差不大于百分之20,通过顶层模块将这四部分连接,总共消耗1288个查找表与1086个寄存器,减少资源消耗,实现预期设计目的。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值