关于FPGA中cordic算法的设计与关键细节
一 codic算法核心内涵
codic 算法是一种旋转变换算法,通过若干次迭代,使横坐标值平方趋于((x ^2 )
+(y^2))纵坐标y轴趋于零,z轴角度趋于原直角坐标(x,y)角度值。其可进行任意两点间距离计算,直角坐标到坐标转换,应用方面包括图像边缘灰度值获取等,可在非常大程度上减少浮点运算导致的资源消耗,在硬件环境下应用广泛。
二 算法原理
初始坐标值(x,y)进行若干次迭代运算使y趋于0,先假设初始坐标值[Xi,Yi,Zi],旋转后坐标值[Xj,Yj,Zj],Z表示角度,P值为旋转过程中的增益补偿量,一般取值为0.6072,旋转公式如下
[Xj,Yj,Zj]=[P(XiXi +YiYi)^(1/2), 0 , Zi+arctan(Yi/Xi) ],及经过多次旋转迭代最终旋转到x轴,
旋转角度为arctan(Yi/Xi),迭代公式为Xn+1=Xn-AnYn
Yn+1=Yn+AnXn
Zn+1=Zn-arctan(2负n次方)
An是Sn乘以2的负n次方,Sn通过Y决定,为使Y趋于零,当Y大于零时,Sn=-1,当Y小于零
Sn=1。
由于角度值一般情况下为小数,fpga中一般需对其做整形转换,fpga中一般迭代次数达到15次即可,归一化系数设置为2的20次方,定义360为2的20次方,1rad值为166886.053,在fpga中通过建立查找表的方式进行迭代运算,其值误差随输入数据增大而减小。
三 fpga电路设计
分为四大模块,预处理模块,单次运算迭代模块,核心处理模块,后续处理模块
1 预处理模块
首先对输入值取绝对值,使其位于第一象限,后对X,Y进行变换使X大于Y,位于1/4(0~45度)象限,代码如下
// An highlighted block
`timescale 1ps/1ps
module cordic_pre(
clk,
rst_n,
din_valid,
din_x,
din_y,
dout_x,
dout_y,
din_info,
dout_valid
);
parameter DW=16;
parameter latency=2;
input clk;
input rst_n;
input din_valid;
input [DW-1:0]din_x;
input [DW-1:0]din_y;
output reg [DW-1:0]dout_x;
output reg [DW-1:0]dout_y;
output reg [2:0]din_info;
output dout_valid;
reg [latency-1:0]din_valid_r;
reg [DW-1:0]abs_x;
reg [DW-1:0]abs_y;
wire [DW-1:0]x_swap;
wire [DW-1:0]y_swap;
wire swap;
reg x_signed,y_signed;
function [DW-1:0]abs;
input [DW-1:0]data;
if(data[DW-1]==1'b1)
abs=1'b1+(~data);
else
abs=data;
endfunction
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
din_valid_r<={latency{1'b0}};
else
begin
din_valid_r<={din_valid_r[latency-2:0],din_valid};
end
end
assign x_swap=(abs_y>abs_x)?abs_y:abs_x;
assign y_swap=(abs_y>abs_x)?abs_x:abs_y;
assign swap=(abs_y>abs_x)?1'b1:1'b0;
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
x_signed<=1'b0;
abs_x<={DW{1'b0}};
y_signed<=1'b0;
abs_y<={DW{1'b0}};
end
else
begin
abs_x<=#1 (abs(din_x));
x_signed<=din_x[DW-1];
abs_y<=#1 (abs(din_y));
y_signed<=din_y[DW-1];
end
end
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
dout_x<={DW{1'b0}};
dout_y<={DW{1'b0}};
din_info<=3'b0;
end
else
begin
if(din_valid_r[1]==1'b1)
begin
dout_x<=#1 x_swap;
dout_y<=#1 y_swap;
din_info<=#1 {y_signed,x_signed,swap};
end
end
end
assign dout_valid=din_valid_r[latency-1];
endmodule
2 单次运算迭代模块
得到预处理模块值后进行迭代运算,核心部分在于移位加法运算每次迭代需一次取符号,1次查表,2次位移和3次乘加运算,运算延迟一个时钟。代码如下
// An highlighted block
module cordic_ir_unit(
clk,
rst_n,
din_valid,
din_x,
din_y,
din_z,
dout_valid,
dout_x,
dout_y,
dout_z
);
parameter DW=16;
parameter PIPE_ID=1;
localparam DW_NOR=20;
localparam IR_NUM=15;
localparam latency=1;
input clk;
input rst_n;
input din_valid;
input signed[DW-1:0]din_x;
input signed[DW-1:0]din_y;
input [DW_NOR-1:0]din_z;
output reg [DW-1:0]dout_x;
output reg [DW-1:0]dout_y;
output reg [DW_NOR-1:0]dout_z;
output dout_valid;
reg din_valid_r;
wire y_is_neg;
wire y_is_pos;
wire signed [DW-1:0]delta_x;
wire signed [DW-1:0]delta_y;
wire signed [DW_NOR-1:0]delta_z;
wire signed [DW-1:0]dout_temp_x;
wire signed [DW-1:0]dout_temp_y;
wire [DW_NOR-1:0]dout_temp_z;
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
din_valid_r<=1'b0;
else
din_valid_r<=din_valid;
end
wire [DW_NOR-1:0]atan_lut[0:IR_NUM-1];
assign atan_lut[0]=20'h20000;
assign atan_lut[1]=20'h12E40;
assign atan_lut[2]=20'h09FB4;
assign atan_lut[3]=20'h05111;
assign atan_lut[4]=20'h028B1;
assign atan_lut[5]=20'h0145D;
assign atan_lut[6]=20'h00A2F;
assign atan_lut[7]=20'h00518;
assign atan_lut[8]=20'h0028C;
assign atan_lut[9]=20'h00146;
assign atan_lut[10]=20'h000A3;
assign atan_lut[11]=20'h00051;
assign atan_lut[12]=20'h00029;
assign atan_lut[13]=20'h00014;
assign atan_lut[14]=20'h0000A;
assign y_is_neg=din_y[DW-1];
assign y_is_pos=(~din_y[DW-1]);
assign delta_z=atan_lut[PIPE_ID];
generate
if(PIPE_ID==0)
begin:shift0
assign delta_x=din_y;
assign delta_y=din_x;
end
endgenerate
generate
if(PIPE_ID!=0)
begin:shift
wire signed [DW-1:0]delta_x_temp;
wire signed [DW-1:0]delta_y_temp;
assign delta_x_temp=(din_valid==1'b1)?din_y:{DW{1'b0}};
assign delta_y_temp=(din_valid==1'b1)?din_x:{DW{1'b0}};
assign delta_x=(din_y[DW-1]==1'b1)?{{PIPE_ID{1'b1}},delta_x_temp[DW-1:PIPE_ID]}:{{PIPE_ID{1'b0}},delta_x_temp[DW-1:PIPE_ID]};
assign delta_y=(din_x[DW-1]==1'b1)?{{PIPE_ID{1'b1}},delta_y_temp[DW-1:PIPE_ID]}:{{PIPE_ID{1'b0}},delta_y_temp[DW-1:PIPE_ID]};
end
endgenerate
assign dout_temp_x=(y_is_pos)?(din_x+delta_x):(din_x-delta_x);
assign dout_temp_y=(y_is_neg)?(din_y+delta_y):(din_y-delta_y);
assign dout_temp_z=(y_is_pos)?(din_z+delta_z):(din_z-delta_z);
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
dout_x<={DW{1'b0}};
dout_y<={DW{1'b0}};
dout_z<={DW_NOR{1'b0}};
end
else
begin
if(din_valid==1'b1)
begin
dout_x<=dout_temp_x;
dout_y<=dout_temp_y;
dout_z<=dout_temp_z;
end
end
end
assign dout_valid=din_valid_r;
endmodule
3,核心处理单元
对单次迭代运算模块进行例化,以菊花链形式连接,上一级输出接下一级输入,输入数据位扩展到4位小数位输出为第15次迭代运算值,代码如下
// An highlighted block
module cordic_core(
clk,
rst_n,
din_valid,
din_x,
din_y,
din_z,
dout_valid,
dout_x,
dout_z
);
parameter PIPELINE=15;
parameter DW=16;
parameter DW_FRAC=4;
parameter DW_NOR=20;
input clk;
input rst_n;
input din_valid;
input [DW-1:0]din_x;
input [DW-1:0]din_y;
input [DW-1:0]din_z;
output [DW+DW_FRAC-1:0]dout_x;
output [DW_NOR-1:0]dout_z;
output dout_valid;
wire [DW+DW_FRAC-1:0]din_x_frac[PIPELINE:0];
wire [DW+DW_FRAC-1:0]din_y_frac[PIPELINE:0];
wire [DW_NOR-1:0]din_z_temp[PIPELINE:0];
wire dout_valid_temp[PIPELINE:0];
assign din_x_frac[0][DW+DW_FRAC-1:DW_FRAC]=din_x;
assign din_x_frac[0][DW_FRAC-1:0]={DW_FRAC{1'b0}};
assign din_y_frac[0][DW+DW_FRAC-1:DW_FRAC]=din_y;
assign din_y_frac[0][DW_FRAC-1:0]={DW_FRAC{1'b0}};
assign din_z_temp[0]=din_z;
assign dout_valid_temp[0]=din_valid;
generate
begin:gen_iteration
genvar n;
for(n=1;n<=PIPELINE;n=n+1)
begin:gen_pipeline
cordic_ir_unit cordic_ir_unit(
.clk(clk),
.rst_n(rst_n),
.din_valid(dout_valid_temp[n-1]),
.din_x(din_x_frac[n-1]),
.din_y(din_y_frac[n-1]),
.din_z(din_z_temp[n-1]),
.dout_valid(dout_valid_temp[n]),
.dout_x(din_x_frac[n]),
.dout_y(din_y_frac[n]),
.dout_z(din_z_temp[n])
);
defparam cordic_ir_unit.DW=DW+DW_FRAC;
defparam cordic_ir_unit.PIPE_ID=n-1;
end
end
endgenerate
assign dout_x=din_x_frac[PIPELINE];
assign dout_z=din_z_temp[PIPELINE];
assign dout_valid=dout_valid_temp[PIPELINE];
endmodule
4,后续处理模块
后续处理模块单元实现坐标还原,对象限位置实现还原,处理核需要n-1个时钟完成迭代运算,预处理的象限信息缓存n-1个时钟与处理结果对齐。
在象限还原时,根据预处理逆运算
1,还原x与y交换信息
2,还原y轴
3,还原x轴
运算模块开销3个时钟,代码如下
// An highlighted block
module cordic_post(
clk,
rst_n,
din_valid,
din_x,
din_z,
din_info,
dout_valid,
dout_x,
dout_z
);
parameter DW=16;
parameter DW_FRAC=14;
parameter DW_NOR=20;
parameter PIPELINE=15;
localparam latency_pre=2;
localparam latency_core=15;
localparam latency=3;
localparam const_half_pi=20'h40000;
localparam const_pi=20'h80000;
localparam const_double_pi=20'h00000;
input clk;
input rst_n;
input din_valid;
input [DW+DW_FRAC-1:0]din_x;
input [DW_NOR-1:0]din_z;
input [latency-1:0]din_info;
output [DW+DW_FRAC-1:0]dout_x;
output reg[DW_NOR-1:0]dout_z;
output dout_valid;
integer n;
reg [DW+DW_FRAC-1:0]gain_temp[0:3];
wire [DW_NOR-1:0]angle_temp;
wire [DW_NOR-1:0]angle_valid;
wire [DW_NOR-1:0]angle_swap;
reg [2:0]din_info_r[latency_core+latency-1:0];
reg [latency-1:0]din_valid_r;
reg [DW_NOR-1:0]angle_swap_r[0:latency-1];
wire [DW_NOR-1:0]angle_temp_x;
wire [DW_NOR-1:0]angle_temp_y;
always@(posedge clk or negedge rst_n)
begin
if(!rst_n)
begin
gain_temp[0]<={DW+DW_FRAC{1'b0}};
gain_temp[1]<={DW+DW_FRAC{1'b0}};
gain_temp[2]<={DW+DW_FRAC{1'b0}};
gain_temp[3]<={DW+DW_FRAC{1'b0}};
end
else
begin
if(din_valid==1'b1)
begin
gain_temp[0]<={1'b0,din_x[DW+DW_FRAC-1:1]}+{3'b0,din_x[DW+DW_FRAC-1:3]};
gain_temp[1]<={6'b0,din_x[DW+DW_FRAC-1:6]}+{9'b0,din_x[DW+DW_FRAC-1:9]};
end
if(din_valid_r[0]==1'b1)
begin
gain_temp[2]<=gain_temp[0]-gain_temp[1];
end
if(din_valid_r[1]==1'b1)
begin
gain_temp[3]<=gain_temp[2]-{12'b0,gain_temp[2][DW+DW_FRAC-1:12]};
end
end
end
assign dout_x=gain_temp[3];
assign angle_valid=(din_valid==1'b1)?din_z:{DW_NOR{1'b0}};
assign angle_temp=(angle_valid[DW_NOR-1]==1'b1)?({DW_NOR{1'b0}}):angle_valid;
assign angle_swap=(din_info_r[latency_core-1][0]==1'b1)?(const_half_pi-angle_temp):angle_temp;
always@(posedge clk)
begin
din_valid_r<={din_valid_r[latency-2:0],din_valid};
angle_swap_r[0]<=angle_swap;
angle_swap_r[1]<=angle_temp_x;
end
always@(posedge clk)
begin
din_info_r[0]<=din_info;
for(n=1;n<latency_core+latency;n=n+1)
din_info_r[n]<=din_info_r[n-1];
end
assign angle_temp_x=(din_info_r[latency_core][1]==1'b1)?(const_half_pi-angle_swap_r[0]):angle_swap_r[0];
assign angle_temp_y=(din_info_r[latency_core][2]==1'b1)?(const_double_pi-angle_swap_r[1]):angle_swap_r[1];
always@(posedge clk)
begin
dout_z<=angle_temp_y;
end
assign dout_valid=din_valid_r[2];
endmodule
四 总结
cordic运算结果与输入值大小有关,与输入位数有关,最终值X除以16 ,为实际距离值,误差不大于百分之20,通过顶层模块将这四部分连接,总共消耗1288个查找表与1086个寄存器,减少资源消耗,实现预期设计目的。