目录
卷积层代码实现(复现Github URL:https://github.com/omarelhedaby/CNN-FPGA )
FPGA-神经网络
LeNet
1前向传播
1.1卷积层功能实现
实现卷积层功能,必须从上而下设计,采用模块化思维,将复杂模块拆分为多个子模块。
卷积层multi filter layer
根据所需的多个特征图,采用多个滤波器进行卷积,每个滤波器输出一个特征图。
由上图可见,第一层卷积输出了6个feature maps,可以构建一个single filter layer模块,该模块的功能为和原图像进行卷积,输出一张特征图。后续在卷积层模块(multi filter layer)中调用多个single filter layer,就可以输出多个特征图。
由于FPGA资源有限,同时调用single filter layer模块并不明智,所以采用同时调用两个single filter layer,并行度为2的方法,示意图如下:
单个卷积核single filter layer
一次特征图的输出,需要卷积核将整个图像遍历一次,以32*32*1,5*5为例,步长为1,卷积核需要走28*28步,让单个卷积核一步步往下走,固然能节约FPGA计算资源,但是时间的耗时过久,需要在时间空间上找到一个平衡,官方案例中采用14个卷积单位(convUnits)并行的方法,一次计算14个5*5窗口,也就是半行卷积,卷积两次为一行,卷积56次,为图片的单张特征图结果。
卷积数据分类RFselector
将图像数据以半排为一类,分为56类,给予例化的CU。
卷积单位convUnits
CU就是一个简单的卷积窗口计算,在这个例子中,CU将输入的5*5图像窗口数据和5*5卷积核进行点积计算,最终输出一个16位卷积结果。
处理单元Processing Element
最小的处理单元,分为三个单位,分别为半精度浮点数相乘、相加、寄存器。
1.2卷积层代码实现(复现Github URL:https://github.com/omarelhedaby/CNN-FPGA )
FM16
module FM16(
input [15:0] floatA,
input [15:0] floatB,
output reg [15:0] product
);
reg sign;
reg signed [5:0] zhishu;
reg [9:0] weishu;
reg [10:0] fractionA,fractionB;
reg [21:0] fraction;
always @(floatA or floatB) begin
if(floatA == 0 || floatB == 0) begin
product <= 1'b0;
end
else begin
sign <= floatA[15] ^ floatB[15]; //确认符号位
zhishu <= floatA[14:10] + floatB[14:10] - 5'd15 + 5'd2; //计算指数值,并添加偏置,
//让指数范围在-15到16之间,加2是为了规范化做准备,让尾数位于1到2之间
//虽然两个0-31相加后是0-62,但是由于只有五位数据位,所以其实高的部分就省略了
//还是0-31,所以只要减去15,就是对应的范围了
fractionA = {1'b1,floatA[9:0]};
fractionB = {1'b1,floatB[9:0]};
fraction = fractionA * fractionB;
//规范化 比如如果输出结果是0.110*2^3,那么规范化后就是1.10*2^2
if(fraction[21] == 1'b1) begin
fraction = fraction << 1;
zhishu = zhishu - 1;
end
else if(fraction[20] == 1'b1) begin
fraction = fraction << 2;
zhishu = zhishu - 2;
end
else if(fraction[19] == 1'b1) begin
fraction = fraction << 3;
zhishu = zhishu - 3;
end
else if(fraction[18] == 1'b1) begin
fraction = fraction << 4;
zhishu = zhishu - 4;
end
else if(fraction[17] == 1'b1) begin
fraction = fraction << 5;
zhishu = zhishu - 5;
end
else if(fraction[16] == 1'b1) begin
fraction = fraction << 6;
zhishu = zhishu - 6;
end
else if(fraction[15] == 1'b1) begin
fraction = fraction << 7;
zhishu = zhishu - 7;
end
else if(fraction[14] == 1'b1) begin
fraction = fraction << 8;
zhishu = zhishu - 8;
end
else if(fraction[13] == 1'b1) begin
fraction = fraction << 9;
zhishu = zhishu - 9;
end
else if(fraction[12] == 1'b1) begin
fraction = fraction << 10;
zhishu = zhishu - 10;
end
weishu = fraction[21:12];
if(zhishu[5] == 1'b1) begin //当指数小于0时,实际就已经是2^-15以下了,超出了半精度浮点数的精度范围
product = 16'd0;
end
else begin
product = {sign,zhishu[4:0],weishu};
end
end
end
endmodule
FMADD
module FADD(
input [15:0] floatA,
input [15:0] floatB,
output reg [15:0] sum
);
reg sign;
reg signed [5:0] zhishu;
reg [9:0] weishu;
reg [10:0] fractionA, fractionB;
reg [11:0] fraction;
reg [4:0] zhishuA,zhishuB;
reg [5:0] zhishucha;
always @(floatA or floatB) begin
if (floatA == 0) begin
sum = floatB;
end
else if(floatB == 0) begin
sum = floatA;
end
else begin
zhishuA = floatA[14:10];
zhishuB = floatB[14:10];
fractionA = {1'b1,floatA[9:0]};
fractionB = {1'b1,floatB[9:0]};
zhishu = zhishuA;
if (zhishuA>=zhishuB) begin //保持指数一致
zhishucha = zhishuA-zhishuB;
fractionB = fractionB >> (zhishucha);
zhishu = zhishuA;
end
else if(zhishuB>zhishuA) begin
zhishucha = zhishuB-zhishuA;
fractionA = fractionA >> (zhishucha);
zhishu = zhishuB;
end
if (floatA[15]==floatB[15]) begin //正负相同
fraction = fractionA + fractionB;
if (fraction[11]) begin
fraction = fraction >> 1;
zhishu = zhishu + 1;
end
sign = floatA[15];
end
else begin
if (floatA[15]==1) begin
fraction = fractionB - fractionA;
end
else if (floatB[15]==1) begin
fraction = fractionA - fractionB;
end
sign = fraction[11];
if (fraction[11]==1'b1) begin //如果发生下溢现象,就会导致11位为1,需要反向操作
fraction[10:0] = - fraction[10:0];
end
if (fraction[10]==1'b1) begin //减法不会导致超出规范化范围
end
else if(fraction[9]==1'b1) begin
fraction = fraction << 1;
zhishu = zhishu - 1;
end
else if(fraction[8]==1'b1) begin
fraction = fraction << 2;
zhishu = zhishu - 2;
end
else if(fraction[7]==1'b1) begin
fraction = fraction << 3;
zhishu = zhishu - 3;
end
else if(fraction[6]==1'b1) begin
fraction = fraction << 4;
zhishu = zhishu - 4;
end
else if(fraction[5]==1'b1) begin
fraction = fraction << 5;
zhishu = zhishu - 5;
end
else if(fraction[4]==1'b1) begin
fraction = fraction << 6;
zhishu = zhishu - 6;
end
else if(fraction[3]==1'b1) begin
fraction = fraction << 7;
zhishu = zhishu - 7;
end
else if(fraction[2]==1'b1) begin
fraction = fraction << 8;
zhishu = zhishu - 8;
end
else if(fraction[1]==1'b1) begin
fraction = fraction << 9;
zhishu = zhishu - 9;
end
else if(fraction[0]==1'b1) begin
fraction = fraction << 10;
zhishu = zhishu - 10;
end
end
weishu = fraction[9:0];
if (zhishu[5]==1'b1) begin
sum = 1'b0;
end
else begin
sum = {sign,zhishu[4:0],weishu};
end
end
end
endmodule
PE
module PE
#(
parameter Data_With = 16
)
(
input wire [Data_With:0] floatA,
input wire [Data_With:0] floatB,
input wire clk,
input wire rst,
output reg [Data_With:0] result
);
wire [Data_With:0] mulresult;
wire [Data_With:0] addresult;
FM16
FM16_dut (
.floatA (floatA ),
.floatB (floatB ),
.product ( mulresult)
);
FADD
FADD_dut (
.floatA (mulresult ),
.floatB (result ),
.sum ( addresult)
);
always @(posedge clk or posedge rst) begin
if (rst == 1'b1) begin
result <= 0;
end
else
result <= addresult;
end
endmodule
CU
module ConvolutionUnit#(
parameter DATA_WIDTH = 16,
parameter D = 1, //滤波器深度
parameter F = 5 //卷积核尺寸
)
(
input wire clk,rst,
input wire [0:D*F*F*DATA_WIDTH-1] img, filter, //注意数据的顺序
output wire [0:DATA_WIDTH-1] result
);
reg [DATA_WIDTH-1:0] imgin;
reg [DATA_WIDTH-1:0] fiterin;
PE
PE_dut (
.floatA (imgin ),
.floatB (fiterin ),
.clk (clk ),
.rst (rst ),
.result ( result)
);
integer i;
always @(posedge clk or posedge rst) begin
if (rst==1'b1) begin
i <= 1'b0;
imgin <= 1'b0;
fiterin <= 1'b0;
end
else if(i>F*F*D-1) begin //完成一个窗口的卷积后,但仍在等待下一组数据
imgin <= 1'b0; //最理想状况为每完成一个窗口就进入下一个,需要时序设计
fiterin <= 1'b0;
end
else begin //卷积窗口内的权重和图像数据一一对应输入到PE中去
imgin <= img[i*DATA_WIDTH+:DATA_WIDTH]; //DATA_WIDTH*i 是开始的位数,+:DATA_WIDTH 指的是从这个位数开始,要选择的位的数量。
fiterin <= filter[i*DATA_WIDTH+:DATA_WIDTH];
i = i + 1;
end
end
endmodule
RF
module Rfselector
#(
parameter IMGW = 32, //图片宽度
parameter IMGH = 32, //图片高度
parameter DATA_WIDTH = 16, //数据宽度
parameter D = 1, //卷积核深度
parameter F = 5 //卷积核尺寸
)
(
input wire [0:IMGH*IMGW*DATA_WIDTH*D-1] img, //一张图片的所有数据
input wire [5:0] rownumber,column,
output reg [0:D*F*F*DATA_WIDTH-1*((IMGW+1-F)/2)] receptiveField //输出为卷积核走一排所能用到的一半图像数据,一半是因为并行度,只调用了14个CU,调用两次
);
integer c,k,i,address; //c用来判断是第几个CU,k用来判断是第几层卷积核,i用来判断是卷积核的第几行
//address表示是receptiveField的第几个数
always @(rownumber or column or img) begin
address = 0;
if (column == 0) begin
for ( c = 0 ; c < ((IMGH-F+1)/2) ; c=c+1 ) begin
for (k =0 ;k<D ;k=k+1 ) begin
for ( i = 0;i<F ;i=i+1 ) begin
receptiveField[address*F*DATA_WIDTH+:DATA_WIDTH*F] = img[rownumber*IMGW*DATA_WIDTH+c*DATA_WIDTH+k*IMGH*IMGW*DATA_WIDTH+i*IMGW*DATA_WIDTH+:DATA_WIDTH*F];
//左边不难理解,右边img的地址中:k*IMGH*IMGW*DATA_WIDTH代表不同深度的图像数据之间相差IMGH*IMGW*DATA_WIDTH; rownumber*IMGW*DATA_WIDTH就是每排滤波器之间只差一行; i*IMGW*DATA_WIDTH代表滤波器中上下行的数据位置在图像数据中的差; c*DATA_WIDTH由于步长为1,所以同一行两个滤波器之间只差一个数据
address = address + 1;
end
end
end
end
else begin
for ( c = (IMGW-F+1)/2 ; c < (IMGW-F+1) ; c=c+1 ) begin
for (k =0 ;k<D ;k=k+1 ) begin
for ( i = 0;i<F ;i=i+1 ) begin
receptiveField[address*F*DATA_WIDTH+:DATA_WIDTH*F] = img[rownumber*IMGW*DATA_WIDTH+c*DATA_WIDTH+k*IMGH*IMGW*DATA_WIDTH+i*IMGW*DATA_WIDTH+:DATA_WIDTH*F];
//左边不难理解,右边img的地址中:k*IMGH*IMGW*DATA_WIDTH代表不同深度的图像数据之间相差IMGH*IMGW*DATA_WIDTH; rownumber*IMGW*DATA_WIDTH就是每排滤波器之间只差一行; i*IMGW*DATA_WIDTH代表滤波器中上下行的数据位置在图像数据中的差; c*DATA_WIDTH由于步长为1,所以同一行两个滤波器之间只差一个数据
address = address + 1;
end
end
end
end
end
endmodule
ConvSingle
module convLayerSingle
#(
parameter IMGW = 32, //图片宽度
parameter IMGH = 32, //图片高度
parameter DATA_WIDTH = 16, //数据宽度
parameter D = 1, //卷积核深度
parameter F = 5 //卷积核尺寸
)
(
input wire [0:D*IMGH*IMGW*DATA_WIDTH-1] img,
input wire clk,rst,
input wire [0:D*F*F*DATA_WIDTH*D-1] filter,
output reg [0:(IMGH-F+1)*(IMGW-F+1)*DATA_WIDTH-1] outputconv
);
wire [0:((IMGW-F+1)/2)*DATA_WIDTH-1] outputConvUnits; // 一排卷积结果的输出
reg internalReset;
wire [0:(((IMGW-F+1)/2)*D*F*F*DATA_WIDTH)-1] receptiveField; // Rfselector输出的某一排卷积所需数据
integer counter, outputCounter;
//counter计数作用,用于ConvUnit模块中5*5卷积核卷积,每一次点乘和累加占一个时钟周期,并预留2个时钟周期给线路延迟
//outputCounter用于确认outputconv输出的半排卷积结果序号
reg [5:0] rownumber,column;
//rownumber卷积排序号 0-28
//column 0表示0-13列并行卷积 14表示14-27列并行卷积
Rfselector
#(
.IMGW(IMGW ),
.IMGH(IMGH ),
.DATA_WIDTH(DATA_WIDTH ),
.D(D ),
.F(F )
)
Rfselector_dut (
.img (img ), //整个图片数据
.rownumber (rownumber ),
.column (column ),
.receptiveField ( receptiveField)
);
genvar n;
generate
for ( n = 0 ; n < (IMGW-F+1)/2 ; n = n + 1 ) begin
ConvolutionUnit
#(
.DATA_WIDTH(DATA_WIDTH ),
.D(D ),
.F(F )
)
CU_dut (
.clk (clk ),
.rst (internalReset ),
.img (receptiveField[D*F*F*DATA_WIDTH*n+:D*F*F*DATA_WIDTH] ), //输入和卷积核面积对应的图像数据D*F*F*DATA_WIDTH
.filter (filter ),
.result (outputConvUnits[n*DATA_WIDTH+:DATA_WIDTH]) //输出单个卷积核卷积结果
);
end
endgenerate
always @(posedge clk or posedge rst) begin
if (rst) begin
internalReset <= 1'b1;
counter <= 1'b0;
outputCounter <= 1'b0;
rownumber <= 1'b0;
column <= 1'b0;
end
else begin
if (rownumber < (IMGH - F + 1)) begin //一排排卷积,卷28排
if (counter >= D*F*F+2) begin
internalReset <= 1'b1;//复位一次
counter <= 1'b0;
outputCounter <= outputCounter + 1; //每完成半排卷积+1
if (column == 0) begin //因为是14个CU并行,所以一排只用卷积两次就可以进入下一排卷积
column <= (IMGW-F+1)/2;
end
else begin
column <= 1'b0;
rownumber <= rownumber + 1;
end
end
else begin
counter <= counter + 1;
internalReset <= 0; //取消复位
end
end
end
end
always @(*) begin //在outputCounter、outputconv、outputConvUnits改变时执行
outputconv[outputCounter*DATA_WIDTH*(IMGW-F+1)/2+:DATA_WIDTH*(IMGW-F+1)/2] <= outputConvUnits;
end
endmodule
ConvMulti
module convLayerMult
#(
parameter IMGW = 32, //图片宽度
parameter IMGH = 32, //图片高度
parameter DATA_WIDTH = 16, //数据宽度
parameter D = 1, //卷积核深度
parameter F = 5, //卷积核尺寸
parameter K = 6 //卷积核数量
)
(
input wire [0:D*DATA_WIDTH*IMGH*IMGW-1] img,
input wire [0:K*F*F*D*DATA_WIDTH] filter,
input wire clk,rst,
output reg [0:(IMGH-F+1)*(IMGH-F+1)*K*DATA_WIDTH-1] outputconv
);
reg [0:2*F*F*D*DATA_WIDTH] inputfilter;
reg [0:2*D*(IMGH-F+1)*(IMGH-F+1)*DATA_WIDTH-1] outputconv_single;
integer counter, outputCounter;
//counter计数作用,计算完成一次特征图输出所需要的时间周期
//outputCounter用于确认Single filter layer输出的卷积结果序号
reg internalReset;
reg [5:0] filternumber; //滤波器的序号
genvar n;
generate
for ( n = 0 ; n < 2 ; n = n + 1 ) begin
convLayerSingle
#(
.IMGW(IMGW ),
.IMGH(IMGH ),
.DATA_WIDTH(DATA_WIDTH ),
.D(D ),
.F (
F )
)
convLayerSingle_dut (
.img (img ),
.clk (clk ),
.rst (rst ),
.filter (inputfilter[n*F*F*D*DATA_WIDTH+:F*F*D*DATA_WIDTH] ),
.outputconv ( outputconv_single[n*D*(IMGH-F+1)*(IMGH-F+1)*DATA_WIDTH+:D*(IMGH-F+1)*(IMGH-F+1)*DATA_WIDTH])
);
end
endgenerate
always @(negedge clk or posedge rst ) begin
if (rst) begin
counter <= 0;
outputCounter <= 0 ;
internalReset <= 1'b1;
filternumber <= 0;
end
else begin
if (filternumber < K) begin
if(counter >= (D*F*F+2)*(IMGH-F+1)*2+1) begin
internalReset <= 1'b1;
counter <= 0;
outputCounter <= outputCounter + 1;
filternumber <= filternumber + 2; //例化两个single filter layer,一次得到两个特征图
end
else begin
counter = counter + 1;
internalReset <= 0;
end
end
end
end
always @(*) begin //虽然outputcounter到3时候会使filter与outputconv溢出,但不会报错,对输出结果也没有影响
inputfilter <= filter[outputCounter*2*F*F*D*DATA_WIDTH+:2*F*F*D*DATA_WIDTH];
outputconv[outputCounter*2*(IMGH-F+1)*(IMGH-F+1)*DATA_WIDTH+:2*(IMGH-F+1)*(IMGH-F+1)*DATA_WIDTH] <= outputconv_single;
end
endmodule