cpu之pc_reg

module pc_reg(
	input wire clk,
	input wire rst,
	input wire stall_id,
	input wire jump,
	input wire[1:0] stall_branch_t,
	input wire[31:0] pc1,
	input wire[31:0] pc2,//专门为跳转指令设置的
	output reg[31:0] pc
);

always @(posedge clk) begin
	if (rst==1'b1) begin
		pc=32'h0;
		
	end else begin
		if (jump!=1'b1) begin
			if (stall_id==1'b0) begin
			if (stall_branch_t==2'b10) begin 
				pc <= pc1-32'h4;
			end else begin
				pc<=pc1;
			end
		end else begin
			pc <= pc1-32'h4;
		end
			
		end else begin
			pc <= pc2;
		end
		
	end
	
end

endmodule

#include <immintrin.h> #include <stdio.h> // 用于调试时的printf // M, N, K 维度定义 // A: M x K, 列主序 // B: K x N, 行主序 // C: M x N, 行主序 /* 矩阵元素访问宏 */ // A_T 是列主序 #define A_T(i,j,lda,aptr) (aptr)[(j)*(lda) + (i)] // B 是行主序 #define B_RM(i,j,ldb,bptr) (bptr)[(i)*(ldb) + (j)] // C 是行主序 #define C_RM(i,j,ldc,cptr) (cptr)[(i)*(ldc) + (j)] // min 函数宏 #define min( i, j ) ( (i)<(j) ? (i): (j) ) // 分块大小 #ifndef MC #define MC 240 // M 维度的块大小 (A的行,C的行) 8的倍数 #endif #ifndef NC #define NC 768 // N 维度的块大小 (B的列,C的列) 24的倍数 #endif #ifndef KC #define KC 128 // K 维度的块大小 (A的列,B的行) #endif #define M_TILE 8 #define N_TILE 24 // AddDot static void AddDot8x24( int k_iter, const double *a_tile, int lda_full, /* M_dim for A */ const double *b_tile, int ldb_full, /* N_dim for B */ double *c_tile, int ldc_full ); /* N_dim for C */ // Kernel_McNcKc 函数:计算 C 的一个 MC x NC 子块 static void Kernel_McNcKc( int current_mc, int current_nc, int current_kc, const double *a_block, int global_lda, const double *b_block, int global_ldb, double *c_block, int global_ldc) { int i_block_loop, j_block_loop; // 每次处理一个 M_TILE x N_TILE 的tile for (i_block_loop = 0; i_block_loop < current_mc; i_block_loop += M_TILE) { for (j_block_loop = 0; j_block_loop < current_nc; j_block_loop += N_TILE) { const double *a_tile_ptr = &A_T(i_block_loop, 0, global_lda, a_block); const double *b_tile_ptr = &B_RM(0, j_block_loop, global_ldb, b_block); double *c_tile_ptr = &C_RM(i_block_loop, j_block_loop, global_ldc, c_block); AddDot8x24(current_kc, /*即K维度子块的大小 */ a_tile_ptr, global_lda, b_tile_ptr, global_ldb, c_tile_ptr, global_ldc); } } } void MY_MMult( int m_dim, int n_dim, int k_dim, const double *global_a_ptr, int lda_full, /* A的真实lda (等于M) */ const double *global_b_rm_ptr, int ldb_full, /* B的真实列数 (等于N) */ double *global_c_rm_ptr, int ldc_full ) /* C的真实列数 (等于N) */ { int ic_loop, pc_loop, jc_loop; int current_mc_val, current_nc_val, current_kc_val; //先n for (jc_loop = 0; jc_loop < n_dim; jc_loop += NC) { current_nc_val = min(n_dim - jc_loop, NC); //相当于没有 for (pc_loop = 0; pc_loop < k_dim; pc_loop += KC) { current_kc_val = min(k_dim - pc_loop, KC); const double *a_panel_for_kc_block = &A_T(0, pc_loop, lda_full, global_a_ptr); const double *b_panel_for_kc_block = &B_RM(pc_loop, 0, ldb_full, global_b_rm_ptr); for (ic_loop = 0; ic_loop < m_dim; ic_loop += MC) { current_mc_val = min(m_dim - ic_loop, MC); // A: A(ic_loop : ic_loop+current_mc_val-1, pc_loop : pc_loop+current_kc_val-1) // B: B(pc_loop : pc_loop+current_kc_val-1, jc_loop : jc_loop+current_nc_val-1) // C: C(ic_loop : ic_loop+current_mc_val-1, jc_loop : jc_loop+current_nc_val-1) // Calculate start pointers for the panel/sub-block for Kernel_McNcKc // A panel starts at global A [ic_loop, pc_loop] const double *a_panel_ptr = &A_T(ic_loop, 0, lda_full, a_panel_for_kc_block); // B panel starts at global B [pc_loop, jc_loop] const double *b_panel_ptr = &B_RM(0, jc_loop, ldb_full, b_panel_for_kc_block); // C sub-block starts at global C [ic_loop, jc_loop] double *c_sub_tile_ptr = &C_RM(ic_loop, jc_loop, ldc_full, global_c_rm_ptr); Kernel_McNcKc(current_mc_val, current_nc_val, current_kc_val, a_panel_ptr, lda_full, b_panel_ptr, ldb_full, c_sub_tile_ptr, ldc_full); } } } } static void AddDot8x24( int k_iter, const double *a_tile, int lda_full, const double *b_tile, int ldb_full, double *c_tile, int ldc_full ) { __m512d c_reg_00, c_reg_01, c_reg_02; __m512d c_reg_10, c_reg_11, c_reg_12; __m512d c_reg_20, c_reg_21, c_reg_22; __m512d c_reg_30, c_reg_31, c_reg_32; __m512d c_reg_40, c_reg_41, c_reg_42; __m512d c_reg_50, c_reg_51, c_reg_52; __m512d c_reg_60, c_reg_61, c_reg_62; __m512d c_reg_70, c_reg_71, c_reg_72; c_reg_00 = _mm512_load_pd(c_tile + 0*ldc_full + 0*8); c_reg_01 = _mm512_load_pd(c_tile + 0*ldc_full + 1*8); c_reg_02 = _mm512_load_pd(c_tile + 0*ldc_full + 2*8); c_reg_10 = _mm512_load_pd(c_tile + 1*ldc_full + 0*8); c_reg_11 = _mm512_load_pd(c_tile + 1*ldc_full + 1*8); c_reg_12 = _mm512_load_pd(c_tile + 1*ldc_full + 2*8); c_reg_20 = _mm512_load_pd(c_tile + 2*ldc_full + 0*8); c_reg_21 = _mm512_load_pd(c_tile + 2*ldc_full + 1*8); c_reg_22 = _mm512_load_pd(c_tile + 2*ldc_full + 2*8); c_reg_30 = _mm512_load_pd(c_tile + 3*ldc_full + 0*8); c_reg_31 = _mm512_load_pd(c_tile + 3*ldc_full + 1*8); c_reg_32 = _mm512_load_pd(c_tile + 3*ldc_full + 2*8); c_reg_40 = _mm512_load_pd(c_tile + 4*ldc_full + 0*8); c_reg_41 = _mm512_load_pd(c_tile + 4*ldc_full + 1*8); c_reg_42 = _mm512_load_pd(c_tile + 4*ldc_full + 2*8); c_reg_50 = _mm512_load_pd(c_tile + 5*ldc_full + 0*8); c_reg_51 = _mm512_load_pd(c_tile + 5*ldc_full + 1*8); c_reg_52 = _mm512_load_pd(c_tile + 5*ldc_full + 2*8); c_reg_60 = _mm512_load_pd(c_tile + 6*ldc_full + 0*8); c_reg_61 = _mm512_load_pd(c_tile + 6*ldc_full + 1*8); c_reg_62 = _mm512_load_pd(c_tile + 6*ldc_full + 2*8); c_reg_70 = _mm512_load_pd(c_tile + 7*ldc_full + 0*8); c_reg_71 = _mm512_load_pd(c_tile + 7*ldc_full + 1*8); c_reg_72 = _mm512_load_pd(c_tile + 7*ldc_full + 2*8); __m512d a_bcast_0, a_bcast_1, a_bcast_2, a_bcast_3; __m512d b_vec_0, b_vec_1, b_vec_2; const double *a_k_ptr = a_tile; const double *b_k_ptr = b_tile; for (int p = 0; p < k_iter; ++p) { b_vec_0 = _mm512_load_pd(b_k_ptr + 0*8); b_vec_1 = _mm512_load_pd(b_k_ptr + 1*8); b_vec_2 = _mm512_load_pd(b_k_ptr + 2*8); a_bcast_0 = _mm512_set1_pd(*(a_k_ptr + 0)); a_bcast_1 = _mm512_set1_pd(*(a_k_ptr + 1)); a_bcast_2 = _mm512_set1_pd(*(a_k_ptr + 2)); a_bcast_3 = _mm512_set1_pd(*(a_k_ptr + 3)); c_reg_00 = _mm512_fmadd_pd(a_bcast_0, b_vec_0, c_reg_00); c_reg_01 = _mm512_fmadd_pd(a_bcast_0, b_vec_1, c_reg_01); c_reg_02 = _mm512_fmadd_pd(a_bcast_0, b_vec_2, c_reg_02); c_reg_10 = _mm512_fmadd_pd(a_bcast_1, b_vec_0, c_reg_10); c_reg_11 = _mm512_fmadd_pd(a_bcast_1, b_vec_1, c_reg_11); c_reg_12 = _mm512_fmadd_pd(a_bcast_1, b_vec_2, c_reg_12); c_reg_20 = _mm512_fmadd_pd(a_bcast_2, b_vec_0, c_reg_20); c_reg_21 = _mm512_fmadd_pd(a_bcast_2, b_vec_1, c_reg_21); c_reg_22 = _mm512_fmadd_pd(a_bcast_2, b_vec_2, c_reg_22); c_reg_30 = _mm512_fmadd_pd(a_bcast_3, b_vec_0, c_reg_30); c_reg_31 = _mm512_fmadd_pd(a_bcast_3, b_vec_1, c_reg_31); c_reg_32 = _mm512_fmadd_pd(a_bcast_3, b_vec_2, c_reg_32); a_bcast_0 = _mm512_set1_pd(*(a_k_ptr + 4)); a_bcast_1 = _mm512_set1_pd(*(a_k_ptr + 5)); a_bcast_2 = _mm512_set1_pd(*(a_k_ptr + 6)); a_bcast_3 = _mm512_set1_pd(*(a_k_ptr + 7)); c_reg_40 = _mm512_fmadd_pd(a_bcast_0, b_vec_0, c_reg_40); c_reg_41 = _mm512_fmadd_pd(a_bcast_0, b_vec_1, c_reg_41); c_reg_42 = _mm512_fmadd_pd(a_bcast_0, b_vec_2, c_reg_42); c_reg_50 = _mm512_fmadd_pd(a_bcast_1, b_vec_0, c_reg_50); c_reg_51 = _mm512_fmadd_pd(a_bcast_1, b_vec_1, c_reg_51); c_reg_52 = _mm512_fmadd_pd(a_bcast_1, b_vec_2, c_reg_52); c_reg_60 = _mm512_fmadd_pd(a_bcast_2, b_vec_0, c_reg_60); c_reg_61 = _mm512_fmadd_pd(a_bcast_2, b_vec_1, c_reg_61); c_reg_62 = _mm512_fmadd_pd(a_bcast_2, b_vec_2, c_reg_62); c_reg_70 = _mm512_fmadd_pd(a_bcast_3, b_vec_0, c_reg_70); c_reg_71 = _mm512_fmadd_pd(a_bcast_3, b_vec_1, c_reg_71); c_reg_72 = _mm512_fmadd_pd(a_bcast_3, b_vec_2, c_reg_72); a_k_ptr += lda_full; b_k_ptr += ldb_full; } _mm512_store_pd(c_tile + 0*ldc_full + 0*8, c_reg_00); _mm512_store_pd(c_tile + 0*ldc_full + 1*8, c_reg_01); _mm512_store_pd(c_tile + 0*ldc_full + 2*8, c_reg_02); _mm512_store_pd(c_tile + 1*ldc_full + 0*8, c_reg_10); _mm512_store_pd(c_tile + 1*ldc_full + 1*8, c_reg_11); _mm512_store_pd(c_tile + 1*ldc_full + 2*8, c_reg_12); _mm512_store_pd(c_tile + 2*ldc_full + 0*8, c_reg_20); _mm512_store_pd(c_tile + 2*ldc_full + 1*8, c_reg_21); _mm512_store_pd(c_tile + 2*ldc_full + 2*8, c_reg_22); _mm512_store_pd(c_tile + 3*ldc_full + 0*8, c_reg_30); _mm512_store_pd(c_tile + 3*ldc_full + 1*8, c_reg_31); _mm512_store_pd(c_tile + 3*ldc_full + 2*8, c_reg_32); _mm512_store_pd(c_tile + 4*ldc_full + 0*8, c_reg_40); _mm512_store_pd(c_tile + 4*ldc_full + 1*8, c_reg_41); _mm512_store_pd(c_tile + 4*ldc_full + 2*8, c_reg_42); _mm512_store_pd(c_tile + 5*ldc_full + 0*8, c_reg_50); _mm512_store_pd(c_tile + 5*ldc_full + 1*8, c_reg_51); _mm512_store_pd(c_tile + 5*ldc_full + 2*8, c_reg_52); _mm512_store_pd(c_tile + 6*ldc_full + 0*8, c_reg_60); _mm512_store_pd(c_tile + 6*ldc_full + 1*8, c_reg_61); _mm512_store_pd(c_tile + 6*ldc_full + 2*8, c_reg_62); _mm512_store_pd(c_tile + 7*ldc_full + 0*8, c_reg_70); _mm512_store_pd(c_tile + 7*ldc_full + 1*8, c_reg_71); _mm512_store_pd(c_tile + 7*ldc_full + 2*8, c_reg_72); } 请告诉我这段代码完成了什么优化工作,请详细说明
最新发布
05-29
def forward(self, predictions, targets): pred_map, pred_reg = nms_hm(predictions['det_cls']), predictions['det_reg'] # calib = targets[0]["calib"] # # debug # bev_map = nms_hm((targets[0].get_field("bev_map").unsqueeze(0))) # scores, inds, clses, ys, xs = self._topk(bev_map, K=self.max_detection) # show_bevmap(pred_map[0].cpu().numpy(), index=targets[0]["filename"]) batch, _, _, _ = pred_map.size() scores, inds, clses, ys, xs = self._topk(pred_map, K=self.max_detection) #[bev_c[0]-x, bev_c[0]-y, z, obj.l, obj.h, obj.w, np.sin(rot_y), np.cos(rot_y)] reg = pred_reg[:, :2] z = pred_reg[:, 2].unsqueeze(1) dim = pred_reg[:, 3:6] ori = pred_reg[:, 6:] reg = self._transpose_and_gather_feat(reg, inds) reg = reg.view(batch, self.max_detection, 2) xs = xs.view(batch, self.max_detection, 1) + reg[:, :, 0:1] ys = ys.view(batch, self.max_detection, 1) + reg[:, :, 1:2] ori = self._transpose_and_gather_feat(ori, inds) ori = ori.view(batch, self.max_detection, 16) # dim of the box dim = self._transpose_and_gather_feat(dim, inds) dim = dim.view(batch, self.max_detection, 3) dim = self.decode_dimension(clses, dim) # height in the bev z = self._transpose_and_gather_feat(z, inds) z = z.view(batch, self.max_detection, 1) z += dim[:,:, 1].unsqueeze(-1) / 2 dim = dim.roll(shifts=-1, dims=2) # class label clses = clses.view(batch, self.max_detection).float() scores = scores.view(batch, self.max_detection) xs = xs.view( batch, self.max_detection, 1) * (self.pc_range[3]-self.pc_range[0]) / self.bev_map_size[0] + self.pc_range[0] ys = (self.bev_map_size[1]-ys.view( batch, self.max_detection, 1)) * (self.pc_range[4]-self.pc_range[1]) / self.bev_map_size[1] + self.pc_range[1] final_box_preds = torch.cat([dim, xs, z, ys], dim=2) final_scores = scores final_preds = clses # use score threshold if self.score_threshold is not None: thresh_mask = final_scores > self.score_threshold results =
03-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值