附加:
Group Normalization作业应该是cs231N在2018年新出的
https://www.jianshu.com/p/aaeb9cd4d70c
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
out, cache = None, None
eps = gn_param.get('eps',1e-5)
###########################################################################
# TODO: Implement the forward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
# In particular, think about how you could transform the matrix so that #
# the bulk of the code is similar to both train-time batch normalization #
# and layer normalization! #
###########################################################################
N,C,H,W = x.shape
x_group = np.reshape(x, (N, G, C//G, H, W)) #按G将C分组
mean = np.mean(x_group, axis=(2,3,4), keepdims=True) #均值
var = np.var(x_group, axis=(2,3,4), keepdims=True) #方差
x_groupnorm = (x_group-mean)/np.sqrt(var+eps) #归一化
x_norm = np.reshape(x_groupnorm, (N,C,H,W)) #还原维度
out = x_norm*gamma+beta #还原C
cache = (G, x, x_norm, mean, var, beta, gamma, eps)
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
def spatial_groupnorm_backward(dout, cache):
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
###########################################################################
N,C,H,W = dout.shape
G, x, x_norm, mean, var, beta, gamma, eps = cache
# dbeta,dgamma
dbeta = np.sum(dout, axis=(0,2,3), keepdims=True)
dgamma = np.sum(dout*x_norm, axis=(0,2,3), keepdims=True)
# 计算dx_group,(N, G, C // G, H, W)
# dx_groupnorm
dx_norm = dout * gamma
dx_groupnorm = dx_norm.reshape((N, G, C // G, H, W))
# dvar
x_group = x.reshape((N, G, C // G, H, W))
dvar = np.sum(dx_groupnorm * -1.0 / 2 * (x_group - mean) / (var + eps) ** (3.0 / 2), axis=(2,3,4), keepdims=True)
# dmean
N_GROUP = C//G*H*W
dmean1 = np.sum(dx_groupnorm * -1.0 / np.sqrt(var + eps), axis=(2,3,4), keepdims=True)
dmean2_var = dvar * -2.0 / N_GROUP * np.sum(x_group - mean, axis=(2,3,4), keepdims=True)
dmean = dmean1 + dmean2_var
# dx_group
dx_group1 = dx_groupnorm * 1.0 / np.sqrt(var + eps)
dx_group2_mean = dmean * 1.0 / N_GROUP
dx_group3_var = dvar * 2.0 / N_GROUP * (x_group - mean)
dx_group = dx_group1 + dx_group2_mean + dx_group3_var
# 还原C得到dx
dx = dx_group.reshape((N, C, H, W))
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
附带cs231N笔记:
http://cs231n.github.io/optimization-2/#sigmoid
http://cthorey.github.io./backpropagation/
https://blog.youkuaiyun.com/kevin_hee/article/details/80783698
正文开始:
def batchnorm_backward(dout, cache):
#unfold the variables stored in cache
xhat,gamma,xmu,ivar,sqrtvar,var,eps = cache
#get the dimensions of the input/output
N,D = dout.shape
#step9
dbeta = np.sum(dout, axis=0)
dgammax = dout #not necessary, but more understandable
#step8
dgamma = np.sum(dgammax*xhat, axis=0)
dxhat = dgammax * gamma
#step7
divar = np.sum(dxhat*xmu, axis=0)
dxmu1 = dxhat * ivar
#step6
dsqrtvar = -1. /(sqrtvar**2) * divar
#step5
dvar = 0.5 * 1. /np.sqrt(var+eps) * dsqrtvar
#step4
dsq = 1. /N * np.ones((N,D)) * dvar
#step3
dxmu2 = 2 * xmu * dsq
#step2
dx1 = (dxmu1 + dxmu2)
dmu = -1 * np.sum(dxmu1+dxmu2, axis=0)
#step1
dx2 = 1. /N * np.ones((N,D)) * dmu
#step0
dx = dx1 + dx2
return dx, dgamma, dbeta