Github代码链接:https://github.com/nyu-dl/dl4mt-tutorial/blob/master/session0/lm.py
如有理解错误,还望能够指出;
# -*- coding: utf-8 -*- ''' Build a simple neural language model using GRU units ''' import theano import theano.tensor as tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams import cPickle as pkl import ipdb import numpy import copy import os import warnings import sys import time from collections import OrderedDict from data_iterator import TextIterator profile = False # push parameters to Theano shared variables def zipp(params, tparams): for kk, vv in params.iteritems(): tparams[kk].set_value(vv) # pull parameters from Theano shared variables def unzip(zipped): new_params = OrderedDict() for kk, vv in zipped.iteritems(): new_params[kk] = vv.get_value() return new_params # get the list of parameters: Note that tparams must be OrderedDict def itemlist(tparams): return [vv for kk, vv in tparams.iteritems()] # dropout def dropout_layer(state_before, use_noise, trng): proj = tensor.switch( use_noise, state_before * trng.binomial(state_before.shape, p=0.5, n=1, dtype=state_before.dtype), state_before * 0.5) return proj # make prefix-appended name def _p(pp, name): return '%s_%s' % (pp, name) # initialize Theano shared variables according to the initial parameters,所有变量共享 def init_tparams(params): tparams = OrderedDict() for kk, pp in params.iteritems(): tparams[kk] = theano.shared(params[kk], name=kk) return tparams # load parameters def load_params(path, params): pp = numpy.load(path) for kk, vv in params.iteritems(): if kk not in pp: warnings.warn('%s is not in the archive' % kk) continue params[kk] = pp[kk] return params # layers: 'name': ('parameter initializer', 'feedforward') layers = {'ff': ('param_init_fflayer', 'fflayer'), 'gru': ('param_init_gru', 'gru_layer'), } def get_layer(name): fns = layers[name] return (eval(fns[0]), eval(fns[1])) # orthogonal initialization for weights # see Saxe et al. ICLR'14 # 根据奇异值分解来求解特征 def ortho_weight(ndim): W = numpy.random.randn(ndim, ndim) u, s, v = numpy.linalg.svd(W) return u.astype('float32') # weight initializer, normal by default #nin = n_words,词表的大小,nout =1000,词向量的维度 def norm_weight(nin, nout=None, scale=0.01, ortho=True): if nout is None: nout = nin if nout == nin and ortho: #如果 词向量维度没有给,所以用到奇异值分解,得到embedding 矩阵 W = ortho_weight(nin) else: #否则的话,根据给定的词向量的维度,初始化embedding 矩阵 W = scale * numpy.random.randn(nin, nout) return W.astype('float32') def tanh(x): return tensor.tanh(x) def linear(x): return x def concatenate(tensor_list, axis=0): """ Alternative implementation of `theano.tensor.concatenate`. This function does exactly the same thing, but contrary to Theano's own implementation, the gradient is implemented on the GPU. Backpropagating through `theano.tensor.concatenate` yields slowdowns because the inverse operation (splitting) needs to be done on the CPU. This implementation does not have that problem. :usage: >>> x, y = theano.tensor.matrices('x', 'y') >>> c = concatenate([x, y], axis=1) :parameters: - tensor_list : list list of Theano tensor expressions that should be concatenated. - axis : int the tensors will be joined along this axis. :returns: - out : tensor the concatenated tensor expression. """ concat_size = sum(tt.shape[axis] for tt in tensor_list) output_shape = () for k in range(axis): output_shape += (tensor_list[0].shape[k],) output_shape += (concat_size,) for k in range(axis + 1, tensor_list[0].ndim): output_shape += (tensor_list[0].shape[k],) out = tensor.zeros(output_shape) offset = 0 for tt in tensor_list: indices = () for k in range(axis): indices += (slice(None),) indices += (slice(offset, offset + tt.shape[axis]),) for k in range(axis + 1, tensor_list[0].ndim): indices += (slice(None),) out = tensor.set_subtensor(out[indices], tt) offset += tt.shape[axis] return out # batch preparation, returns padded batch and mask def prepare_data(seqs_x, maxlen=None, n_words=30000): # x: a list of sentences lengths_x = [len(s) for s in seqs_x] # filter according to mexlen if maxlen is not None: new_seqs_x = [] new_lengths_x = [] for l_x, s_x in zip(lengths_x, seqs_x): if l_x < maxlen: new_seqs_x.append(s_x) new_lengths_x.append(l_x) lengths_x = new_lengths_x seqs_x = new_seqs_x if len(lengths_x) < 1: return None, None, None, None n_samples = len(seqs_x) maxlen_x = numpy.max(lengths_x) + 1 x = numpy.zeros((maxlen_x, n_samples)).astype('int64') x_mask = numpy.zeros((maxlen_x, n_samples)).astype('float32') ''' /*这里,将原来的seqs中的每一行加入到新的x的相应的列中,如果原来的这行的长度(也就是step) 小于maxlen(新矩阵x的行数,也是原来step最长的样本的step, 那么seqs的那行转换成的x的列不足的部分就是0,相当于seqs中的行转置了一下,变成新的x的一列, 长度不够后边就补0,而mask就记录了新的x矩阵每个位置是否有值,有值得话mask相应的位置就是1,没有的话就是0)*/ ''' for idx, s_x in enumerate(seqs_x): x[:lengths_x[idx], idx] = s_x x_mask[:lengths_x[idx]+1, idx] = 1. return x, x_mask # feedforward layer: affine transformation + point-wise nonlinearity #初始化每一层的W和b def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[_p(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) params[_p(prefix, 'b')] = numpy.zeros((nout,)).astype('float32') return params def fflayer(tparams, state_below, options, prefix='rconv', activ='lambda x: tensor.tanh(x)', **kwargs): return eval(activ)( tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]) # GRU layer def param_init_gru(options, params, prefix='gru', nin=None, dim=None): #dim_proj / *word,embedding的维数和隐藏层的维数,用默认值。(word embedding是一种将一个词转成一个向量的过程,这里不去深究) * / #nin = dim_word = 512, n_out = dim = 1024 if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] # embedding to gates transformation weights, biases ''' # concatenate 函数 a = [[1,2],[3,4]] b = [[5,6],[7,8]] numpy.concatenate([a,b],axis=1) array([[1, 2, 5, 6], [3, 4, 7, 8]]) np.concatenate([aa,bb],axis=0) array([[1, 2], [3, 4], [5, 6], [7, 8]]) ''' #W.shape = nin*(nin*dim) 512 * (512*1024) W = numpy.concatenate([norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[_p(prefix, 'W')] = W #b.shape 2*dim 2048 params[_p(prefix, 'b')] = numpy.zeros((2 * dim,)).astype('float32') # recurrent transformation weights for gates #ortho_weight(dim),shape = dim * dim 1024 * (1024*1024) U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[_p(prefix, 'U')] = U # embedding to hidden state proposal weights, biases #Wx.shaope = dim_word*dim 512 * 1024 Wx = norm_weight(nin, dim) params[_p(prefix, 'Wx')] = Wx params[_p(prefix, 'bx')] = numpy.zeros((dim,)).astype('float32') # recurrent transformation weights for hidden state proposal # Ux.shape = dim * dim Ux = ortho_weight(dim) params[_p(prefix, 'Ux')] = Ux return params def gru_layer(tparams, state_below, options, prefix='gru', mask=None, one_step=False, init_state=None, **kwargs): if one_step: assert init_state, 'previous state must be provided' nsteps = state_below.shape[0] #//state_below是输入x和w,b计算后的输入节点。同上,第一维代表step #第一维度表示的是nsteps,为迭代次数 #第二维度表示样本数,如果输入三维的x,那么样本数就是第二维的长度,否则就是只有一个样本 if state_below.ndim == 3: n_samples = state_below.shape[1] else: n_samples = state_below.shape[0] dim = tparams[_p(prefix, 'Ux')].shape[1] # if mask is None: mask = tensor.alloc(1., state_below.shape[0], 1) # utility function to slice a tensor #// 切片,计算的时候是几个门一起计算,切片将各个门的值分开 def _slice(_x, n, dim): if _x.ndim == 3: return _x[:, :, n*dim:(n+1)*dim] return _x[:, n*dim:(n+1)*dim] # state_below is the input word embeddings # input to the gates, concatenated state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \ tparams[_p(prefix, 'b')] # input to compute the hidden state proposal state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \ tparams[_p(prefix, 'bx')] # step function to be used by scan # arguments | sequences |outputs-info| non-seqs ''' #/*scan函数,进行迭代的函数是_step,它的输入是m_, x_, h_, c_,迭代的是sequences中的mask,state_below,每次拿出他们的一行, # 作为输入的m_和x_,h_和c_的初始值设置为0(outputs_info设置初始值), # 每次计算,_step返回的h和c分别赋给下次迭代的h_和c_,迭代次数为nsteps, # 这样就实现了隐藏层节点的传递,最后函数返回h和c给rval。*/ zt=sigmoid(Wzxt+Uzht−1) rt=sigmoid(Wtxt+Utht−1) h˜t=tanh(Wxt+U(rt∘ht−1)) ht=(1−zt)∘h(t−1)+zt∘h˜t ''' def _step_slice(m_, x_, xx_, h_,U, Ux): #x_ = state_below_ = emb*encoder_w+encoder_b #preact = h(t-1) * U + x_ preact = tensor.dot(h_, U) preact += x_ # reset and update gates #此处“r”和“u”同时计算 # u = zt = sigmoid(Wzxt + Uzht−1) # r = rt = sigmoid(Wtxt + Utht−1) r = tensor.nnet.sigmoid(_slice(preact, 0, dim)) # rt u = tensor.nnet.sigmoid(_slice(preact, 1, dim)) # zt # compute the hidden state proposal #xx_ = state_below_x = Wxt = emb*encoder_wx+encoder_bx #GRU公式:h = h˜t=tanh(Wxt+U(r(t)∘h(t−1))) preactx = tensor.dot(h_, Ux) preactx = preactx * r preactx = preactx + xx_ # hidden state proposal h = tensor.tanh(preactx) #GRU 公式:ht=(1−zt)∘h(t−1)+zt∘h˜t # leaky integrate and obtain next hidden state #如果样本到一定的状态后没有值了,也就是mask矩阵对应位置的值为0了,那么它的好就和上一个时刻的保持不变。 h = u * h_ + (1. - u) * h h = m_[:, None] * h + (1. - m_)[:, None] * h_ return h # prepare scan arguments seqs = [mask, state_below_, state_belowx] _step = _step_slice shared_vars = [tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]] # set initial state to all zeros if init_state is None: #theano.alloc create a tensor shape is (n_samples,dim) init_state = tensor.unbroadcast(tensor.alloc(0., n_samples, dim), 0) ''' /*scan函数,进行迭代的函数是_step,它的输入是m_, x_, xx_, h_, 迭代的是sequences中的mask,state_below,每次拿出他们的一行, 作为输入的m_和x_,xx_和h_的初始值设置为0(outputs_info设置初始值), 每次计算,_step返回的h分别赋给下次迭代的h_,迭代次数为nsteps, 这样就实现了隐藏层节点的传递,最后函数返回h给rval。*/ ''' if one_step: # sampling rval = _step(*(seqs+[init_state]+shared_vars)) else: # training rval, updates = theano.scan(_step, sequences=seqs, outputs_info=[init_state], non_sequences=shared_vars, name=_p(prefix, '_layers'), n_steps=nsteps, profile=profile, strict=True) rval = [rval] return rval # initialize all parameters def init_params(options): params = OrderedDict() # embedding,create embedding vector ''' dim_word:词向量的维度,n_word:词向量的大小,dim:1024 /* 随机生成embedding矩阵,这里为30000 * 512维的,因为词典大小是30000,也就是说,词的ID范围是1-30000, 我们将每个词转换成一个512维的向量,所以这里生成了一个10000*512的矩阵,每个词转换成它的ID的那一行的512维向量。 比如“我”这个词的ID是5,那么“我”这个词就用params['Wemb']矩阵的第5行表示, 第5行就是一个512维的向量,这里用随机生成的矩阵表示,作为示例。(这是下边用到的,这里先给出解释)*/ ''' params['Wemb'] = norm_weight(options['n_words'], options['dim_word']) ''' layers = {'ff': ('param_init_fflayer', 'fflayer'), 'gru': ('param_init_gru', 'gru_layer'), } ''' ''' embedding = 30000*512 n_words*dim_word encoder = 512*1024 dim_word*dim ff_logit_lstm= 1024*512 dim*dim_word ff_logit_prev=512*512 dim_word*dim_word ff_logit = 512*30000 dim_word*n_words ''' # options['encoder'] = "GRU",get_layer(options['encoder']) = 'gru': ('param_init_gru', 'gru_layer') #调用函数为"param_init_gru' nin = dim_word = 512, n_out = dim = 1024, prefix 为初始化的权值类型 #调用gru layer,param_init_gru, return W, U, Wx ,Ux params = get_layer(options['encoder'])[0](options, params, prefix='encoder', nin=options['dim_word'], dim=options['dim']) # readout #调用 param_init_fflayer,初始化相应的w和b params = get_layer('ff')[0](options, params, prefix='ff_logit_lstm', nin=options['dim'], nout=options['dim_word'], ortho=False) params = get_layer('ff')[0](options, params, prefix='ff_logit_prev', nin=options['dim_word'], nout=options['dim_word'], ortho=False) params = get_layer('ff')[0](options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params # build a training model def build_model(tparams, options): opt_ret = dict() trng = RandomStreams(1234) use_noise = theano.shared(numpy.float32(0.)) # description string: #words x #samples x = tensor.matrix('x', dtype='int64') x_mask = tensor.matrix('x_mask', dtype='float32') n_timesteps = x.shape[0] n_samples = x.shape[1] # input #tparms 此时是共享变量,type is OrderedDict() #dim_word = 512 #batch,step,change it to three dim emb = tparams['Wemb'][x.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) emb_shifted = tensor.zeros_like(emb) emb_shifted = tensor.set_subtensor(emb_shifted[1:], emb[:-1]) emb = emb_shifted opt_ret['emb'] = emb # pass through gru layer, recurrence here #计算出来隐藏层 #gru_layer proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=x_mask) #proj_h = rval, updates = theano.scan(_step,sequences=seqs,outputs_info=[init_state],non_sequences=shared_vars,name=_p(prefix, '_layers'),n_steps=nsteps,profile=profile,strict=True) #get the ans is h proj_h = proj[0] opt_ret['proj_h'] = proj_h # compute word probabilities,计算词的概率 #调用 get_layer('ff')[1] = fflayer # logit_lstm = proj_h*ff_logit_lstm_w +b logit_lstm = get_layer('ff')[1](tparams, proj_h, options, prefix='ff_logit_lstm', activ='linear') #logit_prev = emb * ff_logit_prev_w + b logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit = tensor.tanh(logit_lstm+logit_prev) # logit = logit * ff_logit_w + b logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') logit_shp = logit.shape probs = tensor.nnet.softmax( logit.reshape([logit_shp[0]*logit_shp[1], logit_shp[2]])) # cost,calculate the cost x_flat = x.flatten() x_flat_idx = tensor.arange(x_flat.shape[0]) * options['n_words'] + x_flat cost = -tensor.log(probs.flatten()[x_flat_idx]) cost = cost.reshape([x.shape[0], x.shape[1]]) opt_ret['cost_per_sample'] = cost cost = (cost * x_mask).sum(0) return trng, use_noise, x, x_mask, opt_ret, cost # build a sampler def build_sampler(tparams, options, trng): # x: 1 x 1 y = tensor.vector('y_sampler', dtype='int64') init_state = tensor.matrix('init_state', dtype='float32') # if it's the first word, emb should be all zero emb = tensor.switch(y[:, None] < 0, tensor.alloc(0., 1, tparams['Wemb'].shape[1]), tparams['Wemb'][y]) # apply one step of gru layer,line 278 proj = get_layer(options['encoder'])[1](tparams, emb, options, prefix='encoder', mask=None, one_step=True, init_state=init_state) next_state = proj[0] # compute the output probability dist and sample logit_lstm = get_layer('ff')[1](tparams, next_state, options, prefix='ff_logit_lstm', activ='linear') logit_prev = get_layer('ff')[1](tparams, emb, options, prefix='ff_logit_prev', activ='linear') logit = tensor.tanh(logit_lstm+logit_prev) logit = get_layer('ff')[1](tparams, logit, options, prefix='ff_logit', activ='linear') next_probs = tensor.nnet.softmax(logit) next_sample = trng.multinomial(pvals=next_probs).argmax(1) # next word probability print 'Building f_next..', inps = [y, init_state] outs = [next_probs, next_sample, next_state] f_next = theano.function(inps, outs, name='f_next', profile=profile) print 'Done' return f_next # generate sample def gen_sample(tparams, f_next, options, trng=None, maxlen=30, argmax=False): sample = [] sample_score = 0 # initial token is indicated by a -1 and initial state is zero next_w = -1 * numpy.ones((1,)).astype('int64') next_state = numpy.zeros((1, options['dim'])).astype('float32') for ii in xrange(maxlen): inps = [next_w, next_state] ret = f_next(*inps) next_p, next_w, next_state = ret[0], ret[1], ret[2] if argmax: nw = next_p[0].argmax() else: nw = next_w[0] sample.append(nw) sample_score += next_p[0, nw] if nw == 0: break return sample, sample_score # calculate the log probablities on a given corpus using language model #用模型来计算给定的数据 def pred_probs(f_log_probs, prepare_data, options, iterator, verbose=True): probs = [] n_done = 0 for x in iterator: n_done += len(x) x, x_mask = prepare_data(x, n_words=options['n_words']) pprobs = f_log_probs(x, x_mask) for pp in pprobs: probs.append(pp) if numpy.isnan(numpy.mean(probs)): ipdb.set_trace() if verbose: print >>sys.stderr, '%d samples computed' % (n_done) return numpy.array(probs) # optimizers # name(hyperp, tparams, grads, inputs (list), cost) = f_grad_shared, f_update def adam(lr, tparams, grads, inp, cost, beta1=0.9, beta2=0.999, e=1e-8): gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems()] gsup = [(gs, g) for gs, g in zip(gshared, grads)] f_grad_shared = theano.function(inp, cost, updates=gsup, profile=profile) updates = [] t_prev = theano.shared(numpy.float32(0.)) t = t_prev + 1. lr_t = lr * tensor.sqrt(1. - beta2**t) / (1. - beta1**t) for p, g in zip(tparams.values(), gshared): m = theano.shared(p.get_value() * 0., p.name + '_mean') v = theano.shared(p.get_value() * 0., p.name + '_variance') m_t = beta1 * m + (1. - beta1) * g v_t = beta2 * v + (1. - beta2) * g**2 step = lr_t * m_t / (tensor.sqrt(v_t) + e) p_t = p - step updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((t_prev, t)) f_update = theano.function([lr], [], updates=updates, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update def adadelta(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_up2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rup2' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up, profile=profile) updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)] ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) for ru2, ud in zip(running_up2, updir)] param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)] f_update = theano.function([lr], [], updates=ru2up+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update def rmsprop(lr, tparams, grads, inp, cost): zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k) for k, p in tparams.iteritems()] running_grads = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad' % k) for k, p in tparams.iteritems()] running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k) for k, p in tparams.iteritems()] zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)] rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)] rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)] f_grad_shared = theano.function(inp, cost, updates=zgup+rgup+rg2up, profile=profile) updir = [theano.shared(p.get_value() * numpy.float32(0.), name='%s_updir' % k) for k, p in tparams.iteritems()] updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4)) for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)] param_up = [(p, p + udn[1]) for p, udn in zip(itemlist(tparams), updir_new)] f_update = theano.function([lr], [], updates=updir_new+param_up, on_unused_input='ignore', profile=profile) return f_grad_shared, f_update def sgd(lr, tparams, grads, x, mask, y, cost): # allocate gradients and set them all to zero gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k) for k, p in tparams.iteritems()] # create gradient copying list, # from grads (tensor variable) to gshared (shared variable) gsup = [(gs, g) for gs, g in zip(gshared, grads)] # compile theano function to compute cost and copy gradients f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, profile=profile) # define the update step rule pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)] # compile a function for update f_update = theano.function([lr], [], updates=pup, profile=profile) return f_grad_shared, f_update def train(dim_word=100, # word vector dimensionality dim=1000, # the number of GRU units encoder='gru', #/*一个标识符,encode端代码/ patience=10, # early stopping patience/*该参数用于earlystop,如果10轮迭代的误差没有降低,就进行earlystop*/ max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, #/*每更新10次显示训练过程,即显示训练、验证和测试误差*/ decay_c=0., # L2 weight decay penalty /*参数U的正则权重,U为隐藏层ht到输出层的参数*/ lrate=0.01, n_words=100000, # vocabulary size,/*词典大小,用于数据预处理部分,将词用该词在词典中的ID表示,超过10000的用1表示,仅仅用于数据,不做深究*/ maxlen=100, # maximum length of the description optimizer='rmsprop', #/*优化方法,代码提供了sgd,adadelta和rmsprop三种方法,采用了adadelta*/ batch_size=16, #/*训练集用的*batch大小*/ valid_batch_size=16, #/*验证集用的*batch大小*/ saveto='model.npz', #/*保存最好模型的文件,保存训练误差,验证误差和测试误差等等*/ validFreq=1000, #/*验证频率*/ saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz', valid_dataset='../data/dev/newstest2011.en.tok', dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/' 'wiki.tok.txt.gz.pkl', use_dropout=False, reload_=False): # Model options #首先将当前的函数的局部作用,例如(use_dropout=False,转换为,model_options["user_dropout"] = False) #后面的很多参数就以model_options作为参数进行参数传递 model_options = locals().copy() # load dictionary,导入字典数据 with open(dictionary, 'rb') as f: worddicts = pkl.load(f) # invert dictionary,导入词典数据,具体格式不清楚,---------------------------------------------??????? worddicts_r = dict() for kk, vv in worddicts.iteritems(): worddicts_r[vv] = kk # reload options ,如果有已经配置的参数的话,导入已经配置好的参数 if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print 'Loading data' # 数据迭代器 train = TextIterator(dataset, dictionary, n_words_source=n_words, batch_size=batch_size, maxlen=maxlen) valid = TextIterator(valid_dataset, dictionary, n_words_source=n_words, batch_size=valid_batch_size, maxlen=maxlen) print 'Building model' #搭建模型,传递过去所有参数, params = init_params(model_options) # reload parameters,如果已经有初始化完的数据,直接导入进来 if reload_ and os.path.exists(saveto): params = load_params(saveto, params) # create shared variables for parameters,使params的变量复制变为共享的变量 tparams = init_tparams(params) # build the symbolic computational graph #搭建模型,包括GRU计算和cost的计算 trng, use_noise, \ x, x_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask] print 'Buliding sampler' f_next = build_sampler(tparams, model_options, trng) # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', #优化方式,rmsprop #梯度下降,进行参数的更新 f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None bad_count = 0 # 记录误差、最好的结果,和bad_count计数 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size #/*如果未设置验证频率和保存频率,那么就设置为一个epoch,len(train[0])/batch_size就是一个epoch*/ # Training loop uidx = 0 estop = False bad_counter = 0 for eidx in xrange(max_epochs): n_samples = 0 for x in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) # pad batch and create mask #按照列来转化数据,每一列为一句话,x_mask是标记,这个位置有词,标记为1,反之为0. x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words) #如果这个batch不满足,则跳过这个 if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask) # do the update on parameters #更新参数,学习率0.01 f_update(lrate) ud = time.time() - ud_start # check for bad numbers if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1. # verbose if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? for jj in xrange(5): sample, score = gen_sample(tparams, f_next, model_options, trng=trng, maxlen=30, argmax=False) print 'Sample ', jj, ': ', ss = sample for vv in ss: if vv == 0: break if vv in worddicts_r: print worddicts_r[vv], else: print 'UNK', print # validate model on validation set and early stop if necessary #用valid数据进行交叉验证 if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() history_errs.append(valid_err) if uidx == 0 or valid_err <= numpy.array(history_errs).min(): best_p = unzip(tparams) bad_counter = 0 #如果120轮了,误差没有发生变化,则需要结束训练了 if len(history_errs) > patience and valid_err >= \ numpy.array(history_errs)[:-patience].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): ipdb.set_trace() print 'Valid ', valid_err # finish after this many updates if uidx >= finish_after: print 'Finishing after %d iterations!' % uidx estop = True break #/* 每验证一次,记录下验证误差和测试误差, # 如果当前的验证误差大于前10(patience)次验证误差的最小值(也就是说误差没有降低),bad_counter+= 1, # 如果bad_counter>patience,就early stop! print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print 'Valid ', valid_err #保存最好的模型 params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) return valid_err if __name__ == '__main__': pass
dl4mt:lstm语言模型训练,代码讲解
最新推荐文章于 2025-03-21 09:48:51 发布