我们从Python开源项目中,提取了以下38个代码示例,用于说明如何使用lasagne.layers.Gate()。
def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=False): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name=name) return l_lstm
def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) return l_lstm
def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) return l_lstm
def create_lstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, # We need to specify a separate input for masks mask_input=l_mask, peepholes=use_peepholes, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) return l_lstm
def exe_maxru(length, num_units, position, binominal): batch_size = BATCH_SIZE input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) target_var = T.ivector(name='targets') layer_input = lasagne.layers.InputLayer(shape=(None, length, 1), input_var=input_var, name='input') time_updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) time_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) resetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.GlorotUniform()) updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.GlorotUniform()) hiden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) layer_taru = MAXRULayer(layer_input, num_units, max_length=length, P_time=lasagne.init.GlorotUniform(), nonlinearity=nonlinearities.tanh, resetgate=resetgate, updategate=updategate, hidden_update=hiden_update, time_updategate=time_updategate, time_update=time_update, only_return_final=True, name='MAXRU', p=0.) # W = layer_taru.W_hid_to_hidden_update.sum() # U = layer_taru.W_in_to_hidden_update.sum() # b = layer_taru.b_hidden_update.sum() layer_output = DenseLayer(layer_taru, num_units=1, nonlinearity=nonlinearities.sigmoid, name='output') return train(layer_output, input_var, target_var, batch_size, length, position, binominal)
def exe_lstm(use_embedd, length, num_units, position, binominal): batch_size = BATCH_SIZE input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) target_var = T.ivector(name='targets') layer_input = lasagne.layers.InputLayer(shape=(None, length, 1), input_var=input_var, name='input') if use_embedd: layer_position = construct_position_input(batch_size, length, num_units) layer_input = lasagne.layers.concat([layer_input, layer_position], axis=2) ingate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) layer_lstm = LSTMLayer(layer_input, num_units, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, peepholes=False, nonlinearity=nonlinearities.tanh, only_return_final=True, name='LSTM') # W = layer_lstm.W_hid_to_cell.sum() # U = layer_lstm.W_in_to_cell.sum() # b = layer_lstm.b_cell.sum() layer_output = DenseLayer(layer_lstm, num_units=1, nonlinearity=nonlinearities.sigmoid, name='output') return train(layer_output, layer_lstm, input_var, target_var, batch_size, length, position, binominal)
def exe_gru(use_embedd, length, num_units, position, binominal, reset_input): batch_size = BATCH_SIZE input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) target_var = T.ivector(name='targets') layer_input = lasagne.layers.InputLayer(shape=(batch_size, length, 1), input_var=input_var, name='input') if use_embedd: layer_position = construct_position_input(batch_size, length, num_units) layer_input = lasagne.layers.concat([layer_input, layer_position], axis=2) resetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hiden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) layer_gru = GRULayer_ANA(layer_input, num_units, resetgate=resetgate, updategate=updategate, hidden_update=hiden_update, reset_input=reset_input, only_return_final=True, name='GRU') # W = layer_gru.W_hid_to_hidden_update.sum() # U = layer_gru.W_in_to_hidden_update.sum() # b = layer_gru.b_hidden_update.sum() layer_output = DenseLayer(layer_gru, num_units=1, nonlinearity=nonlinearities.sigmoid, name='output') return train(layer_output, layer_gru, input_var, target_var, batch_size, length, position, binominal)
def __init__(self, incoming, num_units, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, unroll_scan=False, precompute_input=True, mask_input=None, **kwargs): super(CustomLSTMEncoder, self).__init__(incoming, num_units, ingate, forgetgate, cell, outgate, nonlinearity, cell_init, hid_init, backwards, learn_init, peepholes, gradient_steps, grad_clipping, unroll_scan, precompute_input, mask_input, False, **kwargs)
def test_lnlstm_passthrough(): # Tests that the LSTM can simply pass through its input l_in = InputLayer((4, 5, 6)) zero = lasagne.init.Constant(0.) one = lasagne.init.Constant(1.) pass_gate = Gate(zero, zero, zero, one, None) no_gate = Gate(zero, zero, zero, zero, None) in_pass_gate = Gate( np.eye(6).astype(theano.config.floatX), zero, zero, zero, None) l_rec = LNLSTMLayer( l_in, 6, pass_gate, no_gate, in_pass_gate, pass_gate, None) out = lasagne.layers.get_output(l_rec) inp = np.arange(4*5*6).reshape(4, 5, 6).astype(theano.config.floatX) # np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
def test_lstm_passthrough(): # Tests that the LSTM can simply pass through its input l_in = InputLayer((4, 5, 6)) zero = lasagne.init.Constant(0.) one = lasagne.init.Constant(1.) pass_gate = Gate(zero, zero, zero, one, None) no_gate = Gate(zero, zero, zero, zero, None) in_pass_gate = Gate( np.eye(6).astype(theano.config.floatX), zero, zero, zero, None) l_rec = LSTMLayer( l_in, 6, pass_gate, no_gate, in_pass_gate, pass_gate, None) out = lasagne.layers.get_output(l_rec) inp = np.arange(4*5*6).reshape(4, 5, 6).astype(theano.config.floatX) np.testing.assert_almost_equal(out.eval({l_in.input_var: inp}), inp)
def generate_lstm_parameters(): gate_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) return gate_parameters, cell_parameters
def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=False): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) return l_lstm, l_lstm_back
def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_incoming, hidden_units, ingate=gate_parameters, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) return l_lstm, l_lstm_back
def create_model(input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26, w_init=las.init.Orthogonal()): gate_parameters = Gate( W_in=w_init, W_hid=w_init, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init, W_hid=w_init, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') f_lstm, b_lstm = create_blstm(l_in, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm') l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') l_forward_slice1 = SliceLayer(l_sum, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer( l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') return l_out
def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, # We need to specify a separate input for masks mask_input=l_mask, peepholes=use_peepholes, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) return l_lstm, l_lstm_back
def create_model(input_shape, input_var, mask_shape, mask_var, window, lstm_size=250, output_classes=26, w_init=las.init.GlorotUniform(), use_peepholes=False, use_blstm=True): gate_parameters = Gate( W_in=w_init, W_hid=w_init, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init, W_hid=w_init, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, name='mask') symbolic_seqlen = l_in.input_var.shape[1] l_delta = DeltaLayer(l_in, window, name='delta') if use_blstm: f_lstm, b_lstm = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_sum = ElemwiseSumLayer([f_lstm, b_lstm], name='sum') # reshape to (num_examples * seq_len, lstm_size) l_reshape = ReshapeLayer(l_sum, (-1, lstm_size), name='reshape') else: l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_reshape = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') return l_out
def create_blstm(l_incoming, l_mask, hidden_units, cell_parameters, gate_parameters, name, use_peepholes=True): if cell_parameters is None: cell_parameters = Gate() if gate_parameters is None: gate_parameters = Gate() l_lstm = LSTMLayer( l_incoming, hidden_units, peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='f_{}'.format(name)) # The "backwards" layer is the same as the first, # except that the backwards argument is set to True. l_lstm_back = LSTMLayer( l_incoming, hidden_units, ingate=gate_parameters, peepholes=use_peepholes, mask_input=l_mask, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, learn_init=True, grad_clipping=5., backwards=True, name='b_{}'.format(name)) return l_lstm, l_lstm_back
def create_model(substreams, mask_shape, mask_var, lstm_size=250, output_classes=26, fusiontype='concat', w_init_fn=las.init.Orthogonal(), use_peepholes=True): gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_seqlen_raw = l_mask.input_var.shape[1] # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. if fusiontype == 'adasum': l_fuse = AdaptiveElemwiseSumLayer(substreams, name='adasum1') elif fusiontype == 'sum': l_fuse = ElemwiseSumLayer(substreams, name='sum1') elif fusiontype == 'concat': l_fuse = ConcatLayer(substreams, axis=-1, name='concat') f_lstm_agg, b_lstm_agg = create_blstm(l_fuse, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm_agg') l_sum2 = ElemwiseSumLayer([f_lstm_agg, b_lstm_agg], name='sum2') # reshape to (num_examples * seq_len, lstm_size) l_reshape3 = ReshapeLayer(l_sum2, (-1, lstm_size), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen_raw, output_classes), name='output') return l_out, l_fuse
def build_recur_dropout(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p): # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=p, shared_axes=(1,)) ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=False, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, p=p, name='forward') ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=False, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, p=p, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_lstm_cnn = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") # shape = [batch, n-step, num_units] bi_lstm_cnn = lasagne.layers.DropoutLayer(bi_lstm_cnn, p=p, shared_axes=(1,)) return ChainCRFLayer(bi_lstm_cnn, num_labels, mask_input=mask)
def build_RNN(architec, layer_input, layer_mask, num_units, grad_clipping): def build_GRU(reset_input): resetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hiden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) return GRULayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping, resetgate=resetgate, updategate=updategate, hidden_update=hiden_update, reset_input=reset_input, only_return_final=True, p=0.5, name='GRU') def build_LSTM(): ingate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) return LSTMLayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping, ingate=ingate, forgetgate=forgetgate, cell=cell, outgate=outgate, peepholes=False, nonlinearity=nonlinearities.tanh, only_return_final=True, p=0.5, name='LSTM') def build_SGRU(): resetgate_hidden = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.GlorotUniform()) resetgate_input = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.GlorotUniform()) updategate = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.GlorotUniform()) hidden_update = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, b=lasagne.init.Constant(0.), nonlinearity=nonlinearities.tanh) return SGRULayer(layer_input, num_units, mask_input=layer_mask, grad_clipping=grad_clipping, resetgate_input=resetgate_input, resetgate_hidden=resetgate_hidden, updategate=updategate, hidden_update=hidden_update, only_return_final=True, p=0.5, name='SGRU') if architec == 'gru0': return build_GRU(False) elif architec == 'gru1': return build_GRU(True) elif architec == 'lstm': return build_LSTM() elif architec == 'sgru': return build_SGRU() else: raise ValueError('unkown architecture: %s' % architec)
def build_std_dropout_gru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p, reset_input): # Construct Bi-directional LSTM-CNNs-CRF with standard dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p) # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.2) resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_forward = GRULayer(incoming, num_units, mask_input=mask, resetgate=resetgate_forward, updategate=updategate_forward, hidden_update=hidden_update_forward, grad_clipping=grad_clipping, reset_input=reset_input, name='forward') resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_backward = GRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate=resetgate_backward, updategate=updategate_backward, hidden_update=hidden_update_backward, grad_clipping=grad_clipping, reset_input=reset_input, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_gru_cnn = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru") bi_gru_cnn = lasagne.layers.DropoutLayer(bi_gru_cnn, p=p) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_gru_cnn = lasagne.layers.reshape(bi_gru_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_gru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') return layer_output
def build_std_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p): # Construct Bi-directional LSTM-CNNs-CRF with standard dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] incoming1 = lasagne.layers.DropoutLayer(incoming1, p=p) # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.2) resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_forward = SGRULayer(incoming, num_units, mask_input=mask, resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward, updategate=updategate_forward, hidden_update=hidden_update_forward, grad_clipping=grad_clipping, name='forward') resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward, updategate=updategate_backward, hidden_update=hidden_update_backward, grad_clipping=grad_clipping, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru") bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') return layer_output
def build_recur_dropout_gru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p, reset_input): # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.2, shared_axes=(1,)) resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_forward = GRULayer(incoming, num_units, mask_input=mask, resetgate=resetgate_forward, updategate=updategate_forward, hidden_update=hidden_update_forward, grad_clipping=grad_clipping, reset_input=reset_input, p=p, name='forward') resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_backward = GRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate=resetgate_backward, updategate=updategate_backward, hidden_update=hidden_update_backward, grad_clipping=grad_clipping, reset_input=reset_input, p=p, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_gru_cnn = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru") # shape = [batch, n-step, num_units] bi_gru_cnn = lasagne.layers.DropoutLayer(bi_gru_cnn, p=p, shared_axes=(1,)) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_gru_cnn = lasagne.layers.reshape(bi_gru_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_gru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') return layer_output
def build_recur_dropout_sgru(incoming1, incoming2, num_units, num_labels, mask, grad_clipping, num_filters, p): # Construct Bi-directional LSTM-CNNs-CRF with recurrent dropout. # first get some necessary dimensions or parameters conv_window = 3 # shape = [batch, n-step, c_dim, char_length] # construct convolution layer # shape = [batch, n-step, c_filters, output_length] cnn_layer = ConvTimeStep1DLayer(incoming1, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, _, pool_size = cnn_layer.output_shape # construct max pool layer # shape = [batch, n-step, c_filters, 1] pool_layer = PoolTimeStep1DLayer(cnn_layer, pool_size=pool_size) # reshape: [batch, n-step, c_filters, 1] --> [batch, n-step, c_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, ([0], [1], [2])) # finally, concatenate the two incoming layers together. # shape = [batch, n-step, c_filter&w_dim] incoming = lasagne.layers.concat([output_cnn_layer, incoming2], axis=2) # dropout for incoming incoming = lasagne.layers.DropoutLayer(incoming, p=0.2, shared_axes=(1,)) resetgate_input_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_forward = SGRULayer(incoming, num_units, mask_input=mask, resetgate_input=resetgate_input_forward, resetgate_hidden=resetgate_hidden_forward, updategate=updategate_forward, hidden_update=hidden_update_forward, grad_clipping=grad_clipping, p=p, name='forward') resetgate_input_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) resetgate_hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None) hidden_update_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) sgru_backward = SGRULayer(incoming, num_units, mask_input=mask, backwards=True, resetgate_input=resetgate_input_backward, resetgate_hidden=resetgate_hidden_backward, updategate=updategate_backward, hidden_update=hidden_update_backward, grad_clipping=grad_clipping, p=p, name='backward') # concatenate the outputs of forward and backward LSTMs to combine them. bi_sgru_cnn = lasagne.layers.concat([sgru_forward, sgru_backward], axis=2, name="bi-sgru") # shape = [batch, n-step, num_units] bi_sgru_cnn = lasagne.layers.DropoutLayer(bi_sgru_cnn, p=p, shared_axes=(1,)) # reshape bi-rnn-cnn to [batch * max_length, num_units] bi_sgru_cnn = lasagne.layers.reshape(bi_sgru_cnn, (-1, [2])) # construct output layer (dense layer with softmax) layer_output = lasagne.layers.DenseLayer(bi_sgru_cnn, num_units=num_labels, nonlinearity=nonlinearities.softmax, name='softmax') return layer_output
def __init__(self, incoming, num_units, ingate=Gate(), forgetgate=Gate(), cell=Gate(W_cell=None, nonlinearity=nonlinearities.tanh), outgate=Gate(), nonlinearity=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, grad_clipping=0, precompute_input=True, mask_input=None, encoder_mask_input=None, attention=False, word_by_word=False, **kwargs): super(CustomLSTMDecoder, self).__init__(incoming, num_units, ingate, forgetgate, cell, outgate, nonlinearity, cell_init, hid_init, backwards, learn_init, peepholes, gradient_steps, grad_clipping, False, precompute_input, mask_input, True, **kwargs) self.attention = attention self.word_by_word = word_by_word # encoder mask self.encoder_mask_incoming_index = -1 if encoder_mask_input is not None: self.input_layers.append(encoder_mask_input) self.input_shapes.append(encoder_mask_input.output_shape) self.encoder_mask_incoming_index = len(self.input_layers) - 1 # check encoder if not isinstance(self.cell_init, CustomLSTMEncoder) \ or self.num_units != self.cell_init.num_units: raise ValueError('cell_init must be CustomLSTMEncoder' ' and num_units should equal') self.r_init = None self.r_init = self.add_param(init.Constant(0.), (1, num_units), name="r_init", trainable=False, regularizable=False) if self.word_by_word: # rewrites self.attention = True if self.attention: if not isinstance(encoder_mask_input, lasagne.layers.Layer): raise ValueError('Attention mechnism needs encoder mask layer') # initializes attention weights self.W_y_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'V_pointer') self.W_h_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_h_attend') # doesn't need transpose self.w_attend = self.add_param(init.Normal(0.1), (num_units, 1), 'v_pointer') self.W_p_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_p_attend') self.W_x_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_x_attend') if self.word_by_word: self.W_r_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_r_attend') self.W_t_attend = self.add_param(init.Normal(0.1), (num_units, num_units), 'W_t_attend')
def build_BiLSTM(incoming, num_units, mask=None, grad_clipping=0, precompute_input=True, peepholes=False, dropout=True, in_to_out=False): # construct the forward and backward rnns. Now, Ws are initialized by Glorot initializer with default arguments. # Need to try other initializers for specific tasks. # dropout for incoming if dropout: incoming = lasagne.layers.DropoutLayer(incoming, p=0.5) ingate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_forward = lasagne.layers.LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, precompute_input=precompute_input, ingate=ingate_forward, outgate=outgate_forward, forgetgate=forgetgate_forward, cell=cell_forward, name='forward') ingate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) outgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. forgetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) # now use tanh for nonlinear function of cell, need to try pure linear cell cell_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) lstm_backward = lasagne.layers.LSTMLayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, nonlinearity=nonlinearities.tanh, peepholes=peepholes, precompute_input=precompute_input, backwards=True, ingate=ingate_backward, outgate=outgate_backward, forgetgate=forgetgate_backward, cell=cell_backward, name='backward') # concatenate the outputs of forward and backward RNNs to combine them. concat = lasagne.layers.concat([lstm_forward, lstm_backward], axis=2, name="bi-lstm") # dropout for output if dropout: concat = lasagne.layers.DropoutLayer(concat, p=0.5) if in_to_out: concat = lasagne.layers.concat([concat, incoming], axis=2) # the shape of BiRNN output (concat) is (batch_size, input_length, 2 * num_hidden_units) return concat
def build_BiGRU(incoming, num_units, mask=None, grad_clipping=0, precompute_input=True, dropout=True, in_to_out=False): # construct the forward and backward grus. Now, Ws are initialized by Glorot initializer with default arguments. # Need to try other initializers for specific tasks. # dropout for incoming if dropout: incoming = lasagne.layers.DropoutLayer(incoming, p=0.5) # according to Jozefowicz et al.(2015), init bias of forget gate to 1. resetgate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) updategate_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # now use tanh for nonlinear function of hidden gate hidden_forward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_forward = lasagne.layers.GRULayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, precompute_input=precompute_input, resetgate=resetgate_forward, updategate=updategate_forward, hidden_update=hidden_forward, name='forward') # according to Jozefowicz et al.(2015), init bias of forget gate to 1. resetgate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1), b=lasagne.init.Constant(1.)) updategate_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=lasagne.init.Uniform(range=0.1)) # now use tanh for nonlinear function of hidden gate hidden_backward = Gate(W_in=lasagne.init.GlorotUniform(), W_hid=lasagne.init.GlorotUniform(), W_cell=None, nonlinearity=nonlinearities.tanh) gru_backward = lasagne.layers.GRULayer(incoming, num_units, mask_input=mask, grad_clipping=grad_clipping, precompute_input=precompute_input, backwards=True, resetgate=resetgate_backward, updategate=updategate_backward, hidden_update=hidden_backward, name='backward') # concatenate the outputs of forward and backward GRUs to combine them. concat = lasagne.layers.concat([gru_forward, gru_backward], axis=2, name="bi-gru") # dropout for output if dropout: concat = lasagne.layers.DropoutLayer(concat, p=0.5) if in_to_out: concat = lasagne.layers.concat([concat, incoming], axis=2) # the shape of BiRNN output (concat) is (batch_size, input_length, 2 * num_hidden_units) return concat
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, lstm_size=250, win=T.iscalar('theta)'), output_classes=26, w_init_fn=GlorotUniform, use_peepholes=False, use_blstm=True): weights, biases, shapes, nonlinearities = dbn gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(l_reshape1, weights, biases, shapes, nonlinearities, ['fc1', 'fc2', 'fc3', 'bottleneck']) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') l_delta = DeltaLayer(l_reshape2, win, name='delta') if use_blstm: l_lstm, l_lstm_back = create_blstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'blstm1', use_peepholes) # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') # reshape, flatten to 2 dimensions to run softmax on all timesteps l_reshape3 = ReshapeLayer(l_sum1, (-1, lstm_size), name='reshape3') else: l_lstm = create_lstm(l_delta, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm', use_peepholes) l_reshape3 = ReshapeLayer(l_lstm, (-1, lstm_size), name='reshape3') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_softmax = DenseLayer( l_reshape3, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='softmax') l_out = ReshapeLayer(l_softmax, (-1, symbolic_seqlen, output_classes), name='output') return l_out
def create_model(dbn, input_shape, input_var, mask_shape, mask_var, lstm_size=250, output_classes=26): dbn_layers = dbn.get_all_layers() weights = [] biases = [] weights.append(dbn_layers[1].W.astype('float32')) weights.append(dbn_layers[2].W.astype('float32')) weights.append(dbn_layers[3].W.astype('float32')) weights.append(dbn_layers[4].W.astype('float32')) biases.append(dbn_layers[1].b.astype('float32')) biases.append(dbn_layers[2].b.astype('float32')) biases.append(dbn_layers[3].b.astype('float32')) biases.append(dbn_layers[4].b.astype('float32')) gate_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), b=las.init.Constant(0.)) cell_parameters = Gate( W_in=las.init.Orthogonal(), W_hid=las.init.Orthogonal(), # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_in = InputLayer(input_shape, input_var, 'input') l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize = l_in.input_var.shape[0] symbolic_seqlen = l_in.input_var.shape[1] l_reshape1 = ReshapeLayer(l_in, (-1, input_shape[-1]), name='reshape1') l_encoder = create_pretrained_encoder(weights, biases, l_reshape1) encoder_len = las.layers.get_output_shape(l_encoder)[-1] l_reshape2 = ReshapeLayer(l_encoder, (symbolic_batchsize, symbolic_seqlen, encoder_len), name='reshape2') # l_delta = DeltaLayer(l_reshape2, win, name='delta') # l_lstm = create_lstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') l_lstm, l_lstm_back = create_blstm(l_reshape2, l_mask, lstm_size, cell_parameters, gate_parameters, 'lstm1') # We'll combine the forward and backward layer output by summing. # Merge layers take in lists of layers to merge as input. l_sum1 = ElemwiseSumLayer([l_lstm, l_lstm_back], name='sum1') l_forward_slice1 = SliceLayer(l_sum1, -1, 1, name='slice1') # Now, we can apply feed-forward layers as usual. # We want the network to predict a classification for the sequence, # so we'll use a the number of classes. l_out = DenseLayer( l_forward_slice1, num_units=output_classes, nonlinearity=las.nonlinearities.softmax, name='output') return l_out
def create_pretrained_substream(weights, biases, input_shape, input_var, mask_shape, mask_var, name, lstm_size=250, win=T.iscalar('theta'), nonlinearity=rectify, w_init_fn=las.init.Orthogonal(), use_peepholes=True): gate_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, b=las.init.Constant(0.)) cell_parameters = Gate( W_in=w_init_fn, W_hid=w_init_fn, # Setting W_cell to None denotes that no cell connection will be used. W_cell=None, b=las.init.Constant(0.), # By convention, the cell nonlinearity is tanh in an LSTM. nonlinearity=tanh) l_input = InputLayer(input_shape, input_var, 'input_'+name) l_mask = InputLayer(mask_shape, mask_var, 'mask') symbolic_batchsize_raw = l_input.input_var.shape[0] symbolic_seqlen_raw = l_input.input_var.shape[1] l_reshape1_raw = ReshapeLayer(l_input, (-1, input_shape[-1]), name='reshape1_'+name) l_encoder_raw = create_pretrained_encoder(l_reshape1_raw, weights, biases, [2000, 1000, 500, 50], [nonlinearity, nonlinearity, nonlinearity, linear], ['fc1_'+name, 'fc2_'+name, 'fc3_'+name, 'bottleneck_'+name]) input_len = las.layers.get_output_shape(l_encoder_raw)[-1] l_reshape2 = ReshapeLayer(l_encoder_raw, (symbolic_batchsize_raw, symbolic_seqlen_raw, input_len), name='reshape2_'+name) l_delta = DeltaLayer(l_reshape2, win, name='delta_'+name) l_lstm = LSTMLayer( l_delta, int(lstm_size), peepholes=use_peepholes, # We need to specify a separate input for masks mask_input=l_mask, # Here, we supply the gate parameters for each gate ingate=gate_parameters, forgetgate=gate_parameters, cell=cell_parameters, outgate=gate_parameters, # We'll learn the initialization and use gradient clipping learn_init=True, grad_clipping=5., name='lstm_'+name) return l_lstm