from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle.fluid as fluid def stacked_lstmp_model(feature, label, hidden_dim, proj_dim, stacked_num, class_num, parallel=False, is_train=True): """ The model for DeepASR. The main structure is composed of stacked identical LSTMP (LSTM with recurrent projection) layers. When running in training and validation phase, the feeding dictionary is {'feature', 'label'}, fed by the LodTensor for feature data and label data respectively. And in inference, only `feature` is needed. Args: frame_dim(int): The frame dimension of feature data. hidden_dim(int): The hidden state's dimension of the LSTMP layer. proj_dim(int): The projection size of the LSTMP layer. stacked_num(int): The number of stacked LSTMP layers. parallel(bool): Run in parallel or not, default `False`. is_train(bool): Run in training phase or not, default `True`. class_dim(int): The number of output classes. """ conv1 = fluid.layers.conv2d( input=feature, num_filters=32, filter_size=3, stride=1, padding=1, bias_attr=True, act="relu") pool1 = fluid.layers.pool2d( conv1, pool_size=3, pool_type="max", pool_stride=2, pool_padding=0) stack_input = pool1 for i in range(stacked_num): fc = fluid.layers.fc(input=stack_input, size=hidden_dim * 4, bias_attr=None) proj, cell = fluid.layers.dynamic_lstmp( input=fc, size=hidden_dim * 4, proj_size=proj_dim, bias_attr=True, use_peepholes=True, is_reverse=False, cell_activation="tanh", proj_activation="tanh") bn = fluid.layers.batch_norm( input=proj, is_test=not is_train, momentum=0.9, epsilon=1e-05, data_layout='NCHW') stack_input = bn prediction = fluid.layers.fc(input=stack_input, size=class_num, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) return prediction, avg_cost, acc