diff --git a/.travis.yml b/.travis.yml index a5559a796351bb10eacd4739f719b054e085160c..347fb321d761c10d9eecb1e5098ef7cc205ac181 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,3 +1,4 @@ +group: deprecated-2017Q2 language: cpp cache: ccache sudo: required diff --git a/.travis/unittest.sh b/.travis/unittest.sh index 23f15094c8e51132cff350e786654e3bab29cee1..ad223eb4a9c1f57896762ad38d0b3fa5de5c496b 100755 --- a/.travis/unittest.sh +++ b/.travis/unittest.sh @@ -8,8 +8,8 @@ abort(){ unittest(){ cd $1 > /dev/null - if [ -f "requirements.txt" ]; then - pip install -r requirements.txt + if [ -f "setup.sh" ]; then + sh setup.sh fi if [ $? != 0 ]; then exit 1 diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index 23e0b412b59da4ccfea7a4ce4303faec479ff234..0cdb203d21ef5fa854a011f2f0381078cabcb874 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -5,7 +5,7 @@ Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. ``` -pip install -r requirements.txt +sh setup.sh export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH ``` diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py index 424343a48ffa579a8ab465794987f957de36abdb..44af7ffaa999c618a7dcd4884f528ef60e59eefe 100644 --- a/deep_speech_2/data_utils/data.py +++ b/deep_speech_2/data_utils/data.py @@ -7,6 +7,7 @@ from __future__ import print_function import random import numpy as np +import multiprocessing import paddle.v2 as paddle from data_utils import utils from data_utils.augmentor.augmentation import AugmentationPipeline @@ -44,6 +45,8 @@ class DataGenerator(object): :types max_freq: None|float :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str + :param num_threads: Number of CPU threads for processing data. + :type num_threads: int :param random_seed: Random seed. :type random_seed: int """ @@ -58,6 +61,7 @@ class DataGenerator(object): window_ms=20.0, max_freq=None, specgram_type='linear', + num_threads=multiprocessing.cpu_count(), random_seed=0): self._max_duration = max_duration self._min_duration = min_duration @@ -70,6 +74,7 @@ class DataGenerator(object): stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq) + self._num_threads = num_threads self._rng = random.Random(random_seed) self._epoch = 0 @@ -207,10 +212,14 @@ class DataGenerator(object): def reader(): for instance in manifest: - yield self._process_utterance(instance["audio_filepath"], - instance["text"]) + yield instance - return reader + def mapper(instance): + return self._process_utterance(instance["audio_filepath"], + instance["text"]) + + return paddle.reader.xmap_readers( + mapper, reader, self._num_threads, 1024, order=True) def _padding_batch(self, batch, padding_to=-1, flatten=False): """ diff --git a/deep_speech_2/data_utils/speech.py b/deep_speech_2/data_utils/speech.py index fc031ff46f4a3820e3b13f7804c91b33948712d1..568e4443ba557149505dfb4de6f230b4962e332a 100644 --- a/deep_speech_2/data_utils/speech.py +++ b/deep_speech_2/data_utils/speech.py @@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment): return cls(samples, sample_rate, transcripts) @classmethod - def slice_from_file(cls, filepath, start=None, end=None, transcript): + def slice_from_file(cls, filepath, transcript, start=None, end=None): """Loads a small section of an speech without having to load the entire file into the memory which can be incredibly wasteful. diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 06449ab05c7960ec78acc9ce5bb664cf1058a845..71518133a347c459bbcf2670fa5d1dc226a619c8 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -6,6 +6,7 @@ from __future__ import print_function import argparse import gzip import distutils.util +import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import deep_speech2 @@ -38,6 +39,11 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -67,7 +73,8 @@ def infer(): data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, - augmentation_config='{}') + augmentation_config='{}', + num_threads=args.num_threads_data) # create network config # paddle.data_type.dense_array is used for variable batch input. diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt index c37e88ffe75dfda401726f485fd9928cbb477fab..0183ecf01fc1eb2507e00b3b97a6db92b6e2258e 100644 --- a/deep_speech_2/requirements.txt +++ b/deep_speech_2/requirements.txt @@ -1,4 +1,3 @@ SoundFile==0.9.0.post1 wget==3.2 -scikits.samplerate==0.3.3 -scipy==0.13.0b1 +scipy==0.13.1 diff --git a/deep_speech_2/setup.sh b/deep_speech_2/setup.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ae2a5eee0f9cfd5b4318b29cf037165f78f2b73 --- /dev/null +++ b/deep_speech_2/setup.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# install python dependencies +if [ -f 'requirements.txt' ]; then + pip install -r requirements.txt +fi +if [ $? != 0 ]; then + echo "Install python dependencies failed !!!" + exit 1 +fi + +# install scikits.samplerate +curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz" +if [ $? != 0 ]; then + echo "Download libsamplerate-0.1.9.tar.gz failed !!!" + exit 1 +fi +tar -xvf libsamplerate-0.1.9.tar.gz +cd libsamplerate-0.1.9 +./configure && make && make install +cd - +rm -rf libsamplerate-0.1.9 +rm libsamplerate-0.1.9.tar.gz +pip install scikits.samplerate==0.3.3 +if [ $? != 0 ]; then + echo "Install scikits.samplerate failed !!!" + exit 1 +fi + +echo "Install all dependencies successfully." diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index c60a039b69d91a89eb20e83ec1e090c8600d47a3..fc23ec72692f319b556a75004a7508990df5357e 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -9,6 +9,7 @@ import argparse import gzip import time import distutils.util +import multiprocessing import paddle.v2 as paddle from model import deep_speech2 from data_utils.data import DataGenerator @@ -52,6 +53,18 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--max_duration", + default=100.0, + type=float, + help="Audios with duration larger than this will be discarded. " + "(default: %(default)s)") +parser.add_argument( + "--min_duration", + default=0.0, + type=float, + help="Audios with duration smaller than this will be discarded. " + "(default: %(default)s)") parser.add_argument( "--shuffle_method", default='instance_shuffle', @@ -63,6 +76,11 @@ parser.add_argument( default=4, type=int, help="Trainer number. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -107,7 +125,10 @@ def train(): return DataGenerator( vocab_filepath=args.vocab_filepath, mean_std_filepath=args.mean_std_filepath, - augmentation_config=args.augmentation_config) + augmentation_config=args.augmentation_config, + max_duration=args.max_duration, + min_duration=args.min_duration, + num_threads=args.num_threads_data) train_generator = data_generator() test_generator = data_generator() diff --git a/language_model/network_conf.py b/language_model/network_conf.py index 1135ab64087eeafa1675702ba83523b4ad403a3c..e53ca66cc1adc49c6e7dcb50b77219030e74681d 100644 --- a/language_model/network_conf.py +++ b/language_model/network_conf.py @@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer): return cost, output -def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer): +def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer, gram_num=4): """ N-Gram language model definition. :param vocab_size: size of vocab. :param emb_dim: embedding vector's dimension. :param hidden_size: size of unit. - :param num_layer: layer number. + :param num_layer: number of hidden layers. + :param gram_size: gram number in n-gram method :return: cost and output layer of model. """ assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 - def wordemb(inlayer): - wordemb = paddle.layer.table_projection( - input=inlayer, - size=emb_dim, - param_attr=paddle.attr.Param( - name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0)) - return wordemb - # input layers - first_word = paddle.layer.data( - name="first_word", type=paddle.data_type.integer_value(vocab_size)) - second_word = paddle.layer.data( - name="second_word", type=paddle.data_type.integer_value(vocab_size)) - third_word = paddle.layer.data( - name="third_word", type=paddle.data_type.integer_value(vocab_size)) - fourth_word = paddle.layer.data( - name="fourth_word", type=paddle.data_type.integer_value(vocab_size)) + emb_layers = [] + for i in range(gram_num): + word = paddle.layer.data( + name="__word%02d__" % (i + 1), + type=paddle.data_type.integer_value(vocab_size)) + emb = paddle.layer.embedding( + input=word, + size=emb_dim, + param_attr=paddle.attr.Param(name="_proj", initial_std=1e-3)) + emb_layers.append(emb) next_word = paddle.layer.data( - name="next_word", type=paddle.data_type.integer_value(vocab_size)) - - # embedding layer - first_emb = wordemb(first_word) - second_emb = wordemb(second_word) - third_emb = wordemb(third_word) - fourth_emb = wordemb(fourth_word) - - context_emb = paddle.layer.concat( - input=[first_emb, second_emb, third_emb, fourth_emb]) + name="__next_word__", type=paddle.data_type.integer_value(vocab_size)) # hidden layer - hidden = paddle.layer.fc( - input=context_emb, size=hidden_size, act=paddle.activation.Relu()) - for _ in range(num_layer - 1): + for i in range(num_layer): hidden = paddle.layer.fc( - input=hidden, size=hidden_size, act=paddle.activation.Relu()) + input=hidden if i else paddle.layer.concat(input=emb_layers), + size=hidden_size, + act=paddle.activation.Relu()) - # fc(full connected) and output layer predict_word = paddle.layer.fc( input=[hidden], size=vocab_size, act=paddle.activation.Softmax()) diff --git a/nmt_without_attention/README.md b/nmt_without_attention/README.md index 609126dd872ca15170fa371469019d4057aa8b43..4c63a2589fbc2f4b9649e38a3edc58969409336a 100644 --- a/nmt_without_attention/README.md +++ b/nmt_without_attention/README.md @@ -1,8 +1,6 @@ # 神经网络机器翻译模型 ## 背景介绍 -- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 - 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 ## 模型概览 @@ -53,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# # bidierctional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -86,18 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 +-PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -107,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word): :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -144,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word): 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -185,6 +159,26 @@ else: max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -208,13 +202,16 @@ parameters = paddle.parameters.create(cost) **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** ```python -# define optimize method and trainer +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -225,20 +222,19 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** ```python -# define event_handler callback +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** @@ -300,26 +296,22 @@ beam_result = paddle.infer( **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** ```python -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" +beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + +gen_sen_idx = np.where(beam_result[1] == -1)[0] +assert len(gen_sen_idx) == len(test_batch) * beam_size + +start_pos, end_pos = 1, 0 +for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` 模型测试的执行与模型训练类似,只需执行 @@ -327,23 +319,20 @@ for i in xrange(len(gen_data)): ```bash python generate.py ``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 -```text -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: ```text -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +``` +- 第一行为输入的源语言句子。 +- 第二 ~ `beam_size + 1` 行是柱搜索生成的 `beam_size` 条翻译结果 + - 一行之内以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/generate.py b/nmt_without_attention/generate.py index e3c220685e2bc75a13f57d4f38faa28cb31a4f65..074db9e93ee459b8b37b55aed47af6473750aca0 100644 --- a/nmt_without_attention/generate.py +++ b/nmt_without_attention/generate.py @@ -1,32 +1,37 @@ #!/usr/bin/env python import os -from network_conf import * +import logging +import numpy as np + +from network_conf import seq2seq_net + +logger = logging.getLogger("paddle") +logger.setLevel(logging.WARNING) def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict): beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list, seq = [], [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(" ".join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] + gen_sen_idx = np.where(beam_result[1] == -1)[0] + assert len(gen_sen_idx) == len(test_batch) * beam_size + + start_pos, end_pos = 1, 0 for i, sample in enumerate(test_batch): - print("src:", " ".join([src_dict.get(w) for w in sample[0]]), "\n") + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence for j in xrange(beam_size): - print("prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]) + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 print("\n") -def generate(source_dict_dim, target_dict_dim, model_path, batch_size): +def generate(source_dict_dim, target_dict_dim, model_path, beam_size, + batch_size): """ - Generating function for NMT + sequence generation for NMT :param source_dict_dim: size of source dictionary :type source_dict_dim: int @@ -34,16 +39,19 @@ def generate(source_dict_dim, target_dict_dim, model_path, batch_size): :type target_dict_dim: int :param model_path: path for inital model :type model_path: string + :param beam_size: the expanson width in each generation setp + :param beam_size: int + :param batch_size: the number of training examples in one forward pass + :param batch_size: int """ assert os.path.exists(model_path), "trained model does not exist." # step 1: prepare dictionary src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - beam_size = 5 # step 2: load the trained model - paddle.init(use_gpu=True, trainer_count=1) + paddle.init(use_gpu=False, trainer_count=1) with gzip.open(model_path) as f: parameters = paddle.parameters.Parameters.from_tar(f) beam_gen = seq2seq_net( @@ -72,5 +80,6 @@ if __name__ == "__main__": generate( source_dict_dim=3000, target_dict_dim=3000, - batch_size=5, - model_path="models/nmt_without_att_params_batch_00001.tar.gz") + batch_size=20, + beam_size=5, + model_path="models/nmt_without_att_params_batch_00347.tar.gz") diff --git a/nmt_without_attention/index.html b/nmt_without_attention/index.html index b68bdb5b6d1c84f8cde6f66583788f678c79e858..b6ddaac64203423ac4d428b2fd965672aa98baff 100644 --- a/nmt_without_attention/index.html +++ b/nmt_without_attention/index.html @@ -43,8 +43,6 @@ # 神经网络机器翻译模型 ## 背景介绍 -- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 - 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 ## 模型概览 @@ -95,14 +93,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: ```python -#### Encoder src_word_id = paddle.layer.data( name='source_language_word', type=paddle.data_type.integer_value_sequence(source_dict_dim)) + # source embedding src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim) -# use bidirectional_gru + +# # bidierctional GRU as encoder encoded_vector = paddle.networks.bidirectional_gru( input=src_embedding, size=encoder_size, @@ -128,18 +127,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ### 无注意力机制的解码器 +-PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: ```python -#### Decoder +# the initialization state for decoder GRU encoder_last = paddle.layer.last_seq(input=encoded_vector) -encoder_last_projected = paddle.layer.mixed( - size=decoder_size, - act=paddle.activation.Tanh(), - input=paddle.layer.full_matrix_projection(input=encoder_last)) +encoder_last_projected = paddle.layer.fc( + size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last) -# gru step +# the step function for decoder GRU def gru_decoder_without_attention(enc_vec, current_word): ''' Step function for gru decoder @@ -149,33 +147,29 @@ def gru_decoder_without_attention(enc_vec, current_word): :type current_word: layer object ''' decoder_mem = paddle.layer.memory( - name='gru_decoder', - size=decoder_size, - boot_layer=encoder_last_projected) + name="gru_decoder", + size=decoder_size, + boot_layer=encoder_last_projected) context = paddle.layer.last_seq(input=enc_vec) - decoder_inputs = paddle.layer.mixed( - size=decoder_size * 3, - input=[ - paddle.layer.full_matrix_projection(input=context), - paddle.layer.full_matrix_projection(input=current_word) - ]) + decoder_inputs = paddle.layer.fc( + size=decoder_size * 3, input=[context, current_word]) gru_step = paddle.layer.gru_step( - name='gru_decoder', + name="gru_decoder", act=paddle.activation.Tanh(), gate_act=paddle.activation.Sigmoid(), input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) - out = paddle.layer.mixed( + out = paddle.layer.fc( size=target_dict_dim, bias_attr=True, act=paddle.activation.Softmax(), - input=paddle.layer.full_matrix_projection(input=gru_step)) - return out + input=gru_step) + return out ``` 在模型训练和测试阶段,解码器的行为有很大的不同: @@ -186,34 +180,14 @@ def gru_decoder_without_attention(enc_vec, current_word): 训练和生成的逻辑分别实现在如下的`if-else`条件分支中: ```python -decoder_group_name = "decoder_group" -group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) +group_input1 = paddle.layer.StaticInput(input=encoded_vector) group_inputs = [group_input1] -if not generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_without_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost -else: +decoder_group_name = "decoder_group" +if is_generating: trg_embedding = paddle.layer.GeneratedInput( size=target_dict_dim, - embedding_name='_target_language_embedding', + embedding_name="_target_language_embedding", embedding_size=word_vector_dim) group_inputs.append(trg_embedding) @@ -227,6 +201,26 @@ else: max_length=max_length) return beam_gen +else: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name="target_language_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name="_target_language_embedding")) + group_inputs.append(trg_embedding) + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_without_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name="target_language_next_word", + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost ``` ## 数据准备 @@ -250,13 +244,16 @@ parameters = paddle.parameters.create(cost) **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** ```python -# define optimize method and trainer +# define optimization method optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) + +# define the trainer instance trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( @@ -267,20 +264,19 @@ wmt14_reader = paddle.batch( **c) 定义事件句柄,打印训练中间结果、保存模型快照** ```python -# define event_handler callback +# define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0 and event.batch_id > 0: - with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % - event.batch_id, 'w') as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost%f, %s" % ( - event.pass_id, event.batch_id, event.cost, event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() + if event.batch_id and not event.batch_id % 10: + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) ``` **d) 开始训练** @@ -342,26 +338,22 @@ beam_result = paddle.infer( **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** ```python -# get the dictionary -src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) - -# the delimited element of generated sequences is -1, -# the first element of each generated sequence is the sequence length -seq_list = [] -seq = [] -for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - -prob = beam_result[0] -for i in xrange(len(gen_data)): - print "\n*******************************************************\n" - print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" +beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) + +gen_sen_idx = np.where(beam_result[1] == -1)[0] +assert len(gen_sen_idx) == len(test_batch) * beam_size + +start_pos, end_pos = 1, 0 +for i, sample in enumerate(test_batch): + print(" ".join([ + src_dict[w] for w in sample[0][1:-1] + ])) # skip the start and ending mark when print the source sentence for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + end_pos = gen_sen_idx[i * beam_size + j] + print("%.4f\t%s" % (beam_result[0][i][j], " ".join( + trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) + start_pos = end_pos + 2 + print("\n") ``` 模型测试的执行与模型训练类似,只需执行 @@ -369,23 +361,20 @@ for i in xrange(len(gen_data)): ```bash python generate.py ``` -则自动为测试数据生成了对应的翻译结果。 -设置beam search的宽度为3,输入某个法文句子 -```text -src: Elles connaissent leur entreprise mieux que personne . -``` - -其对应的英文翻译结果为 +设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下: ```text -prob = -3.754819: They know their business better than anyone . -prob = -4.445528: They know their businesses better than anyone . -prob = -5.026885: They know their business better than anybody . -``` +Elles connaissent leur entreprise mieux que personne . +-3.754819 They know their business better than anyone . +-4.445528 They know their businesses better than anyone . +-5.026885 They know their business better than anybody . -* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; -* `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 +``` +- 第一行为输入的源语言句子。 +- 第二 ~ `beam_size + 1` 行是柱搜索生成的 `beam_size` 条翻译结果 + - 一行之内以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。 + - `` 表示句子的开始,``表示一个句子的结束,如果出现了在词典中未包含的词,则用``替代。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 diff --git a/nmt_without_attention/train.py b/nmt_without_attention/train.py index 6b847fa8e04334407f127fb3a5ed78f610111fb1..9600df8e5b70cca90543062b040e6dddc540440c 100644 --- a/nmt_without_attention/train.py +++ b/nmt_without_attention/train.py @@ -1,56 +1,66 @@ #!/usr/bin/env python +import os +import logging +import paddle.v2 as paddle -from network_conf import * +from network_conf import seq2seq_net +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) -def train(source_dict_dim, target_dict_dim): + +def train(save_dir_path, source_dict_dim, target_dict_dim): ''' Training function for NMT + :param save_dir_path: path of the directory to save the trained models. + :param save_dir_path: str :param source_dict_dim: size of source dictionary :type source_dict_dim: int :param target_dict_dim: size of target dictionary :type target_dict_dim: int ''' - # initialize model + if not os.path.exists(save_dir_path): + os.mkdir(save_dir_path) + + # initialize PaddlePaddle paddle.init(use_gpu=False, trainer_count=1) cost = seq2seq_net(source_dict_dim, target_dict_dim) parameters = paddle.parameters.create(cost) - # define optimize method and trainer + # define optimization method and the trainer instance optimizer = paddle.optimizer.RMSProp( learning_rate=1e-3, gradient_clipping_threshold=10.0, regularization=paddle.optimizer.L2Regularization(rate=8e-4)) trainer = paddle.trainer.SGD( cost=cost, parameters=parameters, update_equation=optimizer) + # define data reader wmt14_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), batch_size=8) - # define event_handler callback + # define the event_handler callback def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if not event.batch_id % 500 and event.batch_id: - with gzip.open("models/nmt_without_att_params_batch_%05d.tar.gz" - % event.batch_id, "w") as f: + if not event.batch_id % 100 and event.batch_id: + with gzip.open( + os.path.join(save_path, + "nmt_without_att_%05d_batch_%05d.tar.gz" % + event.pass_id, event.batch_id), "w") as f: parameters.to_tar(f) if event.batch_id and not event.batch_id % 10: - print("\nPass %d, Batch %d, Cost %f, %s" % - (event.pass_id, event.batch_id, event.cost, - event.metrics)) - else: - sys.stdout.write('.') - sys.stdout.flush() - - # start to train + logger.info("Pass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics)) + + # start training trainer.train( reader=wmt14_reader, event_handler=event_handler, num_passes=2) if __name__ == '__main__': - train(source_dict_dim=3000, target_dict_dim=3000) + train(save_dir_path="models", source_dict_dim=3000, target_dict_dim=3000)