提交 ddaba7fb 编写于 作者: C caoying03

Merge branch 'develop' into refine_seq2seq

group: deprecated-2017Q2
language: cpp language: cpp
cache: ccache cache: ccache
sudo: required sudo: required
......
...@@ -8,8 +8,8 @@ abort(){ ...@@ -8,8 +8,8 @@ abort(){
unittest(){ unittest(){
cd $1 > /dev/null cd $1 > /dev/null
if [ -f "requirements.txt" ]; then if [ -f "setup.sh" ]; then
pip install -r requirements.txt sh setup.sh
fi fi
if [ $? != 0 ]; then if [ $? != 0 ]; then
exit 1 exit 1
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory. Please replace `$PADDLE_INSTALL_DIR` with your own paddle installation directory.
``` ```
pip install -r requirements.txt sh setup.sh
export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
``` ```
......
...@@ -7,6 +7,7 @@ from __future__ import print_function ...@@ -7,6 +7,7 @@ from __future__ import print_function
import random import random
import numpy as np import numpy as np
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils import utils from data_utils import utils
from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.augmentor.augmentation import AugmentationPipeline
...@@ -44,6 +45,8 @@ class DataGenerator(object): ...@@ -44,6 +45,8 @@ class DataGenerator(object):
:types max_freq: None|float :types max_freq: None|float
:param specgram_type: Specgram feature type. Options: 'linear'. :param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str :type specgram_type: str
:param num_threads: Number of CPU threads for processing data.
:type num_threads: int
:param random_seed: Random seed. :param random_seed: Random seed.
:type random_seed: int :type random_seed: int
""" """
...@@ -58,6 +61,7 @@ class DataGenerator(object): ...@@ -58,6 +61,7 @@ class DataGenerator(object):
window_ms=20.0, window_ms=20.0,
max_freq=None, max_freq=None,
specgram_type='linear', specgram_type='linear',
num_threads=multiprocessing.cpu_count(),
random_seed=0): random_seed=0):
self._max_duration = max_duration self._max_duration = max_duration
self._min_duration = min_duration self._min_duration = min_duration
...@@ -70,6 +74,7 @@ class DataGenerator(object): ...@@ -70,6 +74,7 @@ class DataGenerator(object):
stride_ms=stride_ms, stride_ms=stride_ms,
window_ms=window_ms, window_ms=window_ms,
max_freq=max_freq) max_freq=max_freq)
self._num_threads = num_threads
self._rng = random.Random(random_seed) self._rng = random.Random(random_seed)
self._epoch = 0 self._epoch = 0
...@@ -207,10 +212,14 @@ class DataGenerator(object): ...@@ -207,10 +212,14 @@ class DataGenerator(object):
def reader(): def reader():
for instance in manifest: for instance in manifest:
yield self._process_utterance(instance["audio_filepath"], yield instance
instance["text"])
return reader def mapper(instance):
return self._process_utterance(instance["audio_filepath"],
instance["text"])
return paddle.reader.xmap_readers(
mapper, reader, self._num_threads, 1024, order=True)
def _padding_batch(self, batch, padding_to=-1, flatten=False): def _padding_batch(self, batch, padding_to=-1, flatten=False):
""" """
......
...@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment): ...@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment):
return cls(samples, sample_rate, transcripts) return cls(samples, sample_rate, transcripts)
@classmethod @classmethod
def slice_from_file(cls, filepath, start=None, end=None, transcript): def slice_from_file(cls, filepath, transcript, start=None, end=None):
"""Loads a small section of an speech without having to load """Loads a small section of an speech without having to load
the entire file into the memory which can be incredibly wasteful. the entire file into the memory which can be incredibly wasteful.
......
...@@ -6,6 +6,7 @@ from __future__ import print_function ...@@ -6,6 +6,7 @@ from __future__ import print_function
import argparse import argparse
import gzip import gzip
import distutils.util import distutils.util
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import deep_speech2 from model import deep_speech2
...@@ -38,6 +39,11 @@ parser.add_argument( ...@@ -38,6 +39,11 @@ parser.add_argument(
default=True, default=True,
type=distutils.util.strtobool, type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)") help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count(),
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--mean_std_filepath", "--mean_std_filepath",
default='mean_std.npz', default='mean_std.npz',
...@@ -67,7 +73,8 @@ def infer(): ...@@ -67,7 +73,8 @@ def infer():
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config='{}') augmentation_config='{}',
num_threads=args.num_threads_data)
# create network config # create network config
# paddle.data_type.dense_array is used for variable batch input. # paddle.data_type.dense_array is used for variable batch input.
......
SoundFile==0.9.0.post1 SoundFile==0.9.0.post1
wget==3.2 wget==3.2
scikits.samplerate==0.3.3 scipy==0.13.1
scipy==0.13.0b1
#!/bin/bash
# install python dependencies
if [ -f 'requirements.txt' ]; then
pip install -r requirements.txt
fi
if [ $? != 0 ]; then
echo "Install python dependencies failed !!!"
exit 1
fi
# install scikits.samplerate
curl -O "http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz"
if [ $? != 0 ]; then
echo "Download libsamplerate-0.1.9.tar.gz failed !!!"
exit 1
fi
tar -xvf libsamplerate-0.1.9.tar.gz
cd libsamplerate-0.1.9
./configure && make && make install
cd -
rm -rf libsamplerate-0.1.9
rm libsamplerate-0.1.9.tar.gz
pip install scikits.samplerate==0.3.3
if [ $? != 0 ]; then
echo "Install scikits.samplerate failed !!!"
exit 1
fi
echo "Install all dependencies successfully."
...@@ -9,6 +9,7 @@ import argparse ...@@ -9,6 +9,7 @@ import argparse
import gzip import gzip
import time import time
import distutils.util import distutils.util
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import deep_speech2 from model import deep_speech2
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
...@@ -52,6 +53,18 @@ parser.add_argument( ...@@ -52,6 +53,18 @@ parser.add_argument(
default=True, default=True,
type=distutils.util.strtobool, type=distutils.util.strtobool,
help="Use sortagrad or not. (default: %(default)s)") help="Use sortagrad or not. (default: %(default)s)")
parser.add_argument(
"--max_duration",
default=100.0,
type=float,
help="Audios with duration larger than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument( parser.add_argument(
"--shuffle_method", "--shuffle_method",
default='instance_shuffle', default='instance_shuffle',
...@@ -63,6 +76,11 @@ parser.add_argument( ...@@ -63,6 +76,11 @@ parser.add_argument(
default=4, default=4,
type=int, type=int,
help="Trainer number. (default: %(default)s)") help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count(),
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument( parser.add_argument(
"--mean_std_filepath", "--mean_std_filepath",
default='mean_std.npz', default='mean_std.npz',
...@@ -107,7 +125,10 @@ def train(): ...@@ -107,7 +125,10 @@ def train():
return DataGenerator( return DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_filepath,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_filepath,
augmentation_config=args.augmentation_config) augmentation_config=args.augmentation_config,
max_duration=args.max_duration,
min_duration=args.min_duration,
num_threads=args.num_threads_data)
train_generator = data_generator() train_generator = data_generator()
test_generator = data_generator() test_generator = data_generator()
......
...@@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer): ...@@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer):
return cost, output return cost, output
def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer): def ngram_lm(vocab_size, emb_dim, hidden_size, num_layer, gram_num=4):
""" """
N-Gram language model definition. N-Gram language model definition.
:param vocab_size: size of vocab. :param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension. :param emb_dim: embedding vector's dimension.
:param hidden_size: size of unit. :param hidden_size: size of unit.
:param num_layer: layer number. :param num_layer: number of hidden layers.
:param gram_size: gram number in n-gram method
:return: cost and output layer of model. :return: cost and output layer of model.
""" """
assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0 assert emb_dim > 0 and hidden_size > 0 and vocab_size > 0 and num_layer > 0
def wordemb(inlayer):
wordemb = paddle.layer.table_projection(
input=inlayer,
size=emb_dim,
param_attr=paddle.attr.Param(
name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
return wordemb
# input layers # input layers
first_word = paddle.layer.data( emb_layers = []
name="first_word", type=paddle.data_type.integer_value(vocab_size)) for i in range(gram_num):
second_word = paddle.layer.data( word = paddle.layer.data(
name="second_word", type=paddle.data_type.integer_value(vocab_size)) name="__word%02d__" % (i + 1),
third_word = paddle.layer.data( type=paddle.data_type.integer_value(vocab_size))
name="third_word", type=paddle.data_type.integer_value(vocab_size)) emb = paddle.layer.embedding(
fourth_word = paddle.layer.data( input=word,
name="fourth_word", type=paddle.data_type.integer_value(vocab_size)) size=emb_dim,
param_attr=paddle.attr.Param(name="_proj", initial_std=1e-3))
emb_layers.append(emb)
next_word = paddle.layer.data( next_word = paddle.layer.data(
name="next_word", type=paddle.data_type.integer_value(vocab_size)) name="__next_word__", type=paddle.data_type.integer_value(vocab_size))
# embedding layer
first_emb = wordemb(first_word)
second_emb = wordemb(second_word)
third_emb = wordemb(third_word)
fourth_emb = wordemb(fourth_word)
context_emb = paddle.layer.concat(
input=[first_emb, second_emb, third_emb, fourth_emb])
# hidden layer # hidden layer
hidden = paddle.layer.fc( for i in range(num_layer):
input=context_emb, size=hidden_size, act=paddle.activation.Relu())
for _ in range(num_layer - 1):
hidden = paddle.layer.fc( hidden = paddle.layer.fc(
input=hidden, size=hidden_size, act=paddle.activation.Relu()) input=hidden if i else paddle.layer.concat(input=emb_layers),
size=hidden_size,
act=paddle.activation.Relu())
# fc(full connected) and output layer
predict_word = paddle.layer.fc( predict_word = paddle.layer.fc(
input=[hidden], size=vocab_size, act=paddle.activation.Softmax()) input=[hidden], size=vocab_size, act=paddle.activation.Softmax())
......
# 神经网络机器翻译模型 # 神经网络机器翻译模型
## 背景介绍 ## 背景介绍
- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。
## 模型概览 ## 模型概览
...@@ -53,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN ...@@ -53,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN
在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现:
```python ```python
#### Encoder
src_word_id = paddle.layer.data( src_word_id = paddle.layer.data(
name='source_language_word', name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim)) type=paddle.data_type.integer_value_sequence(source_dict_dim))
# source embedding # source embedding
src_embedding = paddle.layer.embedding( src_embedding = paddle.layer.embedding(
input=src_word_id, size=word_vector_dim) input=src_word_id, size=word_vector_dim)
# use bidirectional_gru
# # bidierctional GRU as encoder
encoded_vector = paddle.networks.bidirectional_gru( encoded_vector = paddle.networks.bidirectional_gru(
input=src_embedding, input=src_embedding,
size=encoder_size, size=encoder_size,
...@@ -86,18 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ...@@ -86,18 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru(
### 无注意力机制的解码器 ### 无注意力机制的解码器
-PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下:
```python ```python
#### Decoder # the initialization state for decoder GRU
encoder_last = paddle.layer.last_seq(input=encoded_vector) encoder_last = paddle.layer.last_seq(input=encoded_vector)
encoder_last_projected = paddle.layer.mixed( encoder_last_projected = paddle.layer.fc(
size=decoder_size, size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
act=paddle.activation.Tanh(),
input=paddle.layer.full_matrix_projection(input=encoder_last))
# gru step # the step function for decoder GRU
def gru_decoder_without_attention(enc_vec, current_word): def gru_decoder_without_attention(enc_vec, current_word):
''' '''
Step function for gru decoder Step function for gru decoder
...@@ -107,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word): ...@@ -107,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word):
:type current_word: layer object :type current_word: layer object
''' '''
decoder_mem = paddle.layer.memory( decoder_mem = paddle.layer.memory(
name='gru_decoder', name="gru_decoder",
size=decoder_size, size=decoder_size,
boot_layer=encoder_last_projected) boot_layer=encoder_last_projected)
context = paddle.layer.last_seq(input=enc_vec) context = paddle.layer.last_seq(input=enc_vec)
decoder_inputs = paddle.layer.mixed( decoder_inputs = paddle.layer.fc(
size=decoder_size * 3, size=decoder_size * 3, input=[context, current_word])
input=[
paddle.layer.full_matrix_projection(input=context),
paddle.layer.full_matrix_projection(input=current_word)
])
gru_step = paddle.layer.gru_step( gru_step = paddle.layer.gru_step(
name='gru_decoder', name="gru_decoder",
act=paddle.activation.Tanh(), act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(), gate_act=paddle.activation.Sigmoid(),
input=decoder_inputs, input=decoder_inputs,
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size)
out = paddle.layer.mixed( out = paddle.layer.fc(
size=target_dict_dim, size=target_dict_dim,
bias_attr=True, bias_attr=True,
act=paddle.activation.Softmax(), act=paddle.activation.Softmax(),
input=paddle.layer.full_matrix_projection(input=gru_step)) input=gru_step)
return out return out
``` ```
在模型训练和测试阶段,解码器的行为有很大的不同: 在模型训练和测试阶段,解码器的行为有很大的不同:
...@@ -144,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word): ...@@ -144,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word):
训练和生成的逻辑分别实现在如下的`if-else`条件分支中: 训练和生成的逻辑分别实现在如下的`if-else`条件分支中:
```python ```python
decoder_group_name = "decoder_group" group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_inputs = [group_input1] group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
decoder_group_name = "decoder_group"
if is_generating:
trg_embedding = paddle.layer.GeneratedInput( trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim, size=target_dict_dim,
embedding_name='_target_language_embedding', embedding_name="_target_language_embedding",
embedding_size=word_vector_dim) embedding_size=word_vector_dim)
group_inputs.append(trg_embedding) group_inputs.append(trg_embedding)
...@@ -185,6 +159,26 @@ else: ...@@ -185,6 +159,26 @@ else:
max_length=max_length) max_length=max_length)
return beam_gen return beam_gen
else:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name="target_language_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name="target_language_next_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
``` ```
## 数据准备 ## 数据准备
...@@ -208,13 +202,16 @@ parameters = paddle.parameters.create(cost) ...@@ -208,13 +202,16 @@ parameters = paddle.parameters.create(cost)
**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
```python ```python
# define optimize method and trainer # define optimization method
optimizer = paddle.optimizer.RMSProp( optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3, learning_rate=1e-3,
gradient_clipping_threshold=10.0, gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
# define the trainer instance
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer) cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader # define data reader
wmt14_reader = paddle.batch( wmt14_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -225,20 +222,19 @@ wmt14_reader = paddle.batch( ...@@ -225,20 +222,19 @@ wmt14_reader = paddle.batch(
**c) 定义事件句柄,打印训练中间结果、保存模型快照** **c) 定义事件句柄,打印训练中间结果、保存模型快照**
```python ```python
# define event_handler callback # define the event_handler callback
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0 and event.batch_id > 0: if not event.batch_id % 100 and event.batch_id:
with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % with gzip.open(
event.batch_id, 'w') as f: os.path.join(save_path,
"nmt_without_att_%05d_batch_%05d.tar.gz" %
event.pass_id, event.batch_id), "w") as f:
parameters.to_tar(f) parameters.to_tar(f)
if event.batch_id % 10 == 0: if event.batch_id and not event.batch_id % 10:
print "\nPass %d, Batch %d, Cost%f, %s" % ( logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics) event.pass_id, event.batch_id, event.cost, event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
``` ```
**d) 开始训练** **d) 开始训练**
...@@ -300,26 +296,22 @@ beam_result = paddle.infer( ...@@ -300,26 +296,22 @@ beam_result = paddle.infer(
**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果**
```python ```python
# get the dictionary beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
gen_sen_idx = np.where(beam_result[1] == -1)[0]
# the delimited element of generated sequences is -1, assert len(gen_sen_idx) == len(test_batch) * beam_size
# the first element of each generated sequence is the sequence length
seq_list = [] start_pos, end_pos = 1, 0
seq = [] for i, sample in enumerate(test_batch):
for w in beam_result[1]: print(" ".join([
if w != -1: src_dict[w] for w in sample[0][1:-1]
seq.append(w) ])) # skip the start and ending mark when print the source sentence
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i in xrange(len(gen_data)):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size): for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
``` ```
模型测试的执行与模型训练类似,只需执行 模型测试的执行与模型训练类似,只需执行
...@@ -327,23 +319,20 @@ for i in xrange(len(gen_data)): ...@@ -327,23 +319,20 @@ for i in xrange(len(gen_data)):
```bash ```bash
python generate.py python generate.py
``` ```
则自动为测试数据生成了对应的翻译结果。
设置beam search的宽度为3,输入某个法文句子
```text 设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下:
src: <s> Elles connaissent leur entreprise mieux que personne . <e>
```
其对应的英文翻译结果为
```text ```text
prob = -3.754819: They know their business better than anyone . <e> Elles connaissent leur entreprise mieux que personne .
prob = -4.445528: They know their businesses better than anyone . <e> -3.754819 They know their business better than anyone . <e>
prob = -5.026885: They know their business better than anybody . <e> -4.445528 They know their businesses better than anyone . <e>
``` -5.026885 They know their business better than anybody . <e>
* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; ```
* `<s>` 表示句子的开始,`<e>`表示一个句子的结束,如果出现了在词典中未包含的词,则用`<unk>`替代。 - 第一行为输入的源语言句子。
- 第二 ~ `beam_size + 1` 行是柱搜索生成的 `beam_size` 条翻译结果
- 一行之内以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。
- `<s>` 表示句子的开始,`<e>`表示一个句子的结束,如果出现了在词典中未包含的词,则用`<unk>`替代。
至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
......
#!/usr/bin/env python #!/usr/bin/env python
import os import os
from network_conf import * import logging
import numpy as np
from network_conf import seq2seq_net
logger = logging.getLogger("paddle")
logger.setLevel(logging.WARNING)
def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict): def infer_a_batch(inferer, test_batch, beam_size, src_dict, trg_dict):
beam_result = inferer.infer(input=test_batch, field=["prob", "id"]) beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
# the delimited element of generated sequences is -1, gen_sen_idx = np.where(beam_result[1] == -1)[0]
# the first element of each generated sequence is the sequence length assert len(gen_sen_idx) == len(test_batch) * beam_size
seq_list, seq = [], []
for w in beam_result[1]: start_pos, end_pos = 1, 0
if w != -1:
seq.append(w)
else:
seq_list.append(" ".join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i, sample in enumerate(test_batch): for i, sample in enumerate(test_batch):
print("src:", " ".join([src_dict.get(w) for w in sample[0]]), "\n") print(" ".join([
src_dict[w] for w in sample[0][1:-1]
])) # skip the start and ending mark when print the source sentence
for j in xrange(beam_size): for j in xrange(beam_size):
print("prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]) end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n") print("\n")
def generate(source_dict_dim, target_dict_dim, model_path, batch_size): def generate(source_dict_dim, target_dict_dim, model_path, beam_size,
batch_size):
""" """
Generating function for NMT sequence generation for NMT
:param source_dict_dim: size of source dictionary :param source_dict_dim: size of source dictionary
:type source_dict_dim: int :type source_dict_dim: int
...@@ -34,16 +39,19 @@ def generate(source_dict_dim, target_dict_dim, model_path, batch_size): ...@@ -34,16 +39,19 @@ def generate(source_dict_dim, target_dict_dim, model_path, batch_size):
:type target_dict_dim: int :type target_dict_dim: int
:param model_path: path for inital model :param model_path: path for inital model
:type model_path: string :type model_path: string
:param beam_size: the expanson width in each generation setp
:param beam_size: int
:param batch_size: the number of training examples in one forward pass
:param batch_size: int
""" """
assert os.path.exists(model_path), "trained model does not exist." assert os.path.exists(model_path), "trained model does not exist."
# step 1: prepare dictionary # step 1: prepare dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim) src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
beam_size = 5
# step 2: load the trained model # step 2: load the trained model
paddle.init(use_gpu=True, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
with gzip.open(model_path) as f: with gzip.open(model_path) as f:
parameters = paddle.parameters.Parameters.from_tar(f) parameters = paddle.parameters.Parameters.from_tar(f)
beam_gen = seq2seq_net( beam_gen = seq2seq_net(
...@@ -72,5 +80,6 @@ if __name__ == "__main__": ...@@ -72,5 +80,6 @@ if __name__ == "__main__":
generate( generate(
source_dict_dim=3000, source_dict_dim=3000,
target_dict_dim=3000, target_dict_dim=3000,
batch_size=5, batch_size=20,
model_path="models/nmt_without_att_params_batch_00001.tar.gz") beam_size=5,
model_path="models/nmt_without_att_params_batch_00347.tar.gz")
...@@ -43,8 +43,6 @@ ...@@ -43,8 +43,6 @@
# 神经网络机器翻译模型 # 神经网络机器翻译模型
## 背景介绍 ## 背景介绍
- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。 机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。
## 模型概览 ## 模型概览
...@@ -95,14 +93,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN ...@@ -95,14 +93,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN
在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现: 在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现:
```python ```python
#### Encoder
src_word_id = paddle.layer.data( src_word_id = paddle.layer.data(
name='source_language_word', name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim)) type=paddle.data_type.integer_value_sequence(source_dict_dim))
# source embedding # source embedding
src_embedding = paddle.layer.embedding( src_embedding = paddle.layer.embedding(
input=src_word_id, size=word_vector_dim) input=src_word_id, size=word_vector_dim)
# use bidirectional_gru
# # bidierctional GRU as encoder
encoded_vector = paddle.networks.bidirectional_gru( encoded_vector = paddle.networks.bidirectional_gru(
input=src_embedding, input=src_embedding,
size=encoder_size, size=encoder_size,
...@@ -128,18 +127,17 @@ encoded_vector = paddle.networks.bidirectional_gru( ...@@ -128,18 +127,17 @@ encoded_vector = paddle.networks.bidirectional_gru(
### 无注意力机制的解码器 ### 无注意力机制的解码器
-PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下: 对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下:
```python ```python
#### Decoder # the initialization state for decoder GRU
encoder_last = paddle.layer.last_seq(input=encoded_vector) encoder_last = paddle.layer.last_seq(input=encoded_vector)
encoder_last_projected = paddle.layer.mixed( encoder_last_projected = paddle.layer.fc(
size=decoder_size, size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
act=paddle.activation.Tanh(),
input=paddle.layer.full_matrix_projection(input=encoder_last))
# gru step # the step function for decoder GRU
def gru_decoder_without_attention(enc_vec, current_word): def gru_decoder_without_attention(enc_vec, current_word):
''' '''
Step function for gru decoder Step function for gru decoder
...@@ -149,33 +147,29 @@ def gru_decoder_without_attention(enc_vec, current_word): ...@@ -149,33 +147,29 @@ def gru_decoder_without_attention(enc_vec, current_word):
:type current_word: layer object :type current_word: layer object
''' '''
decoder_mem = paddle.layer.memory( decoder_mem = paddle.layer.memory(
name='gru_decoder', name="gru_decoder",
size=decoder_size, size=decoder_size,
boot_layer=encoder_last_projected) boot_layer=encoder_last_projected)
context = paddle.layer.last_seq(input=enc_vec) context = paddle.layer.last_seq(input=enc_vec)
decoder_inputs = paddle.layer.mixed( decoder_inputs = paddle.layer.fc(
size=decoder_size * 3, size=decoder_size * 3, input=[context, current_word])
input=[
paddle.layer.full_matrix_projection(input=context),
paddle.layer.full_matrix_projection(input=current_word)
])
gru_step = paddle.layer.gru_step( gru_step = paddle.layer.gru_step(
name='gru_decoder', name="gru_decoder",
act=paddle.activation.Tanh(), act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(), gate_act=paddle.activation.Sigmoid(),
input=decoder_inputs, input=decoder_inputs,
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size)
out = paddle.layer.mixed( out = paddle.layer.fc(
size=target_dict_dim, size=target_dict_dim,
bias_attr=True, bias_attr=True,
act=paddle.activation.Softmax(), act=paddle.activation.Softmax(),
input=paddle.layer.full_matrix_projection(input=gru_step)) input=gru_step)
return out return out
``` ```
在模型训练和测试阶段,解码器的行为有很大的不同: 在模型训练和测试阶段,解码器的行为有很大的不同:
...@@ -186,34 +180,14 @@ def gru_decoder_without_attention(enc_vec, current_word): ...@@ -186,34 +180,14 @@ def gru_decoder_without_attention(enc_vec, current_word):
训练和生成的逻辑分别实现在如下的`if-else`条件分支中: 训练和生成的逻辑分别实现在如下的`if-else`条件分支中:
```python ```python
decoder_group_name = "decoder_group" group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_inputs = [group_input1] group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
decoder_group_name = "decoder_group"
if is_generating:
trg_embedding = paddle.layer.GeneratedInput( trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim, size=target_dict_dim,
embedding_name='_target_language_embedding', embedding_name="_target_language_embedding",
embedding_size=word_vector_dim) embedding_size=word_vector_dim)
group_inputs.append(trg_embedding) group_inputs.append(trg_embedding)
...@@ -227,6 +201,26 @@ else: ...@@ -227,6 +201,26 @@ else:
max_length=max_length) max_length=max_length)
return beam_gen return beam_gen
else:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name="target_language_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name="target_language_next_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
``` ```
## 数据准备 ## 数据准备
...@@ -250,13 +244,16 @@ parameters = paddle.parameters.create(cost) ...@@ -250,13 +244,16 @@ parameters = paddle.parameters.create(cost)
**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`** **b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
```python ```python
# define optimize method and trainer # define optimization method
optimizer = paddle.optimizer.RMSProp( optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3, learning_rate=1e-3,
gradient_clipping_threshold=10.0, gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
# define the trainer instance
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer) cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader # define data reader
wmt14_reader = paddle.batch( wmt14_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -267,20 +264,19 @@ wmt14_reader = paddle.batch( ...@@ -267,20 +264,19 @@ wmt14_reader = paddle.batch(
**c) 定义事件句柄,打印训练中间结果、保存模型快照** **c) 定义事件句柄,打印训练中间结果、保存模型快照**
```python ```python
# define event_handler callback # define the event_handler callback
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0 and event.batch_id > 0: if not event.batch_id % 100 and event.batch_id:
with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' % with gzip.open(
event.batch_id, 'w') as f: os.path.join(save_path,
"nmt_without_att_%05d_batch_%05d.tar.gz" %
event.pass_id, event.batch_id), "w") as f:
parameters.to_tar(f) parameters.to_tar(f)
if event.batch_id % 10 == 0: if event.batch_id and not event.batch_id % 10:
print "\nPass %d, Batch %d, Cost%f, %s" % ( logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics) event.pass_id, event.batch_id, event.cost, event.metrics))
else:
sys.stdout.write('.')
sys.stdout.flush()
``` ```
**d) 开始训练** **d) 开始训练**
...@@ -342,26 +338,22 @@ beam_result = paddle.infer( ...@@ -342,26 +338,22 @@ beam_result = paddle.infer(
**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果** **c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果**
```python ```python
# get the dictionary beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
gen_sen_idx = np.where(beam_result[1] == -1)[0]
# the delimited element of generated sequences is -1, assert len(gen_sen_idx) == len(test_batch) * beam_size
# the first element of each generated sequence is the sequence length
seq_list = [] start_pos, end_pos = 1, 0
seq = [] for i, sample in enumerate(test_batch):
for w in beam_result[1]: print(" ".join([
if w != -1: src_dict[w] for w in sample[0][1:-1]
seq.append(w) ])) # skip the start and ending mark when print the source sentence
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i in xrange(len(gen_data)):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
for j in xrange(beam_size): for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
``` ```
模型测试的执行与模型训练类似,只需执行 模型测试的执行与模型训练类似,只需执行
...@@ -369,23 +361,20 @@ for i in xrange(len(gen_data)): ...@@ -369,23 +361,20 @@ for i in xrange(len(gen_data)):
```bash ```bash
python generate.py python generate.py
``` ```
则自动为测试数据生成了对应的翻译结果。
设置beam search的宽度为3,输入某个法文句子
```text 设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下:
src: <s> Elles connaissent leur entreprise mieux que personne . <e>
```
其对应的英文翻译结果为
```text ```text
prob = -3.754819: They know their business better than anyone . <e> Elles connaissent leur entreprise mieux que personne .
prob = -4.445528: They know their businesses better than anyone . <e> -3.754819 They know their business better than anyone . <e>
prob = -5.026885: They know their business better than anybody . <e> -4.445528 They know their businesses better than anyone . <e>
``` -5.026885 They know their business better than anybody . <e>
* `prob`表示生成句子的得分,随之其后则是翻译生成的句子; ```
* `<s>` 表示句子的开始,`<e>`表示一个句子的结束,如果出现了在词典中未包含的词,则用`<unk>`替代。 - 第一行为输入的源语言句子。
- 第二 ~ `beam_size + 1` 行是柱搜索生成的 `beam_size` 条翻译结果
- 一行之内以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。
- `<s>` 表示句子的开始,`<e>`表示一个句子的结束,如果出现了在词典中未包含的词,则用`<unk>`替代。
至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。 至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
......
#!/usr/bin/env python #!/usr/bin/env python
import os
import logging
import paddle.v2 as paddle
from network_conf import * from network_conf import seq2seq_net
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
def train(source_dict_dim, target_dict_dim):
def train(save_dir_path, source_dict_dim, target_dict_dim):
''' '''
Training function for NMT Training function for NMT
:param save_dir_path: path of the directory to save the trained models.
:param save_dir_path: str
:param source_dict_dim: size of source dictionary :param source_dict_dim: size of source dictionary
:type source_dict_dim: int :type source_dict_dim: int
:param target_dict_dim: size of target dictionary :param target_dict_dim: size of target dictionary
:type target_dict_dim: int :type target_dict_dim: int
''' '''
# initialize model if not os.path.exists(save_dir_path):
os.mkdir(save_dir_path)
# initialize PaddlePaddle
paddle.init(use_gpu=False, trainer_count=1) paddle.init(use_gpu=False, trainer_count=1)
cost = seq2seq_net(source_dict_dim, target_dict_dim) cost = seq2seq_net(source_dict_dim, target_dict_dim)
parameters = paddle.parameters.create(cost) parameters = paddle.parameters.create(cost)
# define optimize method and trainer # define optimization method and the trainer instance
optimizer = paddle.optimizer.RMSProp( optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3, learning_rate=1e-3,
gradient_clipping_threshold=10.0, gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD( trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer) cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader # define data reader
wmt14_reader = paddle.batch( wmt14_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192), paddle.dataset.wmt14.train(source_dict_dim), buf_size=8192),
batch_size=8) batch_size=8)
# define event_handler callback # define the event_handler callback
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, paddle.event.EndIteration):
if not event.batch_id % 500 and event.batch_id: if not event.batch_id % 100 and event.batch_id:
with gzip.open("models/nmt_without_att_params_batch_%05d.tar.gz" with gzip.open(
% event.batch_id, "w") as f: os.path.join(save_path,
"nmt_without_att_%05d_batch_%05d.tar.gz" %
event.pass_id, event.batch_id), "w") as f:
parameters.to_tar(f) parameters.to_tar(f)
if event.batch_id and not event.batch_id % 10: if event.batch_id and not event.batch_id % 10:
print("\nPass %d, Batch %d, Cost %f, %s" % logger.info("Pass %d, Batch %d, Cost %f, %s" % (
(event.pass_id, event.batch_id, event.cost, event.pass_id, event.batch_id, event.cost, event.metrics))
event.metrics))
else: # start training
sys.stdout.write('.')
sys.stdout.flush()
# start to train
trainer.train( trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2) reader=wmt14_reader, event_handler=event_handler, num_passes=2)
if __name__ == '__main__': if __name__ == '__main__':
train(source_dict_dim=3000, target_dict_dim=3000) train(save_dir_path="models", source_dict_dim=3000, target_dict_dim=3000)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册