Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
ddaba7fb
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ddaba7fb
编写于
6月 22, 2017
作者:
C
caoying03
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' into refine_seq2seq
上级
555e0899
1aaee801
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
310 addition
and
261 deletion
+310
-261
.travis.yml
.travis.yml
+1
-0
.travis/unittest.sh
.travis/unittest.sh
+2
-2
deep_speech_2/README.md
deep_speech_2/README.md
+1
-1
deep_speech_2/data_utils/data.py
deep_speech_2/data_utils/data.py
+12
-3
deep_speech_2/data_utils/speech.py
deep_speech_2/data_utils/speech.py
+1
-1
deep_speech_2/infer.py
deep_speech_2/infer.py
+8
-1
deep_speech_2/requirements.txt
deep_speech_2/requirements.txt
+1
-2
deep_speech_2/setup.sh
deep_speech_2/setup.sh
+30
-0
deep_speech_2/train.py
deep_speech_2/train.py
+22
-1
language_model/network_conf.py
language_model/network_conf.py
+18
-33
nmt_without_attention/README.md
nmt_without_attention/README.md
+79
-90
nmt_without_attention/generate.py
nmt_without_attention/generate.py
+29
-20
nmt_without_attention/index.html
nmt_without_attention/index.html
+79
-90
nmt_without_attention/train.py
nmt_without_attention/train.py
+27
-17
未找到文件。
.travis.yml
浏览文件 @
ddaba7fb
group
:
deprecated-2017Q2
language
:
cpp
cache
:
ccache
sudo
:
required
...
...
.travis/unittest.sh
浏览文件 @
ddaba7fb
...
...
@@ -8,8 +8,8 @@ abort(){
unittest
(){
cd
$1
>
/dev/null
if
[
-f
"
requirements.txt
"
]
;
then
pip
install
-r
requirements.txt
if
[
-f
"
setup.sh
"
]
;
then
sh setup.sh
fi
if
[
$?
!=
0
]
;
then
exit
1
...
...
deep_speech_2/README.md
浏览文件 @
ddaba7fb
...
...
@@ -5,7 +5,7 @@
Please replace
`$PADDLE_INSTALL_DIR`
with your own paddle installation directory.
```
pip install -r requirements.txt
sh setup.sh
export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
```
...
...
deep_speech_2/data_utils/data.py
浏览文件 @
ddaba7fb
...
...
@@ -7,6 +7,7 @@ from __future__ import print_function
import
random
import
numpy
as
np
import
multiprocessing
import
paddle.v2
as
paddle
from
data_utils
import
utils
from
data_utils.augmentor.augmentation
import
AugmentationPipeline
...
...
@@ -44,6 +45,8 @@ class DataGenerator(object):
:types max_freq: None|float
:param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str
:param num_threads: Number of CPU threads for processing data.
:type num_threads: int
:param random_seed: Random seed.
:type random_seed: int
"""
...
...
@@ -58,6 +61,7 @@ class DataGenerator(object):
window_ms
=
20.0
,
max_freq
=
None
,
specgram_type
=
'linear'
,
num_threads
=
multiprocessing
.
cpu_count
(),
random_seed
=
0
):
self
.
_max_duration
=
max_duration
self
.
_min_duration
=
min_duration
...
...
@@ -70,6 +74,7 @@ class DataGenerator(object):
stride_ms
=
stride_ms
,
window_ms
=
window_ms
,
max_freq
=
max_freq
)
self
.
_num_threads
=
num_threads
self
.
_rng
=
random
.
Random
(
random_seed
)
self
.
_epoch
=
0
...
...
@@ -207,10 +212,14 @@ class DataGenerator(object):
def
reader
():
for
instance
in
manifest
:
yield
self
.
_process_utterance
(
instance
[
"audio_filepath"
],
instance
[
"text"
])
yield
instance
return
reader
def
mapper
(
instance
):
return
self
.
_process_utterance
(
instance
[
"audio_filepath"
],
instance
[
"text"
])
return
paddle
.
reader
.
xmap_readers
(
mapper
,
reader
,
self
.
_num_threads
,
1024
,
order
=
True
)
def
_padding_batch
(
self
,
batch
,
padding_to
=-
1
,
flatten
=
False
):
"""
...
...
deep_speech_2/data_utils/speech.py
浏览文件 @
ddaba7fb
...
...
@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment):
return
cls
(
samples
,
sample_rate
,
transcripts
)
@
classmethod
def
slice_from_file
(
cls
,
filepath
,
start
=
None
,
end
=
None
,
transcript
):
def
slice_from_file
(
cls
,
filepath
,
transcript
,
start
=
None
,
end
=
None
):
"""Loads a small section of an speech without having to load
the entire file into the memory which can be incredibly wasteful.
...
...
deep_speech_2/infer.py
浏览文件 @
ddaba7fb
...
...
@@ -6,6 +6,7 @@ from __future__ import print_function
import
argparse
import
gzip
import
distutils.util
import
multiprocessing
import
paddle.v2
as
paddle
from
data_utils.data
import
DataGenerator
from
model
import
deep_speech2
...
...
@@ -38,6 +39,11 @@ parser.add_argument(
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use gpu or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
(),
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
...
...
@@ -67,7 +73,8 @@ def infer():
data_generator
=
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
'{}'
)
augmentation_config
=
'{}'
,
num_threads
=
args
.
num_threads_data
)
# create network config
# paddle.data_type.dense_array is used for variable batch input.
...
...
deep_speech_2/requirements.txt
浏览文件 @
ddaba7fb
SoundFile==0.9.0.post1
wget==3.2
scikits.samplerate==0.3.3
scipy==0.13.0b1
scipy==0.13.1
deep_speech_2/setup.sh
0 → 100644
浏览文件 @
ddaba7fb
#!/bin/bash
# install python dependencies
if
[
-f
'requirements.txt'
]
;
then
pip
install
-r
requirements.txt
fi
if
[
$?
!=
0
]
;
then
echo
"Install python dependencies failed !!!"
exit
1
fi
# install scikits.samplerate
curl
-O
"http://www.mega-nerd.com/SRC/libsamplerate-0.1.9.tar.gz"
if
[
$?
!=
0
]
;
then
echo
"Download libsamplerate-0.1.9.tar.gz failed !!!"
exit
1
fi
tar
-xvf
libsamplerate-0.1.9.tar.gz
cd
libsamplerate-0.1.9
./configure
&&
make
&&
make
install
cd
-
rm
-rf
libsamplerate-0.1.9
rm
libsamplerate-0.1.9.tar.gz
pip
install
scikits.samplerate
==
0.3.3
if
[
$?
!=
0
]
;
then
echo
"Install scikits.samplerate failed !!!"
exit
1
fi
echo
"Install all dependencies successfully."
deep_speech_2/train.py
浏览文件 @
ddaba7fb
...
...
@@ -9,6 +9,7 @@ import argparse
import
gzip
import
time
import
distutils.util
import
multiprocessing
import
paddle.v2
as
paddle
from
model
import
deep_speech2
from
data_utils.data
import
DataGenerator
...
...
@@ -52,6 +53,18 @@ parser.add_argument(
default
=
True
,
type
=
distutils
.
util
.
strtobool
,
help
=
"Use sortagrad or not. (default: %(default)s)"
)
parser
.
add_argument
(
"--max_duration"
,
default
=
100.0
,
type
=
float
,
help
=
"Audios with duration larger than this will be discarded. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--min_duration"
,
default
=
0.0
,
type
=
float
,
help
=
"Audios with duration smaller than this will be discarded. "
"(default: %(default)s)"
)
parser
.
add_argument
(
"--shuffle_method"
,
default
=
'instance_shuffle'
,
...
...
@@ -63,6 +76,11 @@ parser.add_argument(
default
=
4
,
type
=
int
,
help
=
"Trainer number. (default: %(default)s)"
)
parser
.
add_argument
(
"--num_threads_data"
,
default
=
multiprocessing
.
cpu_count
(),
type
=
int
,
help
=
"Number of cpu threads for preprocessing data. (default: %(default)s)"
)
parser
.
add_argument
(
"--mean_std_filepath"
,
default
=
'mean_std.npz'
,
...
...
@@ -107,7 +125,10 @@ def train():
return
DataGenerator
(
vocab_filepath
=
args
.
vocab_filepath
,
mean_std_filepath
=
args
.
mean_std_filepath
,
augmentation_config
=
args
.
augmentation_config
)
augmentation_config
=
args
.
augmentation_config
,
max_duration
=
args
.
max_duration
,
min_duration
=
args
.
min_duration
,
num_threads
=
args
.
num_threads_data
)
train_generator
=
data_generator
()
test_generator
=
data_generator
()
...
...
language_model/network_conf.py
浏览文件 @
ddaba7fb
...
...
@@ -51,56 +51,41 @@ def rnn_lm(vocab_size, emb_dim, rnn_type, hidden_size, num_layer):
return
cost
,
output
def
ngram_lm
(
vocab_size
,
emb_dim
,
hidden_size
,
num_layer
):
def
ngram_lm
(
vocab_size
,
emb_dim
,
hidden_size
,
num_layer
,
gram_num
=
4
):
"""
N-Gram language model definition.
:param vocab_size: size of vocab.
:param emb_dim: embedding vector's dimension.
:param hidden_size: size of unit.
:param num_layer: layer number.
:param num_layer: number of hidden layers.
:param gram_size: gram number in n-gram method
:return: cost and output layer of model.
"""
assert
emb_dim
>
0
and
hidden_size
>
0
and
vocab_size
>
0
and
num_layer
>
0
def
wordemb
(
inlayer
):
wordemb
=
paddle
.
layer
.
table_projection
(
input
=
inlayer
,
size
=
emb_dim
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
0.001
,
learning_rate
=
1
,
l2_rate
=
0
))
return
wordemb
# input layers
first_word
=
paddle
.
layer
.
data
(
name
=
"first_word"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
second_word
=
paddle
.
layer
.
data
(
name
=
"second_word"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
third_word
=
paddle
.
layer
.
data
(
name
=
"third_word"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
fourth_word
=
paddle
.
layer
.
data
(
name
=
"fourth_word"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
emb_layers
=
[]
for
i
in
range
(
gram_num
):
word
=
paddle
.
layer
.
data
(
name
=
"__word%02d__"
%
(
i
+
1
),
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
emb
=
paddle
.
layer
.
embedding
(
input
=
word
,
size
=
emb_dim
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"_proj"
,
initial_std
=
1e-3
))
emb_layers
.
append
(
emb
)
next_word
=
paddle
.
layer
.
data
(
name
=
"next_word"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
# embedding layer
first_emb
=
wordemb
(
first_word
)
second_emb
=
wordemb
(
second_word
)
third_emb
=
wordemb
(
third_word
)
fourth_emb
=
wordemb
(
fourth_word
)
context_emb
=
paddle
.
layer
.
concat
(
input
=
[
first_emb
,
second_emb
,
third_emb
,
fourth_emb
])
name
=
"__next_word__"
,
type
=
paddle
.
data_type
.
integer_value
(
vocab_size
))
# hidden layer
hidden
=
paddle
.
layer
.
fc
(
input
=
context_emb
,
size
=
hidden_size
,
act
=
paddle
.
activation
.
Relu
())
for
_
in
range
(
num_layer
-
1
):
for
i
in
range
(
num_layer
):
hidden
=
paddle
.
layer
.
fc
(
input
=
hidden
,
size
=
hidden_size
,
act
=
paddle
.
activation
.
Relu
())
input
=
hidden
if
i
else
paddle
.
layer
.
concat
(
input
=
emb_layers
),
size
=
hidden_size
,
act
=
paddle
.
activation
.
Relu
())
# fc(full connected) and output layer
predict_word
=
paddle
.
layer
.
fc
(
input
=
[
hidden
],
size
=
vocab_size
,
act
=
paddle
.
activation
.
Softmax
())
...
...
nmt_without_attention/README.md
浏览文件 @
ddaba7fb
# 神经网络机器翻译模型
## 背景介绍
-
PaddleBook中
[
机器翻译
](
https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md
)
的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献
\[
[
3
](
#参考文献
)
]。
机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。
## 模型概览
...
...
@@ -53,14 +51,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN
在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现:
```
python
#### Encoder
src_word_id
=
paddle
.
layer
.
data
(
name
=
'source_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
source_dict_dim
))
# source embedding
src_embedding
=
paddle
.
layer
.
embedding
(
input
=
src_word_id
,
size
=
word_vector_dim
)
# use bidirectional_gru
# # bidierctional GRU as encoder
encoded_vector
=
paddle
.
networks
.
bidirectional_gru
(
input
=
src_embedding
,
size
=
encoder_size
,
...
...
@@ -86,18 +85,17 @@ encoded_vector = paddle.networks.bidirectional_gru(
### 无注意力机制的解码器
-PaddleBook中
[
机器翻译
](
https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md
)
的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献
\[
[
3
](
#参考文献
)
]。
对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的
`recurrent_layer_group`
。首先,自定义单步逻辑函数,再利用函数
`recurrent_group()`
循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用
`recurrent_layer_group`
来实现,其中,单步逻辑函数
`gru_decoder_without_attention()`
相关代码如下:
```
python
#
### Decoder
#
the initialization state for decoder GRU
encoder_last
=
paddle
.
layer
.
last_seq
(
input
=
encoded_vector
)
encoder_last_projected
=
paddle
.
layer
.
mixed
(
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
(),
input
=
paddle
.
layer
.
full_matrix_projection
(
input
=
encoder_last
))
encoder_last_projected
=
paddle
.
layer
.
fc
(
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
(),
input
=
encoder_last
)
#
gru step
#
the step function for decoder GRU
def
gru_decoder_without_attention
(
enc_vec
,
current_word
):
'''
Step function for gru decoder
...
...
@@ -107,33 +105,29 @@ def gru_decoder_without_attention(enc_vec, current_word):
:type current_word: layer object
'''
decoder_mem
=
paddle
.
layer
.
memory
(
name
=
'gru_decoder'
,
size
=
decoder_size
,
boot_layer
=
encoder_last_projected
)
name
=
"gru_decoder"
,
size
=
decoder_size
,
boot_layer
=
encoder_last_projected
)
context
=
paddle
.
layer
.
last_seq
(
input
=
enc_vec
)
decoder_inputs
=
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
,
input
=
[
paddle
.
layer
.
full_matrix_projection
(
input
=
context
),
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
])
decoder_inputs
=
paddle
.
layer
.
fc
(
size
=
decoder_size
*
3
,
input
=
[
context
,
current_word
])
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
name
=
"gru_decoder"
,
act
=
paddle
.
activation
.
Tanh
(),
gate_act
=
paddle
.
activation
.
Sigmoid
(),
input
=
decoder_inputs
,
output_mem
=
decoder_mem
,
size
=
decoder_size
)
out
=
paddle
.
layer
.
mixed
(
out
=
paddle
.
layer
.
fc
(
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
(),
input
=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
)
return
out
input
=
gru_step
)
return
out
```
在模型训练和测试阶段,解码器的行为有很大的不同:
...
...
@@ -144,34 +138,14 @@ def gru_decoder_without_attention(enc_vec, current_word):
训练和生成的逻辑分别实现在如下的
`if-else`
条件分支中:
```
python
decoder_group_name
=
"decoder_group"
group_input1
=
paddle
.
layer
.
StaticInput
(
input
=
encoded_vector
,
is_seq
=
True
)
group_input1
=
paddle
.
layer
.
StaticInput
(
input
=
encoded_vector
)
group_inputs
=
[
group_input1
]
if
not
generating
:
trg_embedding
=
paddle
.
layer
.
embedding
(
input
=
paddle
.
layer
.
data
(
name
=
'target_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
)),
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
group_inputs
.
append
(
trg_embedding
)
decoder
=
paddle
.
layer
.
recurrent_group
(
name
=
decoder_group_name
,
step
=
gru_decoder_without_attention
,
input
=
group_inputs
)
lbl
=
paddle
.
layer
.
data
(
name
=
'target_language_next_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder
,
label
=
lbl
)
return
cost
else
:
decoder_group_name
=
"decoder_group"
if
is_generating
:
trg_embedding
=
paddle
.
layer
.
GeneratedInput
(
size
=
target_dict_dim
,
embedding_name
=
'_target_language_embedding'
,
embedding_name
=
"_target_language_embedding"
,
embedding_size
=
word_vector_dim
)
group_inputs
.
append
(
trg_embedding
)
...
...
@@ -185,6 +159,26 @@ else:
max_length
=
max_length
)
return
beam_gen
else
:
trg_embedding
=
paddle
.
layer
.
embedding
(
input
=
paddle
.
layer
.
data
(
name
=
"target_language_word"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
)),
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
"_target_language_embedding"
))
group_inputs
.
append
(
trg_embedding
)
decoder
=
paddle
.
layer
.
recurrent_group
(
name
=
decoder_group_name
,
step
=
gru_decoder_without_attention
,
input
=
group_inputs
)
lbl
=
paddle
.
layer
.
data
(
name
=
"target_language_next_word"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder
,
label
=
lbl
)
return
cost
```
## 数据准备
...
...
@@ -208,13 +202,16 @@ parameters = paddle.parameters.create(cost)
**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
```
python
# define optimiz
e method and trainer
# define optimiz
ation method
optimizer
=
paddle
.
optimizer
.
RMSProp
(
learning_rate
=
1e-3
,
gradient_clipping_threshold
=
10.0
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
8e-4
))
# define the trainer instance
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
# define data reader
wmt14_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
...
...
@@ -225,20 +222,19 @@ wmt14_reader = paddle.batch(
**c) 定义事件句柄,打印训练中间结果、保存模型快照**
```
python
# define event_handler callback
# define
the
event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
100
==
0
and
event
.
batch_id
>
0
:
with
gzip
.
open
(
'models/nmt_without_att_params_batch_%d.tar.gz'
%
event
.
batch_id
,
'w'
)
as
f
:
if
not
event
.
batch_id
%
100
and
event
.
batch_id
:
with
gzip
.
open
(
os
.
path
.
join
(
save_path
,
"nmt_without_att_%05d_batch_%05d.tar.gz"
%
event
.
pass_id
,
event
.
batch_id
),
"w"
)
as
f
:
parameters
.
to_tar
(
f
)
if
event
.
batch_id
%
10
==
0
:
print
"
\n
Pass %d, Batch %d, Cost%f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
)
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
event
.
batch_id
and
not
event
.
batch_id
%
10
:
logger
.
info
(
"Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
```
**d) 开始训练**
...
...
@@ -300,26 +296,22 @@ beam_result = paddle.infer(
**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果**
```
python
# get the dictionary
src_dict
,
trg_dict
=
paddle
.
dataset
.
wmt14
.
get_dict
(
source_dict_dim
)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list
=
[]
seq
=
[]
for
w
in
beam_result
[
1
]:
if
w
!=
-
1
:
seq
.
append
(
w
)
else
:
seq_list
.
append
(
' '
.
join
([
trg_dict
.
get
(
w
)
for
w
in
seq
[
1
:]]))
seq
=
[]
prob
=
beam_result
[
0
]
for
i
in
xrange
(
len
(
gen_data
)):
print
"
\n
*******************************************************
\n
"
print
"src:"
,
' '
.
join
([
src_dict
.
get
(
w
)
for
w
in
gen_data
[
i
][
0
]]),
"
\n
"
beam_result
=
inferer
.
infer
(
input
=
test_batch
,
field
=
[
"prob"
,
"id"
])
gen_sen_idx
=
np
.
where
(
beam_result
[
1
]
==
-
1
)[
0
]
assert
len
(
gen_sen_idx
)
==
len
(
test_batch
)
*
beam_size
start_pos
,
end_pos
=
1
,
0
for
i
,
sample
in
enumerate
(
test_batch
):
print
(
" "
.
join
([
src_dict
[
w
]
for
w
in
sample
[
0
][
1
:
-
1
]
]))
# skip the start and ending mark when print the source sentence
for
j
in
xrange
(
beam_size
):
print
"prob = %f:"
%
(
prob
[
i
][
j
]),
seq_list
[
i
*
beam_size
+
j
]
end_pos
=
gen_sen_idx
[
i
*
beam_size
+
j
]
print
(
"%.4f
\t
%s"
%
(
beam_result
[
0
][
i
][
j
],
" "
.
join
(
trg_dict
[
w
]
for
w
in
beam_result
[
1
][
start_pos
:
end_pos
])))
start_pos
=
end_pos
+
2
print
(
"
\n
"
)
```
模型测试的执行与模型训练类似,只需执行
...
...
@@ -327,23 +319,20 @@ for i in xrange(len(gen_data)):
```
bash
python generate.py
```
则自动为测试数据生成了对应的翻译结果。
设置beam search的宽度为3,输入某个法文句子
```
text
src: <s> Elles connaissent leur entreprise mieux que personne . <e>
```
其对应的英文翻译结果为
设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下:
```
text
prob = -3.754819: They know their business better than anyone . <e>
prob = -4.445528: They know their businesse
s better than anyone . <e>
prob = -5.026885: They know their business better than anybody
. <e>
```
Elles connaissent leur entreprise mieux que personne .
-3.754819 They know their busines
s better than anyone . <e>
-4.445528 They know their businesses better than anyone
. <e>
-5.026885 They know their business better than anybody . <e>
*
`prob`
表示生成句子的得分,随之其后则是翻译生成的句子;
*
`<s>`
表示句子的开始,
`<e>`
表示一个句子的结束,如果出现了在词典中未包含的词,则用
`<unk>`
替代。
```
-
第一行为输入的源语言句子。
-
第二 ~
`beam_size + 1`
行是柱搜索生成的
`beam_size`
条翻译结果
-
一行之内以“
\t
”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。
-
`<s>`
表示句子的开始,
`<e>`
表示一个句子的结束,如果出现了在词典中未包含的词,则用
`<unk>`
替代。
至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
...
...
nmt_without_attention/generate.py
浏览文件 @
ddaba7fb
#!/usr/bin/env python
import
os
from
network_conf
import
*
import
logging
import
numpy
as
np
from
network_conf
import
seq2seq_net
logger
=
logging
.
getLogger
(
"paddle"
)
logger
.
setLevel
(
logging
.
WARNING
)
def
infer_a_batch
(
inferer
,
test_batch
,
beam_size
,
src_dict
,
trg_dict
):
beam_result
=
inferer
.
infer
(
input
=
test_batch
,
field
=
[
"prob"
,
"id"
])
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list
,
seq
=
[],
[]
for
w
in
beam_result
[
1
]:
if
w
!=
-
1
:
seq
.
append
(
w
)
else
:
seq_list
.
append
(
" "
.
join
([
trg_dict
.
get
(
w
)
for
w
in
seq
[
1
:]]))
seq
=
[]
prob
=
beam_result
[
0
]
gen_sen_idx
=
np
.
where
(
beam_result
[
1
]
==
-
1
)[
0
]
assert
len
(
gen_sen_idx
)
==
len
(
test_batch
)
*
beam_size
start_pos
,
end_pos
=
1
,
0
for
i
,
sample
in
enumerate
(
test_batch
):
print
(
"src:"
,
" "
.
join
([
src_dict
.
get
(
w
)
for
w
in
sample
[
0
]]),
"
\n
"
)
print
(
" "
.
join
([
src_dict
[
w
]
for
w
in
sample
[
0
][
1
:
-
1
]
]))
# skip the start and ending mark when print the source sentence
for
j
in
xrange
(
beam_size
):
print
(
"prob = %f:"
%
(
prob
[
i
][
j
]),
seq_list
[
i
*
beam_size
+
j
])
end_pos
=
gen_sen_idx
[
i
*
beam_size
+
j
]
print
(
"%.4f
\t
%s"
%
(
beam_result
[
0
][
i
][
j
],
" "
.
join
(
trg_dict
[
w
]
for
w
in
beam_result
[
1
][
start_pos
:
end_pos
])))
start_pos
=
end_pos
+
2
print
(
"
\n
"
)
def
generate
(
source_dict_dim
,
target_dict_dim
,
model_path
,
batch_size
):
def
generate
(
source_dict_dim
,
target_dict_dim
,
model_path
,
beam_size
,
batch_size
):
"""
Generating func
tion for NMT
sequence genera
tion for NMT
:param source_dict_dim: size of source dictionary
:type source_dict_dim: int
...
...
@@ -34,16 +39,19 @@ def generate(source_dict_dim, target_dict_dim, model_path, batch_size):
:type target_dict_dim: int
:param model_path: path for inital model
:type model_path: string
:param beam_size: the expanson width in each generation setp
:param beam_size: int
:param batch_size: the number of training examples in one forward pass
:param batch_size: int
"""
assert
os
.
path
.
exists
(
model_path
),
"trained model does not exist."
# step 1: prepare dictionary
src_dict
,
trg_dict
=
paddle
.
dataset
.
wmt14
.
get_dict
(
source_dict_dim
)
beam_size
=
5
# step 2: load the trained model
paddle
.
init
(
use_gpu
=
Tru
e
,
trainer_count
=
1
)
paddle
.
init
(
use_gpu
=
Fals
e
,
trainer_count
=
1
)
with
gzip
.
open
(
model_path
)
as
f
:
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
f
)
beam_gen
=
seq2seq_net
(
...
...
@@ -72,5 +80,6 @@ if __name__ == "__main__":
generate
(
source_dict_dim
=
3000
,
target_dict_dim
=
3000
,
batch_size
=
5
,
model_path
=
"models/nmt_without_att_params_batch_00001.tar.gz"
)
batch_size
=
20
,
beam_size
=
5
,
model_path
=
"models/nmt_without_att_params_batch_00347.tar.gz"
)
nmt_without_attention/index.html
浏览文件 @
ddaba7fb
...
...
@@ -43,8 +43,6 @@
# 神经网络机器翻译模型
## 背景介绍
- PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
机器翻译利用计算机将源语言转换成目标语言的同义表达,是自然语言处理中重要的研究方向,有着广泛的应用需求,其实现方式也经历了不断地演化。传统机器翻译方法主要基于规则或统计模型,需要人为地指定翻译规则或设计语言特征,效果依赖于人对源语言与目标语言的理解程度。近些年来,深度学习的提出与迅速发展使得特征的自动学习成为可能。深度学习首先在图像识别和语音识别中取得成功,进而在机器翻译等自然语言处理领域中掀起了研究热潮。机器翻译中的深度学习模型直接学习源语言到目标语言的映射,大为减少了学习过程中人的介入,同时显著地提高了翻译质量。本例介绍在PaddlePaddle中如何利用循环神经网络(Recurrent Neural Network, RNN)构建一个端到端(End-to-End)的神经网络机器翻译(Neural Machine Translation, NMT)模型。
## 模型概览
...
...
@@ -95,14 +93,15 @@ RNN 的原始结构用一个向量来存储隐状态,然而这种结构的 RNN
在 PaddlePaddle 中,双向编码器可以很方便地调用相关 APIs 实现:
```python
#### Encoder
src_word_id = paddle.layer.data(
name='source_language_word',
type=paddle.data_type.integer_value_sequence(source_dict_dim))
# source embedding
src_embedding = paddle.layer.embedding(
input=src_word_id, size=word_vector_dim)
# use bidirectional_gru
# # bidierctional GRU as encoder
encoded_vector = paddle.networks.bidirectional_gru(
input=src_embedding,
size=encoder_size,
...
...
@@ -128,18 +127,17 @@ encoded_vector = paddle.networks.bidirectional_gru(
### 无注意力机制的解码器
-PaddleBook中[机器翻译](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.cn.md)的相关章节中,已介绍了带注意力机制(Attention Mechanism)的 Encoder-Decoder 结构,本例则介绍的是不带注意力机制的 Encoder-Decoder 结构。关于注意力机制,读者可进一步参考 PaddleBook 和参考文献\[[3](#参考文献)]。
对于流行的RNN单元,PaddlePaddle 已有很好的实现均可直接调用。如果希望在 RNN 每一个时间步实现某些自定义操作,可使用 PaddlePaddle 中的`recurrent_layer_group`。首先,自定义单步逻辑函数,再利用函数 `recurrent_group()` 循环调用单步逻辑函数处理整个序列。本例中的无注意力机制的解码器便是使用`recurrent_layer_group`来实现,其中,单步逻辑函数`gru_decoder_without_attention()`相关代码如下:
```python
#
### Decoder
#
the initialization state for decoder GRU
encoder_last = paddle.layer.last_seq(input=encoded_vector)
encoder_last_projected = paddle.layer.mixed(
size=decoder_size,
act=paddle.activation.Tanh(),
input=paddle.layer.full_matrix_projection(input=encoder_last))
encoder_last_projected = paddle.layer.fc(
size=decoder_size, act=paddle.activation.Tanh(), input=encoder_last)
#
gru step
#
the step function for decoder GRU
def gru_decoder_without_attention(enc_vec, current_word):
'''
Step function for gru decoder
...
...
@@ -149,33 +147,29 @@ def gru_decoder_without_attention(enc_vec, current_word):
:type current_word: layer object
'''
decoder_mem = paddle.layer.memory(
name='gru_decoder'
,
size=decoder_size,
boot_layer=encoder_last_projected)
name="gru_decoder"
,
size=decoder_size,
boot_layer=encoder_last_projected)
context = paddle.layer.last_seq(input=enc_vec)
decoder_inputs = paddle.layer.mixed(
size=decoder_size * 3,
input=[
paddle.layer.full_matrix_projection(input=context),
paddle.layer.full_matrix_projection(input=current_word)
])
decoder_inputs = paddle.layer.fc(
size=decoder_size * 3, input=[context, current_word])
gru_step = paddle.layer.gru_step(
name=
'gru_decoder'
,
name=
"gru_decoder"
,
act=paddle.activation.Tanh(),
gate_act=paddle.activation.Sigmoid(),
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
out = paddle.layer.mixed
(
out = paddle.layer.fc
(
size=target_dict_dim,
bias_attr=True,
act=paddle.activation.Softmax(),
input=
paddle.layer.full_matrix_projection(input=gru_step)
)
return out
input=
gru_step
)
return out
```
在模型训练和测试阶段,解码器的行为有很大的不同:
...
...
@@ -186,34 +180,14 @@ def gru_decoder_without_attention(enc_vec, current_word):
训练和生成的逻辑分别实现在如下的`if-else`条件分支中:
```python
decoder_group_name = "decoder_group"
group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_inputs = [group_input1]
if not generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
decoder_group_name = "decoder_group"
if is_generating:
trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
embedding_name=
'_target_language_embedding'
,
embedding_name=
"_target_language_embedding"
,
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
...
...
@@ -227,6 +201,26 @@ else:
max_length=max_length)
return beam_gen
else:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name="target_language_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name="_target_language_embedding"))
group_inputs.append(trg_embedding)
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_without_attention,
input=group_inputs)
lbl = paddle.layer.data(
name="target_language_next_word",
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
```
## 数据准备
...
...
@@ -250,13 +244,16 @@ parameters = paddle.parameters.create(cost)
**b) 设定训练过程中的优化策略、定义训练数据读取 `reader`**
```python
# define optimiz
e method and trainer
# define optimiz
ation method
optimizer = paddle.optimizer.RMSProp(
learning_rate=1e-3,
gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
# define the trainer instance
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle(
...
...
@@ -267,20 +264,19 @@ wmt14_reader = paddle.batch(
**c) 定义事件句柄,打印训练中间结果、保存模型快照**
```python
# define event_handler callback
# define
the
event_handler callback
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0 and event.batch_id > 0:
with gzip.open('models/nmt_without_att_params_batch_%d.tar.gz' %
event.batch_id, 'w') as f:
if not event.batch_id % 100 and event.batch_id:
with gzip.open(
os.path.join(save_path,
"nmt_without_att_%05d_batch_%05d.tar.gz" %
event.pass_id, event.batch_id), "w") as f:
parameters.to_tar(f)
if event.batch_id % 10 == 0:
print "\nPass %d, Batch %d, Cost%f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if event.batch_id and not event.batch_id % 10:
logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
```
**d) 开始训练**
...
...
@@ -342,26 +338,22 @@ beam_result = paddle.infer(
**c) 加载源语言和目标语言词典,将`id`序列表示的句子转化成原语言并输出结果**
```python
# get the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(source_dict_dim)
# the delimited element of generated sequences is -1,
# the first element of each generated sequence is the sequence length
seq_list = []
seq = []
for w in beam_result[1]:
if w != -1:
seq.append(w)
else:
seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
seq = []
prob = beam_result[0]
for i in xrange(len(gen_data)):
print "\n*******************************************************\n"
print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n"
beam_result = inferer.infer(input=test_batch, field=["prob", "id"])
gen_sen_idx = np.where(beam_result[1] == -1)[0]
assert len(gen_sen_idx) == len(test_batch) * beam_size
start_pos, end_pos = 1, 0
for i, sample in enumerate(test_batch):
print(" ".join([
src_dict[w] for w in sample[0][1:-1]
])) # skip the start and ending mark when print the source sentence
for j in xrange(beam_size):
print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
```
模型测试的执行与模型训练类似,只需执行
...
...
@@ -369,23 +361,20 @@ for i in xrange(len(gen_data)):
```bash
python generate.py
```
则自动为测试数据生成了对应的翻译结果。
设置beam search的宽度为3,输入某个法文句子
```text
src:
<s>
Elles connaissent leur entreprise mieux que personne .
<e>
```
其对应的英文翻译结果为
设置beam search的宽度为3,输入为一个法文句子,则自动为测试数据生成对应的翻译结果,输出格式如下:
```text
prob = -3.754819: They know their business better than anyone .
<e>
prob = -4.445528: They know their businesse
s better than anyone .
<e>
prob = -5.026885: They know their business better than anybody
.
<e>
```
Elles connaissent leur entreprise mieux que personne .
-3.754819 They know their busines
s better than anyone .
<e>
-4.445528 They know their businesses better than anyone
.
<e>
-5.026885 They know their business better than anybody .
<e>
* `prob`表示生成句子的得分,随之其后则是翻译生成的句子;
* `
<s>
` 表示句子的开始,`
<e>
`表示一个句子的结束,如果出现了在词典中未包含的词,则用`
<unk>
`替代。
```
- 第一行为输入的源语言句子。
- 第二 ~ `beam_size + 1` 行是柱搜索生成的 `beam_size` 条翻译结果
- 一行之内以“\t”分隔为两列,第一列是句子的log 概率,第二列是翻译结果的文本。
- `
<s>
` 表示句子的开始,`
<e>
`表示一个句子的结束,如果出现了在词典中未包含的词,则用`
<unk>
`替代。
至此,我们在 PaddlePaddle 上实现了一个初步的机器翻译模型。我们可以看到,PaddlePaddle 提供了灵活丰富的API供大家选择和使用,使得我们能够很方便完成各种复杂网络的配置。机器翻译本身也是个快速发展的领域,各种新方法新思想在不断涌现。在学习完本例后,读者若有兴趣和余力,可基于 PaddlePaddle 平台实现更为复杂、性能更优的机器翻译模型。
...
...
nmt_without_attention/train.py
浏览文件 @
ddaba7fb
#!/usr/bin/env python
import
os
import
logging
import
paddle.v2
as
paddle
from
network_conf
import
*
from
network_conf
import
seq2seq_net
logger
=
logging
.
getLogger
(
"paddle"
)
logger
.
setLevel
(
logging
.
INFO
)
def
train
(
source_dict_dim
,
target_dict_dim
):
def
train
(
save_dir_path
,
source_dict_dim
,
target_dict_dim
):
'''
Training function for NMT
:param save_dir_path: path of the directory to save the trained models.
:param save_dir_path: str
:param source_dict_dim: size of source dictionary
:type source_dict_dim: int
:param target_dict_dim: size of target dictionary
:type target_dict_dim: int
'''
# initialize model
if
not
os
.
path
.
exists
(
save_dir_path
):
os
.
mkdir
(
save_dir_path
)
# initialize PaddlePaddle
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
cost
=
seq2seq_net
(
source_dict_dim
,
target_dict_dim
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
# define optimiz
e method and trainer
# define optimiz
ation method and the trainer instance
optimizer
=
paddle
.
optimizer
.
RMSProp
(
learning_rate
=
1e-3
,
gradient_clipping_threshold
=
10.0
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
# define data reader
wmt14_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
paddle
.
dataset
.
wmt14
.
train
(
source_dict_dim
),
buf_size
=
8192
),
batch_size
=
8
)
# define event_handler callback
# define
the
event_handler callback
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
not
event
.
batch_id
%
500
and
event
.
batch_id
:
with
gzip
.
open
(
"models/nmt_without_att_params_batch_%05d.tar.gz"
%
event
.
batch_id
,
"w"
)
as
f
:
if
not
event
.
batch_id
%
100
and
event
.
batch_id
:
with
gzip
.
open
(
os
.
path
.
join
(
save_path
,
"nmt_without_att_%05d_batch_%05d.tar.gz"
%
event
.
pass_id
,
event
.
batch_id
),
"w"
)
as
f
:
parameters
.
to_tar
(
f
)
if
event
.
batch_id
and
not
event
.
batch_id
%
10
:
print
(
"
\n
Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
# start to train
logger
.
info
(
"Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
# start training
trainer
.
train
(
reader
=
wmt14_reader
,
event_handler
=
event_handler
,
num_passes
=
2
)
if
__name__
==
'__main__'
:
train
(
source_dict_dim
=
3000
,
target_dict_dim
=
3000
)
train
(
s
ave_dir_path
=
"models"
,
s
ource_dict_dim
=
3000
,
target_dict_dim
=
3000
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录