Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
63416ffa
M
models
项目概览
PaddlePaddle
/
models
大约 2 年 前同步成功
通知
232
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
63416ffa
编写于
5月 09, 2017
作者:
X
Xinghai Sun
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add model configuration for machine translation with external memory.
上级
367e1231
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
567 addition
and
0 deletion
+567
-0
mt_with_external_memory/mt_with_external_memory.py
mt_with_external_memory/mt_with_external_memory.py
+567
-0
未找到文件。
mt_with_external_memory/mt_with_external_memory.py
0 → 100644
浏览文件 @
63416ffa
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.v2
as
paddle
import
sys
import
gzip
dict_size
=
30000
word_vec_dim
=
512
hidden_size
=
1024
batch_size
=
5
memory_slot_num
=
8
beam_size
=
40
infer_data_num
=
3
class
ExternalMemory
(
object
):
"""
External neural memory class, with differentiable write/read heads.
A simplified Neural Turing Machine (NTM) with only content-based
addressing (including content addressing and interpolation, but excluding
convolutional shift and sharpening). It can serve as an external memory
bank, with differential write/read head controllers responsible for storing
and reading information flow dynamically as the model needs. Here, simple
feedforward neural networks are used as the write/read head controllers.
For more techinical details, please refer to the
`NTM paper <https://arxiv.org/abs/1410.5401>`_.
:param name: Memory name.
:type name: basestring
:param mem_slot_size: Size of memory slot/vector.
:type mem_slot_size: int
:param boot_layer: Boot layer for initializing memory. Sequence layer
with sequence length indicating the number of memory
slots, and size as mem_slot_size.
:type boot_layer: LayerOutput
:param readonly: If true, the memory is read-only, and write function cannot
be called. Default is false.
:type readonly: bool
"""
def
__init__
(
self
,
name
,
mem_slot_size
,
boot_layer
,
readonly
=
False
):
self
.
name
=
name
self
.
mem_slot_size
=
mem_slot_size
self
.
readonly
=
readonly
self
.
external_memory
=
paddle
.
layer
.
memory
(
name
=
self
.
name
,
size
=
self
.
mem_slot_size
,
is_seq
=
True
,
boot_layer
=
boot_layer
)
# set memory to constant when readonly=True
if
self
.
readonly
:
self
.
updated_external_memory
=
paddle
.
layer
.
mixed
(
name
=
self
.
name
,
input
=
[
paddle
.
layer
.
identity_projection
(
input
=
self
.
external_memory
)
],
size
=
self
.
mem_slot_size
)
def
__content_addressing__
(
self
,
key_vector
):
"""
Get head's addressing weight via content-based addressing.
"""
# content-based addressing: a=tanh(W*M + U*key)
key_projection
=
paddle
.
layer
.
fc
(
input
=
key_vector
,
size
=
self
.
mem_slot_size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
key_proj_expanded
=
paddle
.
layer
.
expand
(
input
=
key_projection
,
expand_as
=
self
.
external_memory
)
memory_projection
=
paddle
.
layer
.
fc
(
input
=
self
.
external_memory
,
size
=
self
.
mem_slot_size
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
merged
=
paddle
.
layer
.
addto
(
input
=
[
key_proj_expanded
,
memory_projection
],
act
=
paddle
.
activation
.
Tanh
())
# softmax addressing weight: w=softmax(v^T a)
addressing_weight
=
paddle
.
layer
.
fc
(
input
=
merged
,
size
=
1
,
act
=
paddle
.
activation
.
SequenceSoftmax
(),
bias_attr
=
False
)
return
addressing_weight
def
__interpolation__
(
self
,
key_vector
,
addressing_weight
):
"""
Interpolate between previous and current addressing weights.
"""
# prepare interpolation scalar gate: g=sigmoid(W*key)
gate
=
paddle
.
layer
.
fc
(
input
=
key_vector
,
size
=
1
,
act
=
paddle
.
activation
.
Sigmoid
(),
bias_attr
=
False
)
# interpolation: w_t = g*w_t+(1-g)*w_{t-1}
last_addressing_weight
=
paddle
.
layer
.
memory
(
name
=
self
.
name
+
"_addressing_weight"
,
size
=
1
,
is_seq
=
True
)
gated_addressing_weight
=
paddle
.
layer
.
addto
(
name
=
self
.
name
+
"_addressing_weight"
,
input
=
[
last_addressing_weight
,
paddle
.
layer
.
scaling
(
weight
=
gate
,
input
=
addressing_weight
),
paddle
.
layer
.
mixed
(
input
=
paddle
.
layer
.
dotmul_operator
(
a
=
gate
,
b
=
last_addressing_weight
,
scale
=-
1.0
),
size
=
1
)
],
act
=
paddle
.
activation
.
Tanh
())
return
gated_addressing_weight
def
__get_addressing_weight__
(
self
,
key_vector
):
"""
Get final addressing weight for read/write heads, including content
addressing and interpolation.
"""
# current content-based addressing
addressing_weight
=
self
.
__content_addressing__
(
key_vector
)
return
addressing_weight
# interpolation with previous addresing weight
return
self
.
__interpolation__
(
key_vector
,
addressing_weight
)
def
write
(
self
,
write_key
):
"""
Write head for external memory.
:param write_key: Key vector for write head to generate writing
content and addressing signals.
:type write_key: LayerOutput
"""
# check readonly
if
self
.
readonly
:
raise
ValueError
(
"ExternalMemory with readonly=True cannot write."
)
# get addressing weight for write head
write_weight
=
self
.
__get_addressing_weight__
(
write_key
)
# prepare add_vector and erase_vector
erase_vector
=
paddle
.
layer
.
fc
(
input
=
write_key
,
size
=
self
.
mem_slot_size
,
act
=
paddle
.
activation
.
Sigmoid
(),
bias_attr
=
False
)
add_vector
=
paddle
.
layer
.
fc
(
input
=
write_key
,
size
=
self
.
mem_slot_size
,
act
=
paddle
.
activation
.
Sigmoid
(),
bias_attr
=
False
)
erase_vector_expand
=
paddle
.
layer
.
expand
(
input
=
erase_vector
,
expand_as
=
self
.
external_memory
)
add_vector_expand
=
paddle
.
layer
.
expand
(
input
=
add_vector
,
expand_as
=
self
.
external_memory
)
# prepare scaled add part and erase part
scaled_erase_vector_expand
=
paddle
.
layer
.
scaling
(
weight
=
write_weight
,
input
=
erase_vector_expand
)
erase_memory_part
=
paddle
.
layer
.
mixed
(
input
=
paddle
.
layer
.
dotmul_operator
(
a
=
self
.
external_memory
,
b
=
scaled_erase_vector_expand
,
scale
=-
1.0
))
add_memory_part
=
paddle
.
layer
.
scaling
(
weight
=
write_weight
,
input
=
add_vector_expand
)
# update external memory
self
.
updated_external_memory
=
paddle
.
layer
.
addto
(
input
=
[
self
.
external_memory
,
add_memory_part
,
erase_memory_part
],
name
=
self
.
name
)
def
read
(
self
,
read_key
):
"""
Read head for external memory.
:param write_key: Key vector for read head to generate addressing
signals.
:type write_key: LayerOutput
:return: Content read from external memory.
:rtype: LayerOutput
"""
# get addressing weight for write head
read_weight
=
self
.
__get_addressing_weight__
(
read_key
)
# read content from external memory
scaled
=
paddle
.
layer
.
scaling
(
weight
=
read_weight
,
input
=
self
.
updated_external_memory
)
return
paddle
.
layer
.
pooling
(
input
=
scaled
,
pooling_type
=
paddle
.
pooling
.
Sum
())
def
bidirectional_gru_encoder
(
input
,
size
,
word_vec_dim
):
"""
Bidirectional GRU encoder.
"""
# token embedding
embeddings
=
paddle
.
layer
.
embedding
(
input
=
input
,
size
=
word_vec_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_encoder_word_embedding'
))
# token-level forward and backard encoding for attentions
forward
=
paddle
.
networks
.
simple_gru
(
input
=
embeddings
,
size
=
size
,
reverse
=
False
)
backward
=
paddle
.
networks
.
simple_gru
(
input
=
embeddings
,
size
=
size
,
reverse
=
True
)
merged
=
paddle
.
layer
.
concat
(
input
=
[
forward
,
backward
])
# sequence-level encoding
backward_first
=
paddle
.
layer
.
first_seq
(
input
=
backward
)
return
merged
,
backward_first
def
memory_enhanced_decoder
(
input
,
target
,
initial_state
,
source_context
,
size
,
word_vec_dim
,
dict_size
,
is_generating
,
beam_size
):
"""
Memory enhanced GRU decoder.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both included in this enhanced seq2seq model.
Here, the bounded memory takes the place of the "state" vector in RNNs. The
state vector in RNNs is a very successfull design enriching the model with
capability to "remember" things in the long run (across multiple sequence
steps). However, such a vector state is somewhat limited to very small
memory bandwith. A bounded memory introduced here could easily increase the
memory capacity under linear complexity cost (rather than quadratic
with vector state). Besides, attention mechasim (with unbounded memory) also
serves as a exteranl memory bank encoding source input information.
Notice that we take the attention mechanism as a special form of external
memory, with readonly memory bank initialized with encoder states, and a
content-based addressing read head responsable for generating attentional
context. From this view point, we could have a better understanding of
attention mechanism and other types of external memory, and it also enable a
concise and unified implementation for them.
For more techinical details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more techinical details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper with minor differences.
Also, we reversed the read-write order, for skipping the potential problems
in PaddlePaddle V2 APIs.
See `issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_.
"""
# prepare initial bounded and unbounded memory
bounded_memory_slot_init
=
paddle
.
layer
.
fc
(
input
=
paddle
.
layer
.
pooling
(
input
=
source_context
,
pooling_type
=
paddle
.
pooling
.
Avg
()),
size
=
size
,
act
=
paddle
.
activation
.
Sigmoid
())
bounded_memory_init
=
paddle
.
layer
.
expand
(
input
=
bounded_memory_slot_init
,
expand_as
=
paddle
.
layer
.
data
(
name
=
'bounded_memory_template'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
0
)))
unbounded_memory_init
=
source_context
# prepare step function for reccurent group
def
recurrent_decoder_step
(
cur_embedding
):
# create hidden state, bounded and unbounded memory.
state
=
paddle
.
layer
.
memory
(
name
=
"gru_decoder"
,
size
=
size
,
boot_layer
=
initial_state
)
bounded_memory
=
ExternalMemory
(
name
=
"bounded_memory"
,
mem_slot_size
=
size
,
boot_layer
=
bounded_memory_init
,
readonly
=
False
)
unbounded_memory
=
ExternalMemory
(
name
=
"unbounded_memory"
,
mem_slot_size
=
size
*
2
,
boot_layer
=
unbounded_memory_init
,
readonly
=
True
)
# write bounded memory
bounded_memory
.
write
(
state
)
# read bounded memory
bounded_memory_read
=
bounded_memory
.
read
(
state
)
# prepare key for unbounded memory
key_for_unbounded_memory
=
paddle
.
layer
.
fc
(
input
=
[
bounded_memory_read
,
cur_embedding
],
size
=
size
,
act
=
paddle
.
activation
.
Tanh
(),
bias_attr
=
False
)
# read unbounded memory (i.e. attention mechanism)
context
=
unbounded_memory
.
read
(
key_for_unbounded_memory
)
# gated recurrent unit
gru_inputs
=
paddle
.
layer
.
fc
(
input
=
[
context
,
cur_embedding
,
bounded_memory_read
],
size
=
size
*
3
,
act
=
paddle
.
activation
.
Linear
(),
bias_attr
=
False
)
gru_output
=
paddle
.
layer
.
gru_step
(
name
=
"gru_decoder"
,
input
=
gru_inputs
,
output_mem
=
state
,
size
=
size
)
# step output
return
paddle
.
layer
.
fc
(
input
=
[
gru_output
,
context
,
cur_embedding
],
size
=
dict_size
,
act
=
paddle
.
activation
.
Softmax
(),
bias_attr
=
True
)
if
not
is_generating
:
target_embeddings
=
paddle
.
layer
.
embedding
(
input
=
input
,
size
=
word_vec_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
"_decoder_word_embedding"
))
decoder_result
=
paddle
.
layer
.
recurrent_group
(
name
=
"decoder_group"
,
step
=
recurrent_decoder_step
,
input
=
[
target_embeddings
])
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder_result
,
label
=
target
)
return
cost
else
:
target_embeddings
=
paddle
.
layer
.
GeneratedInputV2
(
size
=
dict_size
,
embedding_name
=
"_decoder_word_embedding"
,
embedding_size
=
word_vec_dim
)
beam_gen
=
paddle
.
layer
.
beam_search
(
name
=
"decoder_group"
,
step
=
recurrent_decoder_step
,
input
=
[
target_embeddings
],
bos_id
=
0
,
eos_id
=
1
,
beam_size
=
beam_size
,
max_length
=
100
)
return
beam_gen
def
memory_enhanced_seq2seq
(
encoder_input
,
decoder_input
,
decoder_target
,
hidden_size
,
word_vec_dim
,
dict_size
,
is_generating
,
beam_size
):
"""
Seq2Seq Model enhanced with external memory.
The "external memory" refers to two types of memories.
- Unbounded memory: i.e. vanilla attention mechanism.
- Bounded memory: i.e. external memory in NTM.
Both types of external memories can be implemented with
ExternalMemory class, and are both included in this enhanced seq2seq model.
Here, the bounded memory takes the place of the "state" vector in RNNs. The
state vector in RNNs is a very successfull design enriching the model with
capability to "remember" things in the long run (across multiple sequence
steps). However, such a vector state is somewhat limited to very small
memory bandwith. A bounded memory introduced here could easily increase the
memory capacity under linear complexity cost (rather than quadratic
with vector state). Besides, attention mechasim (with unbounded memory) also
serves as a exteranl memory bank encoding source input information.
Notice that we take the attention mechanism as a special form of external
memory, with readonly memory bank initialized with encoder states, and a
content-based addressing read head responsable for generating attentional
context. From this view point, we could have a better understanding of
attention mechanism and other types of external memory, and it also enable a
concise and unified implementation for them.
For more techinical details about external memory, please refer to
`Neural Turing Machines <https://arxiv.org/abs/1410.5401>`_.
For more techinical details about this memory-enhanced decoder, please
refer to `Memory-enhanced Decoder for Neural Machine Translation
<https://arxiv.org/abs/1606.02003>`_. This implementation is highly
correlated to this paper with minor differences.
Also, we reversed the read-write order, for skipping the potential problems
in PaddlePaddle V2 APIs.
See `issue <https://github.com/PaddlePaddle/Paddle/issues/2061>`_.
"""
# encoder
context_encodings
,
sequence_encoding
=
bidirectional_gru_encoder
(
input
=
encoder_input
,
size
=
hidden_size
,
word_vec_dim
=
word_vec_dim
)
# decoder
return
memory_enhanced_decoder
(
input
=
decoder_input
,
target
=
decoder_target
,
initial_state
=
sequence_encoding
,
source_context
=
context_encodings
,
size
=
hidden_size
,
word_vec_dim
=
word_vec_dim
,
dict_size
=
dict_size
,
is_generating
=
is_generating
,
beam_size
=
beam_size
)
def
parse_beam_result
(
beam_result
,
dictionary
):
"""
Beam result parser.
"""
sentence_list
=
[]
sentence
=
[]
for
word
in
beam_result
[
1
]:
if
word
!=
-
1
:
sentence
.
append
(
word
)
else
:
sentence_list
.
append
(
' '
.
join
([
dictionary
.
get
(
word
)
for
word
in
sentence
[
1
:]]))
sentence
=
[]
beam_probs
=
beam_result
[
0
]
beam_size
=
len
(
beam_probs
[
0
])
beam_sentences
=
[
sentence_list
[
i
:
i
+
beam_size
]
for
i
in
range
(
0
,
len
(
sentence_list
),
beam_size
)
]
return
beam_probs
,
beam_sentences
def
reader_append_wrapper
(
reader
,
append_tuple
):
"""
Data reader wrapper for appending extra data to exisiting reader.
"""
def
new_reader
():
for
ins
in
reader
():
yield
ins
+
append_tuple
return
new_reader
def
train
(
num_passes
):
"""
For training.
"""
# create network config
source_words
=
paddle
.
layer
.
data
(
name
=
"source_words"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
target_words
=
paddle
.
layer
.
data
(
name
=
"target_words"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
target_next_words
=
paddle
.
layer
.
data
(
name
=
'target_next_words'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
cost
=
memory_enhanced_seq2seq
(
encoder_input
=
source_words
,
decoder_input
=
target_words
,
decoder_target
=
target_next_words
,
hidden_size
=
hidden_size
,
word_vec_dim
=
word_vec_dim
,
dict_size
=
dict_size
,
is_generating
=
False
,
beam_size
=
beam_size
)
# create parameters and optimizer
parameters
=
paddle
.
parameters
.
create
(
cost
)
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
5e-5
,
gradient_clipping_threshold
=
5
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
8e-4
))
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
cost
,
parameters
=
parameters
,
update_equation
=
optimizer
)
# create data readers
feeding
=
{
"source_words"
:
0
,
"target_words"
:
1
,
"target_next_words"
:
2
,
"bounded_memory_template"
:
3
}
train_append_reader
=
reader_append_wrapper
(
reader
=
paddle
.
dataset
.
wmt14
.
train
(
dict_size
),
append_tuple
=
([
0
]
*
memory_slot_num
,
))
train_batch_reader
=
paddle
.
batch
(
reader
=
paddle
.
reader
.
shuffle
(
reader
=
train_append_reader
,
buf_size
=
8192
),
batch_size
=
batch_size
)
test_append_reader
=
reader_append_wrapper
(
reader
=
paddle
.
dataset
.
wmt14
.
test
(
dict_size
),
append_tuple
=
([
0
]
*
memory_slot_num
,
))
test_batch_reader
=
paddle
.
batch
(
reader
=
paddle
.
reader
.
shuffle
(
reader
=
test_append_reader
,
buf_size
=
8192
),
batch_size
=
batch_size
)
# create event handler
def
event_handler
(
event
):
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
event
.
batch_id
%
10
==
0
:
print
"Pass: %d, Batch: %d, TrainCost: %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
)
with
gzip
.
open
(
"params.tar.gz"
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
else
:
sys
.
stdout
.
write
(
'.'
)
sys
.
stdout
.
flush
()
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
result
=
trainer
.
test
(
reader
=
test_batch_reader
,
feeding
=
feeding
)
print
"Pass: %d, TestCost: %f, %s"
%
(
event
.
pass_id
,
event
.
cost
,
result
.
metrics
)
with
gzip
.
open
(
"params.tar.gz"
,
'w'
)
as
f
:
parameters
.
to_tar
(
f
)
# run train
trainer
.
train
(
reader
=
train_batch_reader
,
event_handler
=
event_handler
,
num_passes
=
num_passes
,
feeding
=
feeding
)
def
infer
():
"""
For inferencing.
"""
# create network config
source_words
=
paddle
.
layer
.
data
(
name
=
"source_words"
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
dict_size
))
beam_gen
=
seq2seq
(
encoder_input
=
source_words
,
decoder_input
=
None
,
decoder_target
=
None
,
hidden_size
=
hidden_size
,
word_vec_dim
=
word_vec_dim
,
dict_size
=
dict_size
,
is_generating
=
True
,
beam_size
=
beam_size
)
# load parameters
parameters
=
paddle
.
parameters
.
Parameters
.
from_tar
(
gzip
.
open
(
"params.tar.gz"
))
# prepare infer data
infer_data
=
[]
test_append_reader
=
reader_append_wrapper
(
reader
=
paddle
.
dataset
.
wmt14
.
test
(
dict_size
),
append_tuple
=
([
0
]
*
memory_slot_num
,
))
for
i
,
item
in
enumerate
(
test_append_reader
()):
if
i
<
infer_data_num
:
infer_data
.
append
((
item
[
0
],
item
[
3
],
))
# run inference
beam_result
=
paddle
.
infer
(
output_layer
=
beam_gen
,
parameters
=
parameters
,
input
=
infer_data
,
field
=
[
'prob'
,
'id'
])
# parse beam result and print
source_dict
,
target_dict
=
paddle
.
dataset
.
wmt14
.
get_dict
(
dict_size
)
beam_probs
,
beam_sentences
=
parse_beam_result
(
beam_result
,
target_dict
)
for
i
in
xrange
(
infer_data_num
):
print
"
\n
*******************************************************
\n
"
print
"src:"
,
' '
.
join
(
[
source_dict
.
get
(
word
)
for
word
in
infer_data
[
i
][
0
]]),
"
\n
"
for
j
in
xrange
(
beam_size
):
print
"prob = %f : %s"
%
(
beam_probs
[
i
][
j
],
beam_sentences
[
i
][
j
])
def
main
():
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
train
(
num_passes
=
1
)
infer
()
if
__name__
==
'__main__'
:
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录