Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
05541295
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
05541295
编写于
3月 06, 2017
作者:
Y
Yi Wang
提交者:
GitHub
3月 06, 2017
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1560 from jacquesqiao/seq2seq-dataset
optimize Seq2seq dataset
上级
ca62c104
1940f58f
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
170 addition
and
269 deletion
+170
-269
demo/seqToseq/api_train_v2.py
demo/seqToseq/api_train_v2.py
+95
-65
demo/seqToseq/seqToseq_net_v2.py
demo/seqToseq/seqToseq_net_v2.py
+0
-92
python/paddle/v2/dataset/wmt14.py
python/paddle/v2/dataset/wmt14.py
+75
-112
未找到文件。
demo/seqToseq/api_train_v2.py
浏览文件 @
05541295
import
os
import
paddle.v2
as
paddle
from
seqToseq_net_v2
import
seqToseq_net_v2
# Data Definiation.
# TODO:This code should be merged to dataset package.
data_dir
=
"./data/pre-wmt14"
src_lang_dict
=
os
.
path
.
join
(
data_dir
,
'src.dict'
)
trg_lang_dict
=
os
.
path
.
join
(
data_dir
,
'trg.dict'
)
source_dict_dim
=
len
(
open
(
src_lang_dict
,
"r"
).
readlines
())
target_dict_dim
=
len
(
open
(
trg_lang_dict
,
"r"
).
readlines
())
def
read_to_dict
(
dict_path
):
with
open
(
dict_path
,
"r"
)
as
fin
:
out_dict
=
{
line
.
strip
():
line_count
for
line_count
,
line
in
enumerate
(
fin
)
}
return
out_dict
src_dict
=
read_to_dict
(
src_lang_dict
)
trg_dict
=
read_to_dict
(
trg_lang_dict
)
train_list
=
os
.
path
.
join
(
data_dir
,
'train.list'
)
test_list
=
os
.
path
.
join
(
data_dir
,
'test.list'
)
UNK_IDX
=
2
START
=
"<s>"
END
=
"<e>"
def
_get_ids
(
s
,
dictionary
):
words
=
s
.
strip
().
split
()
return
[
dictionary
[
START
]]
+
\
[
dictionary
.
get
(
w
,
UNK_IDX
)
for
w
in
words
]
+
\
[
dictionary
[
END
]]
def
train_reader
(
file_name
):
def
reader
():
with
open
(
file_name
,
'r'
)
as
f
:
for
line_count
,
line
in
enumerate
(
f
):
line_split
=
line
.
strip
().
split
(
'
\t
'
)
if
len
(
line_split
)
!=
2
:
continue
src_seq
=
line_split
[
0
]
# one source sequence
src_ids
=
_get_ids
(
src_seq
,
src_dict
)
trg_seq
=
line_split
[
1
]
# one target sequence
trg_words
=
trg_seq
.
split
()
trg_ids
=
[
trg_dict
.
get
(
w
,
UNK_IDX
)
for
w
in
trg_words
]
# remove sequence whose length > 80 in training mode
if
len
(
src_ids
)
>
80
or
len
(
trg_ids
)
>
80
:
continue
trg_ids_next
=
trg_ids
+
[
trg_dict
[
END
]]
trg_ids
=
[
trg_dict
[
START
]]
+
trg_ids
yield
src_ids
,
trg_ids
,
trg_ids_next
return
reader
def
seqToseq_net
(
source_dict_dim
,
target_dict_dim
):
### Network Architecture
word_vector_dim
=
512
# dimension of word vector
decoder_size
=
512
# dimension of hidden unit in GRU Decoder network
encoder_size
=
512
# dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id
=
paddle
.
layer
.
data
(
name
=
'source_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
source_dict_dim
))
src_embedding
=
paddle
.
layer
.
embedding
(
input
=
src_word_id
,
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_source_language_embedding'
))
src_forward
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
)
src_backward
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
,
reverse
=
True
)
encoded_vector
=
paddle
.
layer
.
concat
(
input
=
[
src_forward
,
src_backward
])
#### Decoder
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
)
as
encoded_proj
:
encoded_proj
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
encoded_vector
)
backward_first
=
paddle
.
layer
.
first_seq
(
input
=
src_backward
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
())
as
decoder_boot
:
decoder_boot
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
backward_first
)
def
gru_decoder_with_attention
(
enc_vec
,
enc_proj
,
current_word
):
decoder_mem
=
paddle
.
layer
.
memory
(
name
=
'gru_decoder'
,
size
=
decoder_size
,
boot_layer
=
decoder_boot
)
context
=
paddle
.
networks
.
simple_attention
(
encoded_sequence
=
enc_vec
,
encoded_proj
=
enc_proj
,
decoder_state
=
decoder_mem
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
)
as
decoder_inputs
:
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
context
)
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
input
=
decoder_inputs
,
output_mem
=
decoder_mem
,
size
=
decoder_size
)
with
paddle
.
layer
.
mixed
(
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
())
as
out
:
out
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
return
out
decoder_group_name
=
"decoder_group"
group_input1
=
paddle
.
layer
.
StaticInputV2
(
input
=
encoded_vector
,
is_seq
=
True
)
group_input2
=
paddle
.
layer
.
StaticInputV2
(
input
=
encoded_proj
,
is_seq
=
True
)
group_inputs
=
[
group_input1
,
group_input2
]
trg_embedding
=
paddle
.
layer
.
embedding
(
input
=
paddle
.
layer
.
data
(
name
=
'target_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
)),
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
group_inputs
.
append
(
trg_embedding
)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder
=
paddle
.
layer
.
recurrent_group
(
name
=
decoder_group_name
,
step
=
gru_decoder_with_attention
,
input
=
group_inputs
)
lbl
=
paddle
.
layer
.
data
(
name
=
'target_language_next_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder
,
label
=
lbl
)
return
cost
def
main
():
paddle
.
init
(
use_gpu
=
False
,
trainer_count
=
1
)
# source and target dict dim.
dict_size
=
30000
source_dict_dim
=
target_dict_dim
=
dict_size
# define network topology
cost
=
seqToseq_net
_v2
(
source_dict_dim
,
target_dict_dim
)
cost
=
seqToseq_net
(
source_dict_dim
,
target_dict_dim
)
parameters
=
paddle
.
parameters
.
create
(
cost
)
# define optimize method and trainer
...
...
@@ -88,7 +118,7 @@ def main():
wmt14_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
train_reader
(
"data/pre-wmt14/train/train"
),
buf_size
=
8192
),
paddle
.
dataset
.
wmt14
.
train
(
dict_size
=
dict_size
),
buf_size
=
8192
),
batch_size
=
5
)
# define event_handler callback
...
...
demo/seqToseq/seqToseq_net_v2.py
已删除
100644 → 0
浏览文件 @
ca62c104
import
paddle.v2
as
paddle
def
seqToseq_net_v2
(
source_dict_dim
,
target_dict_dim
):
### Network Architecture
word_vector_dim
=
512
# dimension of word vector
decoder_size
=
512
# dimension of hidden unit in GRU Decoder network
encoder_size
=
512
# dimension of hidden unit in GRU Encoder network
#### Encoder
src_word_id
=
paddle
.
layer
.
data
(
name
=
'source_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
source_dict_dim
))
src_embedding
=
paddle
.
layer
.
embedding
(
input
=
src_word_id
,
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_source_language_embedding'
))
src_forward
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
)
src_backward
=
paddle
.
networks
.
simple_gru
(
input
=
src_embedding
,
size
=
encoder_size
,
reverse
=
True
)
encoded_vector
=
paddle
.
layer
.
concat
(
input
=
[
src_forward
,
src_backward
])
#### Decoder
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
)
as
encoded_proj
:
encoded_proj
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
encoded_vector
)
backward_first
=
paddle
.
layer
.
first_seq
(
input
=
src_backward
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
,
act
=
paddle
.
activation
.
Tanh
())
as
decoder_boot
:
decoder_boot
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
backward_first
)
def
gru_decoder_with_attention
(
enc_vec
,
enc_proj
,
current_word
):
decoder_mem
=
paddle
.
layer
.
memory
(
name
=
'gru_decoder'
,
size
=
decoder_size
,
boot_layer
=
decoder_boot
)
context
=
paddle
.
networks
.
simple_attention
(
encoded_sequence
=
enc_vec
,
encoded_proj
=
enc_proj
,
decoder_state
=
decoder_mem
)
with
paddle
.
layer
.
mixed
(
size
=
decoder_size
*
3
)
as
decoder_inputs
:
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
context
)
decoder_inputs
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
current_word
)
gru_step
=
paddle
.
layer
.
gru_step
(
name
=
'gru_decoder'
,
input
=
decoder_inputs
,
output_mem
=
decoder_mem
,
size
=
decoder_size
)
with
paddle
.
layer
.
mixed
(
size
=
target_dict_dim
,
bias_attr
=
True
,
act
=
paddle
.
activation
.
Softmax
())
as
out
:
out
+=
paddle
.
layer
.
full_matrix_projection
(
input
=
gru_step
)
return
out
decoder_group_name
=
"decoder_group"
group_input1
=
paddle
.
layer
.
StaticInputV2
(
input
=
encoded_vector
,
is_seq
=
True
)
group_input2
=
paddle
.
layer
.
StaticInputV2
(
input
=
encoded_proj
,
is_seq
=
True
)
group_inputs
=
[
group_input1
,
group_input2
]
trg_embedding
=
paddle
.
layer
.
embedding
(
input
=
paddle
.
layer
.
data
(
name
=
'target_language_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
)),
size
=
word_vector_dim
,
param_attr
=
paddle
.
attr
.
ParamAttr
(
name
=
'_target_language_embedding'
))
group_inputs
.
append
(
trg_embedding
)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder
=
paddle
.
layer
.
recurrent_group
(
name
=
decoder_group_name
,
step
=
gru_decoder_with_attention
,
input
=
group_inputs
)
lbl
=
paddle
.
layer
.
data
(
name
=
'target_language_next_word'
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
target_dict_dim
))
cost
=
paddle
.
layer
.
classification_cost
(
input
=
decoder
,
label
=
lbl
)
return
cost
python/paddle/v2/dataset/wmt14.py
浏览文件 @
05541295
...
...
@@ -14,129 +14,92 @@
"""
wmt14 dataset
"""
import
paddle.v2.dataset.common
import
tarfile
import
os.path
import
itertools
import
paddle.v2.dataset.common
__all__
=
[
'train'
,
'test'
,
'build_dict'
]
URL_DEV_TEST
=
'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
MD5_DEV_TEST
=
'7d7897317ddd8ba0ae5c5fa7248d3ff5'
URL_TRAIN
=
'http://localhost:8000/train.tgz'
MD5_TRAIN
=
'72de99da2830ea5a3a2c4eb36092bbc7'
def
word_count
(
f
,
word_freq
=
None
):
add
=
paddle
.
v2
.
dataset
.
common
.
dict_add
if
word_freq
==
None
:
word_freq
=
{}
for
l
in
f
:
for
w
in
l
.
strip
().
split
():
add
(
word_freq
,
w
)
add
(
word_freq
,
'<s>'
)
add
(
word_freq
,
'<e>'
)
return
word_freq
def
get_word_dix
(
word_freq
):
TYPO_FREQ
=
50
word_freq
=
filter
(
lambda
x
:
x
[
1
]
>
TYPO_FREQ
,
word_freq
.
items
())
word_freq_sorted
=
sorted
(
word_freq
,
key
=
lambda
x
:
(
-
x
[
1
],
x
[
0
]))
words
,
_
=
list
(
zip
(
*
word_freq_sorted
))
word_idx
=
dict
(
zip
(
words
,
xrange
(
len
(
words
))))
word_idx
[
'<unk>'
]
=
len
(
words
)
return
word_idx
def
get_word_freq
(
train
,
dev
):
word_freq
=
word_count
(
train
,
word_count
(
dev
))
if
'<unk>'
in
word_freq
:
# remove <unk> for now, since we will set it as last index
del
word_freq
[
'<unk>'
]
return
word_freq
def
build_dict
():
base_dir
=
'./wmt14-data'
train_en_filename
=
base_dir
+
'/train/train.en'
train_fr_filename
=
base_dir
+
'/train/train.fr'
dev_en_filename
=
base_dir
+
'/dev/ntst1213.en'
dev_fr_filename
=
base_dir
+
'/dev/ntst1213.fr'
if
not
os
.
path
.
exists
(
train_en_filename
)
or
not
os
.
path
.
exists
(
train_fr_filename
):
with
tarfile
.
open
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL_TRAIN
,
'wmt14'
,
MD5_TRAIN
))
as
tf
:
tf
.
extractall
(
base_dir
)
if
not
os
.
path
.
exists
(
dev_en_filename
)
or
not
os
.
path
.
exists
(
dev_fr_filename
):
with
tarfile
.
open
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL_DEV_TEST
,
'wmt14'
,
MD5_DEV_TEST
))
as
tf
:
tf
.
extractall
(
base_dir
)
f_en
=
open
(
train_en_filename
)
f_fr
=
open
(
train_fr_filename
)
f_en_dev
=
open
(
dev_en_filename
)
f_fr_dev
=
open
(
dev_fr_filename
)
word_freq_en
=
get_word_freq
(
f_en
,
f_en_dev
)
word_freq_fr
=
get_word_freq
(
f_fr
,
f_fr_dev
)
f_en
.
close
()
f_fr
.
close
()
f_en_dev
.
close
()
f_fr_dev
.
close
()
return
get_word_dix
(
word_freq_en
),
get_word_dix
(
word_freq_fr
)
# this is a small set of data for test. The original data is too large and will be add later.
URL_TRAIN
=
'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
MD5_TRAIN
=
'a755315dd01c2c35bde29a744ede23a6'
START
=
"<s>"
END
=
"<e>"
UNK
=
"<unk>"
UNK_IDX
=
2
def
__read_to_dict__
(
tar_file
,
dict_size
):
def
__to_dict__
(
fd
,
size
):
out_dict
=
dict
()
for
line_count
,
line
in
enumerate
(
fd
):
if
line_count
<
size
:
out_dict
[
line
.
strip
()]
=
line_count
else
:
break
return
out_dict
with
tarfile
.
open
(
tar_file
,
mode
=
'r'
)
as
f
:
names
=
[
each_item
.
name
for
each_item
in
f
if
each_item
.
name
.
endswith
(
"src.dict"
)
]
assert
len
(
names
)
==
1
src_dict
=
__to_dict__
(
f
.
extractfile
(
names
[
0
]),
dict_size
)
names
=
[
each_item
.
name
for
each_item
in
f
if
each_item
.
name
.
endswith
(
"trg.dict"
)
]
assert
len
(
names
)
==
1
trg_dict
=
__to_dict__
(
f
.
extractfile
(
names
[
0
]),
dict_size
)
return
src_dict
,
trg_dict
def
reader_creator
(
directory
,
path_en
,
path_fr
,
URL
,
MD5
,
dict_en
,
dict_fr
):
def
reader_creator
(
tar_file
,
file_name
,
dict_size
):
def
reader
():
if
not
os
.
path
.
exists
(
path_en
)
or
not
os
.
path
.
exists
(
path_fr
):
with
tarfile
.
open
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL
,
'wmt14'
,
MD5
))
as
tf
:
tf
.
extractall
(
directory
)
f_en
=
open
(
path_en
)
f_fr
=
open
(
path_fr
)
UNK_en
=
dict_en
[
'<unk>'
]
UNK_fr
=
dict_fr
[
'<unk>'
]
for
en
,
fr
in
itertools
.
izip
(
f_en
,
f_fr
):
src_ids
=
[
dict_en
.
get
(
w
,
UNK_en
)
for
w
in
en
.
strip
().
split
()]
tar_ids
=
[
dict_fr
.
get
(
w
,
UNK_fr
)
for
w
in
[
'<s>'
]
+
fr
.
strip
().
split
()
+
[
'<e>'
]
src_dict
,
trg_dict
=
__read_to_dict__
(
tar_file
,
dict_size
)
with
tarfile
.
open
(
tar_file
,
mode
=
'r'
)
as
f
:
names
=
[
each_item
.
name
for
each_item
in
f
if
each_item
.
name
.
endswith
(
file_name
)
]
for
name
in
names
:
for
line
in
f
.
extractfile
(
name
):
line_split
=
line
.
strip
().
split
(
'
\t
'
)
if
len
(
line_split
)
!=
2
:
continue
src_seq
=
line_split
[
0
]
# one source sequence
src_words
=
src_seq
.
split
()
src_ids
=
[
src_dict
.
get
(
w
,
UNK_IDX
)
for
w
in
[
START
]
+
src_words
+
[
END
]
]
trg_seq
=
line_split
[
1
]
# one target sequence
trg_words
=
trg_seq
.
split
()
trg_ids
=
[
trg_dict
.
get
(
w
,
UNK_IDX
)
for
w
in
trg_words
]
# remove sequence whose length > 80 in training mode
if
len
(
src_ids
)
==
0
or
len
(
tar_ids
)
<=
1
or
len
(
src_ids
)
>
80
or
len
(
tar_ids
)
>
80
:
if
len
(
src_ids
)
>
80
or
len
(
trg_ids
)
>
80
:
continue
trg_ids_next
=
trg_ids
+
[
trg_dict
[
END
]]
trg_ids
=
[
trg_dict
[
START
]]
+
trg_ids
yield
src_ids
,
tar_ids
[:
-
1
],
tar_ids
[
1
:]
f_en
.
close
()
f_fr
.
close
()
yield
src_ids
,
trg_ids
,
trg_ids_next
return
reader
def
train
(
dict_en
,
dict_fr
):
directory
=
'./wmt14-data'
return
reader_creator
(
directory
,
directory
+
'/train/train.en'
,
directory
+
'/train/train.fr'
,
URL_TRAIN
,
MD5_TRAIN
,
dict_en
,
dict_fr
)
def
train
(
dict_size
):
return
reader_creator
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL_TRAIN
,
'wmt14'
,
MD5_TRAIN
),
'train/train'
,
dict_size
)
def
test
(
dict_en
,
dict_fr
):
directory
=
'./wmt14-data'
return
reader_creator
(
directory
,
directory
+
'/dev/ntst1213.en'
,
directory
+
'/dev/ntst1213.fr'
,
URL_DEV_TEST
,
MD5_DEV_TEST
,
dict_en
,
dict_fr
)
def
test
(
dict_size
):
return
reader_creator
(
paddle
.
v2
.
dataset
.
common
.
download
(
URL_TRAIN
,
'wmt14'
,
MD5_TRAIN
),
'test/test'
,
dict_size
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录