Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
d30a28c7
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
d30a28c7
编写于
7月 25, 2017
作者:
C
caoying03
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
proj init.
上级
d42adaca
变更
6
显示空白变更内容
内联
并排
Showing
6 changed file
with
544 addition
and
0 deletion
+544
-0
globally_normalized_reader/.gitignore
globally_normalized_reader/.gitignore
+3
-0
globally_normalized_reader/basic_modules.py
globally_normalized_reader/basic_modules.py
+135
-0
globally_normalized_reader/config.py
globally_normalized_reader/config.py
+25
-0
globally_normalized_reader/model.py
globally_normalized_reader/model.py
+199
-0
globally_normalized_reader/reader.py
globally_normalized_reader/reader.py
+43
-0
globally_normalized_reader/train.py
globally_normalized_reader/train.py
+139
-0
未找到文件。
globally_normalized_reader/.gitignore
0 → 100644
浏览文件 @
d30a28c7
data
*.txt
*.pyc
globally_normalized_reader/basic_modules.py
0 → 100755
浏览文件 @
d30a28c7
#!/usr/bin/env python
#coding=utf-8
import
pdb
import
collections
import
paddle.v2
as
paddle
from
paddle.v2.layer
import
parse_network
__all__
=
[
"stacked_bidirectional_lstm"
,
"lstm_by_nested_sequence"
,
]
def
stacked_bidirectional_lstm
(
inputs
,
size
,
depth
,
drop_rate
=
0.
,
prefix
=
""
):
if
not
isinstance
(
inputs
,
collections
.
Sequence
):
inputs
=
[
inputs
]
lstm_last
=
[]
for
dirt
in
[
"fwd"
,
"bwd"
]:
for
i
in
range
(
depth
):
input_proj
=
paddle
.
layer
.
mixed
(
name
=
"%s_in_proj_%0d_%s__"
%
(
prefix
,
i
,
dirt
),
size
=
size
*
4
,
bias_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.
),
input
=
[
paddle
.
layer
.
full_matrix_projection
(
lstm
)]
if
i
else
[
paddle
.
layer
.
full_matrix_projection
(
in_layer
)
for
in_layer
in
inputs
])
lstm
=
paddle
.
layer
.
lstmemory
(
input
=
input_proj
,
bias_attr
=
paddle
.
attr
.
Param
(
initial_std
=
0.
),
param_attr
=
paddle
.
attr
.
Param
(
initial_std
=
5e-4
),
reverse
=
(
dirt
==
"bwd"
))
lstm_last
.
append
(
lstm
)
final_states
=
paddle
.
layer
.
concat
(
input
=
[
paddle
.
layer
.
last_seq
(
input
=
lstm_last
[
0
]),
paddle
.
layer
.
first_seq
(
input
=
lstm_last
[
1
]),
])
return
final_states
,
paddle
.
layer
.
concat
(
input
=
lstm_last
,
layer_attr
=
paddle
.
attr
.
ExtraLayerAttribute
(
drop_rate
=
drop_rate
),
)
def
lstm_by_nested_sequence
(
input_layer
,
hidden_dim
,
name
=
""
,
reverse
=
False
):
'''
This is a LSTM implemended by nested recurrent_group.
Paragraph is a nature nested sequence:
1. each paragraph is a sequence of sentence.
2. each sentence is a sequence of words.
This function ueses the nested recurrent_group to implement LSTM.
1. The outer group iterates over sentence in a paragraph.
2. The inner group iterates over words in a sentence.
3. A LSTM is used to encode sentence, its final outputs is used to
initialize memory of the LSTM that is used to encode the next sentence.
4. Parameters are shared among these sentence-encoding LSTMs.
5. Consequently, this function is just equivalent to concatenate all
sentences in a paragraph into one (long) sentence, and use one LSTM to
encode this new long sentence.
'''
def
lstm_outer_step
(
lstm_group_input
,
hidden_dim
,
reverse
,
name
=
''
):
outer_memory
=
paddle
.
layer
.
memory
(
name
=
"__inner_%s_last__"
%
name
,
size
=
hidden_dim
)
def
lstm_inner_step
(
input_layer
,
hidden_dim
,
reverse
,
name
):
inner_memory
=
paddle
.
layer
.
memory
(
name
=
"__inner_state_%s__"
%
name
,
size
=
hidden_dim
,
boot_layer
=
outer_memory
)
input_proj
=
paddle
.
layer
.
fc
(
size
=
hidden_dim
*
4
,
bias_attr
=
False
,
input
=
input_layer
)
return
paddle
.
networks
.
lstmemory_unit
(
input
=
input_proj
,
name
=
"__inner_state_%s__"
%
name
,
out_memory
=
inner_memory
,
size
=
hidden_dim
,
act
=
paddle
.
activation
.
Tanh
(),
gate_act
=
paddle
.
activation
.
Sigmoid
(),
state_act
=
paddle
.
activation
.
Tanh
())
inner_out
=
paddle
.
layer
.
recurrent_group
(
name
=
"__inner_%s__"
%
name
,
step
=
lstm_inner_step
,
reverse
=
reverse
,
input
=
[
lstm_group_input
,
hidden_dim
,
reverse
,
name
])
if
reverse
:
inner_last_output
=
paddle
.
layer
.
first_seq
(
input
=
inner_out
,
name
=
"__inner_%s_last__"
%
name
,
agg_level
=
paddle
.
layer
.
AggregateLevel
.
TO_SEQUENCE
)
else
:
inner_last_output
=
paddle
.
layer
.
last_seq
(
input
=
inner_out
,
name
=
"__inner_%s_last__"
%
name
,
agg_level
=
paddle
.
layer
.
AggregateLevel
.
TO_SEQUENCE
)
return
inner_out
return
paddle
.
layer
.
recurrent_group
(
input
=
[
paddle
.
layer
.
SubsequenceInput
(
input_layer
),
hidden_dim
,
reverse
,
name
],
step
=
lstm_outer_step
,
name
=
"__outter_%s__"
%
name
,
reverse
=
reverse
)
def
stacked_bi_lstm_by_nested_seq
(
input_layer
,
depth
,
hidden_dim
,
prefix
=
""
):
lstm_final_outs
=
[]
for
dirt
in
[
"fwd"
,
"bwd"
]:
for
i
in
range
(
depth
):
lstm_out
=
lstm_by_nested_sequence
(
input_layer
=
(
lstm_out
if
i
else
input_layer
),
hidden_dim
=
hidden_dim
,
name
=
"__%s_%s_%02d__"
%
(
prefix
,
dirt
,
i
),
reverse
=
(
dirt
==
"bwd"
))
lstm_final_outs
.
append
(
lstm_out
)
return
paddle
.
layer
.
concat
(
input
=
lstm_final_outs
)
if
__name__
==
"__main__"
:
vocab_size
=
1024
emb_dim
=
128
embedding
=
paddle
.
layer
.
embedding
(
input
=
paddle
.
layer
.
data
(
name
=
"word"
,
type
=
paddle
.
data_type
.
integer_value_sub_sequence
(
vocab_size
)),
size
=
emb_dim
)
print
(
parse_network
(
stacked_bi_lstm_by_nested_seq
(
input_layer
=
embedding
,
depth
=
3
,
hidden_dim
=
128
,
prefix
=
"__lstm"
)))
globally_normalized_reader/config.py
0 → 100644
浏览文件 @
d30a28c7
#!/usr/bin/env python
#coding=utf-8
__all__
=
[
"ModelConfig"
]
class
ModelConfig
(
object
):
beam_size
=
3
vocab_size
=
102400
embedding_dim
=
256
embedding_droprate
=
0.3
lstm_depth
=
3
lstm_hidden_dim
=
300
lstm_hidden_droprate
=
0.3
passage_indep_embedding_dim
=
300
passage_aligned_embedding_dim
=
128
beam_size
=
5
class
TrainerConfig
(
object
):
learning_rate
=
1e-3
data_dir
=
"data/featurized"
globally_normalized_reader/model.py
0 → 100755
浏览文件 @
d30a28c7
#!/usr/bin/env python
#coding=utf-8
import
pdb
import
paddle.v2
as
paddle
from
paddle.v2.layer
import
parse_network
import
basic_modules
from
config
import
ModelConfig
__all__
=
[
"GNR"
]
def
build_pretrained_embedding
(
name
,
data_type
,
vocab_size
,
emb_dim
,
emb_drop
=
0.
):
one_hot_input
=
paddle
.
layer
.
data
(
name
=
name
,
type
=
paddle
.
data_type
.
integer_value_sequence
(
vocab_size
))
return
paddle
.
layer
.
embedding
(
input
=
one_hot_input
,
size
=
emb_dim
,
param_attr
=
paddle
.
attr
.
Param
(
name
=
"GloveVectors"
,
is_static
=
True
),
layer_attr
=
paddle
.
attr
.
ExtraLayerAttribute
(
drop_rate
=
emb_drop
),
)
def
encode_question
(
input_embedding
,
config
,
prefix
):
lstm_final
,
lstm_outs
=
basic_modules
.
stacked_bidirectional_lstm
(
inputs
=
input_embedding
,
size
=
config
.
lstm_hidden_dim
,
depth
=
config
.
lstm_depth
,
drop_rate
=
config
.
lstm_hidden_droprate
,
prefix
=
prefix
)
# passage-independent embeddings
candidates
=
paddle
.
layer
.
fc
(
input
=
lstm_outs
,
bias_attr
=
False
,
size
=
config
.
passage_indep_embedding_dim
,
act
=
paddle
.
activation
.
Linear
())
weights
=
paddle
.
layer
.
fc
(
input
=
lstm_outs
,
size
=
1
,
act
=
paddle
.
activation
.
SequenceSoftmax
())
weighted_candidates
=
paddle
.
layer
.
scaling
(
input
=
candidates
,
weight
=
weights
)
passage_indep_embedding
=
paddle
.
layer
.
pooling
(
input
=
weighted_candidates
,
pooling_type
=
paddle
.
pooling
.
Sum
())
return
paddle
.
layer
.
concat
(
input
=
[
lstm_final
,
passage_indep_embedding
]),
lstm_outs
def
question_aligned_passage_embedding
(
question_lstm_outs
,
document_embeddings
,
config
):
def
outer_sentence_step
(
document_embeddings
,
question_lstm_outs
,
config
):
'''
in this recurrent_group, document_embeddings has scattered into sequence,
'''
def
inner_word_step
(
word_embedding
,
question_lstm_outs
,
question_outs_proj
,
config
):
'''
in this recurrent_group, sentence embedding has scattered into word
embeddings.
'''
doc_word_expand
=
paddle
.
layer
.
expand
(
input
=
word_embedding
,
expand_as
=
question_lstm_outs
,
expand_level
=
paddle
.
layer
.
ExpandLevel
.
FROM_NO_SEQUENCE
)
weights
=
paddle
.
layer
.
fc
(
input
=
[
question_lstm_outs
,
doc_word_expand
],
size
=
1
,
act
=
paddle
.
activation
.
SequenceSoftmax
())
weighted_candidates
=
paddle
.
layer
.
scaling
(
input
=
question_outs_proj
,
weight
=
weights
)
return
paddle
.
layer
.
pooling
(
input
=
weighted_candidates
,
pooling_type
=
paddle
.
pooling
.
Sum
())
question_outs_proj
=
paddle
.
layer
.
fc
(
input
=
question_lstm_outs
,
bias_attr
=
False
,
size
=
config
.
passage_aligned_embedding_dim
)
return
paddle
.
layer
.
recurrent_group
(
input
=
[
paddle
.
layer
.
SubsequenceInput
(
document_embeddings
),
paddle
.
layer
.
StaticInput
(
question_lstm_outs
),
paddle
.
layer
.
StaticInput
(
question_outs_proj
),
config
,
],
step
=
inner_word_step
,
name
=
"iter_over_word"
)
return
paddle
.
layer
.
recurrent_group
(
input
=
[
paddle
.
layer
.
SubsequenceInput
(
document_embeddings
),
paddle
.
layer
.
StaticInput
(
question_lstm_outs
),
config
],
step
=
outer_sentence_step
,
name
=
"iter_over_sen"
)
def
encode_documents
(
input_embedding
,
same_as_question
,
question_vector
,
question_lstm_outs
,
config
,
prefix
):
question_expanded
=
paddle
.
layer
.
expand
(
input
=
question_vector
,
expand_as
=
input_embedding
,
expand_level
=
paddle
.
layer
.
ExpandLevel
.
FROM_NO_SEQUENCE
)
question_aligned_embedding
=
question_aligned_passage_embedding
(
question_lstm_outs
,
input_embedding
,
config
)
return
paddle
.
layer
.
concat
(
input
=
[
input_embedding
,
question_expanded
,
same_as_question
,
question_aligned_embedding
])
def
search_answer
(
doc_lstm_outs
,
sentence_idx
,
start_idx
,
end_idx
,
config
):
last_state_of_sentence
=
paddle
.
layer
.
last_seq
(
input
=
doc_lstm_outs
,
agg_level
=
paddle
.
layer
.
AggregateLevel
.
TO_SEQUENCE
)
# HERE do not use sequence softmax activition.
sentence_scores
=
paddle
.
layer
.
fc
(
input
=
last_state_of_sentence
,
size
=
1
,
act
=
paddle
.
activation
.
Exp
())
topk_sentence_ids
=
paddle
.
layer
.
kmax_sequence_score
(
input
=
sentence_scores
,
beam_size
=
config
.
beam_size
)
topk_sen
=
paddle
.
layer
.
sub_nested_seq
(
input
=
last_state_of_sentence
,
selected_indices
=
topk_sentence_ids
)
# expand beam to search start positions on selected sentences
start_pos_scores
=
paddle
.
layer
.
fc
(
input
=
topk_sen
,
size
=
1
,
act
=
paddle
.
activation
.
Exp
())
topk_start_pos_ids
=
paddle
.
layer
.
kmax_sequence_score
(
input
=
sentence_scores
,
beam_size
=
config
.
beam_size
)
topk_start_spans
=
paddle
.
layer
.
seq_slice
(
input
=
topk_sen
,
starts
=
topk_start_pos_ids
,
ends
=
None
)
# expand beam to search end positions on selected start spans
_
,
end_span_embedding
=
basic_modules
.
stacked_bidirectional_lstm
(
inputs
=
topk_start_spans
,
size
=
config
.
lstm_hidden_dim
,
depth
=
config
.
lstm_depth
,
drop_rate
=
config
.
lstm_hidden_droprate
,
prefix
=
"__end_span_embeddings__"
)
end_pos_scores
=
paddle
.
layer
.
fc
(
input
=
end_span_embedding
,
size
=
1
,
act
=
paddle
.
activation
.
Exp
())
topk_end_pos_ids
=
paddle
.
layer
.
kmax_sequence_score
(
input
=
end_pos_scores
,
beam_size
=
config
.
beam_size
)
cost
=
paddle
.
layer
.
cross_entropy_over_beam
(
input
=
[
sentence_scores
,
topk_sentence_ids
,
start_pos_scores
,
topk_start_pos_ids
,
end_pos_scores
,
topk_end_pos_ids
],
label
=
[
sentence_idx
,
start_idx
,
end_idx
])
return
cost
def
GNR
(
config
):
# encoding question words
question_embeddings
=
build_pretrained_embedding
(
"question"
,
paddle
.
data_type
.
integer_value_sequence
,
config
.
vocab_size
,
config
.
embedding_dim
,
config
.
embedding_droprate
)
question_vector
,
question_lstm_outs
=
encode_question
(
input_embedding
=
question_embeddings
,
config
=
config
,
prefix
=
"__ques"
)
# encoding document words
document_embeddings
=
build_pretrained_embedding
(
"documents"
,
paddle
.
data_type
.
integer_value_sub_sequence
,
config
.
vocab_size
,
config
.
embedding_dim
,
config
.
embedding_droprate
)
same_as_question
=
paddle
.
layer
.
data
(
name
=
"same_as_question"
,
type
=
paddle
.
data_type
.
integer_value_sub_sequence
(
2
))
document_words_ecoding
=
encode_documents
(
input_embedding
=
document_embeddings
,
question_vector
=
question_vector
,
question_lstm_outs
=
question_lstm_outs
,
same_as_question
=
same_as_question
,
config
=
config
,
prefix
=
"__doc"
)
doc_lstm_outs
=
basic_modules
.
stacked_bi_lstm_by_nested_seq
(
input_layer
=
document_words_ecoding
,
hidden_dim
=
config
.
lstm_hidden_dim
,
depth
=
config
.
lstm_depth
,
prefix
=
"__doc_lstm"
)
# define labels
sentence_idx
=
paddle
.
layer
.
data
(
name
=
"sen_idx"
,
type
=
paddle
.
data_type
.
integer_value
(
1
))
start_idx
=
paddle
.
layer
.
data
(
name
=
"start_idx"
,
type
=
paddle
.
data_type
.
integer_value
(
1
))
end_idx
=
paddle
.
layer
.
data
(
name
=
"end_idx"
,
type
=
paddle
.
data_type
.
integer_value
(
1
))
return
search_answer
(
doc_lstm_outs
,
sentence_idx
,
start_idx
,
end_idx
,
config
)
if
__name__
==
"__main__"
:
print
(
parse_network
(
GNR
(
ModelConfig
)))
globally_normalized_reader/reader.py
0 → 100755
浏览文件 @
d30a28c7
#!/usr/bin/env python
#coding=utf-8
import
pdb
import
os
import
random
import
json
def
train_reader
(
data_list
,
is_train
=
True
):
def
reader
():
# every pass shuffle the data list again
if
is_train
:
random
.
shuffle
(
data_list
)
for
train_sample
in
data_list
:
data
=
json
.
load
(
open
(
train_sample
,
"r"
))
sent_len
=
data
[
'sent_lengths'
]
doc_len
=
len
(
data
[
'context'
])
same_as_question_word
=
[[[
x
]]
for
x
in
data
[
'same_as_question_word'
]]
ans_sentence
=
[
0
]
*
doc_len
ans_sentence
[
data
[
'ans_sentence'
]]
=
1
ans_start
=
[
0
]
*
doc_len
ans_start
[
data
[
'ans_start'
]]
=
1
ans_end
=
[
0
]
*
doc_len
ans_end
[
data
[
'ans_end'
]]
=
1
yield
(
data
[
'question'
],
data
[
'context'
],
same_as_question_word
,
ans_sentence
,
ans_start
,
ans_end
)
return
reader
if
__name__
==
"__main__"
:
from
train
import
choose_samples
train_list
,
dev_list
=
choose_samples
(
"data/featurized"
)
for
i
,
item
in
enumerate
(
train_reader
(
train_list
)()):
print
(
item
)
if
i
>
5
:
break
globally_normalized_reader/train.py
0 → 100755
浏览文件 @
d30a28c7
#!/usr/bin/env python
#coding=utf-8
from
__future__
import
print_function
import
pdb
import
os
import
sys
import
logging
import
random
import
glob
import
gzip
import
reader
import
paddle.v2
as
paddle
from
paddle.v2.layer
import
parse_network
from
model
import
GNR
from
config
import
ModelConfig
,
TrainerConfig
logger
=
logging
.
getLogger
(
"paddle"
)
logger
.
setLevel
(
logging
.
INFO
)
def
load_pretrained_parameters
(
path
,
height
,
width
):
return
def
save_model
(
save_path
,
parameters
):
with
gzip
.
open
(
save_path
,
"w"
)
as
f
:
parameters
.
to_tar
(
f
)
def
load_initial_model
(
model_path
,
parameters
):
with
gzip
.
open
(
model_path
,
"rb"
)
as
f
:
parameters
.
init_from_tar
(
f
)
def
choose_samples
(
path
):
"""
Load filenames for train, dev, and augmented samples.
"""
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
path
,
"train"
)):
print
(
"Non-existent directory as input path: {}"
.
format
(
path
),
file
=
sys
.
stderr
)
sys
.
exit
(
1
)
# Get paths to all samples that we want to load.
train_samples
=
glob
.
glob
(
os
.
path
.
join
(
path
,
"train"
,
"*"
))
valid_samples
=
glob
.
glob
(
os
.
path
.
join
(
path
,
"dev"
,
"*"
))
train_samples
.
sort
()
valid_samples
.
sort
()
random
.
shuffle
(
train_samples
)
return
train_samples
,
valid_samples
def
build_reader
(
data_dir
):
"""
Build the data reader for this model.
"""
train_samples
,
valid_samples
=
choose_samples
(
data_dir
)
pdb
.
set_trace
()
train_reader
=
paddle
.
batch
(
paddle
.
reader
.
shuffle
(
reader
.
train_reader
(
train_samples
),
buf_size
=
102400
),
batch_size
=
config
.
batch_size
)
# testing data is not shuffled
test_reader
=
paddle
.
batch
(
reader
.
train_reader
(
valid_samples
,
is_train
=
False
),
batch_size
=
config
.
batch_size
)
return
train_reader
,
test_reader
def
build_event_handler
(
config
,
parameters
,
trainer
,
test_reader
):
"""
Build the event handler for this model.
"""
# End batch and end pass event handler
def
event_handler
(
event
):
"""The event handler."""
if
isinstance
(
event
,
paddle
.
event
.
EndIteration
):
if
(
not
event
.
batch_id
%
100
)
and
event
.
batch_id
:
save_model
(
"checkpoint_param.latest.tar.gz"
,
parameters
)
if
not
event
.
batch_id
%
5
:
logger
.
info
(
"Pass %d, Batch %d, Cost %f, %s"
%
(
event
.
pass_id
,
event
.
batch_id
,
event
.
cost
,
event
.
metrics
))
if
isinstance
(
event
,
paddle
.
event
.
EndPass
):
save_model
(
config
.
param_save_filename_format
%
event
.
pass_id
,
parameters
)
with
gzip
.
open
(
param_path
,
'w'
)
as
handle
:
parameters
.
to_tar
(
handle
)
result
=
trainer
.
test
(
reader
=
test_reader
)
logger
.
info
(
"Test with Pass %d, %s"
%
(
event
.
pass_id
,
result
.
metrics
))
return
event_handler
def
train
(
model_config
,
trainer_config
):
paddle
.
init
(
use_gpu
=
True
,
trainer_count
=
1
)
# define the optimizer
optimizer
=
paddle
.
optimizer
.
Adam
(
learning_rate
=
trainer_config
.
learning_rate
,
regularization
=
paddle
.
optimizer
.
L2Regularization
(
rate
=
1e-3
),
model_average
=
paddle
.
optimizer
.
ModelAverage
(
average_window
=
0.5
))
# define network topology
losses
=
GNR
(
model_config
)
parameters
=
paddle
.
parameters
.
create
(
losses
)
# print(parse_network(losses))
trainer
=
paddle
.
trainer
.
SGD
(
cost
=
losses
,
parameters
=
parameters
,
update_equation
=
optimizer
)
"""
parameters.set('GloveVectors',
load_pretrained_parameters(parameter_path, height, width))
"""
# define data reader
train_reader
,
test_reader
=
build_reader
(
trainer_config
.
data_dir
)
event_handler
=
build_event_handler
(
conf
,
parameters
,
trainer
,
test_reader
)
trainer
.
train
(
reader
=
train_reader
,
num_passes
=
conf
.
epochs
,
event_handler
=
event_handler
)
if
__name__
==
"__main__"
:
train
(
ModelConfig
,
TrainerConfig
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录