Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
52c076c7
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
52c076c7
编写于
9月 27, 2018
作者:
X
xuezhong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change download.sh
上级
1cf25cdb
变更
12
展开全部
显示空白变更内容
内联
并排
Showing
12 changed file
with
238 addition
and
51477 deletion
+238
-51477
fluid/machine_reading_comprehesion/DuReader/args.py
fluid/machine_reading_comprehesion/DuReader/args.py
+4
-0
fluid/machine_reading_comprehesion/DuReader/data/demo/devset/search.dev.json
...ng_comprehesion/DuReader/data/demo/devset/search.dev.json
+0
-100
fluid/machine_reading_comprehesion/DuReader/data/demo/testset/search.test.json
..._comprehesion/DuReader/data/demo/testset/search.test.json
+0
-100
fluid/machine_reading_comprehesion/DuReader/data/demo/trainset/search.train.json
...omprehesion/DuReader/data/demo/trainset/search.train.json
+0
-100
fluid/machine_reading_comprehesion/DuReader/data/demo/vocab/vocab.data
..._reading_comprehesion/DuReader/data/demo/vocab/vocab.data
+0
-50800
fluid/machine_reading_comprehesion/DuReader/data/download.sh
fluid/machine_reading_comprehesion/DuReader/data/download.sh
+1
-3
fluid/machine_reading_comprehesion/DuReader/data/md5sum.txt
fluid/machine_reading_comprehesion/DuReader/data/md5sum.txt
+0
-1
fluid/machine_reading_comprehesion/DuReader/run.py
fluid/machine_reading_comprehesion/DuReader/run.py
+36
-1
fluid/machine_reading_comprehesion/DuReader/run.sh
fluid/machine_reading_comprehesion/DuReader/run.sh
+1
-0
fluid/machine_reading_comprehesion/DuReader/run_demo.sh
fluid/machine_reading_comprehesion/DuReader/run_demo.sh
+0
-18
fluid/machine_reading_comprehesion/DuReader/train.py
fluid/machine_reading_comprehesion/DuReader/train.py
+0
-354
fluid/machine_reading_comprehesion/DuReader/vocab.py
fluid/machine_reading_comprehesion/DuReader/vocab.py
+196
-0
未找到文件。
fluid/machine_reading_comprehesion/DuReader/args.py
浏览文件 @
52c076c7
...
...
@@ -22,6 +22,10 @@ import distutils.util
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
parser
.
add_argument
(
'--prepare'
,
action
=
'store_true'
,
help
=
'create the directories, prepare the vocabulary and embeddings'
)
parser
.
add_argument
(
'--train'
,
action
=
'store_true'
,
...
...
fluid/machine_reading_comprehesion/DuReader/data/demo/devset/search.dev.json
已删除
100644 → 0
浏览文件 @
1cf25cdb
此差异已折叠。
点击以展开。
fluid/machine_reading_comprehesion/DuReader/data/demo/testset/search.test.json
已删除
100644 → 0
浏览文件 @
1cf25cdb
此差异已折叠。
点击以展开。
fluid/machine_reading_comprehesion/DuReader/data/demo/trainset/search.train.json
已删除
100644 → 0
浏览文件 @
1cf25cdb
此差异已折叠。
点击以展开。
fluid/machine_reading_comprehesion/DuReader/data/demo/vocab/vocab.data
已删除
100644 → 0
浏览文件 @
1cf25cdb
此差异已折叠。
点击以展开。
fluid/machine_reading_comprehesion/DuReader/data/download.sh
浏览文件 @
52c076c7
...
...
@@ -20,12 +20,10 @@ if [[ -d preprocessed ]] && [[ -d raw ]]; then
echo
"data exist"
exit
0
else
wget
-c
https://aipedataset.cdn.bcebos.com/dureader/dureader_raw.zip
wget
-c
https://aipedataset.cdn.bcebos.com/dureader/dureader_preprocessed.zip
wget
-c
--no-check-certificate
http://dureader.gz.bcebos.com/dureader_preprocessed.zip
fi
if
md5sum
--status
-c
md5sum.txt
;
then
unzip dureader_raw.zip
unzip dureader_preprocessed.zip
else
echo
"download data error!"
>>
/dev/stderr
...
...
fluid/machine_reading_comprehesion/DuReader/data/md5sum.txt
浏览文件 @
52c076c7
50633b5e5fda12d86e825a5c738d0ca8 dureader_raw.zip
7a4c28026f7dc94e8135d17203c63664 dureader_preprocessed.zip
fluid/machine_reading_comprehesion/DuReader/run.py
浏览文件 @
52c076c7
...
...
@@ -41,7 +41,7 @@ import logging
import
pickle
from
utils
import
normalize
from
utils
import
compute_bleu_rouge
from
vocab
import
Vocab
def
prepare_batch_input
(
insts
,
args
):
doc_num
=
args
.
doc_num
...
...
@@ -437,6 +437,39 @@ def predict(logger, args):
inference_program
,
avg_cost
,
s_probs
,
e_probs
,
feed_order
,
place
,
vocab
,
brc_data
,
logger
,
args
)
def
prepare
(
logger
,
args
):
"""
checks data, creates the directories, prepare the vocabulary and embeddings
"""
logger
.
info
(
'Checking the data files...'
)
for
data_path
in
args
.
trainset
+
args
.
devset
+
args
.
testset
:
assert
os
.
path
.
exists
(
data_path
),
'{} file does not exist.'
.
format
(
data_path
)
logger
.
info
(
'Preparing the directories...'
)
for
dir_path
in
[
args
.
vocab_dir
,
args
.
save_dir
,
args
.
result_dir
]:
if
not
os
.
path
.
exists
(
dir_path
):
os
.
makedirs
(
dir_path
)
logger
.
info
(
'Building vocabulary...'
)
brc_data
=
BRCDataset
(
args
.
max_p_num
,
args
.
max_p_len
,
args
.
max_q_len
,
args
.
trainset
,
args
.
devset
,
args
.
testset
)
vocab
=
Vocab
(
lower
=
True
)
for
word
in
brc_data
.
word_iter
(
'train'
):
vocab
.
add
(
word
)
unfiltered_vocab_size
=
vocab
.
size
()
vocab
.
filter_tokens_by_cnt
(
min_cnt
=
2
)
filtered_num
=
unfiltered_vocab_size
-
vocab
.
size
()
logger
.
info
(
'After filter {} tokens, the final vocab size is {}'
.
format
(
filtered_num
,
vocab
.
size
()))
logger
.
info
(
'Assigning embeddings...'
)
vocab
.
randomly_init_embeddings
(
args
.
embed_size
)
logger
.
info
(
'Saving vocab...'
)
with
open
(
os
.
path
.
join
(
args
.
vocab_dir
,
'vocab.data'
),
'wb'
)
as
fout
:
pickle
.
dump
(
vocab
,
fout
)
logger
.
info
(
'Done with preparing!'
)
if
__name__
==
'__main__'
:
args
=
parse_args
()
...
...
@@ -460,6 +493,8 @@ if __name__ == '__main__':
logger
.
addHandler
(
console_handler
)
args
=
parse_args
()
logger
.
info
(
'Running with args : {}'
.
format
(
args
))
if
args
.
prepare
:
prepare
(
logger
,
args
)
if
args
.
train
:
train
(
logger
,
args
)
if
args
.
evaluate
:
...
...
fluid/machine_reading_comprehesion/DuReader/run.sh
浏览文件 @
52c076c7
export
CUDA_VISIBLE_DEVICES
=
1
python run.py
\
--trainset
'data/preprocessed/trainset/search.train.json'
\
'data/preprocessed/trainset/zhidao.train.json'
\
...
...
fluid/machine_reading_comprehesion/DuReader/run_demo.sh
已删除
100644 → 0
浏览文件 @
1cf25cdb
python run.py
\
--trainset
'data/demo/trainset/search.train.json'
\
--devset
'data/demo/devset/search.dev.json'
\
--testset
'data/demo/testset/search.test.json'
\
--vocab_dir
'data/demo/vocab'
\
--use_gpu
true
\
--save_dir
./models
\
--pass_num
10
\
--learning_rate
0.001
\
--batch_size
8
\
--embed_size
300
\
--hidden_size
150
\
--max_p_num
5
\
--max_p_len
500
\
--max_q_len
60
\
--max_a_len
200
\
--drop_rate
0.2
\
--log_interval
1
$@
\
fluid/machine_reading_comprehesion/DuReader/train.py
已删除
100644 → 0
浏览文件 @
1cf25cdb
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
numpy
as
np
import
time
import
os
import
random
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
paddle.fluid.framework
as
framework
from
paddle.fluid.executor
import
Executor
import
sys
if
sys
.
version
[
0
]
==
'2'
:
reload
(
sys
)
sys
.
setdefaultencoding
(
"utf-8"
)
sys
.
path
.
append
(
'..'
)
from
args
import
*
import
rc_model
from
dataset
import
BRCDataset
import
logging
import
pickle
from
utils
import
normalize
from
utils
import
compute_bleu_rouge
def
prepare_batch_input
(
insts
,
args
):
doc_num
=
args
.
doc_num
batch_size
=
len
(
insts
[
'raw_data'
])
new_insts
=
[]
for
i
in
range
(
batch_size
):
p_id
=
[]
q_id
=
[]
p_ids
=
[]
q_ids
=
[]
p_len
=
0
for
j
in
range
(
i
*
doc_num
,
(
i
+
1
)
*
doc_num
):
p_ids
.
append
(
insts
[
'passage_token_ids'
][
j
])
p_id
=
p_id
+
insts
[
'passage_token_ids'
][
j
]
q_ids
.
append
(
insts
[
'question_token_ids'
][
j
])
q_id
=
q_id
+
insts
[
'question_token_ids'
][
j
]
p_len
=
len
(
p_id
)
def
_get_label
(
idx
,
ref_len
):
ret
=
[
0.0
]
*
ref_len
if
idx
>=
0
and
idx
<
ref_len
:
ret
[
idx
]
=
1.0
return
[[
x
]
for
x
in
ret
]
start_label
=
_get_label
(
insts
[
'start_id'
][
i
],
p_len
)
end_label
=
_get_label
(
insts
[
'end_id'
][
i
],
p_len
)
new_inst
=
q_ids
+
[
start_label
,
end_label
]
+
p_ids
new_insts
.
append
(
new_inst
)
return
new_insts
def
LodTensor_Array
(
lod_tensor
):
lod
=
lod_tensor
.
lod
()
array
=
np
.
array
(
lod_tensor
)
new_array
=
[]
for
i
in
range
(
len
(
lod
[
0
])
-
1
):
new_array
.
append
(
array
[
lod
[
0
][
i
]:
lod
[
0
][
i
+
1
]])
return
new_array
def
print_para
(
train_prog
,
train_exe
,
logger
,
args
):
if
args
.
para_print
:
param_list
=
train_prog
.
block
(
0
).
all_parameters
()
param_name_list
=
[
p
.
name
for
p
in
param_list
]
num_sum
=
0
for
p_name
in
param_name_list
:
p_array
=
np
.
array
(
train_exe
.
scope
.
find_var
(
p_name
).
get_tensor
())
param_num
=
np
.
prod
(
p_array
.
shape
)
num_sum
=
num_sum
+
param_num
print
(
"param: {0}, mean={1} max={2} min={3} num={4} {5}"
.
format
(
p_name
,
p_array
.
mean
(),
p_array
.
max
(),
p_array
.
min
(),
p_array
.
shape
,
param_num
))
print
(
"total param num: {0}"
.
format
(
num_sum
))
def
find_best_answer_for_passage
(
start_probs
,
end_probs
,
passage_len
,
args
):
"""
Finds the best answer with the maximum start_prob * end_prob from a single passage
"""
if
passage_len
is
None
:
passage_len
=
len
(
start_probs
)
else
:
passage_len
=
min
(
len
(
start_probs
),
passage_len
)
best_start
,
best_end
,
max_prob
=
-
1
,
-
1
,
0
for
start_idx
in
range
(
passage_len
):
for
ans_len
in
range
(
args
.
max_a_len
):
end_idx
=
start_idx
+
ans_len
if
end_idx
>=
passage_len
:
continue
prob
=
start_probs
[
start_idx
]
*
end_probs
[
end_idx
]
if
prob
>
max_prob
:
best_start
=
start_idx
best_end
=
end_idx
max_prob
=
prob
return
(
best_start
,
best_end
),
max_prob
def
find_best_answer
(
sample
,
start_prob
,
end_prob
,
padded_p_len
,
args
):
"""
Finds the best answer for a sample given start_prob and end_prob for each position.
This will call find_best_answer_for_passage because there are multiple passages in a sample
"""
best_p_idx
,
best_span
,
best_score
=
None
,
None
,
0
for
p_idx
,
passage
in
enumerate
(
sample
[
'passages'
]):
if
p_idx
>=
args
.
max_p_num
:
continue
passage_len
=
min
(
args
.
max_p_len
,
len
(
passage
[
'passage_tokens'
]))
answer_span
,
score
=
find_best_answer_for_passage
(
start_prob
[
p_idx
*
padded_p_len
:(
p_idx
+
1
)
*
padded_p_len
],
end_prob
[
p_idx
*
padded_p_len
:(
p_idx
+
1
)
*
padded_p_len
],
passage_len
,
args
)
if
score
>
best_score
:
best_score
=
score
best_p_idx
=
p_idx
best_span
=
answer_span
if
best_p_idx
is
None
or
best_span
is
None
:
best_answer
=
''
else
:
best_answer
=
''
.
join
(
sample
[
'passages'
][
best_p_idx
][
'passage_tokens'
][
best_span
[
0
]:
best_span
[
1
]
+
1
])
return
best_answer
def
validation
(
exe
,
inference_program
,
avg_cost
,
s_probs
,
e_probs
,
feed_order
,
place
,
vocab
,
brc_data
,
args
):
"""
"""
# Use test set as validation each pass
total_loss
=
0.0
count
=
0
pred_answers
,
ref_answers
=
[],
[]
val_feed_list
=
[
inference_program
.
global_block
().
var
(
var_name
)
for
var_name
in
feed_order
]
val_feeder
=
fluid
.
DataFeeder
(
val_feed_list
,
place
)
pad_id
=
vocab
.
get_id
(
vocab
.
pad_token
)
dev_batches
=
brc_data
.
gen_mini_batches
(
'dev'
,
args
.
batch_size
,
pad_id
,
shuffle
=
False
)
for
batch_id
,
batch
in
enumerate
(
dev_batches
,
1
):
feed_data
=
prepare_batch_input
(
batch
,
args
)
val_fetch_outs
=
exe
.
run
(
inference_program
,
feed
=
val_feeder
.
feed
(
feed_data
),
fetch_list
=
[
avg_cost
,
s_probs
,
e_probs
],
return_numpy
=
False
)
total_loss
+=
np
.
array
(
val_fetch_outs
[
0
])[
0
]
start_probs
=
LodTensor_Array
(
val_fetch_outs
[
1
])
end_probs
=
LodTensor_Array
(
val_fetch_outs
[
2
])
count
+=
len
(
batch
[
'raw_data'
])
padded_p_len
=
len
(
batch
[
'passage_token_ids'
][
0
])
for
sample
,
start_prob
,
end_prob
in
zip
(
batch
[
'raw_data'
],
start_probs
,
end_probs
):
best_answer
=
find_best_answer
(
sample
,
start_prob
,
end_prob
,
padded_p_len
,
args
)
pred_answers
.
append
({
'question_id'
:
sample
[
'question_id'
],
'question_type'
:
sample
[
'question_type'
],
'answers'
:
[
best_answer
],
'entity_answers'
:
[[]],
'yesno_answers'
:
[]
})
if
'answers'
in
sample
:
ref_answers
.
append
({
'question_id'
:
sample
[
'question_id'
],
'question_type'
:
sample
[
'question_type'
],
'answers'
:
sample
[
'answers'
],
'entity_answers'
:
[[]],
'yesno_answers'
:
[]
})
ave_loss
=
1.0
*
total_loss
/
count
# compute the bleu and rouge scores if reference answers is provided
if
len
(
ref_answers
)
>
0
:
pred_dict
,
ref_dict
=
{},
{}
for
pred
,
ref
in
zip
(
pred_answers
,
ref_answers
):
question_id
=
ref
[
'question_id'
]
if
len
(
ref
[
'answers'
])
>
0
:
pred_dict
[
question_id
]
=
normalize
(
pred
[
'answers'
])
ref_dict
[
question_id
]
=
normalize
(
ref
[
'answers'
])
bleu_rouge
=
compute_bleu_rouge
(
pred_dict
,
ref_dict
)
else
:
bleu_rouge
=
None
return
ave_loss
,
bleu_rouge
def
train
():
args
=
parse_args
()
random
.
seed
(
args
.
random_seed
)
np
.
random
.
seed
(
args
.
random_seed
)
fluid
.
framework
.
default_startup_program
().
random_seed
=
args
.
random_seed
fluid
.
default_main_program
().
random_seed
=
args
.
random_seed
logger
=
logging
.
getLogger
(
"brc"
)
logger
.
setLevel
(
logging
.
INFO
)
formatter
=
logging
.
Formatter
(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
if
args
.
log_path
:
file_handler
=
logging
.
FileHandler
(
args
.
log_path
)
file_handler
.
setLevel
(
logging
.
INFO
)
file_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
file_handler
)
else
:
console_handler
=
logging
.
StreamHandler
()
console_handler
.
setLevel
(
logging
.
INFO
)
console_handler
.
setFormatter
(
formatter
)
logger
.
addHandler
(
console_handler
)
logger
.
info
(
'Running with args : {}'
.
format
(
args
))
logger
.
info
(
'Load data_set and vocab...'
)
with
open
(
os
.
path
.
join
(
args
.
vocab_dir
,
'vocab.data'
),
'rb'
)
as
fin
:
vocab
=
pickle
.
load
(
fin
)
logger
.
info
(
'vocab size is {} and embed dim is {}'
.
format
(
vocab
.
size
(
),
vocab
.
embed_dim
))
brc_data
=
BRCDataset
(
args
.
max_p_num
,
args
.
max_p_len
,
args
.
max_q_len
,
args
.
trainset
,
args
.
devset
)
logger
.
info
(
'Converting text into ids...'
)
brc_data
.
convert_to_ids
(
vocab
)
logger
.
info
(
'Initialize the model...'
)
# build model
avg_cost
,
s_probs
,
e_probs
,
feed_order
=
rc_model
.
rc_model
(
args
.
hidden_size
,
vocab
,
args
)
# clone from default main program and use it as the validation program
main_program
=
fluid
.
default_main_program
()
inference_program
=
fluid
.
default_main_program
().
clone
(
for_test
=
True
)
# build optimizer
if
args
.
optim
==
'sgd'
:
optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
args
.
learning_rate
)
elif
args
.
optim
==
'adam'
:
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
args
.
learning_rate
)
elif
args
.
optim
==
'rprop'
:
optimizer
=
fluid
.
optimizer
.
RMSPropOptimizer
(
learning_rate
=
args
.
learning_rate
)
else
:
logger
.
error
(
'Unsupported optimizer: {}'
.
format
(
args
.
optim
))
exit
(
-
1
)
optimizer
.
minimize
(
avg_cost
)
# initialize parameters
place
=
core
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
core
.
CPUPlace
()
exe
=
Executor
(
place
)
if
args
.
load_dir
:
logger
.
info
(
'load from {}'
.
format
(
args
.
load_dir
))
fluid
.
io
.
load_persistables
(
exe
,
args
.
load_dir
)
else
:
exe
.
run
(
framework
.
default_startup_program
())
embedding_para
=
fluid
.
global_scope
().
find_var
(
'embedding_para'
).
get_tensor
()
embedding_para
.
set
(
vocab
.
embeddings
.
astype
(
np
.
float32
),
place
)
# prepare data
feed_list
=
[
main_program
.
global_block
().
var
(
var_name
)
for
var_name
in
feed_order
]
feeder
=
fluid
.
DataFeeder
(
feed_list
,
place
)
logger
.
info
(
'Training the model...'
)
parallel_executor
=
fluid
.
ParallelExecutor
(
use_cuda
=
bool
(
args
.
use_gpu
),
loss_name
=
avg_cost
.
name
)
print_para
(
fluid
.
framework
.
default_main_program
(),
parallel_executor
,
logger
,
args
)
for
pass_id
in
range
(
1
,
args
.
pass_num
+
1
):
pass_start_time
=
time
.
time
()
pad_id
=
vocab
.
get_id
(
vocab
.
pad_token
)
train_batches
=
brc_data
.
gen_mini_batches
(
'train'
,
args
.
batch_size
,
pad_id
,
shuffle
=
True
)
log_every_n_batch
,
n_batch_loss
=
args
.
log_interval
,
0
total_num
,
total_loss
=
0
,
0
for
batch_id
,
batch
in
enumerate
(
train_batches
,
1
):
input_data_dict
=
prepare_batch_input
(
batch
,
args
)
fetch_outs
=
parallel_executor
.
run
(
feed
=
feeder
.
feed
(
input_data_dict
),
fetch_list
=
[
avg_cost
.
name
],
return_numpy
=
False
)
cost_train
=
np
.
array
(
fetch_outs
[
0
])[
0
]
total_num
+=
len
(
batch
[
'raw_data'
])
n_batch_loss
+=
cost_train
total_loss
+=
cost_train
*
len
(
batch
[
'raw_data'
])
if
log_every_n_batch
>
0
and
batch_id
%
log_every_n_batch
==
0
:
print_para
(
fluid
.
framework
.
default_main_program
(),
parallel_executor
,
logger
,
args
)
logger
.
info
(
'Average loss from batch {} to {} is {}'
.
format
(
batch_id
-
log_every_n_batch
+
1
,
batch_id
,
"%.10f"
%
(
n_batch_loss
/
log_every_n_batch
)))
n_batch_loss
=
0
if
args
.
dev_interval
>
0
and
batch_id
%
args
.
dev_interval
==
0
:
eval_loss
,
bleu_rouge
=
validation
(
exe
,
inference_program
,
avg_cost
,
s_probs
,
e_probs
,
feed_order
,
place
,
vocab
,
brc_data
,
args
)
logger
.
info
(
'Dev eval loss {}'
.
format
(
eval_loss
))
logger
.
info
(
'Dev eval result: {}'
.
format
(
bleu_rouge
))
pass_end_time
=
time
.
time
()
logger
.
info
(
'Evaluating the model after epoch {}'
.
format
(
pass_id
))
if
brc_data
.
dev_set
is
not
None
:
eval_loss
,
bleu_rouge
=
validation
(
exe
,
inference_program
,
avg_cost
,
s_probs
,
e_probs
,
feed_order
,
place
,
vocab
,
brc_data
,
args
)
logger
.
info
(
'Dev eval loss {}'
.
format
(
eval_loss
))
logger
.
info
(
'Dev eval result: {}'
.
format
(
bleu_rouge
))
else
:
logger
.
warning
(
'No dev set is loaded for evaluation in the dataset!'
)
time_consumed
=
pass_end_time
-
pass_start_time
logger
.
info
(
'Average train loss for epoch {} is {}'
.
format
(
pass_id
,
"%.10f"
%
(
1.0
*
total_loss
/
total_num
)))
if
pass_id
%
args
.
save_interval
==
0
:
model_path
=
os
.
path
.
join
(
args
.
save_dir
,
str
(
pass_id
))
if
not
os
.
path
.
isdir
(
model_path
):
os
.
makedirs
(
model_path
)
fluid
.
io
.
save_persistables
(
executor
=
exe
,
dirname
=
model_path
,
main_program
=
main_program
)
if
__name__
==
'__main__'
:
train
()
fluid/machine_reading_comprehesion/DuReader/vocab.py
0 → 100644
浏览文件 @
52c076c7
# -*- coding:utf8 -*-
# ==============================================================================
# Copyright 2017 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
This module implements the Vocab class for converting string to id and back
"""
import
numpy
as
np
class
Vocab
(
object
):
"""
Implements a vocabulary to store the tokens in the data, with their corresponding embeddings.
"""
def
__init__
(
self
,
filename
=
None
,
initial_tokens
=
None
,
lower
=
False
):
self
.
id2token
=
{}
self
.
token2id
=
{}
self
.
token_cnt
=
{}
self
.
lower
=
lower
self
.
embed_dim
=
None
self
.
embeddings
=
None
self
.
pad_token
=
'<blank>'
self
.
unk_token
=
'<unk>'
self
.
initial_tokens
=
initial_tokens
if
initial_tokens
is
not
None
else
[]
self
.
initial_tokens
.
extend
([
self
.
pad_token
,
self
.
unk_token
])
for
token
in
self
.
initial_tokens
:
self
.
add
(
token
)
if
filename
is
not
None
:
self
.
load_from_file
(
filename
)
def
size
(
self
):
"""
get the size of vocabulary
Returns:
an integer indicating the size
"""
return
len
(
self
.
id2token
)
def
load_from_file
(
self
,
file_path
):
"""
loads the vocab from file_path
Args:
file_path: a file with a word in each line
"""
for
line
in
open
(
file_path
,
'r'
):
token
=
line
.
rstrip
(
'
\n
'
)
self
.
add
(
token
)
def
get_id
(
self
,
token
):
"""
gets the id of a token, returns the id of unk token if token is not in vocab
Args:
key: a string indicating the word
Returns:
an integer
"""
token
=
token
.
lower
()
if
self
.
lower
else
token
try
:
return
self
.
token2id
[
token
]
except
KeyError
:
return
self
.
token2id
[
self
.
unk_token
]
def
get_token
(
self
,
idx
):
"""
gets the token corresponding to idx, returns unk token if idx is not in vocab
Args:
idx: an integer
returns:
a token string
"""
try
:
return
self
.
id2token
[
idx
]
except
KeyError
:
return
self
.
unk_token
def
add
(
self
,
token
,
cnt
=
1
):
"""
adds the token to vocab
Args:
token: a string
cnt: a num indicating the count of the token to add, default is 1
"""
token
=
token
.
lower
()
if
self
.
lower
else
token
if
token
in
self
.
token2id
:
idx
=
self
.
token2id
[
token
]
else
:
idx
=
len
(
self
.
id2token
)
self
.
id2token
[
idx
]
=
token
self
.
token2id
[
token
]
=
idx
if
cnt
>
0
:
if
token
in
self
.
token_cnt
:
self
.
token_cnt
[
token
]
+=
cnt
else
:
self
.
token_cnt
[
token
]
=
cnt
return
idx
def
filter_tokens_by_cnt
(
self
,
min_cnt
):
"""
filter the tokens in vocab by their count
Args:
min_cnt: tokens with frequency less than min_cnt is filtered
"""
filtered_tokens
=
[
token
for
token
in
self
.
token2id
if
self
.
token_cnt
[
token
]
>=
min_cnt
]
# rebuild the token x id map
self
.
token2id
=
{}
self
.
id2token
=
{}
for
token
in
self
.
initial_tokens
:
self
.
add
(
token
,
cnt
=
0
)
for
token
in
filtered_tokens
:
self
.
add
(
token
,
cnt
=
0
)
def
randomly_init_embeddings
(
self
,
embed_dim
):
"""
randomly initializes the embeddings for each token
Args:
embed_dim: the size of the embedding for each token
"""
self
.
embed_dim
=
embed_dim
self
.
embeddings
=
np
.
random
.
rand
(
self
.
size
(),
embed_dim
)
for
token
in
[
self
.
pad_token
,
self
.
unk_token
]:
self
.
embeddings
[
self
.
get_id
(
token
)]
=
np
.
zeros
([
self
.
embed_dim
])
def
load_pretrained_embeddings
(
self
,
embedding_path
):
"""
loads the pretrained embeddings from embedding_path,
tokens not in pretrained embeddings will be filtered
Args:
embedding_path: the path of the pretrained embedding file
"""
trained_embeddings
=
{}
with
open
(
embedding_path
,
'r'
)
as
fin
:
for
line
in
fin
:
contents
=
line
.
strip
().
split
()
token
=
contents
[
0
].
decode
(
'utf8'
)
if
token
not
in
self
.
token2id
:
continue
trained_embeddings
[
token
]
=
list
(
map
(
float
,
contents
[
1
:]))
if
self
.
embed_dim
is
None
:
self
.
embed_dim
=
len
(
contents
)
-
1
filtered_tokens
=
trained_embeddings
.
keys
()
# rebuild the token x id map
self
.
token2id
=
{}
self
.
id2token
=
{}
for
token
in
self
.
initial_tokens
:
self
.
add
(
token
,
cnt
=
0
)
for
token
in
filtered_tokens
:
self
.
add
(
token
,
cnt
=
0
)
# load embeddings
self
.
embeddings
=
np
.
zeros
([
self
.
size
(),
self
.
embed_dim
])
for
token
in
self
.
token2id
.
keys
():
if
token
in
trained_embeddings
:
self
.
embeddings
[
self
.
get_id
(
token
)]
=
trained_embeddings
[
token
]
def
convert_to_ids
(
self
,
tokens
):
"""
Convert a list of tokens to ids, use unk_token if the token is not in vocab.
Args:
tokens: a list of token
Returns:
a list of ids
"""
vec
=
[
self
.
get_id
(
label
)
for
label
in
tokens
]
return
vec
def
recover_from_ids
(
self
,
ids
,
stop_id
=
None
):
"""
Convert a list of ids to tokens, stop converting if the stop_id is encountered
Args:
ids: a list of ids to convert
stop_id: the stop id, default is None
Returns:
a list of tokens
"""
tokens
=
[]
for
i
in
ids
:
tokens
+=
[
self
.
get_token
(
i
)]
if
stop_id
is
not
None
and
i
==
stop_id
:
break
return
tokens
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录