提交 18e03956 编写于 作者: Y Yibing Liu

Merge branch 'develop' of upstream into kaldi_decoder

...@@ -218,8 +218,6 @@ class AsyncDataReader(object): ...@@ -218,8 +218,6 @@ class AsyncDataReader(object):
self._sample_proc_num = self._proc_num - 2 self._sample_proc_num = self._proc_num - 2
self._verbose = verbose self._verbose = verbose
self._force_exit = ForceExitWrapper(self._manager.Value('b', False)) self._force_exit = ForceExitWrapper(self._manager.Value('b', False))
self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size *
3, self._manager)
def generate_bucket_list(self, is_shuffle): def generate_bucket_list(self, is_shuffle):
if self._block_info_list is None: if self._block_info_list is None:
...@@ -424,6 +422,9 @@ class AsyncDataReader(object): ...@@ -424,6 +422,9 @@ class AsyncDataReader(object):
sample_queue = self._start_async_processing() sample_queue = self._start_async_processing()
batch_queue = self._manager.Queue(self._batch_buffer_size) batch_queue = self._manager.Queue(self._batch_buffer_size)
self._pool_manager = SharedMemoryPoolManager(self._batch_buffer_size *
3, self._manager)
assembling_proc = DaemonProcessGroup( assembling_proc = DaemonProcessGroup(
proc_num=1, proc_num=1,
target=batch_assembling_task, target=batch_assembling_task,
...@@ -439,3 +440,6 @@ class AsyncDataReader(object): ...@@ -439,3 +440,6 @@ class AsyncDataReader(object):
if isinstance(batch_data, EpochEndSignal): if isinstance(batch_data, EpochEndSignal):
break break
yield batch_data yield batch_data
# clean the shared memory
del self._pool_manager
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import sys import sys, time
from six import reraise from six import reraise
from tblib import Traceback from tblib import Traceback
from multiprocessing import Manager, Process from multiprocessing import Manager, Process
...@@ -161,9 +161,10 @@ class SharedMemoryPoolManager(object): ...@@ -161,9 +161,10 @@ class SharedMemoryPoolManager(object):
def __init__(self, pool_size, manager, name_prefix='/deep_asr'): def __init__(self, pool_size, manager, name_prefix='/deep_asr'):
self._names = [] self._names = []
self._dict = manager.dict() self._dict = manager.dict()
self._time_prefix = time.strftime('%Y%m%d%H%M%S')
for i in xrange(pool_size): for i in xrange(pool_size):
name = name_prefix + '_' + str(i) name = name_prefix + '_' + self._time_prefix + '_' + str(i)
self._dict[name] = SharedNDArray(name) self._dict[name] = SharedNDArray(name)
self._names.append(name) self._names.append(name)
......
...@@ -3,18 +3,37 @@ class TrainTaskConfig(object): ...@@ -3,18 +3,37 @@ class TrainTaskConfig(object):
# the epoch number to train. # the epoch number to train.
pass_num = 2 pass_num = 2
# number of sequences contained in a mini-batch. # the number of sequences contained in a mini-batch.
batch_size = 64 batch_size = 64
# the hyper params for Adam optimizer. # the hyper parameters for Adam optimizer.
learning_rate = 0.001 learning_rate = 0.001
beta1 = 0.9 beta1 = 0.9
beta2 = 0.98 beta2 = 0.98
eps = 1e-9 eps = 1e-9
# the params for learning rate scheduling # the parameters for learning rate scheduling.
warmup_steps = 4000 warmup_steps = 4000
# the directory for saving trained models.
model_dir = "trained_models"
class InferTaskConfig(object):
use_gpu = False
# the number of examples in one run for sequence generation.
# currently the batch size can only be set to 1.
batch_size = 1
# the parameters for beam search.
beam_size = 5
max_length = 30
# the number of decoded sentences to output.
n_best = 1
# the directory for loading the trained model.
model_path = "trained_models/pass_1.infer.model"
class ModelHyperParams(object): class ModelHyperParams(object):
# Dictionary size for source and target language. This model directly uses # Dictionary size for source and target language. This model directly uses
...@@ -33,6 +52,11 @@ class ModelHyperParams(object): ...@@ -33,6 +52,11 @@ class ModelHyperParams(object):
# index for <pad> token in target language. # index for <pad> token in target language.
trg_pad_idx = trg_vocab_size trg_pad_idx = trg_vocab_size
# index for <bos> token
bos_idx = 0
# index for <eos> token
eos_idx = 1
# position value corresponding to the <pad> token. # position value corresponding to the <pad> token.
pos_pad_idx = 0 pos_pad_idx = 0
...@@ -64,14 +88,21 @@ pos_enc_param_names = ( ...@@ -64,14 +88,21 @@ pos_enc_param_names = (
"src_pos_enc_table", "src_pos_enc_table",
"trg_pos_enc_table", ) "trg_pos_enc_table", )
# Names of all data layers listed in order. # Names of all data layers in encoder listed in order.
input_data_names = ( encoder_input_data_names = (
"src_word", "src_word",
"src_pos", "src_pos",
"src_slf_attn_bias", )
# Names of all data layers in decoder listed in order.
decoder_input_data_names = (
"trg_word", "trg_word",
"trg_pos", "trg_pos",
"src_slf_attn_bias",
"trg_slf_attn_bias", "trg_slf_attn_bias",
"trg_src_attn_bias", "trg_src_attn_bias",
"enc_output", )
# Names of label related data layers listed in order.
label_data_names = (
"lbl_word", "lbl_word",
"lbl_weight", ) "lbl_weight", )
import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid
import model
from model import wrap_encoder as encoder
from model import wrap_decoder as decoder
from config import InferTaskConfig, ModelHyperParams, \
encoder_input_data_names, decoder_input_data_names
from train import pad_batch_data
def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
decoder, dec_in_names, dec_out_names, beam_size, max_length,
n_best, batch_size, n_head, src_pad_idx, trg_pad_idx,
bos_idx, eos_idx):
"""
Run the encoder program once and run the decoder program multiple times to
implement beam search externally.
"""
# Prepare data for encoder and run the encoder.
enc_in_data = pad_batch_data(
src_words,
src_pad_idx,
n_head,
is_target=False,
return_pos=True,
return_attn_bias=True,
return_max_len=True)
enc_output = exe.run(encoder,
feed=dict(zip(enc_in_names, enc_in_data)),
fetch_list=enc_out_names)[0]
# Beam Search.
# To store the beam info.
scores = np.zeros((batch_size, beam_size), dtype="float32")
prev_branchs = [[]] * batch_size
next_ids = [[]] * batch_size
# Use beam_map to map the instance idx in batch to beam idx, since the
# size of feeded batch is changing.
beam_map = range(batch_size)
def beam_backtrace(prev_branchs, next_ids, n_best=beam_size, add_bos=True):
"""
Decode and select n_best sequences for one instance by backtrace.
"""
seqs = []
for i in range(n_best):
k = i
seq = []
for j in range(len(prev_branchs) - 1, -1, -1):
seq.append(next_ids[j][k])
k = prev_branchs[j][k]
seq = seq[::-1]
seq = [bos_idx] + seq if add_bos else seq
seqs.append(seq)
return seqs
def init_dec_in_data(batch_size, beam_size, enc_in_data, enc_output):
"""
Initialize the input data for decoder.
"""
trg_words = np.array(
[[bos_idx]] * batch_size * beam_size, dtype="int64")
trg_pos = np.array([[1]] * batch_size * beam_size, dtype="int64")
src_max_length, src_slf_attn_bias, trg_max_len = enc_in_data[
-1], enc_in_data[-2], 1
# This is used to remove attention on subsequent words.
trg_slf_attn_bias = np.ones((batch_size * beam_size, trg_max_len,
trg_max_len))
trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
[-1, 1, trg_max_len, trg_max_len])
trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
[-1e9]).astype("float32")
# This is used to remove attention on the paddings of source sequences.
trg_src_attn_bias = np.tile(
src_slf_attn_bias[:, :, ::src_max_length, :],
[beam_size, 1, trg_max_len, 1])
enc_output = np.tile(enc_output, [beam_size, 1, 1])
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output
def update_dec_in_data(dec_in_data, next_ids, active_beams):
"""
Update the input data of decoder mainly by slicing from the previous
input data and dropping the finished instance beams.
"""
trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = dec_in_data
trg_cur_len = len(next_ids[0]) + 1 # include the <bos>
trg_words = np.array(
[
beam_backtrace(
prev_branchs[beam_idx], next_ids[beam_idx], add_bos=True)
for beam_idx in active_beams
],
dtype="int64")
trg_words = trg_words.reshape([-1, 1])
trg_pos = np.array(
[range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
dtype="int64").reshape([-1, 1])
active_beams_indice = (
(np.array(active_beams) * beam_size)[:, np.newaxis] +
np.array(range(beam_size))[np.newaxis, :]).flatten()
# This is used to remove attention on subsequent words.
trg_slf_attn_bias = np.ones((len(active_beams) * beam_size, trg_cur_len,
trg_cur_len))
trg_slf_attn_bias = np.triu(trg_slf_attn_bias, 1).reshape(
[-1, 1, trg_cur_len, trg_cur_len])
trg_slf_attn_bias = (np.tile(trg_slf_attn_bias, [1, n_head, 1, 1]) *
[-1e9]).astype("float32")
# This is used to remove attention on the paddings of source sequences.
trg_src_attn_bias = np.tile(trg_src_attn_bias[
active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
[1, 1, trg_cur_len, 1])
enc_output = enc_output[active_beams_indice, :, :]
return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output
dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
enc_output)
for i in range(max_length):
predict_all = exe.run(decoder,
feed=dict(zip(dec_in_names, dec_in_data)),
fetch_list=dec_out_names)[0]
predict_all = np.log(
predict_all.reshape([len(beam_map) * beam_size, i + 1, -1])[:,
-1, :])
predict_all = (predict_all + scores[beam_map].reshape(
[len(beam_map) * beam_size, -1])).reshape(
[len(beam_map), beam_size, -1])
active_beams = []
for inst_idx, beam_idx in enumerate(beam_map):
predict = (predict_all[inst_idx, :, :]
if i != 0 else predict_all[inst_idx, 0, :]).flatten()
top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
top_scores_ids = top_k_indice[np.argsort(predict[top_k_indice])[::
-1]]
top_scores = predict[top_scores_ids]
scores[beam_idx] = top_scores
prev_branchs[beam_idx].append(top_scores_ids /
predict_all.shape[-1])
next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
if next_ids[beam_idx][-1][0] != eos_idx:
active_beams.append(beam_idx)
beam_map = active_beams
if len(beam_map) == 0:
break
dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams)
# Decode beams and select n_best sequences for each instance by backtrace.
seqs = [beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)]
return seqs, scores[:, :n_best].tolist()
def main():
place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
# The current program desc is coupled with batch_size and the only
# supported batch size is 1 currently.
encoder_program = fluid.Program()
model.batch_size = InferTaskConfig.batch_size
with fluid.program_guard(main_program=encoder_program):
enc_output = encoder(
ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
ModelHyperParams.src_pad_idx, ModelHyperParams.pos_pad_idx)
model.batch_size = InferTaskConfig.batch_size * InferTaskConfig.beam_size
decoder_program = fluid.Program()
with fluid.program_guard(main_program=decoder_program):
predict = decoder(
ModelHyperParams.trg_vocab_size + 1,
ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
ModelHyperParams.n_head, ModelHyperParams.d_key,
ModelHyperParams.d_value, ModelHyperParams.d_model,
ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
# Load model parameters of encoder and decoder separately from the saved
# transformer model.
encoder_var_names = []
for op in encoder_program.block(0).ops:
encoder_var_names += op.input_arg_names
encoder_param_names = filter(
lambda var_name: isinstance(encoder_program.block(0).var(var_name),
fluid.framework.Parameter),
encoder_var_names)
encoder_params = map(encoder_program.block(0).var, encoder_param_names)
decoder_var_names = []
for op in decoder_program.block(0).ops:
decoder_var_names += op.input_arg_names
decoder_param_names = filter(
lambda var_name: isinstance(decoder_program.block(0).var(var_name),
fluid.framework.Parameter),
decoder_var_names)
decoder_params = map(decoder_program.block(0).var, decoder_param_names)
fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=encoder_params)
fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=decoder_params)
# This is used here to set dropout to the test mode.
encoder_program = fluid.io.get_inference_program(
target_vars=[enc_output], main_program=encoder_program)
decoder_program = fluid.io.get_inference_program(
target_vars=[predict], main_program=decoder_program)
test_data = paddle.batch(
paddle.dataset.wmt16.test(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=InferTaskConfig.batch_size)
trg_idx2word = paddle.dataset.wmt16.get_dict(
"de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True)
for batch_id, data in enumerate(test_data()):
batch_seqs, batch_scores = translate_batch(
exe, [item[0] for item in data], encoder_program,
encoder_input_data_names, [enc_output.name], decoder_program,
decoder_input_data_names, [predict.name], InferTaskConfig.beam_size,
InferTaskConfig.max_length, InferTaskConfig.n_best,
len(data), ModelHyperParams.n_head, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.bos_idx,
ModelHyperParams.eos_idx)
for i in range(len(batch_seqs)):
seqs = batch_seqs[i]
scores = batch_scores[i]
for seq in seqs:
print(" ".join([trg_idx2word[idx] for idx in seq]))
if __name__ == "__main__":
main()
...@@ -4,7 +4,8 @@ import numpy as np ...@@ -4,7 +4,8 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
from config import TrainTaskConfig, input_data_names, pos_enc_param_names from config import TrainTaskConfig, pos_enc_param_names, \
encoder_input_data_names, decoder_input_data_names, label_data_names
# FIXME(guosheng): Remove out the batch_size from the model. # FIXME(guosheng): Remove out the batch_size from the model.
batch_size = TrainTaskConfig.batch_size batch_size = TrainTaskConfig.batch_size
...@@ -127,7 +128,9 @@ def multi_head_attention(queries, ...@@ -127,7 +128,9 @@ def multi_head_attention(queries,
scaled_q = layers.scale(x=q, scale=d_model**-0.5) scaled_q = layers.scale(x=q, scale=d_model**-0.5)
product = layers.matmul(x=scaled_q, y=k, transpose_y=True) product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) weights = __softmax(
layers.elementwise_add(
x=product, y=attn_bias) if attn_bias else product)
if dropout_rate: if dropout_rate:
weights = layers.dropout( weights = layers.dropout(
weights, dropout_prob=dropout_rate, is_test=False) weights, dropout_prob=dropout_rate, is_test=False)
...@@ -280,8 +283,15 @@ def encoder(enc_input, ...@@ -280,8 +283,15 @@ def encoder(enc_input,
encoder_layer. encoder_layer.
""" """
for i in range(n_layer): for i in range(n_layer):
enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value, enc_output = encoder_layer(
d_model, d_inner_hid, dropout_rate) enc_input,
attn_bias,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate, )
enc_input = enc_output enc_input = enc_output
return enc_output return enc_output
...@@ -373,75 +383,142 @@ def decoder(dec_input, ...@@ -373,75 +383,142 @@ def decoder(dec_input,
return dec_output return dec_output
def transformer( def make_inputs(input_data_names,
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head, n_head,
d_key,
d_value,
d_model, d_model,
d_inner_hid, batch_size,
dropout_rate, max_length,
src_pad_idx, is_pos,
trg_pad_idx, slf_attn_bias_flag,
pos_pad_idx, ): src_attn_bias_flag,
enc_output_flag=False):
"""
Define the input data layers for the transformer model.
"""
input_layers = []
# The shapes here act as placeholder. # The shapes here act as placeholder.
# The shapes set here is to pass the infer-shape in compile time. The actual # The shapes set here is to pass the infer-shape in compile time.
# shape of src_word in run time is: word = layers.data(
# [batch_size * max_src_length_in_a_batch, 1]. name=input_data_names[len(input_layers)],
src_word = layers.data(
name=input_data_names[0],
shape=[batch_size * max_length, 1],
dtype="int64",
append_batch_size=False)
# The actual shape of src_pos in runtime is:
# [batch_size * max_src_length_in_a_batch, 1].
src_pos = layers.data(
name=input_data_names[1],
shape=[batch_size * max_length, 1],
dtype="int64",
append_batch_size=False)
# The actual shape of trg_word is in runtime is:
# [batch_size * max_trg_length_in_a_batch, 1].
trg_word = layers.data(
name=input_data_names[2],
shape=[batch_size * max_length, 1], shape=[batch_size * max_length, 1],
dtype="int64", dtype="int64",
append_batch_size=False) append_batch_size=False)
# The actual shape of trg_pos in runtime is: input_layers += [word]
# [batch_size * max_trg_length_in_a_batch, 1]. # This is used for position data or label weight.
trg_pos = layers.data( pos = layers.data(
name=input_data_names[3], name=input_data_names[len(input_layers)],
shape=[batch_size * max_length, 1], shape=[batch_size * max_length, 1],
dtype="int64", dtype="int64" if is_pos else "float32",
append_batch_size=False) append_batch_size=False)
# The actual shape of src_slf_attn_bias in runtime is: input_layers += [pos]
# [batch_size, n_head, max_src_length_in_a_batch, max_src_length_in_a_batch]. if slf_attn_bias_flag:
# This input is used to remove attention weights on paddings. # This input is used to remove attention weights on paddings for the
src_slf_attn_bias = layers.data( # encoder and to remove attention weights on subsequent words for the
name=input_data_names[4], # decoder.
slf_attn_bias = layers.data(
name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length], shape=[batch_size, n_head, max_length, max_length],
dtype="float32", dtype="float32",
append_batch_size=False) append_batch_size=False)
# The actual shape of trg_slf_attn_bias in runtime is: input_layers += [slf_attn_bias]
# [batch_size, n_head, max_trg_length_in_batch, max_trg_length_in_batch]. if src_attn_bias_flag:
# This is used to remove attention weights on paddings and subsequent words. # This input is used to remove attention weights on paddings.
trg_slf_attn_bias = layers.data( src_attn_bias = layers.data(
name=input_data_names[5], name=input_data_names[len(input_layers)],
shape=[batch_size, n_head, max_length, max_length], shape=[batch_size, n_head, max_length, max_length],
dtype="float32", dtype="float32",
append_batch_size=False) append_batch_size=False)
# The actual shape of trg_src_attn_bias in runtime is: input_layers += [src_attn_bias]
# [batch_size, n_head, max_trg_length_in_batch, max_src_length_in_batch]. if enc_output_flag:
# This is used to remove attention weights on paddings. enc_output = layers.data(
trg_src_attn_bias = layers.data( name=input_data_names[len(input_layers)],
name=input_data_names[6], shape=[batch_size, max_length, d_model],
shape=[batch_size, n_head, max_length, max_length],
dtype="float32", dtype="float32",
append_batch_size=False) append_batch_size=False)
input_layers += [enc_output]
return input_layers
def transformer(
src_vocab_size,
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
src_pad_idx,
trg_pad_idx,
pos_pad_idx, ):
enc_input_layers = make_inputs(encoder_input_data_names, n_head, d_model,
batch_size, max_length, True, True, False)
enc_output = wrap_encoder(
src_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
src_pad_idx,
pos_pad_idx,
enc_input_layers, )
dec_input_layers = make_inputs(decoder_input_data_names, n_head, d_model,
batch_size, max_length, True, True, True)
predict = wrap_decoder(
trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
trg_pad_idx,
pos_pad_idx,
dec_input_layers,
enc_output, )
# Padding index do not contribute to the total loss. The weights is used to
# cancel padding index in calculating the loss.
gold, weights = make_inputs(label_data_names, n_head, d_model, batch_size,
max_length, False, False, False)
cost = layers.cross_entropy(input=predict, label=gold)
weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost), predict
def wrap_encoder(src_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
src_pad_idx,
pos_pad_idx,
enc_input_layers=None):
"""
The wrapper assembles together all needed layers for the encoder.
"""
if enc_input_layers is None:
# This is used to implement independent encoder program in inference.
src_word, src_pos, src_slf_attn_bias = make_inputs(
encoder_input_data_names, n_head, d_model, batch_size, max_length,
True, True, False)
else:
src_word, src_pos, src_slf_attn_bias = enc_input_layers
enc_input = prepare_encoder( enc_input = prepare_encoder(
src_word, src_word,
src_pos, src_pos,
...@@ -460,6 +537,32 @@ def transformer( ...@@ -460,6 +537,32 @@ def transformer(
d_model, d_model,
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate, )
return enc_output
def wrap_decoder(trg_vocab_size,
max_length,
n_layer,
n_head,
d_key,
d_value,
d_model,
d_inner_hid,
dropout_rate,
trg_pad_idx,
pos_pad_idx,
dec_input_layers=None,
enc_output=None):
"""
The wrapper assembles together all needed layers for the decoder.
"""
if dec_input_layers is None:
# This is used to implement independent decoder program in inference.
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = make_inputs(
decoder_input_data_names, n_head, d_model, batch_size, max_length,
True, True, True, True)
else:
trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_input_layers
dec_input = prepare_decoder( dec_input = prepare_decoder(
trg_word, trg_word,
...@@ -482,32 +585,11 @@ def transformer( ...@@ -482,32 +585,11 @@ def transformer(
d_inner_hid, d_inner_hid,
dropout_rate, ) dropout_rate, )
# TODO(guosheng): Share the weight matrix between the embedding layers and
# the pre-softmax linear transformation.
predict = layers.reshape( predict = layers.reshape(
x=layers.fc(input=dec_output, x=layers.fc(input=dec_output,
size=trg_vocab_size, size=trg_vocab_size,
param_attr=fluid.initializer.Xavier(uniform=False),
bias_attr=False, bias_attr=False,
num_flatten_dims=2), num_flatten_dims=2),
shape=[-1, trg_vocab_size], shape=[-1, trg_vocab_size],
act="softmax") act="softmax")
# The actual shape of gold in runtime is: return predict
# [batch_size * max_trg_length_in_a_batch, 1].
gold = layers.data(
name=input_data_names[7],
shape=[batch_size * max_length, 1],
dtype="int64",
append_batch_size=False)
cost = layers.cross_entropy(input=predict, label=gold)
# The actual shape of weights in runtime is:
# [batch_size * max_trg_length_in_a_batch, 1].
# Padding index do not contribute to the total loss. This Weight is used to
# cancel padding index in calculating the loss.
weights = layers.data(
name=input_data_names[8],
shape=[batch_size * max_length, 1],
dtype="float32",
append_batch_size=False)
weighted_cost = cost * weights
return layers.reduce_sum(weighted_cost)
import os
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
...@@ -5,21 +6,13 @@ import paddle.fluid as fluid ...@@ -5,21 +6,13 @@ import paddle.fluid as fluid
from model import transformer, position_encoding_init from model import transformer, position_encoding_init
from optim import LearningRateScheduler from optim import LearningRateScheduler
from config import TrainTaskConfig, ModelHyperParams, \ from config import TrainTaskConfig, ModelHyperParams, pos_enc_param_names, \
pos_enc_param_names, input_data_names encoder_input_data_names, decoder_input_data_names, label_data_names
def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, def pad_batch_data(insts,
max_length, n_head, place):
"""
Pad the instances to the max sequence length in batch, and generate the
corresponding position data and attention bias. Then, convert the numpy
data to tensors and return a dict mapping names to tensors.
"""
input_dict = {}
def __pad_batch_data(insts,
pad_idx, pad_idx,
n_head,
is_target=False, is_target=False,
return_pos=True, return_pos=True,
return_attn_bias=True, return_attn_bias=True,
...@@ -35,8 +28,7 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -35,8 +28,7 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
return_list += [inst_data.astype("int64").reshape([-1, 1])] return_list += [inst_data.astype("int64").reshape([-1, 1])]
if return_pos: if return_pos:
inst_pos = np.array([[ inst_pos = np.array([[
pos_i + 1 if w_i != pad_idx else 0 pos_i + 1 if w_i != pad_idx else 0 for pos_i, w_i in enumerate(inst)
for pos_i, w_i in enumerate(inst)
] for inst in inst_data]) ] for inst in inst_data])
return_list += [inst_pos.astype("int64").reshape([-1, 1])] return_list += [inst_pos.astype("int64").reshape([-1, 1])]
...@@ -44,8 +36,7 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -44,8 +36,7 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
if is_target: if is_target:
# This is used to avoid attention on paddings and subsequent # This is used to avoid attention on paddings and subsequent
# words. # words.
slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, max_len))
max_len))
slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
[-1, 1, max_len, max_len]) [-1, 1, max_len, max_len])
slf_attn_bias_data = np.tile(slf_attn_bias_data, slf_attn_bias_data = np.tile(slf_attn_bias_data,
...@@ -63,28 +54,26 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx, ...@@ -63,28 +54,26 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
return_list += [max_len] return_list += [max_len]
return return_list if len(return_list) > 1 else return_list[0] return return_list if len(return_list) > 1 else return_list[0]
def data_to_tensor(data_list, name_list, input_dict, place):
assert len(data_list) == len(name_list) def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
for i in range(len(name_list)): max_length, n_head):
tensor = fluid.LoDTensor() """
tensor.set(data_list[i], place) Put all padded data needed by training into a dict.
input_dict[name_list[i]] = tensor """
src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( [inst[0] for inst in insts], src_pad_idx, n_head, is_target=False)
[inst[0] for inst in insts], src_pad_idx, is_target=False) trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
[inst[1] for inst in insts], trg_pad_idx, is_target=True)
trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
[1, 1, trg_max_len, 1]).astype("float32") [1, 1, trg_max_len, 1]).astype("float32")
lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
False, False, False) False, False, False, False)
lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
input_dict = dict(
data_to_tensor([ zip(input_data_names, [
src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
], input_data_names, input_dict, place) ]))
return input_dict return input_dict
...@@ -92,7 +81,7 @@ def main(): ...@@ -92,7 +81,7 @@ def main():
place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
cost = transformer( cost, predict = transformer(
ModelHyperParams.src_vocab_size + 1, ModelHyperParams.src_vocab_size + 1,
ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.n_layer, ModelHyperParams.n_head,
...@@ -118,6 +107,31 @@ def main(): ...@@ -118,6 +107,31 @@ def main():
buf_size=100000), buf_size=100000),
batch_size=TrainTaskConfig.batch_size) batch_size=TrainTaskConfig.batch_size)
# Program to do validation.
test_program = fluid.default_main_program().clone()
with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([cost])
val_data = paddle.batch(
paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
ModelHyperParams.trg_vocab_size),
batch_size=TrainTaskConfig.batch_size)
def test(exe):
test_costs = []
for batch_id, data in enumerate(val_data()):
if len(data) != TrainTaskConfig.batch_size:
continue
data_input = prepare_batch_input(
data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
ModelHyperParams.n_head)
test_cost = exe.run(test_program,
feed=data_input,
fetch_list=[cost])[0]
test_costs.append(test_cost)
return np.mean(test_costs)
# Initialize the parameters. # Initialize the parameters.
exe.run(fluid.framework.default_startup_program()) exe.run(fluid.framework.default_startup_program())
for pos_enc_param_name in pos_enc_param_names: for pos_enc_param_name in pos_enc_param_names:
...@@ -134,9 +148,10 @@ def main(): ...@@ -134,9 +148,10 @@ def main():
if len(data) != TrainTaskConfig.batch_size: if len(data) != TrainTaskConfig.batch_size:
continue continue
data_input = prepare_batch_input( data_input = prepare_batch_input(
data, input_data_names, ModelHyperParams.src_pad_idx, data, encoder_input_data_names + decoder_input_data_names[:-1] +
label_data_names, ModelHyperParams.src_pad_idx,
ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length, ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
ModelHyperParams.n_head, place) ModelHyperParams.n_head)
lr_scheduler.update_learning_rate(data_input) lr_scheduler.update_learning_rate(data_input)
outs = exe.run(fluid.framework.default_main_program(), outs = exe.run(fluid.framework.default_main_program(),
feed=data_input, feed=data_input,
...@@ -145,6 +160,14 @@ def main(): ...@@ -145,6 +160,14 @@ def main():
cost_val = np.array(outs[0]) cost_val = np.array(outs[0])
print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) + print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) +
" cost = " + str(cost_val)) " cost = " + str(cost_val))
# Validate and save the model for inference.
val_cost = test(exe)
print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost))
fluid.io.save_inference_model(
os.path.join(TrainTaskConfig.model_dir,
"pass_" + str(pass_id) + ".infer.model"),
encoder_input_data_names + decoder_input_data_names[:-1],
[predict], exe)
if __name__ == "__main__": if __name__ == "__main__":
......
from PIL import Image from PIL import Image, ImageEnhance
import numpy as np import numpy as np
import random import random
import math import math
...@@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height): ...@@ -159,3 +159,77 @@ def crop_image(img, bbox_labels, sample_bbox, image_width, image_height):
sample_img = img[ymin:ymax, xmin:xmax] sample_img = img[ymin:ymax, xmin:xmax]
sample_labels = transform_labels(bbox_labels, sample_bbox) sample_labels = transform_labels(bbox_labels, sample_bbox)
return sample_img, sample_labels return sample_img, sample_labels
def random_brightness(img, settings):
prob = random.uniform(0, 1)
if prob < settings._brightness_prob:
delta = random.uniform(-settings._brightness_delta,
settings._brightness_delta) + 1
img = ImageEnhance.Brightness(img).enhance(delta)
return img
def random_contrast(img, settings):
prob = random.uniform(0, 1)
if prob < settings._contrast_prob:
delta = random.uniform(-settings._contrast_delta,
settings._contrast_delta) + 1
img = ImageEnhance.Contrast(img).enhance(delta)
return img
def random_saturation(img, settings):
prob = random.uniform(0, 1)
if prob < settings._saturation_prob:
delta = random.uniform(-settings._saturation_delta,
settings._saturation_delta) + 1
img = ImageEnhance.Color(img).enhance(delta)
return img
def random_hue(img, settings):
prob = random.uniform(0, 1)
if prob < settings._hue_prob:
delta = random.uniform(-settings._hue_delta, settings._hue_delta)
img_hsv = np.array(img.convert('HSV'))
img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
return img
def distort_image(img, settings):
prob = random.uniform(0, 1)
# Apply different distort order
if prob > 0.5:
img = random_brightness(img, settings)
img = random_contrast(img, settings)
img = random_saturation(img, settings)
img = random_hue(img, settings)
else:
img = random_brightness(img, settings)
img = random_saturation(img, settings)
img = random_hue(img, settings)
img = random_contrast(img, settings)
return img
def expand_image(img, bbox_labels, img_width, img_height, settings):
prob = random.uniform(0, 1)
if prob < settings._hue_prob:
expand_ratio = random.uniform(1, settings._expand_max_ratio)
if expand_ratio - 1 >= 0.01:
height = int(img_height * expand_ratio)
width = int(img_width * expand_ratio)
h_off = math.floor(random.uniform(0, height - img_height))
w_off = math.floor(random.uniform(0, width - img_width))
expand_bbox = bbox(-w_off / img_width, -h_off / img_height,
(width - w_off) / img_width,
(height - h_off) / img_height)
expand_img = np.ones((height, width, 3))
expand_img = np.uint8(expand_img * np.squeeze(settings._img_mean))
expand_img = Image.fromarray(expand_img)
expand_img.paste(img, (int(w_off), int(h_off)))
bbox_labels = transform_labels(bbox_labels, expand_bbox)
return expand_img, bbox_labels
return img, bbox_labels
...@@ -22,17 +22,38 @@ import os ...@@ -22,17 +22,38 @@ import os
class Settings(object): class Settings(object):
def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value): def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value,
apply_distort, apply_expand):
self._data_dir = data_dir self._data_dir = data_dir
self._label_list = [] self._label_list = []
label_fpath = os.path.join(data_dir, label_file) label_fpath = os.path.join(data_dir, label_file)
for line in open(label_fpath): for line in open(label_fpath):
self._label_list.append(line.strip()) self._label_list.append(line.strip())
self._apply_distort = apply_distort
self._apply_expand = apply_expand
self._resize_height = resize_h self._resize_height = resize_h
self._resize_width = resize_w self._resize_width = resize_w
self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype( self._img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype(
'float32') 'float32')
self._expand_prob = 0.5
self._expand_max_ratio = 4
self._hue_prob = 0.5
self._hue_delta = 18
self._contrast_prob = 0.5
self._contrast_delta = 0.5
self._saturation_prob = 0.5
self._saturation_delta = 0.5
self._brightness_prob = 0.5
self._brightness_delta = 0.125
@property
def apply_distort(self):
return self._apply_expand
@property
def apply_distort(self):
return self._apply_distort
@property @property
def data_dir(self): def data_dir(self):
...@@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -71,7 +92,6 @@ def _reader_creator(settings, file_list, mode, shuffle):
img = Image.open(img_path) img = Image.open(img_path)
img_width, img_height = img.size img_width, img_height = img.size
img = np.array(img)
# layout: label | xmin | ymin | xmax | ymax | difficult # layout: label | xmin | ymin | xmax | ymax | difficult
if mode == 'train' or mode == 'test': if mode == 'train' or mode == 'test':
...@@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -99,6 +119,12 @@ def _reader_creator(settings, file_list, mode, shuffle):
sample_labels = bbox_labels sample_labels = bbox_labels
if mode == 'train': if mode == 'train':
if settings._apply_distort:
img = image_util.distort_image(img, settings)
if settings._apply_expand:
img, bbox_labels = image_util.expand_image(
img, bbox_labels, img_width, img_height,
settings)
batch_sampler = [] batch_sampler = []
# hard-code here # hard-code here
batch_sampler.append( batch_sampler.append(
...@@ -126,6 +152,7 @@ def _reader_creator(settings, file_list, mode, shuffle): ...@@ -126,6 +152,7 @@ def _reader_creator(settings, file_list, mode, shuffle):
sampled_bbox = image_util.generate_batch_samples( sampled_bbox = image_util.generate_batch_samples(
batch_sampler, bbox_labels, img_width, img_height) batch_sampler, bbox_labels, img_width, img_height)
img = np.array(img)
if len(sampled_bbox) > 0: if len(sampled_bbox) > 0:
idx = int(random.uniform(0, len(sampled_bbox))) idx = int(random.uniform(0, len(sampled_bbox)))
img, sample_labels = image_util.crop_image( img, sample_labels = image_util.crop_image(
......
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import os
import reader import reader
import numpy as np
import load_model as load_model import load_model as load_model
from mobilenet_ssd import mobile_net from mobilenet_ssd import mobile_net
from utility import add_arguments, print_arguments
import os
import numpy as np
import argparse
import functools
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('parallel', bool, True, "Whether use parallel training.")
add_arg('use_gpu', bool, True, "Whether use GPU.")
# yapf: disable
def train(train_file_list,
def train(args,
train_file_list,
val_file_list, val_file_list,
data_args, data_args,
learning_rate, learning_rate,
...@@ -25,16 +36,37 @@ def train(train_file_list, ...@@ -25,16 +36,37 @@ def train(train_file_list,
difficult = fluid.layers.data( difficult = fluid.layers.data(
name='gt_difficult', shape=[1], dtype='int32', lod_level=1) name='gt_difficult', shape=[1], dtype='int32', lod_level=1)
mbox_locs, mbox_confs, box, box_var = mobile_net(image, image_shape) if args.parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
image_ = pd.read_input(image)
gt_box_ = pd.read_input(gt_box)
gt_label_ = pd.read_input(gt_label)
difficult_ = pd.read_input(difficult)
locs, confs, box, box_var = mobile_net(image_, image_shape)
loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_,
box, box_var)
pd.write_output(loss)
pd.write_output(locs)
pd.write_output(confs)
pd.write_output(box)
pd.write_output(box_var)
loss, locs, confs, box, box_var = pd()
loss = fluid.layers.reduce_sum(loss)
else:
locs, confs, box, box_var = mobile_net(image, image_shape)
nmsed_out = fluid.layers.detection_output( nmsed_out = fluid.layers.detection_output(
mbox_locs, mbox_confs, box, box_var, nms_threshold=0.45) locs, mbox_confs, box, box_var, nms_threshold=0.45)
loss_vec = fluid.layers.ssd_loss(mbox_locs, mbox_confs, gt_box, gt_label, loss = fluid.layers.ssd_loss(locs, mbox_confs, gt_box, gt_label,
box, box_var) box, box_var)
loss = fluid.layers.nn.reduce_sum(loss_vec) loss = fluid.layers.reduce_sum(loss)
map_eval = None
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
with fluid.program_guard(test_program): with fluid.program_guard(test_program):
nmsed_out = fluid.layers.detection_output(
locs, confs, box, box_var, nms_threshold=0.45)
map_eval = fluid.evaluator.DetectionMAP( map_eval = fluid.evaluator.DetectionMAP(
nmsed_out, nmsed_out,
gt_label, gt_label,
...@@ -45,22 +77,20 @@ def train(train_file_list, ...@@ -45,22 +77,20 @@ def train(train_file_list,
evaluate_difficult=False, evaluate_difficult=False,
ap_version='11point') ap_version='11point')
optimizer = fluid.optimizer.Momentum( boundaries = [40000, 60000]
learning_rate=fluid.layers.exponential_decay( values = [0.001, 0.0005, 0.00025]
learning_rate=learning_rate, optimizer = fluid.optimizer.RMSProp(
decay_steps=40000, learning_rate=fluid.layers.piecewise_decay(boundaries, values),
decay_rate=0.1,
staircase=True),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(0.00005), ) regularization=fluid.regularizer.L2Decay(0.00005), )
optimizer.minimize(loss) optimizer.minimize(loss)
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
load_model.load_paddlev1_vars(place) load_model.load_and_set_vars(place)
#load_model.load_paddlev1_vars(place)
train_reader = paddle.batch( train_reader = paddle.batch(
reader.train(data_args, train_file_list), batch_size=batch_size) reader.train(data_args, train_file_list), batch_size=batch_size)
test_reader = paddle.batch( test_reader = paddle.batch(
...@@ -85,6 +115,7 @@ def train(train_file_list, ...@@ -85,6 +115,7 @@ def train(train_file_list,
loss_v = exe.run(fluid.default_main_program(), loss_v = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[loss]) fetch_list=[loss])
if batch_id % 20 == 0:
print("Pass {0}, batch {1}, loss {2}" print("Pass {0}, batch {1}, loss {2}"
.format(pass_id, batch_id, loss_v[0])) .format(pass_id, batch_id, loss_v[0]))
test(pass_id) test(pass_id)
...@@ -97,13 +128,17 @@ def train(train_file_list, ...@@ -97,13 +128,17 @@ def train(train_file_list,
if __name__ == '__main__': if __name__ == '__main__':
args = parser.parse_args()
print_arguments(args)
data_args = reader.Settings( data_args = reader.Settings(
data_dir='./data', data_dir='./data',
label_file='label_list', label_file='label_list',
apply_distort=True,
apply_expand=True,
resize_h=300, resize_h=300,
resize_w=300, resize_w=300,
mean_value=[127.5, 127.5, 127.5]) mean_value=[127.5, 127.5, 127.5])
train( train(args,
train_file_list='./data/trainval.txt', train_file_list='./data/trainval.txt',
val_file_list='./data/test.txt', val_file_list='./data/test.txt',
data_args=data_args, data_args=data_args,
......
"""Contains common utility functions."""
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import distutils.util
import numpy as np
from paddle.fluid import core
def print_arguments(args):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
print("----------- Configuration Arguments -----------")
for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value))
print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
...@@ -26,7 +26,12 @@ def conv_bn_pool(input, ...@@ -26,7 +26,12 @@ def conv_bn_pool(input,
bias_attr=bias, bias_attr=bias,
is_test=is_test) is_test=is_test)
tmp = fluid.layers.pool2d( tmp = fluid.layers.pool2d(
input=tmp, pool_size=2, pool_type='max', pool_stride=2, use_cudnn=True) input=tmp,
pool_size=2,
pool_type='max',
pool_stride=2,
use_cudnn=True,
ceil_mode=True)
return tmp return tmp
...@@ -136,26 +141,61 @@ def encoder_net(images, ...@@ -136,26 +141,61 @@ def encoder_net(images,
def ctc_train_net(images, label, args, num_classes): def ctc_train_net(images, label, args, num_classes):
regularizer = fluid.regularizer.L2Decay(args.l2) regularizer = fluid.regularizer.L2Decay(args.l2)
gradient_clip = None gradient_clip = None
if args.parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
images_ = pd.read_input(images)
label_ = pd.read_input(label)
fc_out = encoder_net( fc_out = encoder_net(
images, images_,
num_classes, num_classes,
regularizer=regularizer, regularizer=regularizer,
gradient_clip=gradient_clip) gradient_clip=gradient_clip)
cost = fluid.layers.warpctc( cost = fluid.layers.warpctc(
input=fc_out, label=label, blank=num_classes, norm_by_times=True) input=fc_out,
label=label_,
blank=num_classes,
norm_by_times=True)
sum_cost = fluid.layers.reduce_sum(cost) sum_cost = fluid.layers.reduce_sum(cost)
optimizer = fluid.optimizer.Momentum( decoded_out = fluid.layers.ctc_greedy_decoder(
learning_rate=args.learning_rate, momentum=args.momentum) input=fc_out, blank=num_classes)
optimizer.minimize(sum_cost)
pd.write_output(sum_cost)
pd.write_output(decoded_out)
sum_cost, decoded_out = pd()
sum_cost = fluid.layers.reduce_sum(sum_cost)
else:
fc_out = encoder_net(
images,
num_classes,
regularizer=regularizer,
gradient_clip=gradient_clip)
cost = fluid.layers.warpctc(
input=fc_out, label=label, blank=num_classes, norm_by_times=True)
sum_cost = fluid.layers.reduce_sum(cost)
decoded_out = fluid.layers.ctc_greedy_decoder( decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes) input=fc_out, blank=num_classes)
casted_label = fluid.layers.cast(x=label, dtype='int64') casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance( error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label) input=decoded_out, label=casted_label)
return sum_cost, error_evaluator
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
inference_program = fluid.io.get_inference_program(error_evaluator)
optimizer = fluid.optimizer.Momentum(
learning_rate=args.learning_rate, momentum=args.momentum)
_, params_grads = optimizer.minimize(sum_cost)
return sum_cost, error_evaluator, inference_program
def ctc_infer(images, num_classes): def ctc_infer(images, num_classes):
......
"""Trainer for OCR CTC model.""" """Trainer for OCR CTC model."""
import paddle.v2 as paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import dummy_reader import dummy_reader
import ctc_reader import ctc_reader
...@@ -24,12 +23,12 @@ add_arg('momentum', float, 0.9, "Momentum.") ...@@ -24,12 +23,12 @@ add_arg('momentum', float, 0.9, "Momentum.")
add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.") add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.")
add_arg('device', int, 0, "Device id.'-1' means running on CPU" add_arg('device', int, 0, "Device id.'-1' means running on CPU"
"while '0' means GPU-0.") "while '0' means GPU-0.")
add_arg('parallel', bool, True, "Whether use parallel training.")
# yapf: disable # yapf: disable
def load_parameter(place): def load_parameter(place):
params = load_param('./name.map', './data/model/results_without_avg_window/pass-00000/') params = load_param('./name.map', './data/model/results_without_avg_window/pass-00000/')
for name in params: for name in params:
# print "param: %s" % name
t = fluid.global_scope().find_var(name).get_tensor() t = fluid.global_scope().find_var(name).get_tensor()
t.set(params[name], place) t.set(params[name], place)
...@@ -41,7 +40,8 @@ def train(args, data_reader=dummy_reader): ...@@ -41,7 +40,8 @@ def train(args, data_reader=dummy_reader):
# define network # define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1) label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
sum_cost, error_evaluator = ctc_train_net(images, label, args, num_classes) sum_cost, error_evaluator, inference_program = ctc_train_net(images, label, args, num_classes)
# data reader # data reader
train_reader = data_reader.train(args.batch_size) train_reader = data_reader.train(args.batch_size)
test_reader = data_reader.test() test_reader = data_reader.test()
...@@ -51,11 +51,8 @@ def train(args, data_reader=dummy_reader): ...@@ -51,11 +51,8 @@ def train(args, data_reader=dummy_reader):
place = fluid.CUDAPlace(args.device) place = fluid.CUDAPlace(args.device)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
#load_parameter(place) #load_parameter(place)
inference_program = fluid.io.get_inference_program(error_evaluator)
for pass_id in range(args.pass_num): for pass_id in range(args.pass_num):
error_evaluator.reset(exe) error_evaluator.reset(exe)
batch_id = 1 batch_id = 1
...@@ -78,7 +75,6 @@ def train(args, data_reader=dummy_reader): ...@@ -78,7 +75,6 @@ def train(args, data_reader=dummy_reader):
sys.stdout.flush() sys.stdout.flush()
batch_id += 1 batch_id += 1
# evaluate model on test data
error_evaluator.reset(exe) error_evaluator.reset(exe)
for data in test_reader(): for data in test_reader():
exe.run(inference_program, feed=get_feeder_data(data, place)) exe.run(inference_program, feed=get_feeder_data(data, place))
......
# 命名实体识别
以下是本例的简要目录结构及说明:
```text
.
├── data # 存储运行本例所依赖的数据,从外部获取
├── network_conf.py # 模型定义
├── reader.py # 数据读取接口, 从外部获取
├── README.md # 文档
├── train.py # 训练脚本
├── infer.py # 预测脚本
├── utils.py # 定义通用的函数, 从外部获取
└── utils_extend.py # 对utils.py的拓展
```
## 简介,模型详解
在PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中对于命名实体识别任务有较详细的介绍,在本例中不再重复介绍。
在模型上,我们沿用了v2版本的模型结构,唯一区别是我们使用LSTM代替原始的RNN。
## 数据获取
请参考PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md) 一节中数据获取方式,将该例中的data文件夹拷贝至本例目录下,运行其中的download.sh脚本获取训练和测试数据。
## 通用脚本获取
请将PaddlePaddle v2版本[命名实体识别](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/README.md)中提供的用于数据读取的文件[reader.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/reader.py)以及包含字典导入等通用功能的文件[utils.py](https://github.com/PaddlePaddle/models/blob/develop/sequence_tagging_for_ner/utils.py)复制到本目录下。本例将会使用到这两个脚本。
## 训练
1. 运行 `sh data/download.sh`
2. 修改 `train.py``main` 函数,指定数据路径
```python
main(
train_data_file="data/train",
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
emb_file="data/wordVectors.txt",
model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=False)
```
3. 运行命令 `python train.py`**需要注意:直接运行使用的是示例数据,请替换真实的标记数据。**
```text
Pass 127, Batch 9525, Cost 4.0867705, Precision 0.3954984, Recall 0.37846154, F1_score0.38679245
Pass 127, Batch 9530, Cost 3.137265, Precision 0.42971888, Recall 0.38351256, F1_score0.405303
Pass 127, Batch 9535, Cost 3.6240938, Precision 0.4272152, Recall 0.41795665, F1_score0.4225352
Pass 127, Batch 9540, Cost 3.5352352, Precision 0.48464164, Recall 0.4536741, F1_score0.46864685
Pass 127, Batch 9545, Cost 4.1130385, Precision 0.40131578, Recall 0.3836478, F1_score0.39228293
Pass 127, Batch 9550, Cost 3.6826708, Precision 0.43333334, Recall 0.43730888, F1_score0.43531203
Pass 127, Batch 9555, Cost 3.6363933, Precision 0.42424244, Recall 0.3962264, F1_score0.4097561
Pass 127, Batch 9560, Cost 3.6101768, Precision 0.51363635, Recall 0.353125, F1_score0.41851854
Pass 127, Batch 9565, Cost 3.5935276, Precision 0.5152439, Recall 0.5, F1_score0.5075075
Pass 127, Batch 9570, Cost 3.4987144, Precision 0.5, Recall 0.4330218, F1_score0.46410686
Pass 127, Batch 9575, Cost 3.4659843, Precision 0.39864865, Recall 0.38064516, F1_score0.38943896
Pass 127, Batch 9580, Cost 3.1702557, Precision 0.5, Recall 0.4490446, F1_score0.47315437
Pass 127, Batch 9585, Cost 3.1587276, Precision 0.49377593, Recall 0.4089347, F1_score0.4473684
Pass 127, Batch 9590, Cost 3.5043538, Precision 0.4556962, Recall 0.4600639, F1_score0.45786962
Pass 127, Batch 9595, Cost 2.981989, Precision 0.44981414, Recall 0.45149255, F1_score0.4506518
[TrainSet] pass_id:127 pass_precision:[0.46023396] pass_recall:[0.43197003] pass_f1_score:[0.44565433]
[TestSet] pass_id:127 pass_precision:[0.4708409] pass_recall:[0.47971722] pass_f1_score:[0.4752376]
```
## 预测
1. 修改 [infer.py](./infer.py)`infer` 函数,指定:需要测试的模型的路径、测试数据、字典文件,预测标记文件的路径,默认参数如下:
```python
infer(
model_path="models/params_pass_0",
batch_size=6,
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
use_gpu=False
)
```
2. 在终端运行 `python infer.py`,开始测试,会看到如下预测结果(以下为训练70个pass所得模型的部分预测结果):
```text
leicestershire B-ORG B-LOC
extended O O
their O O
first O O
innings O O
by O O
DGDG O O
runs O O
before O O
being O O
bowled O O
out O O
for O O
296 O O
with O O
england B-LOC B-LOC
discard O O
andy B-PER B-PER
caddick I-PER I-PER
taking O O
three O O
for O O
DGDG O O
. O O
```
输出分为三列,以“\t” 分隔,第一列是输入的词语,第二列是标准结果,第三列为生成的标记结果。多条输入序列之间以空行分隔。
## 结果示例
<p align="center">
<img src="imgs/convergence_curve.png" width="80%" align="center"/><br/>
图1. 学习曲线, 横轴表示训练轮数,纵轴表示F1值
</p>
import numpy as np
import paddle.fluid as fluid
import paddle.v2 as paddle
from network_conf import ner_net
import reader
from utils import load_dict, load_reverse_dict
from utils_extend import to_lodtensor
def infer(model_path, batch_size, test_data_file, vocab_file, target_file,
use_gpu):
"""
use the model under model_path to predict the test data, the result will be printed on the screen
return nothing
"""
word_dict = load_dict(vocab_file)
word_reverse_dict = load_reverse_dict(vocab_file)
label_dict = load_dict(target_file)
label_reverse_dict = load_reverse_dict(target_file)
test_data = paddle.batch(
reader.data_reader(test_data_file, word_dict, label_dict),
batch_size=batch_size)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(model_path, exe)
for data in test_data():
word = to_lodtensor(map(lambda x: x[0], data), place)
mark = to_lodtensor(map(lambda x: x[1], data), place)
target = to_lodtensor(map(lambda x: x[2], data), place)
crf_decode = exe.run(
inference_program,
feed={"word": word,
"mark": mark,
"target": target},
fetch_list=fetch_targets,
return_numpy=False)
lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0])
assert len(data) == len(lod_info) - 1
for sen_index in xrange(len(data)):
assert len(data[sen_index][0]) == lod_info[
sen_index + 1] - lod_info[sen_index]
word_index = 0
for tag_index in xrange(lod_info[sen_index],
lod_info[sen_index + 1]):
word = word_reverse_dict[data[sen_index][0][word_index]]
gold_tag = label_reverse_dict[data[sen_index][2][
word_index]]
tag = label_reverse_dict[np_data[tag_index][0]]
print word + "\t" + gold_tag + "\t" + tag
word_index += 1
print ""
if __name__ == "__main__":
infer(
model_path="models/params_pass_0",
batch_size=6,
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
use_gpu=False)
import math
import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
from utils import logger, load_dict, get_embedding
def ner_net(word_dict_len, label_dict_len, parallel, stack_num=2):
mark_dict_len = 2
word_dim = 50
mark_dim = 5
hidden_dim = 300
IS_SPARSE = True
embedding_name = 'emb'
def _net_conf(word, mark, target):
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
name=embedding_name, trainable=False))
mark_embedding = fluid.layers.embedding(
input=mark,
size=[mark_dict_len, mark_dim],
dtype='float32',
is_sparse=IS_SPARSE)
word_caps_vector = fluid.layers.concat(
input=[word_embedding, mark_embedding], axis=1)
mix_hidden_lr = 1
rnn_para_attr = fluid.ParamAttr(
initializer=NormalInitializer(
loc=0.0, scale=0.0),
learning_rate=mix_hidden_lr)
hidden_para_attr = fluid.ParamAttr(
initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)),
learning_rate=mix_hidden_lr)
hidden = fluid.layers.fc(
input=word_caps_vector,
name="__hidden00__",
size=hidden_dim,
act="tanh",
bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))),
param_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))))
fea = []
for direction in ["fwd", "bwd"]:
for i in range(stack_num):
if i != 0:
hidden = fluid.layers.fc(
name="__hidden%02d_%s__" % (i, direction),
size=hidden_dim,
act="stanh",
bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=1.0)),
input=[hidden, rnn[0], rnn[1]],
param_attr=[
hidden_para_attr, rnn_para_attr, rnn_para_attr
])
rnn = fluid.layers.dynamic_lstm(
name="__rnn%02d_%s__" % (i, direction),
input=hidden,
size=hidden_dim,
candidate_activation='relu',
gate_activation='sigmoid',
cell_activation='sigmoid',
bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=1.0)),
is_reverse=(i % 2) if direction == "fwd" else not i % 2,
param_attr=rnn_para_attr)
fea += [hidden, rnn[0], rnn[1]]
rnn_fea = fluid.layers.fc(
size=hidden_dim,
bias_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))),
act="stanh",
input=fea,
param_attr=[hidden_para_attr, rnn_para_attr, rnn_para_attr] * 2)
emission = fluid.layers.fc(
size=label_dict_len,
input=rnn_fea,
param_attr=fluid.ParamAttr(initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3))))
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=target,
param_attr=fluid.ParamAttr(
name='crfw',
initializer=NormalInitializer(
loc=0.0, scale=(1. / math.sqrt(hidden_dim) / 3)),
learning_rate=mix_hidden_lr))
avg_cost = fluid.layers.mean(x=crf_cost)
return avg_cost, emission
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
mark = fluid.layers.data(name='mark', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name="target", shape=[1], dtype='int64', lod_level=1)
if parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
word_ = pd.read_input(word)
mark_ = pd.read_input(mark)
target_ = pd.read_input(target)
avg_cost, emission_base = _net_conf(word_, mark_, target_)
pd.write_output(avg_cost)
pd.write_output(emission_base)
avg_cost_list, emission = pd()
avg_cost = fluid.layers.mean(x=avg_cost_list)
emission.stop_gradient = True
else:
avg_cost, emission = _net_conf(word, mark, target)
return avg_cost, emission, word, mark, target
import os
import math
import numpy as np
import paddle.v2 as paddle
import paddle.fluid as fluid
import reader
from network_conf import ner_net
from utils import logger, load_dict
from utils_extend import to_lodtensor, get_embedding
def test(exe, chunk_evaluator, inference_program, test_data, place):
chunk_evaluator.reset(exe)
for data in test_data():
word = to_lodtensor(map(lambda x: x[0], data), place)
mark = to_lodtensor(map(lambda x: x[1], data), place)
target = to_lodtensor(map(lambda x: x[2], data), place)
acc = exe.run(inference_program,
feed={"word": word,
"mark": mark,
"target": target})
return chunk_evaluator.eval(exe)
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
model_save_dir, num_passes, use_gpu, parallel):
if not os.path.exists(model_save_dir):
os.mkdir(model_save_dir)
BATCH_SIZE = 200
word_dict = load_dict(vocab_file)
label_dict = load_dict(target_file)
word_vector_values = get_embedding(emb_file)
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
avg_cost, feature_out, word, mark, target = ner_net(
word_dict_len, label_dict_len, parallel)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
sgd_optimizer.minimize(avg_cost)
crf_decode = fluid.layers.crf_decoding(
input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
chunk_evaluator = fluid.evaluator.ChunkEvaluator(
input=crf_decode,
label=target,
chunk_scheme="IOB",
num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = chunk_evaluator.metrics + chunk_evaluator.states
inference_program = fluid.io.get_inference_program(test_target)
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(train_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.data_reader(test_data_file, word_dict, label_dict),
buf_size=20000),
batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
embedding_name = 'emb'
embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
embedding_param.set(word_vector_values, place)
batch_id = 0
for pass_id in xrange(num_passes):
chunk_evaluator.reset(exe)
for data in train_reader():
cost, batch_precision, batch_recall, batch_f1_score = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + chunk_evaluator.metrics)
if batch_id % 5 == 0:
print("Pass " + str(pass_id) + ", Batch " + str(
batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(
batch_precision[0]) + ", Recall " + str(batch_recall[0])
+ ", F1_score" + str(batch_f1_score[0]))
batch_id = batch_id + 1
pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe)
print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
pass_precision, pass_recall, pass_f1_score = test(
exe, chunk_evaluator, inference_program, test_reader, place)
print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) +
" pass_f1_score:" + str(pass_f1_score))
save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id)
fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'],
[crf_decode], exe)
if __name__ == "__main__":
main(
train_data_file="data/train",
test_data_file="data/test",
vocab_file="data/vocab.txt",
target_file="data/target.txt",
emb_file="data/wordVectors.txt",
model_save_dir="models",
num_passes=1000,
use_gpu=False,
parallel=False)
import numpy as np
import paddle.fluid as fluid
def get_embedding(emb_file='data/wordVectors.txt'):
"""
Get the trained word vector.
"""
return np.loadtxt(emb_file, dtype='float32')
def to_lodtensor(data, place):
"""
convert data to lodtensor
"""
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
此目录中代码示例PaddlePaddle所需版本至少为v0.11.0。如果您使用的PaddlePaddle版本早于v0.11.0, [请更新](http://www.paddlepaddle.org/docs/develop/documentation/en/build_and_install/pip_install_en.html).
---
# 全球标准化阅读器
该模型实现以下功能:
Jonathan Raiman and John Miller. Globally Normalized Reader. Empirical Methods in Natural Language Processing (EMNLP), 2017
如果您在研究中使用数据集/代码,请引用上述论文:
```text
@inproceedings{raiman2015gnr,
author={Raiman, Jonathan and Miller, John},
booktitle={Empirical Methods in Natural Language Processing (EMNLP)},
title={Globally Normalized Reader},
year={2017},
}
```
您也可以访问 https://github.com/baidu-research/GloballyNormalizedReader 以获取更多信息。
# 安装
1. 请使用 [docker image](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) 安装最新的PaddlePaddle,运行方法:
```bash
docker pull paddledev/paddle
```
2. 下载所有必要的数据,运行方法:
```bash
cd data && ./download.sh && cd ..
```
3. 预处理并特征化数据:
```bash
python featurize.py --datadir data --outdir data/featurized --glove-path data/glove.840B.300d.txt
```
# 模型训练
- 根据需要修改config.py来配置模型,然后运行:
```bash
python train.py 2>&1 | tee train.log
```
# 使用训练过的模型推断
- 运行以下训练模型来推断:
```bash
python infer.py \
--model_path models/pass_00000.tar.gz \
--data_dir data/featurized/ \
--batch_size 2 \
--use_gpu 0 \
--trainer_count 1 \
2>&1 | tee infer.log
```
运行本目录下的程序示例需要使用PaddlePaddle v0.10.0 版本。如果您的PaddlePaddle安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。 运行本目录下的程序示例需要使用PaddlePaddle v0.10.0 版本。如果您的PaddlePaddle安装版本低于此要求,请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。
--- ---
...@@ -25,16 +25,16 @@ ...@@ -25,16 +25,16 @@
命名实体识别(Named Entity Recognition,NER)又称作“专名识别”,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等,是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分,可以将其作为序列标注问题解决。 命名实体识别(Named Entity Recognition,NER)又称作“专名识别”,是指识别文本中具有特定意义的实体,主要包括人名、地名、机构名、专有名词等,是自然语言处理研究的一个基础问题。NER任务通常包括实体边界识别、确定实体类别两部分,可以将其作为序列标注问题解决。
序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)],本例只考虑Segment Classification,即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务,由于需要标识边界,一般采用[BIO标注方法](http://book.paddlepaddle.org/07.label_semantic_roles/)定义的标签集,如下是一个NER的标注结果示例: 序列标注可以分为Sequence Classification、Segment Classification和Temporal Classification三类[[1](#参考文献)],本例只考虑Segment Classification,即对输入序列中的每个元素在输出序列中给出对应的标签。对于NER任务,由于需要标识边界,一般采用[BIO标注方法](http://www.paddlepaddle.org/docs/develop/book/07.label_semantic_roles/index.cn.html)定义的标签集,如下是一个NER的标注结果示例:
<p align="center"> <p align="center">
<img src="images/ner_label_ins.png" width="80%" align="center"/><br/> <img src="images/ner_label_ins.png" width="80%" align="center"/><br/>
图1. BIO标注方法示例 图1. BIO标注方法示例
</p> </p>
根据序列标注结果可以直接得到实体边界和实体类别。类似的,分词、词性标注、语块识别、[语义角色标注](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务;对于序列标注问题,通常:使用基于RNN的网络结构学习特征,将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是:CRF使用句子级别的似然概率,能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然,这里以NER任务作为示例,但所给出的模型可以应用到其他各种序列标注任务中。 根据序列标注结果可以直接得到实体边界和实体类别。类似的,分词、词性标注、语块识别、[语义角色标注](http://www.paddlepaddle.org/docs/develop/book/07.label_semantic_roles/index.cn.html)等任务都可通过序列标注来解决。使用神经网络模型解决问题的思路通常是:前层网络学习输入的特征表示,网络的最后一层在特征基础上完成最终的任务;对于序列标注问题,通常:使用基于RNN的网络结构学习特征,将学习到的特征接入CRF完成序列标注。实际上是将传统CRF中的线性模型换成了非线性神经网络。沿用CRF的出发点是:CRF使用句子级别的似然概率,能够更好的解决标记偏置问题[[2](#参考文献)]。本例也将基于此思路建立模型。虽然,这里以NER任务作为示例,但所给出的模型可以应用到其他各种序列标注任务中。
由于序列标注问题的广泛性,产生了[CRF](http://book.paddlepaddle.org/07.label_semantic_roles/index.cn.html)等经典的序列模型,这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展,循环神经网络(Recurrent Neural Network,RNN等 序列模型能够处理序列元素之间前后关联问题,能够从原始输入文本中学习特征表示,而更加适合序列标注任务,更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。 由于序列标注问题的广泛性,产生了[CRF](http://www.paddlepaddle.org/docs/develop/book/07.label_semantic_roles/index.cn.html)等经典的序列模型,这些模型大多只能使用局部信息或需要人工设计特征。随着深度学习研究的发展,循环神经网络(Recurrent Neural Network,RNN等 序列模型能够处理序列元素之间前后关联问题,能够从原始输入文本中学习特征表示,而更加适合序列标注任务,更多相关知识可参考PaddleBook中[语义角色标注](https://github.com/PaddlePaddle/book/blob/develop/07.label_semantic_roles/README.cn.md)一课。
## 模型详解 ## 模型详解
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册