提交 50816a2d 编写于 作者: 0 0YuanZhang0

update_sequence_tagging

上级 dc437431
...@@ -25,7 +25,8 @@ from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer ...@@ -25,7 +25,8 @@ from hapi.text.text import TransformerDecoderLayer as TransformerDecoderLayer
from hapi.text.text import TransformerEncoder as TransformerEncoder from hapi.text.text import TransformerEncoder as TransformerEncoder
from hapi.text.text import TransformerDecoder as TransformerDecoder from hapi.text.text import TransformerDecoder as TransformerDecoder
from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder from hapi.text.text import TransformerBeamSearchDecoder as TransformerBeamSearchDecoder
from hapi.text.text import DynamicGRU as DynamicGRU from hapi.text.text import GRUCell as GRUCell
from hapi.text.text import GRUEncoderCell as GRUEncoderCell
from hapi.text.text import BiGRU as BiGRU from hapi.text.text import BiGRU as BiGRU
from hapi.text.text import Linear_chain_crf as Linear_chain_crf from hapi.text.text import Linear_chain_crf as Linear_chain_crf
from hapi.text.text import Crf_decoding as Crf_decoding from hapi.text.text import Crf_decoding as Crf_decoding
......
...@@ -22,7 +22,7 @@ import sys ...@@ -22,7 +22,7 @@ import sys
if six.PY2: if six.PY2:
reload(sys) reload(sys)
sys.setdefaultencoding('utf8') sys.setdefaultencoding('utf8')
import ast import ast
import time import time
import argparse as argparse import argparse as argparse
...@@ -31,8 +31,6 @@ import multiprocessing ...@@ -31,8 +31,6 @@ import multiprocessing
import collections import collections
import copy import copy
import six
import sys
from functools import partial, reduce from functools import partial, reduce
import paddle import paddle
...@@ -46,11 +44,12 @@ from paddle.fluid import layers ...@@ -46,11 +44,12 @@ from paddle.fluid import layers
from paddle.fluid.dygraph import Layer from paddle.fluid.dygraph import Layer
from paddle.fluid.layers import BeamSearchDecoder from paddle.fluid.layers import BeamSearchDecoder
__all__ = [ __all__ = [
'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode', 'RNNCell', 'BasicLSTMCell', 'BasicGRUCell', 'RNN', 'DynamicDecode',
'BeamSearchDecoder', 'MultiHeadAttention', 'FFN', 'BeamSearchDecoder', 'MultiHeadAttention', 'FFN',
'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer',
'TransformerDecoder', 'TransformerBeamSearchDecoder', 'DynamicGRU', 'TransformerDecoder', 'TransformerBeamSearchDecoder', 'GRUCell', 'GRUEncoderCell',
'BiGRU', 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging' 'BiGRU', 'Linear_chain_crf', 'Crf_decoding', 'SequenceTagging'
] ]
...@@ -220,19 +219,7 @@ class BasicLSTMCell(RNNCell): ...@@ -220,19 +219,7 @@ class BasicLSTMCell(RNNCell):
gate_activation=None, gate_activation=None,
activation=None, activation=None,
forget_bias=1.0, forget_bias=1.0,
dtype='float32', dtype='float32'):
forget_gate_weights={"w": None,
"h": None,
"b": None},
input_gate_weights={"w": None,
"h": None,
"b": None},
output_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicLSTMCell, self).__init__() super(BasicLSTMCell, self).__init__()
self._hidden_size = hidden_size self._hidden_size = hidden_size
...@@ -246,187 +233,18 @@ class BasicLSTMCell(RNNCell): ...@@ -246,187 +233,18 @@ class BasicLSTMCell(RNNCell):
self._dtype = dtype self._dtype = dtype
self._input_size = input_size self._input_size = input_size
assert isinstance(forget_gate_weights, dict) self._weight = self.create_parameter(
assert isinstance(input_gate_weights, dict) attr=self._param_attr,
assert isinstance(output_gate_weights, dict) shape=[
assert isinstance(cell_weights, dict) self._input_size + self._hidden_size, 4 * self._hidden_size
],
# forgot get parameters dtype=self._dtype)
if "w" in forget_gate_weights and forget_gate_weights["w"] is not None:
self.fg_w = forget_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_w"
else:
tmp_param_attr = self._param_attr
self.fg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in forget_gate_weights and forget_gate_weights["h"] is not None:
self.fg_h = forget_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_forget_gate_h"
else:
tmp_param_attr = self._param_attr
self.fg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in forget_gate_weights and forget_gate_weights["b"] is not None:
self.fg_b = forget_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_forget_gate_b"
else:
tmp_param_attr = self._bias_attr
self.fg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# input gate parameters
if "w" in input_gate_weights and input_gate_weights["w"] is not None:
self.ig_w = input_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_w"
else:
tmp_param_attr = self._param_attr
self.ig_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in input_gate_weights and input_gate_weights["h"] is not None:
self.ig_h = input_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_input_gate_h"
else:
tmp_param_attr = self._param_attr
self.ig_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in input_gate_weights and input_gate_weights["b"] is not None:
self.ig_b = input_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_input_gate_b"
else:
tmp_param_attr = self._bias_attr
self.ig_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# output gate parameters
if "w" in output_gate_weights and output_gate_weights["w"] is not None:
self.og_w = output_gate_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_w"
else:
tmp_param_attr = self._param_attr
self.og_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in output_gate_weights and output_gate_weights["h"] is not None:
self.og_h = output_gate_weights["h"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_output_gate_h"
else:
tmp_param_attr = self._param_attr
self.og_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in output_gate_weights and output_gate_weights["b"] is not None:
self.og_b = output_gate_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_output_gate_b"
else:
tmp_param_attr = self._bias_attr
self.og_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# cell parameters
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if self._param_attr is not None and self._param_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = self._param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None: self._bias = self.create_parameter(
self.c_h = cell_weights["h"] attr=self._bias_attr,
else: shape=[4 * self._hidden_size],
if self._param_attr is not None and self._param_attr.name is not None: dtype=self._dtype,
tmp_param_attr = copy.deepcopy(self._param_attr) is_bias=True)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = self._param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if self._bias_attr is not None and self._bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(self._bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = self._bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# the weight is concated here in order to make the computation more efficent.
weight_w = fluid.layers.concat(
[self.ig_w, self.c_w, self.fg_w, self.og_w], axis=-1)
weight_h = fluid.layers.concat(
[self.ig_h, self.c_h, self.fg_h, self.og_h], axis=-1)
self._weight = fluid.layers.concat([weight_w, weight_h], axis=0)
self._bias = fluid.layers.concat(
[self.ig_b, self.c_b, self.fg_b, self.og_b])
def forward(self, input, state): def forward(self, input, state):
pre_hidden, pre_cell = state pre_hidden, pre_cell = state
...@@ -490,30 +308,16 @@ class BasicGRUCell(RNNCell): ...@@ -490,30 +308,16 @@ class BasicGRUCell(RNNCell):
bias_attr=None, bias_attr=None,
gate_activation=None, gate_activation=None,
activation=None, activation=None,
dtype='float32', dtype='float32'):
update_gate_weights={"w": None,
"h": None,
"b": None},
reset_gate_weights={"w": None,
"h": None,
"b": None},
cell_weights={"w": None,
"h": None,
"b": None}):
super(BasicGRUCell, self).__init__() super(BasicGRUCell, self).__init__()
self._input_size = input_size self._input_size = input_size
self._hiden_size = hidden_size self._hidden_size = hidden_size
self._param_attr = param_attr self._param_attr = param_attr
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh self._activation = activation or layers.tanh
self._dtype = dtype self._dtype = dtype
assert isinstance(update_gate_weights, dict)
assert isinstance(reset_gate_weights, dict)
assert isinstance(cell_weights, dict)
if self._param_attr is not None and self._param_attr.name is not None: if self._param_attr is not None and self._param_attr.name is not None:
gate_param_attr = copy.deepcopy(self._param_attr) gate_param_attr = copy.deepcopy(self._param_attr)
candidate_param_attr = copy.deepcopy(self._param_attr) candidate_param_attr = copy.deepcopy(self._param_attr)
...@@ -523,6 +327,16 @@ class BasicGRUCell(RNNCell): ...@@ -523,6 +327,16 @@ class BasicGRUCell(RNNCell):
gate_param_attr = self._param_attr gate_param_attr = self._param_attr
candidate_param_attr = self._param_attr candidate_param_attr = self._param_attr
self._gate_weight = self.create_parameter(
attr=gate_param_attr,
shape=[self._input_size + self._hidden_size, 2 * self._hidden_size],
dtype=self._dtype)
self._candidate_weight = self.create_parameter(
attr=candidate_param_attr,
shape=[self._input_size + self._hidden_size, self._hidden_size],
dtype=self._dtype)
if self._bias_attr is not None and self._bias_attr.name is not None: if self._bias_attr is not None and self._bias_attr.name is not None:
gate_bias_attr = copy.deepcopy(self._bias_attr) gate_bias_attr = copy.deepcopy(self._bias_attr)
candidate_bias_attr = copy.deepcopy(self._bias_attr) candidate_bias_attr = copy.deepcopy(self._bias_attr)
...@@ -532,139 +346,16 @@ class BasicGRUCell(RNNCell): ...@@ -532,139 +346,16 @@ class BasicGRUCell(RNNCell):
gate_bias_attr = self._bias_attr gate_bias_attr = self._bias_attr
candidate_bias_attr = self._bias_attr candidate_bias_attr = self._bias_attr
# create the parameters of gates in gru self._gate_bias = self.create_parameter(
if "w" in update_gate_weights and update_gate_weights["w"] is not None: attr=gate_bias_attr,
self.ug_w = update_gate_weights["w"] shape=[2 * self._hidden_size],
else: dtype=self._dtype,
if gate_param_attr is not None and gate_param_attr.name is not None: is_bias=True)
tmp_param_attr = copy.deepcopy(gate_param_attr) self._candidate_bias = self.create_parameter(
tmp_param_attr.name += "_update_gate_w" attr=candidate_bias_attr,
else: shape=[self._hidden_size],
tmp_param_attr = gate_param_attr dtype=self._dtype,
self.ug_w = self.create_parameter( is_bias=True)
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in update_gate_weights and update_gate_weights["h"] is not None:
self.ug_h = update_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_update_gate_h"
else:
tmp_param_attr = gate_param_attr
self.ug_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in update_gate_weights and update_gate_weights["b"] is not None:
self.ug_b = update_gate_weights["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_update_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.ug_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# reset gate parameters
if "w" in reset_gate_weights and reset_gate_weights["w"] is not None:
self.rg_w = reset_gate_weights["w"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_w"
else:
tmp_param_attr = gate_param_attr
self.rg_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in reset_gate_weights and reset_gate_weights["h"] is not None:
self.rg_h = reset_gate_weights["h"]
else:
if gate_param_attr is not None and gate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_param_attr)
tmp_param_attr.name += "_reset_gate_h"
else:
tmp_param_attr = gate_param_attr
self.rg_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in reset_gate_weights and reset_gate_weights["b"] is not None:
self.rg_b = reused_params["b"]
else:
if gate_bias_attr is not None and gate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(gate_bias_attr)
tmp_param_attr.name += "_reset_gate_b"
else:
tmp_param_attr = gate_bias_attr
self.rg_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
# cell parameters
if "w" in cell_weights and cell_weights["w"] is not None:
self.c_w = cell_weights["w"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_w"
else:
tmp_param_attr = gate_param_attr
self.c_w = self.create_parameter(
attr=tmp_param_attr,
shape=[self._input_size, self._hidden_size],
dtype=self._dtype)
if "h" in cell_weights and cell_weights["h"] is not None:
self.c_h = cell_weights["h"]
else:
if candidate_param_attr is not None and candidate_param_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_param_attr)
tmp_param_attr.name += "_cell_h"
else:
tmp_param_attr = gate_param_attr
self.c_h = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size, self._hidden_size],
dtype=self._dtype)
if "b" in cell_weights and cell_weights["b"] is not None:
self.c_b = cell_weights["b"]
else:
if candidate_bias_attr is not None and candidate_bias_attr.name is not None:
tmp_param_attr = copy.deepcopy(candidate_bias_attr)
tmp_param_attr.name += "_cell_b"
else:
tmp_param_attr = gate_bias_attr
self.c_b = self.create_parameter(
attr=tmp_param_attr,
shape=[self._hidden_size],
dtype=self._dtype,
is_bias=True)
rg_weights = layers.concat([self.rg_w, self.rg_h], axis=0)
ug_weights = layers.concat([self.ug_w, self.ug_h], axis=0)
self._gate_weight = layers.concat([rg_weights, ug_weights], axis=-1)
self._candidate_weight = layers.concat([self.c_w, self.c_h], axis=0)
self._gate_bias = layers.concat([self.rg_b, self.ug_b], axis=0)
self._candidate_bias = self.c_b
def forward(self, input, state): def forward(self, input, state):
pre_hidden = state pre_hidden = state
...@@ -870,7 +561,6 @@ class DynamicDecode(Layer): ...@@ -870,7 +561,6 @@ class DynamicDecode(Layer):
# To confirm states.finished/finished be consistent with # To confirm states.finished/finished be consistent with
# next_finished. # next_finished.
layers.assign(next_finished, finished) layers.assign(next_finished, finished)
next_sequence_lengths = layers.elementwise_add( next_sequence_lengths = layers.elementwise_add(
sequence_lengths, sequence_lengths,
layers.cast( layers.cast(
...@@ -1010,11 +700,7 @@ class PrePostProcessLayer(Layer): ...@@ -1010,11 +700,7 @@ class PrePostProcessLayer(Layer):
PrePostProcessLayer PrePostProcessLayer
""" """
def __init__(self, def __init__(self, process_cmd, d_model, dropout_rate):
process_cmd,
d_model,
dropout_rate,
reused_layer_norm=None):
super(PrePostProcessLayer, self).__init__() super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd self.process_cmd = process_cmd
self.functors = [] self.functors = []
...@@ -1022,21 +708,16 @@ class PrePostProcessLayer(Layer): ...@@ -1022,21 +708,16 @@ class PrePostProcessLayer(Layer):
if cmd == "a": # add residual connection if cmd == "a": # add residual connection
self.functors.append(lambda x, y: x + y if y else x) self.functors.append(lambda x, y: x + y if y else x)
elif cmd == "n": # add layer normalization elif cmd == "n": # add layer normalization
if reused_layer_norm is not None:
layer_norm = reused_layer_norm
else:
layer_norm = LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))
self.functors.append( self.functors.append(
self.add_sublayer( self.add_sublayer(
"layer_norm_%d" % len( "layer_norm_%d" % len(
self.sublayers(include_sublayers=False)), self.sublayers(include_sublayers=False)),
layer_norm)) LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(1.)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(0.)))))
elif cmd == "d": # add dropout elif cmd == "d": # add dropout
self.functors.append(lambda x: layers.dropout( self.functors.append(lambda x: layers.dropout(
x, dropout_prob=dropout_rate, is_test=False) x, dropout_prob=dropout_rate, is_test=False)
...@@ -1056,48 +737,21 @@ class MultiHeadAttention(Layer): ...@@ -1056,48 +737,21 @@ class MultiHeadAttention(Layer):
Multi-Head Attention Multi-Head Attention
""" """
def __init__(self, def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
d_key,
d_value,
d_model,
n_head=1,
dropout_rate=0.0,
reused_query_fc=None,
reused_key_fc=None,
reused_value_fc=None,
reused_proj_fc=None):
super(MultiHeadAttention, self).__init__() super(MultiHeadAttention, self).__init__()
self.n_head = n_head self.n_head = n_head
self.d_key = d_key self.d_key = d_key
self.d_value = d_value self.d_value = d_value
self.d_model = d_model self.d_model = d_model
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.q_fc = Linear(
if reused_query_fc is not None: input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.q_fc = reused_query_fc self.k_fc = Linear(
else: input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.q_fc = Linear( self.v_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False) input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
if reused_key_fc is not None: self.proj_fc = Linear(
self.k_fc = reused_key_fc input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
else:
self.k_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
if reused_value_fc is not None:
self.v_fc = reused_value_fc
else:
self.v_fc = Linear(
input_dim=d_model,
output_dim=d_value * n_head,
bias_attr=False)
if reused_proj_fc is not None:
self.proj_fc = reused_proj_fc
else:
self.proj_fc = Linear(
input_dim=d_value * n_head,
output_dim=d_model,
bias_attr=False)
def _prepare_qkv(self, queries, keys, values, cache=None): def _prepare_qkv(self, queries, keys, values, cache=None):
if keys is None: # self-attention if keys is None: # self-attention
...@@ -1174,24 +828,12 @@ class FFN(Layer): ...@@ -1174,24 +828,12 @@ class FFN(Layer):
Feed-Forward Network Feed-Forward Network
""" """
def __init__(self, def __init__(self, d_inner_hid, d_model, dropout_rate):
d_inner_hid,
d_model,
dropout_rate,
fc1_act="relu",
reused_fc1=None,
reused_fc2=None):
super(FFN, self).__init__() super(FFN, self).__init__()
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
if reused_fc1 is not None: self.fc1 = Linear(
self.fc1 = reused_fc1 input_dim=d_model, output_dim=d_inner_hid, act="relu")
else: self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
self.fc1 = Linear(
input_dim=d_model, output_dim=d_inner_hid, act=fc1_act)
if reused_fc2 is not None:
self.fc2 = reused_fc2
else:
self.fc2 = Linear(input_dim=d_inner_hid, output_dim=d_model)
def forward(self, x): def forward(self, x):
hidden = self.fc1(x) hidden = self.fc1(x)
...@@ -1217,52 +859,22 @@ class TransformerEncoderLayer(Layer): ...@@ -1217,52 +859,22 @@ class TransformerEncoderLayer(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da", postprocess_cmd="da"):
ffn_fc1_act="relu",
reused_pre_selatt_layernorm=None,
reused_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerEncoderLayer, self).__init__() super(TransformerEncoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_selatt_layernorm) self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.self_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_multihead_att_weights["reused_query_fc"],
reused_key_fc=reused_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_multihead_att_weights["reused_value_fc"],
reused_proj_fc=reused_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_ffn_layernorm) self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
fc1_act=ffn_fc1_act,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_post_ffn_layernorm)
def forward(self, enc_input, attn_bias): def forward(self, enc_input, attn_bias):
attn_output = self.self_attn( attn_output = self.self_attn(
...@@ -1290,8 +902,7 @@ class TransformerEncoder(Layer): ...@@ -1290,8 +902,7 @@ class TransformerEncoder(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da", postprocess_cmd="da"):
ffn_fc1_act="relu"):
super(TransformerEncoder, self).__init__() super(TransformerEncoder, self).__init__()
...@@ -1301,17 +912,9 @@ class TransformerEncoder(Layer): ...@@ -1301,17 +912,9 @@ class TransformerEncoder(Layer):
self.add_sublayer( self.add_sublayer(
"layer_%d" % i, "layer_%d" % i,
TransformerEncoderLayer( TransformerEncoderLayer(
n_head, n_head, d_key, d_value, d_model, d_inner_hid,
d_key, prepostprocess_dropout, attention_dropout,
d_value, relu_dropout, preprocess_cmd, postprocess_cmd)))
d_model,
d_inner_hid,
prepostprocess_dropout,
attention_dropout,
relu_dropout,
preprocess_cmd,
postprocess_cmd,
ffn_fc1_act=ffn_fc1_act)))
self.processer = PrePostProcessLayer(preprocess_cmd, d_model, self.processer = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout) prepostprocess_dropout)
...@@ -1338,79 +941,28 @@ class TransformerDecoderLayer(Layer): ...@@ -1338,79 +941,28 @@ class TransformerDecoderLayer(Layer):
attention_dropout, attention_dropout,
relu_dropout, relu_dropout,
preprocess_cmd="n", preprocess_cmd="n",
postprocess_cmd="da", postprocess_cmd="da"):
reused_pre_selfatt_layernorm=None,
reused_self_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_selfatt_layernorm=None,
reused_pre_crossatt_layernorm=None,
reused_cross_multihead_att_weights={
"reused_query_fc": None,
"reused_key_fc": None,
"reused_value_fc": None,
"reused_proj_fc": None
},
reused_post_crossatt_layernorm=None,
reused_pre_ffn_layernorm=None,
reused_ffn_weights={"reused_fc1": None,
"reused_fc2": None},
reused_post_ffn_layernorm=None):
super(TransformerDecoderLayer, self).__init__() super(TransformerDecoderLayer, self).__init__()
self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_selfatt_layernorm) self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.self_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_self_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_self_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_self_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_self_multihead_att_weights["reused_proj_fc"])
self.postprocesser1 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_selfatt_layernorm)
self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_crossatt_layernorm) self.cross_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
self.cross_attn = MultiHeadAttention( attention_dropout)
d_key, self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
d_value, prepostprocess_dropout)
d_model,
n_head,
attention_dropout,
reused_query_fc=reused_cross_multihead_att_weights[
"reused_query_fc"],
reused_key_fc=reused_cross_multihead_att_weights["reused_key_fc"],
reused_value_fc=reused_cross_multihead_att_weights[
"reused_value_fc"],
reused_proj_fc=reused_cross_multihead_att_weights[
"reused_proj_fc"])
self.postprocesser2 = PrePostProcessLayer(
postprocess_cmd, d_model, prepostprocess_dropout,
reused_post_crossatt_layernorm)
self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model, self.preprocesser3 = PrePostProcessLayer(preprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_pre_ffn_layernorm) self.ffn = FFN(d_inner_hid, d_model, relu_dropout)
self.ffn = FFN(d_inner_hid,
d_model,
relu_dropout,
reused_fc1=reused_ffn_weights["reused_fc1"],
reused_fc2=reused_ffn_weights["reused_fc2"])
self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model, self.postprocesser3 = PrePostProcessLayer(postprocess_cmd, d_model,
prepostprocess_dropout, prepostprocess_dropout)
reused_post_ffn_layernorm)
def forward(self, def forward(self,
dec_input, dec_input,
...@@ -1479,98 +1031,99 @@ class TransformerDecoder(Layer): ...@@ -1479,98 +1031,99 @@ class TransformerDecoder(Layer):
] ]
class DynamicGRU(fluid.dygraph.Layer):
class GRUCell(RNNCell):
def __init__(self, def __init__(self,
size, input_size,
h_0=None, hidden_size,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
is_reverse=False,
gate_activation='sigmoid', gate_activation='sigmoid',
candidate_activation='tanh', candidate_activation='tanh',
origin_mode=False, origin_mode=False):
init_size=None): super(GRUCell, self).__init__()
super(DynamicGRU, self).__init__() self.hidden_size = hidden_size
self.fc_layer = Linear(
input_size,
hidden_size * 3,
param_attr=param_attr)
self.gru_unit = GRUUnit( self.gru_unit = GRUUnit(
size * 3, hidden_size * 3,
param_attr=param_attr, param_attr=param_attr,
bias_attr=bias_attr, bias_attr=bias_attr,
activation=candidate_activation, activation=candidate_activation,
gate_activation=gate_activation, gate_activation=gate_activation,
origin_mode=origin_mode) origin_mode=origin_mode)
self.size = size def forward(self, inputs, states):
self.h_0 = h_0 # for GRUCell, `step_outputs` and `new_states` both are hidden
self.is_reverse = is_reverse x = self.fc_layer(inputs)
hidden, _, _ = self.gru_unit(x, states)
return hidden, hidden
def forward(self, inputs): @property
hidden = self.h_0 def state_shape(self):
res = [] return [self.hidden_size]
for i in range(inputs.shape[1]):
if self.is_reverse: class GRUEncoderCell(RNNCell):
i = inputs.shape[1] - 1 - i def __init__(self,
input_ = inputs[:, i:i + 1, :] num_layers,
input_ = fluid.layers.reshape( input_size,
input_, [-1, input_.shape[2]], inplace=False) hidden_size,
hidden, reset, gate = self.gru_unit(input_, hidden) dropout_prob=0.,
hidden_ = fluid.layers.reshape( init_scale=0.1):
hidden, [-1, 1, hidden.shape[1]], inplace=False) super(GRUEncoderCell, self).__init__()
res.append(hidden_) self.dropout_prob = dropout_prob
if self.is_reverse: # use add_sublayer to add multi-layers
res = res[::-1] self.gru_cells = []
res = fluid.layers.concat(res, axis=1) for i in range(num_layers):
return res self.gru_cells.append(
self.add_sublayer(
"gru_%d" % i,
#BasicGRUCell(
GRUCell(
input_size=input_size if i == 0 else hidden_size,
hidden_size=hidden_size,
param_attr=fluid.ParamAttr(initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))))
def forward(self, step_input, states):
new_states = []
for i, gru_cell in enumerate(self.gru_cells):
out, state = gru_cell(step_input, states[i])
step_input = layers.dropout(
out,
self.dropout_prob,
dropout_implementation='upscale_in_train'
) if self.dropout_prob > 0 else out
new_states.append(step_input)
return step_input, new_states
@property
def state_shape(self):
return [cell.state_shape for cell in self.gru_cells]
class BiGRU(fluid.dygraph.Layer): class BiGRU(fluid.dygraph.Layer):
def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None): def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
super(BiGRU, self).__init__() super(BiGRU, self).__init__()
self.gru = RNN(GRUEncoderCell(1, input_dim,
grnn_hidden_dim, 0.0, init_bound),
is_reverse=False,
time_major=False)
self.pre_gru = Linear( self.gru_r = RNN(GRUEncoderCell(1, input_dim,
input_dim=input_dim, grnn_hidden_dim, 0.0, init_bound),
output_dim=grnn_hidden_dim * 3, is_reverse=True,
param_attr=fluid.ParamAttr( time_major=False)
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru = DynamicGRU(
size=grnn_hidden_dim,
h_0=h_0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.pre_gru_r = Linear(
input_dim=input_dim,
output_dim=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
self.gru_r = DynamicGRU(
size=grnn_hidden_dim,
is_reverse=True,
h_0=h_0,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
def forward(self, input_feature): def forward(self, input_feature):
res_pre_gru = self.pre_gru(input_feature) pre_gru, pre_state = self.gru(input_feature)
res_gru = self.gru(res_pre_gru) gru_r, r_state = self.gru_r(input_feature)
res_pre_gru_r = self.pre_gru_r(input_feature) bi_merge = fluid.layers.concat(input=[pre_gru, gru_r], axis=-1)
res_gru_r = self.gru_r(res_pre_gru_r)
bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
return bi_merge return bi_merge
...@@ -1610,7 +1163,7 @@ class Linear_chain_crf(fluid.dygraph.Layer): ...@@ -1610,7 +1163,7 @@ class Linear_chain_crf(fluid.dygraph.Layer):
"Transition": self._transition, "Transition": self._transition,
"Label": [label] "Label": [label]
} }
if length: if length is not None:
this_inputs['Length'] = [length] this_inputs['Length'] = [length]
self._helper.append_op( self._helper.append_op(
type='linear_chain_crf', type='linear_chain_crf',
...@@ -1655,7 +1208,7 @@ class Crf_decoding(fluid.dygraph.Layer): ...@@ -1655,7 +1208,7 @@ class Crf_decoding(fluid.dygraph.Layer):
"Transition": self._transition, "Transition": self._transition,
"Label": label "Label": label
} }
if length: if length is not None:
this_inputs['Length'] = [length] this_inputs['Length'] = [length]
self._helper.append_op( self._helper.append_op(
type='crf_decoding', type='crf_decoding',
...@@ -1767,7 +1320,7 @@ class SequenceTagging(fluid.dygraph.Layer): ...@@ -1767,7 +1320,7 @@ class SequenceTagging(fluid.dygraph.Layer):
emission = self.fc(bigru_output) emission = self.fc(bigru_output)
if target: if target is not None:
crf_cost = self.linear_chain_crf( crf_cost = self.linear_chain_crf(
input=emission, label=target, length=lengths) input=emission, label=target, length=lengths)
avg_cost = fluid.layers.mean(x=crf_cost) avg_cost = fluid.layers.mean(x=crf_cost)
...@@ -1775,5 +1328,6 @@ class SequenceTagging(fluid.dygraph.Layer): ...@@ -1775,5 +1328,6 @@ class SequenceTagging(fluid.dygraph.Layer):
crf_decode = self.crf_decoding(input=emission, length=lengths) crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, avg_cost, lengths return crf_decode, avg_cost, lengths
else: else:
self.linear_chain_crf.weight = self.crf_decoding.weight
crf_decode = self.crf_decoding(input=emission, length=lengths) crf_decode = self.crf_decoding(input=emission, length=lengths)
return crf_decode, lengths return crf_decode, lengths
...@@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词 ...@@ -6,7 +6,7 @@ Sequence Tagging,是一个序列标注模型,模型可用于实现,分词
|模型|Precision|Recall|F1-score| |模型|Precision|Recall|F1-score|
|:-:|:-:|:-:|:-:| |:-:|:-:|:-:|:-:|
|Lexical Analysis|89.2%|89.4%|89.3%| |Lexical Analysis|88.26%|89.20%|88.73%|
## 2. 快速开始 ## 2. 快速开始
...@@ -139,7 +139,7 @@ python predict.py \ ...@@ -139,7 +139,7 @@ python predict.py \
--init_from_checkpoint model_baseline/params \ --init_from_checkpoint model_baseline/params \
--output_file predict.result \ --output_file predict.result \
--mode predict \ --mode predict \
--device gpu \ --device cpu \
-d -d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数 # -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
...@@ -157,7 +157,7 @@ python eval.py \ ...@@ -157,7 +157,7 @@ python eval.py \
--label_dict_path ./conf/tag.dic \ --label_dict_path ./conf/tag.dic \
--word_rep_dict_path ./conf/q2b.dic \ --word_rep_dict_path ./conf/q2b.dic \
--init_from_checkpoint ./model_baseline/params \ --init_from_checkpoint ./model_baseline/params \
--device gpu \ --device cpu \
-d -d
# -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数 # -d: 是否使用动态图模式进行训练,如果使用静态图训练,命令行请删除-d参数
...@@ -189,7 +189,10 @@ python eval.py \ ...@@ -189,7 +189,10 @@ python eval.py \
### 模型原理介绍 ### 模型原理介绍
上面介绍的模型原理如下图所示:<br /> 上面介绍的模型原理如下图所示:<br />
![GRU-CRF-MODEL](./images/gru-crf-model.png) <p align="center">
<img src="./images/gru-crf-model.png" width = "340" height = "300" /> <br />
Overall Architecture of GRU-CRF-MODEL
</p>
### 数据格式 ### 数据格式
训练使用的数据可以由用户根据实际的应用场景,自己组织数据。除了第一行是 `text_a\tlabel` 固定的开头,后面的每行数据都是由两列组成,以制表符分隔,第一列是 utf-8 编码的中文文本,以 `\002` 分割,第二列是对应每个字的标注,以 `\002` 分隔。我们采用 IOB2 标注体系,即以 X-B 作为类型为 X 的词的开始,以 X-I 作为类型为 X 的词的持续,以 O 表示不关注的字(实际上,在词性、专名联合标注中,不存在 O )。示例如下: 训练使用的数据可以由用户根据实际的应用场景,自己组织数据。除了第一行是 `text_a\tlabel` 固定的开头,后面的每行数据都是由两列组成,以制表符分隔,第一列是 utf-8 编码的中文文本,以 `\002` 分割,第二列是对应每个字的标注,以 `\002` 分隔。我们采用 IOB2 标注体系,即以 X-B 作为类型为 X 的词的开始,以 X-I 作为类型为 X 的词的持续,以 O 表示不关注的字(实际上,在词性、专名联合标注中,不存在 O )。示例如下:
......
...@@ -25,8 +25,9 @@ import math ...@@ -25,8 +25,9 @@ import math
import argparse import argparse
import numpy as np import numpy as np
from train import SeqTagging, Chunk_eval from train import SeqTagging
from utils.check import check_gpu, check_version from utils.check import check_gpu, check_version
from utils.metrics import chunk_count
from reader import LacDataset, create_lexnet_data_generator, create_dataloader from reader import LacDataset, create_lexnet_data_generator, create_dataloader
work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) work_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
...@@ -42,14 +43,13 @@ def main(args): ...@@ -42,14 +43,13 @@ def main(args):
place = set_device(args.device) place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'), inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')] Input([None], 'int64', name='length')]
feed_list = None if args.dynamic else [x.forward() for x in inputs] feed_list = None if args.dynamic else [x.forward() for x in inputs]
dataset = LacDataset(args) dataset = LacDataset(args)
eval_path = args.test_file eval_path = args.test_file
chunk_eval = Chunk_eval(int(math.ceil((dataset.num_labels - 1) / 2.0)), "IOB")
chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator = fluid.metrics.ChunkEvaluator()
chunk_evaluator.reset() chunk_evaluator.reset()
...@@ -69,25 +69,23 @@ def main(args): ...@@ -69,25 +69,23 @@ def main(args):
model.mode = "test" model.mode = "test"
model.prepare(inputs=inputs) model.prepare(inputs=inputs)
model.load(args.init_from_checkpoint, skip_mismatch=True)
model.load(args.init_from_checkpoint)
for data in eval_dataset():
f = open(args.output_file, "wb") if len(data) == 1:
for data in eval_dataset(): batch_data = data[0]
words, lens, targets, targets = data targets = np.array(batch_data[2])
crf_decode, length = model.test(inputs=flatten(data)) else:
crf_decode = fluid.dygraph.to_variable(crf_decode) batch_data = data
length = fluid.dygraph.to_variable(length) targets = batch_data[2].numpy()
(num_infer_chunks, num_label_chunks, num_correct_chunks) = chunk_eval( inputs_data = [batch_data[0], batch_data[1]]
input=crf_decode, crf_decode, length = model.test(inputs=inputs_data)
label=targets, num_infer_chunks, num_label_chunks, num_correct_chunks = chunk_count(crf_decode, targets, length, dataset.id2label_dict)
seq_length=length) chunk_evaluator.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
print(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
chunk_evaluator.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy())
precision, recall, f1 = chunk_evaluator.eval() precision, recall, f1 = chunk_evaluator.eval()
print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1)) print("[test] P: %.5f, R: %.5f, F1: %.5f" % (precision, recall, f1))
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser("sequence tagging training") parser = argparse.ArgumentParser("sequence tagging training")
...@@ -176,7 +174,8 @@ if __name__ == '__main__': ...@@ -176,7 +174,8 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
check_gpu(args.device) use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version() check_version()
main(args) main(args)
...@@ -42,7 +42,7 @@ def main(args): ...@@ -42,7 +42,7 @@ def main(args):
place = set_device(args.device) place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'), inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length')] Input([None], 'int64', name='length')]
feed_list = None if args.dynamic else [x.forward() for x in inputs] feed_list = None if args.dynamic else [x.forward() for x in inputs]
...@@ -70,8 +70,11 @@ def main(args): ...@@ -70,8 +70,11 @@ def main(args):
f = open(args.output_file, "wb") f = open(args.output_file, "wb")
for data in predict_dataset(): for data in predict_dataset():
results, length = model.test(inputs=flatten(data)) if len(data) == 1:
#length_list = np.fromstring(length, dtype=str) input_data = data[0]
else:
input_data = data
results, length = model.test(inputs=flatten(input_data))
for i in range(len(results)): for i in range(len(results)):
word_len = length[i] word_len = length[i]
word_ids = results[i][: word_len] word_ids = results[i][: word_len]
...@@ -162,7 +165,8 @@ if __name__ == '__main__': ...@@ -162,7 +165,8 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
check_gpu(args.device) use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version() check_version()
main(args) main(args)
...@@ -21,7 +21,7 @@ from __future__ import print_function ...@@ -21,7 +21,7 @@ from __future__ import print_function
import io import io
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle
class LacDataset(object): class LacDataset(object):
...@@ -120,7 +120,7 @@ class LacDataset(object): ...@@ -120,7 +120,7 @@ class LacDataset(object):
def wrapper(): def wrapper():
fread = io.open(filename, "r", encoding="utf-8") fread = io.open(filename, "r", encoding="utf-8")
if mode == "train" or mode == "test": if mode == "train":
headline = next(fread) headline = next(fread)
headline = headline.strip().split('\t') headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[0] == "text_a" and headline[ assert len(headline) == 2 and headline[0] == "text_a" and headline[
...@@ -133,6 +133,8 @@ class LacDataset(object): ...@@ -133,6 +133,8 @@ class LacDataset(object):
word_ids = self.word_to_ids(words.split("\002")) word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002")) label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids) assert len(word_ids) == len(label_ids)
words_len = np.int64(len(word_ids))
word_ids = word_ids[0:max_seq_len] word_ids = word_ids[0:max_seq_len]
words_len = np.int64(len(word_ids)) words_len = np.int64(len(word_ids))
word_ids += [0 for _ in range(max_seq_len - words_len)] word_ids += [0 for _ in range(max_seq_len - words_len)]
...@@ -140,6 +142,21 @@ class LacDataset(object): ...@@ -140,6 +142,21 @@ class LacDataset(object):
label_ids += [0 for _ in range(max_seq_len - words_len)] label_ids += [0 for _ in range(max_seq_len - words_len)]
assert len(word_ids) == len(label_ids) assert len(word_ids) == len(label_ids)
yield word_ids, label_ids, words_len yield word_ids, label_ids, words_len
elif mode == "test":
headline = next(fread)
headline = headline.strip().split('\t')
assert len(headline) == 2 and headline[0] == "text_a" and headline[
1] == "label"
buf = []
for line in fread:
words, labels = line.strip("\n").split("\t")
if len(words) < 1:
continue
word_ids = self.word_to_ids(words.split("\002"))
label_ids = self.label_to_ids(labels.split("\002"))
assert len(word_ids) == len(label_ids)
words_len = np.int64(len(word_ids))
yield word_ids, label_ids, words_len
else: else:
for line in fread: for line in fread:
words = line.strip("\n").split('\t')[0] words = line.strip("\n").split('\t')[0]
...@@ -157,9 +174,16 @@ class LacDataset(object): ...@@ -157,9 +174,16 @@ class LacDataset(object):
return wrapper return wrapper
def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def padding_data(max_len, batch_data):
padding_batch_data = []
for data in batch_data:
data += [0 for _ in range(max_len - len(data))]
padding_batch_data.append(data)
return padding_batch_data
def wrapper(): def wrapper():
if mode == "train" or mode == "test": if mode == "train":
batch_words, batch_labels, seq_lens = [], [], [] batch_words, batch_labels, seq_lens = [], [], []
for epoch in xrange(args.epoch): for epoch in xrange(args.epoch):
for instance in reader.file_reader( for instance in reader.file_reader(
...@@ -169,12 +193,32 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): ...@@ -169,12 +193,32 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
batch_words.append(words) batch_words.append(words)
batch_labels.append(labels) batch_labels.append(labels)
seq_lens.append(words_len) seq_lens.append(words_len)
if len(seq_lens) == args.batch_size: if len(seq_lens) == args.batch_size:
yield batch_words, seq_lens, batch_labels, batch_labels yield batch_words, seq_lens, batch_labels, batch_labels
batch_words, batch_labels, seq_lens = [], [], [] batch_words, batch_labels, seq_lens = [], [], []
if len(seq_lens) > 0: if len(seq_lens) > 0:
yield batch_words, seq_lens, batch_labels, batch_labels yield batch_words, seq_lens, batch_labels, batch_labels
elif mode == "test":
batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
for instance in reader.file_reader(
file_name, mode, max_seq_len=args.max_seq_len)():
words, labels, words_len = instance
max_len = words_len if words_len > max_len else max_len
if len(seq_lens) < args.batch_size:
batch_words.append(words)
seq_lens.append(words_len)
batch_labels.append(labels)
if len(seq_lens) == args.batch_size:
padding_batch_words = padding_data(max_len, batch_words)
padding_batch_labels = padding_data(max_len, batch_labels)
yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
batch_words, batch_labels, seq_lens, max_len = [], [], [], 0
if len(seq_lens) > 0:
padding_batch_words = padding_data(max_len, batch_words)
padding_batch_labels = padding_data(max_len, batch_labels)
yield padding_batch_words, seq_lens, padding_batch_labels, padding_batch_labels
else: else:
batch_words, seq_lens, max_len = [], [], 0 batch_words, seq_lens, max_len = [], [], 0
for instance in reader.file_reader( for instance in reader.file_reader(
...@@ -183,20 +227,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): ...@@ -183,20 +227,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
if len(seq_lens) < args.batch_size: if len(seq_lens) < args.batch_size:
batch_words.append(words) batch_words.append(words)
seq_lens.append(words_len) seq_lens.append(words_len)
if words_len > max_len: max_len = words_len if words_len > max_len else max_len
max_len = words_len if len(seq_lens) == args.batch_size:
if len(seq_lens) == args.batch_size: padding_batch_words = padding_data(max_len, batch_words)
padding_batch_words = []
for words in batch_words:
words += [0 for _ in range(max_len - len(words))]
padding_batch_words.append(words)
yield padding_batch_words, seq_lens yield padding_batch_words, seq_lens
batch_words, seq_lens, max_len = [], [], 0 batch_words, seq_lens, max_len = [], [], 0
if len(seq_lens) > 0: if len(seq_lens) > 0:
padding_batch_words = [] padding_batch_words = padding_data(max_len, batch_words)
for words in batch_words:
words += [0 for _ in range(max_len - len(words))]
padding_batch_words.append(words)
yield padding_batch_words, seq_lens yield padding_batch_words, seq_lens
return wrapper return wrapper
...@@ -204,13 +241,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"): ...@@ -204,13 +241,13 @@ def create_lexnet_data_generator(args, reader, file_name, place, mode="train"):
def create_dataloader(generator, place, feed_list=None): def create_dataloader(generator, place, feed_list=None):
if not feed_list: if not feed_list:
data_loader = fluid.io.DataLoader.from_generator( data_loader = paddle.io.DataLoader.from_generator(
capacity=50, capacity=50,
use_double_buffer=True, use_double_buffer=True,
iterable=True, iterable=True,
return_list=True) return_list=True)
else: else:
data_loader = fluid.io.DataLoader.from_generator( data_loader = paddle.io.DataLoader.from_generator(
feed_list=feed_list, feed_list=feed_list,
capacity=50, capacity=50,
use_double_buffer=True, use_double_buffer=True,
......
...@@ -154,9 +154,10 @@ class ChunkEval(Metric): ...@@ -154,9 +154,10 @@ class ChunkEval(Metric):
int(math.ceil((num_labels - 1) / 2.0)), "IOB") int(math.ceil((num_labels - 1) / 2.0)), "IOB")
self.reset() self.reset()
def add_metric_op(self, pred, label, *args, **kwargs): def add_metric_op(self, *args):
crf_decode = pred[0] crf_decode = args[0]
lengths = pred[2] lengths = args[2]
label = args[3]
(num_infer_chunks, num_label_chunks, (num_infer_chunks, num_label_chunks,
num_correct_chunks) = self.chunk_eval( num_correct_chunks) = self.chunk_eval(
input=crf_decode, label=label, seq_length=lengths) input=crf_decode, label=label, seq_length=lengths)
...@@ -204,11 +205,11 @@ def main(args): ...@@ -204,11 +205,11 @@ def main(args):
place = set_device(args.device) place = set_device(args.device)
fluid.enable_dygraph(place) if args.dynamic else None fluid.enable_dygraph(place) if args.dynamic else None
inputs = [Input([None, args.max_seq_len], 'int64', name='words'), inputs = [Input([None, None], 'int64', name='words'),
Input([None], 'int64', name='length'), Input([None], 'int64', name='length'),
Input([None, args.max_seq_len], 'int64', name='target')] Input([None, None], 'int64', name='target')]
labels = [Input([None, args.max_seq_len], 'int64', name='labels')] labels = [Input([None, None], 'int64', name='labels')]
feed_list = None if args.dynamic else [x.forward() for x in inputs + labels] feed_list = None if args.dynamic else [x.forward() for x in inputs + labels]
dataset = LacDataset(args) dataset = LacDataset(args)
...@@ -343,7 +344,8 @@ if __name__ == '__main__': ...@@ -343,7 +344,8 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
check_gpu(args.device) use_gpu = True if args.device == "gpu" else False
check_gpu(use_gpu)
check_version() check_version()
main(args) main(args)
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --selected_gpus=0,1,2,3 train.py \
--train_file ./data/train.tsv \
--test_file ./data/test.tsv \
--word_dict_path ./data/word.dic \
--label_dict_path ./data/tag.dic \
--word_rep_dict_path ./data/q2b.dic \
--device gpu \
--grnn_hidden_dim 128 \
--word_emb_dim 128 \
--bigru_num 2 \
--base_learning_rate 1e-3 \
--batch_size 300 \
--epoch 10 \
--save_dir ./model \
-d
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import paddle.fluid as fluid
__all__ = ['chunk_count', "build_chunk"]
def build_chunk(data_list, id2label_dict):
"""
Assembly entity
"""
tag_list = [id2label_dict.get(str(id)) for id in data_list]
ner_dict = {}
ner_str = ""
ner_start = 0
for i in range(len(tag_list)):
tag = tag_list[i]
if tag == u"O":
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag
elif tag.endswith(u"B"):
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag.split('-')[0]
elif tag.endswith(u"I"):
if tag.split('-')[0] != ner_str:
if i != 0:
key = "%d_%d" % (ner_start, i - 1)
ner_dict[key] = ner_str
ner_start = i
ner_str = tag.split('-')[0]
return ner_dict
def chunk_count(infer_numpy, label_numpy, seq_len, id2label_dict):
"""
calculate num_correct_chunks num_error_chunks total_num for metrics
"""
num_infer_chunks, num_label_chunks, num_correct_chunks = 0, 0, 0
assert infer_numpy.shape[0] == label_numpy.shape[0]
for i in range(infer_numpy.shape[0]):
infer_list = infer_numpy[i][: seq_len[i]]
label_list = label_numpy[i][: seq_len[i]]
infer_dict = build_chunk(infer_list, id2label_dict)
num_infer_chunks += len(infer_dict)
label_dict = build_chunk(label_list, id2label_dict)
num_label_chunks += len(label_dict)
for key in infer_dict:
if key in label_dict and label_dict[key] == infer_dict[key]:
num_correct_chunks += 1
return num_infer_chunks, num_label_chunks, num_correct_chunks
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册