未验证 提交 851c8c52 编写于 作者: 王肖 提交者: GitHub

fix bugs of PaddleNLP/similarity_net & dygraph/similarity_net (#4362)

* Update README.md (#4267)

* test=develop (#4269)

* 3d use new api (#4275)

* PointNet++ and PointRCNN use new API

* Update Readme of Dygraph BERT (#4277)

Fix some typos.

* Update run_classifier_multi_gpu.sh (#4279)

remove the CUDA_VISIBLE_DEVICES

* Update README.md (#4280)

* add similarity_net dygraph

* fix similarity_net dygraph

* fix bugs of dygraph/similarity_net

* Fix some bugs running on the GPU of dygraph/similarity_net

* fix a bug in pointwise mode of dygraph/similarity_net

* fix a bug of paddleNLP/similarity_net

* fix a bug and remove unuse files of dygraph/similarity_net
Co-authored-by: Npkpk <xiyzhouang@gmail.com>
Co-authored-by: NKaipeng Deng <dengkaipeng@baidu.com>
上级 53ea4b36
...@@ -53,12 +53,12 @@ def create_model(args, pyreader_name, is_inference=False, is_pointwise=False): ...@@ -53,12 +53,12 @@ def create_model(args, pyreader_name, is_inference=False, is_pointwise=False):
""" """
if is_inference: if is_inference:
inf_pyreader = fluid.layers.py_reader( inf_pyreader = fluid.layers.py_reader(
capacity=16, capacity=16,
shapes=([-1, 1], [-1, 1]), shapes=([-1], [-1]),
dtypes=('int64', 'int64'), dtypes=('int64', 'int64'),
lod_levels=(1, 1), lod_levels=(1, 1),
name=pyreader_name, name=pyreader_name,
use_double_buffer=False) use_double_buffer=False)
left, pos_right = fluid.layers.read_file(inf_pyreader) left, pos_right = fluid.layers.read_file(inf_pyreader)
return inf_pyreader, left, pos_right return inf_pyreader, left, pos_right
...@@ -66,27 +66,26 @@ def create_model(args, pyreader_name, is_inference=False, is_pointwise=False): ...@@ -66,27 +66,26 @@ def create_model(args, pyreader_name, is_inference=False, is_pointwise=False):
else: else:
if is_pointwise: if is_pointwise:
pointwise_pyreader = fluid.layers.py_reader( pointwise_pyreader = fluid.layers.py_reader(
capacity=16, capacity=16,
shapes=([-1, 1], [-1, 1], [-1, 1]), shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'), dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 0), lod_levels=(1, 1, 0),
name=pyreader_name, name=pyreader_name,
use_double_buffer=False) use_double_buffer=False)
left, right, label = fluid.layers.read_file(pointwise_pyreader) left, right, label = fluid.layers.read_file(pointwise_pyreader)
return pointwise_pyreader, left, right, label return pointwise_pyreader, left, right, label
else: else:
pairwise_pyreader = fluid.layers.py_reader( pairwise_pyreader = fluid.layers.py_reader(
capacity=16, capacity=16,
shapes=([-1, 1], [-1, 1], [-1, 1]), shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'), dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 1), lod_levels=(1, 1, 1),
name=pyreader_name, name=pyreader_name,
use_double_buffer=False) use_double_buffer=False)
left, pos_right, neg_right = fluid.layers.read_file( left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
pairwise_pyreader)
return pairwise_pyreader, left, pos_right, neg_right return pairwise_pyreader, left, pos_right, neg_right
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
base layers
"""
from paddle.fluid import layers
from paddle.fluid.dygraph import Layer
from paddle.fluid.dygraph import GRUUnit
from paddle.fluid.dygraph.base import to_variable
# import numpy as np
# import logging
class DynamicGRU(Layer):
def __init__(self,
size,
param_attr=None,
bias_attr=None,
is_reverse=False,
gate_activation='sigmoid',
candidate_activation='tanh',
h_0=None,
origin_mode=False,
init_size = None):
super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit(
size * 3,
param_attr=param_attr,
bias_attr=bias_attr,
activation=candidate_activation,
gate_activation=gate_activation,
origin_mode=origin_mode)
self.size = size
self.h_0 = h_0
self.is_reverse = is_reverse
def forward(self, inputs):
hidden = self.h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[ :, i:i+1, :]
input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
res = fluid.layers.concat(res, axis=1)
return res
\ No newline at end of file
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid import layers, unique_name
from paddle.fluid.dygraph import Layer
from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
from paddle.fluid.layers.control_flow import StaticRNN
__all__ = ['BasicGRUUnit', 'basic_gru', 'BasicLSTMUnit', 'basic_lstm']
class BasicGRUUnit(Layer):
"""
****
BasicGRUUnit class, using basic operators to build GRU
The algorithm can be described as the equations below.
.. math::
u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
Args:
name_scope(string) : The name scope used to identify parameters and biases
hidden_size (integer): The hidden size used in the Unit.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note:
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias
of GRU unit.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate).
Default: 'fluid.layers.sigmoid'
activation (function|None): The activation function for cell (actNode).
Default: 'fluid.layers.tanh'
dtype(string): data type used in this unit
Examples:
.. code-block:: python
import paddle.fluid.layers as layers
from paddle.fluid.contrib.layers import BasicGRUUnit
input_size = 128
hidden_size = 256
input = layers.data( name = "input", shape = [-1, input_size], dtype='float32')
pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
gru_unit = BasicGRUUnit( "gru_unit", hidden_size )
new_hidden = gru_unit( input, pre_hidden )
"""
def __init__(self,
name_scope,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
dtype='float32'):
super(BasicGRUUnit, self).__init__(name_scope, dtype)
# reserve old school _full_name and _helper for static graph save load
self._full_name = unique_name.generate(name_scope + "/" +
self.__class__.__name__)
self._helper = LayerObjectHelper(self._full_name)
self._name = name_scope
self._hiden_size = hidden_size
self._param_attr = param_attr
self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh
self._dtype = dtype
def _build_once(self, input, pre_hidden):
self._input_size = input.shape[-1]
assert (self._input_size > 0)
self._gate_weight = self.create_parameter(
attr=self._param_attr,
shape=[self._input_size + self._hiden_size, 2 * self._hiden_size],
dtype=self._dtype)
self._candidate_weight = self.create_parameter(
attr=self._param_attr,
shape=[self._input_size + self._hiden_size, self._hiden_size],
dtype=self._dtype)
self._gate_bias = self.create_parameter(
attr=self._bias_attr,
shape=[2 * self._hiden_size],
dtype=self._dtype,
is_bias=True)
self._candidate_bias = self.create_parameter(
attr=self._bias_attr,
shape=[self._hiden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, pre_hidden):
concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._gate_weight)
gate_input = layers.elementwise_add(gate_input, self._gate_bias)
gate_input = self._gate_activation(gate_input)
r, u = layers.split(gate_input, num_or_sections=2, dim=1)
r_hidden = r * pre_hidden
candidate = layers.matmul(
layers.concat([input, r_hidden], 1), self._candidate_weight)
candidate = layers.elementwise_add(candidate, self._candidate_bias)
c = self._activation(candidate)
new_hidden = u * pre_hidden + (1 - u) * c
return new_hidden
def basic_gru(input,
init_hidden,
hidden_size,
num_layers=1,
sequence_length=None,
dropout_prob=0.0,
bidirectional=False,
batch_first=True,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
dtype='float32',
name='basic_gru'):
"""
GRU implementation using basic operator, supports multiple layers and bidirection gru.
.. math::
u_t & = actGate(W_ux xu_{t} + W_uh h_{t-1} + b_u)
r_t & = actGate(W_rx xr_{t} + W_rh h_{t-1} + b_r)
m_t & = actNode(W_cx xm_t + W_ch dot(r_t, h_{t-1}) + b_m)
h_t & = dot(u_t, h_{t-1}) + dot((1-u_t), m_t)
Args:
input (Variable): GRU input tensor,
if batch_first = False, shape should be ( seq_len x batch_size x input_size )
if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
init_hidden(Variable|None): The initial hidden state of the GRU
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
and can be reshaped to tensor with ( num_layers x 2 x batch_size x hidden_size) to use.
If it's None, it will be set to all 0.
hidden_size (int): Hidden size of the GRU
num_layers (int): The total number of layers of the GRU
sequence_length (Variabe|None): A Tensor (shape [batch_size]) stores each real length of each instance,
This tensor will be convert to a mask to mask the padding ids
If it's None means NO padding ids
dropout_prob(float|0.0): Dropout prob, dropout ONLY works after rnn output of earch layers,
NOT between time steps
bidirectional (bool|False): If it is bidirectional
batch_first (bool|True): The shape format of the input and output tensors. If true,
the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
this function accepts input and emits output in batch-major form to be consistent
with most of data format, though a bit less efficient because of extra transposes.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note:
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias
of GRU unit.
If it is set to None or one attribute of ParamAttr, gru_unit will
create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate).
Default: 'fluid.layers.sigmoid'
activation (function|None): The activation function for cell (actNode).
Default: 'fluid.layers.tanh'
dtype(string): data type used in this unit
name(string): name used to identify parameters and biases
Returns:
rnn_out(Tensor),last_hidden(Tensor)
- rnn_out is result of GRU hidden, with shape (seq_len x batch_size x hidden_size) \
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
- last_hidden is the hidden state of the last step of GRU \
shape is ( num_layers x batch_size x hidden_size ) \
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size),
can be reshaped to a tensor with shape( num_layers x 2 x batch_size x hidden_size)
Examples:
.. code-block:: python
import paddle.fluid.layers as layers
from paddle.fluid.contrib.layers import basic_gru
batch_size = 20
input_size = 128
hidden_size = 256
num_layers = 2
dropout = 0.5
bidirectional = True
batch_first = False
input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')
rnn_out, last_hidden = basic_gru( input, pre_hidden, hidden_size, num_layers = num_layers, \
sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
batch_first = batch_first)
"""
fw_unit_list = []
for i in range(num_layers):
new_name = name + "_layers_" + str(i)
fw_unit_list.append(
BasicGRUUnit(new_name, hidden_size, param_attr, bias_attr,
gate_activation, activation, dtype))
if bidirectional:
bw_unit_list = []
for i in range(num_layers):
new_name = name + "_reverse_layers_" + str(i)
bw_unit_list.append(
BasicGRUUnit(new_name, hidden_size, param_attr, bias_attr,
gate_activation, activation, dtype))
if batch_first:
input = layers.transpose(input, [1, 0, 2])
mask = None
if sequence_length:
max_seq_len = layers.shape(input)[0]
mask = layers.sequence_mask(
sequence_length, maxlen=max_seq_len, dtype='float32')
mask = layers.transpose(mask, [1, 0])
direc_num = 1
if bidirectional:
direc_num = 2
if init_hidden:
init_hidden = layers.reshape(
init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
def get_single_direction_output(rnn_input,
unit_list,
mask=None,
direc_index=0):
rnn = StaticRNN()
with rnn.step():
step_input = rnn.step_input(rnn_input)
if mask:
step_mask = rnn.step_input(mask)
for i in range(num_layers):
if init_hidden:
pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
else:
pre_hidden = rnn.memory(
batch_ref=rnn_input,
shape=[-1, hidden_size],
ref_batch_dim_idx=1)
new_hidden = unit_list[i](step_input, pre_hidden)
if mask:
new_hidden = layers.elementwise_mul(
new_hidden, step_mask, axis=0) - layers.elementwise_mul(
pre_hidden, (step_mask - 1), axis=0)
rnn.update_memory(pre_hidden, new_hidden)
rnn.step_output(new_hidden)
step_input = new_hidden
if dropout_prob != None and dropout_prob > 0.0:
step_input = layers.dropout(
step_input,
dropout_prob=dropout_prob, )
rnn.step_output(step_input)
rnn_out = rnn()
last_hidden_array = []
rnn_output = rnn_out[-1]
for i in range(num_layers):
last_hidden = rnn_out[i]
last_hidden = last_hidden[-1]
last_hidden_array.append(last_hidden)
last_hidden_output = layers.concat(last_hidden_array, axis=0)
last_hidden_output = layers.reshape(
last_hidden_output, shape=[num_layers, -1, hidden_size])
return rnn_output, last_hidden_output
# seq_len, batch_size, hidden_size
fw_rnn_out, fw_last_hidden = get_single_direction_output(
input, fw_unit_list, mask, direc_index=0)
if bidirectional:
bw_input = layers.reverse(input, axis=[0])
bw_mask = None
if mask:
bw_mask = layers.reverse(mask, axis=[0])
bw_rnn_out, bw_last_hidden = get_single_direction_output(
bw_input, bw_unit_list, bw_mask, direc_index=1)
bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])
rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
last_hidden = layers.reshape(
last_hidden, shape=[num_layers * direc_num, -1, hidden_size])
if batch_first:
rnn_out = layers.transpose(rnn_out, [1, 0, 2])
return rnn_out, last_hidden
else:
rnn_out = fw_rnn_out
last_hidden = fw_last_hidden
if batch_first:
rnn_out = layers.transpose(rnn_out, [1, 0, 2])
return rnn_out, last_hidden
def basic_lstm(input,
init_hidden,
init_cell,
hidden_size,
num_layers=1,
sequence_length=None,
dropout_prob=0.0,
bidirectional=False,
batch_first=True,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
forget_bias=1.0,
dtype='float32',
name='basic_lstm'):
"""
LSTM implementation using basic operators, supports multiple layers and bidirection LSTM.
.. math::
i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
\\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
h_t &= o_t \odot tanh(c_t)
Args:
input (Variable): lstm input tensor,
if batch_first = False, shape should be ( seq_len x batch_size x input_size )
if batch_first = True, shape should be ( batch_size x seq_len x hidden_size )
init_hidden(Variable|None): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
If it's None, it will be set to all 0.
init_cell(Variable|None): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
and can be reshaped to a tensor with shape ( num_layers x 2 x batch_size x hidden_size) to use.
If it's None, it will be set to all 0.
hidden_size (int): Hidden size of the LSTM
num_layers (int): The total number of layers of the LSTM
sequence_length (Variabe|None): A tensor (shape [batch_size]) stores each real length of each instance,
This tensor will be convert to a mask to mask the padding ids
If it's None means NO padding ids
dropout_prob(float|0.0): Dropout prob, dropout ONLY work after rnn output of earch layers,
NOT between time steps
bidirectional (bool|False): If it is bidirectional
batch_first (bool|True): The shape format of the input and output tensors. If true,
the shape format should be :attr:`[batch_size, seq_len, hidden_size]`. If false,
the shape format should be :attr:`[seq_len, batch_size, hidden_size]`. By default
this function accepts input and emits output in batch-major form to be consistent
with most of data format, though a bit less efficient because of extra transposes.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note:
If it is set to None or one attribute of ParamAttr, lstm_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias
of LSTM unit.
If it is set to None or one attribute of ParamAttr, lstm_unit will
create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate).
Default: 'fluid.layers.sigmoid'
activation (function|None): The activation function for cell (actNode).
Default: 'fluid.layers.tanh'
forget_bias (float|1.0) : Forget bias used to compute the forget gate
dtype(string): Data type used in this unit
name(string): Name used to identify parameters and biases
Returns:
rnn_out(Tensor), last_hidden(Tensor), last_cell(Tensor)
- rnn_out is the result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
if is_bidirec set to True, it's shape will be ( seq_len x batch_sze x hidden_size*2)
- last_hidden is the hidden state of the last step of LSTM \
with shape ( num_layers x batch_size x hidden_size ) \
if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use.
- last_cell is the hidden state of the last step of LSTM \
with shape ( num_layers x batch_size x hidden_size ) \
if is_bidirec set to True, it's shape will be ( num_layers*2 x batch_size x hidden_size),
and can be reshaped to a tensor ( num_layers x 2 x batch_size x hidden_size) to use.
Examples:
.. code-block:: python
import paddle.fluid.layers as layers
from paddle.fluid.contrib.layers import basic_lstm
batch_size = 20
input_size = 128
hidden_size = 256
num_layers = 2
dropout = 0.5
bidirectional = True
batch_first = False
input = layers.data( name = "input", shape = [-1, batch_size, input_size], dtype='float32')
pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
sequence_length = layers.data( name="sequence_length", shape=[-1], dtype='int32')
rnn_out, last_hidden, last_cell = basic_lstm( input, pre_hidden, pre_cell, \
hidden_size, num_layers = num_layers, \
sequence_length = sequence_length, dropout_prob=dropout, bidirectional = bidirectional, \
batch_first = batch_first)
"""
fw_unit_list = []
for i in range(num_layers):
new_name = name + "_layers_" + str(i)
fw_unit_list.append(
BasicLSTMUnit(
new_name,
hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
gate_activation=gate_activation,
activation=activation,
forget_bias=forget_bias,
dtype=dtype))
if bidirectional:
bw_unit_list = []
for i in range(num_layers):
new_name = name + "_reverse_layers_" + str(i)
bw_unit_list.append(
BasicLSTMUnit(
new_name,
hidden_size,
param_attr=param_attr,
bias_attr=bias_attr,
gate_activation=gate_activation,
activation=activation,
forget_bias=forget_bias,
dtype=dtype))
if batch_first:
input = layers.transpose(input, [1, 0, 2])
mask = None
if sequence_length:
max_seq_len = layers.shape(input)[0]
mask = layers.sequence_mask(
sequence_length, maxlen=max_seq_len, dtype='float32')
mask = layers.transpose(mask, [1, 0])
direc_num = 1
if bidirectional:
direc_num = 2
# convert to [num_layers, 2, batch_size, hidden_size]
if init_hidden:
init_hidden = layers.reshape(
init_hidden, shape=[num_layers, direc_num, -1, hidden_size])
init_cell = layers.reshape(
init_cell, shape=[num_layers, direc_num, -1, hidden_size])
# forward direction
def get_single_direction_output(rnn_input,
unit_list,
mask=None,
direc_index=0):
rnn = StaticRNN()
with rnn.step():
step_input = rnn.step_input(rnn_input)
if mask:
step_mask = rnn.step_input(mask)
for i in range(num_layers):
if init_hidden:
pre_hidden = rnn.memory(init=init_hidden[i, direc_index])
pre_cell = rnn.memory(init=init_cell[i, direc_index])
else:
pre_hidden = rnn.memory(
batch_ref=rnn_input, shape=[-1, hidden_size])
pre_cell = rnn.memory(
batch_ref=rnn_input, shape=[-1, hidden_size])
new_hidden, new_cell = unit_list[i](step_input, pre_hidden,
pre_cell)
if mask:
new_hidden = layers.elementwise_mul(
new_hidden, step_mask, axis=0) - layers.elementwise_mul(
pre_hidden, (step_mask - 1), axis=0)
new_cell = layers.elementwise_mul(
new_cell, step_mask, axis=0) - layers.elementwise_mul(
pre_cell, (step_mask - 1), axis=0)
rnn.update_memory(pre_hidden, new_hidden)
rnn.update_memory(pre_cell, new_cell)
rnn.step_output(new_hidden)
rnn.step_output(new_cell)
step_input = new_hidden
if dropout_prob != None and dropout_prob > 0.0:
step_input = layers.dropout(
step_input,
dropout_prob=dropout_prob,
dropout_implementation='upscale_in_train')
rnn.step_output(step_input)
rnn_out = rnn()
last_hidden_array = []
last_cell_array = []
rnn_output = rnn_out[-1]
for i in range(num_layers):
last_hidden = rnn_out[i * 2]
last_hidden = last_hidden[-1]
last_hidden_array.append(last_hidden)
last_cell = rnn_out[i * 2 + 1]
last_cell = last_cell[-1]
last_cell_array.append(last_cell)
last_hidden_output = layers.concat(last_hidden_array, axis=0)
last_hidden_output = layers.reshape(
last_hidden_output, shape=[num_layers, -1, hidden_size])
last_cell_output = layers.concat(last_cell_array, axis=0)
last_cell_output = layers.reshape(
last_cell_output, shape=[num_layers, -1, hidden_size])
return rnn_output, last_hidden_output, last_cell_output
# seq_len, batch_size, hidden_size
fw_rnn_out, fw_last_hidden, fw_last_cell = get_single_direction_output(
input, fw_unit_list, mask, direc_index=0)
if bidirectional:
bw_input = layers.reverse(input, axis=[0])
bw_mask = None
if mask:
bw_mask = layers.reverse(mask, axis=[0])
bw_rnn_out, bw_last_hidden, bw_last_cell = get_single_direction_output(
bw_input, bw_unit_list, bw_mask, direc_index=1)
bw_rnn_out = layers.reverse(bw_rnn_out, axis=[0])
rnn_out = layers.concat([fw_rnn_out, bw_rnn_out], axis=2)
last_hidden = layers.concat([fw_last_hidden, bw_last_hidden], axis=1)
last_hidden = layers.reshape(
last_hidden, shape=[num_layers * direc_num, -1, hidden_size])
last_cell = layers.concat([fw_last_cell, bw_last_cell], axis=1)
last_cell = layers.reshape(
last_cell, shape=[num_layers * direc_num, -1, hidden_size])
if batch_first:
rnn_out = layers.transpose(rnn_out, [1, 0, 2])
return rnn_out, last_hidden, last_cell
else:
rnn_out = fw_rnn_out
last_hidden = fw_last_hidden
last_cell = fw_last_cell
if batch_first:
rnn_out = layers.transpose(rnn_out, [1, 0, 2])
return rnn_out, last_hidden, last_cell
class BasicLSTMUnit(Layer):
"""
****
BasicLSTMUnit class, Using basic operator to build LSTM
The algorithm can be described as the code below.
.. math::
i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + b_i)
f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + b_f + forget_bias )
o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + b_o)
\\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
h_t &= o_t \odot tanh(c_t)
- $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The :math:`\odot` is the element-wise product of the vectors.
- :math:`tanh` is the activation functions.
- :math:`\\tilde{c_t}` is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Args:
name_scope(string) : The name scope used to identify parameter and bias name
hidden_size (integer): The hidden size used in the Unit.
param_attr(ParamAttr|None): The parameter attribute for the learnable
weight matrix. Note:
If it is set to None or one attribute of ParamAttr, lstm_unit will
create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr|None): The parameter attribute for the bias
of LSTM unit.
If it is set to None or one attribute of ParamAttr, lstm_unit will
create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized as zero. Default: None.
gate_activation (function|None): The activation function for gates (actGate).
Default: 'fluid.layers.sigmoid'
activation (function|None): The activation function for cells (actNode).
Default: 'fluid.layers.tanh'
forget_bias(float|1.0): forget bias used when computing forget gate
dtype(string): data type used in this unit
Examples:
.. code-block:: python
import paddle.fluid.layers as layers
from paddle.fluid.contrib.layers import BasicLSTMUnit
input_size = 128
hidden_size = 256
input = layers.data( name = "input", shape = [-1, input_size], dtype='float32')
pre_hidden = layers.data( name = "pre_hidden", shape=[-1, hidden_size], dtype='float32')
pre_cell = layers.data( name = "pre_cell", shape=[-1, hidden_size], dtype='float32')
lstm_unit = BasicLSTMUnit( "gru_unit", hidden_size)
new_hidden, new_cell = lstm_unit( input, pre_hidden, pre_cell )
"""
def __init__(self,
name_scope,
hidden_size,
param_attr=None,
bias_attr=None,
gate_activation=None,
activation=None,
forget_bias=1.0,
dtype='float32'):
super(BasicLSTMUnit, self).__init__(name_scope, dtype)
# reserve old school _full_name and _helper for static graph save load
self._full_name = unique_name.generate(name_scope + "/" +
self.__class__.__name__)
self._helper = LayerObjectHelper(self._full_name)
self._name = name_scope
self._hiden_size = hidden_size
self._param_attr = param_attr
self._bias_attr = bias_attr
self._gate_activation = gate_activation or layers.sigmoid
self._activation = activation or layers.tanh
self._forget_bias = layers.fill_constant(
[1], dtype=dtype, value=forget_bias)
self._forget_bias.stop_gradient = False
self._dtype = dtype
def _build_once(self, input, pre_hidden, pre_cell):
self._input_size = input.shape[-1]
assert (self._input_size > 0)
self._weight = self.create_parameter(
attr=self._param_attr,
shape=[self._input_size + self._hiden_size, 4 * self._hiden_size],
dtype=self._dtype)
self._bias = self.create_parameter(
attr=self._bias_attr,
shape=[4 * self._hiden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, pre_hidden, pre_cell):
concat_input_hidden = layers.concat([input, pre_hidden], 1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias)
i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
new_cell = layers.elementwise_add(
layers.elementwise_mul(
pre_cell,
layers.sigmoid(layers.elementwise_add(f, self._forget_bias))),
layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)))
new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
return new_hidden, new_cell
...@@ -42,7 +42,7 @@ class MMDNN(Layer): ...@@ -42,7 +42,7 @@ class MMDNN(Layer):
self.dpool_size1 = int(config['net']['dpool_size_left']) self.dpool_size1 = int(config['net']['dpool_size_left'])
self.dpool_size2 = int(config['net']['dpool_size_right']) self.dpool_size2 = int(config['net']['dpool_size_right'])
self.hidden_size = int(config['net']['hidden_size']) self.hidden_size = int(config['net']['hidden_size'])
self.seq_len = int(conf_dict["seq_len"]) self.seq_len = int(config["seq_len"])
self.seq_len1 = self.seq_len self.seq_len1 = self.seq_len
#int(config['max_len_left']) #int(config['max_len_left'])
self.seq_len2 = self.seq_len self.seq_len2 = self.seq_len
...@@ -157,7 +157,7 @@ class MMDNN(Layer): ...@@ -157,7 +157,7 @@ class MMDNN(Layer):
conv = self.conv(emb_expand) conv = self.conv(emb_expand)
if mask is not None: if mask is not None:
cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=0) cross_mask = fluid.layers.stack(x=[mask] * self.kernel_size, axis=0)
cross_mask = fluid.layers.stack(x=[cross] * conv.shape[1], axis=1) cross_mask = fluid.layers.stack(x=[cross_mask] * conv.shape[0], axis=0)
conv = cross_mask * conv + (1 - cross_mask) * (-2**self.seq_len + 1) conv = cross_mask * conv + (1 - cross_mask) * (-2**self.seq_len + 1)
pool = self.pool_layer(conv) pool = self.pool_layer(conv)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
network layers
"""
import paddle.fluid as fluid
import paddle.fluid.param_attr as attr
class EmbeddingLayer(object):
"""
Embedding Layer class
"""
def __init__(self, dict_size, emb_dim, name="emb"):
"""
initialize
"""
self.dict_size = dict_size
self.emb_dim = emb_dim
self.name = name
def ops(self, input):
"""
operation
"""
emb = fluid.dygraph.Embedding(
input=input,
size=[self.dict_size, self.emb_dim],
is_sparse=True,
param_attr=attr.ParamAttr(name=self.name))
return emb
class SequencePoolLayer(object):
"""
Sequence Pool Layer class
"""
def __init__(self, pool_type):
"""
initialize
"""
self.pool_type = pool_type
def ops(self, input):
"""
operation
"""
pool = fluid.dygraph.Pool2D(input=input, pool_type=self.pool_type)
return pool
class FCLayer(object):
"""
Fully Connect Layer class
"""
def __init__(self, fc_dim, act, name="fc"):
"""
initialize
"""
self.fc_dim = fc_dim
self.act = act
self.name = name
def ops(self, input):
"""
operation
"""
fc = fluid.dygraph.FC(input=input,
size=self.fc_dim,
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name),
act=self.act,
name=self.name)
return fc
class DynamicGRULayer(object):
"""
Dynamic GRU Layer class
"""
def __init__(self, gru_dim, name="dyn_gru"):
"""
initialize
"""
self.gru_dim = gru_dim
self.name = name
def ops(self, input):
"""
operation
"""
proj = fluid.dygraph.FC(
input=input,
size=self.gru_dim * 3,
param_attr=attr.ParamAttr(name="%s_fc.w" % self.name),
bias_attr=attr.ParamAttr(name="%s_fc.b" % self.name))
gru = fluid.layers.dynamic_gru(
input=proj,
size=self.gru_dim,
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name))
return gru
class DynamicLSTMLayer(object):
"""
Dynamic LSTM Layer class
"""
def __init__(self, lstm_dim, name="dyn_lstm"):
"""
initialize
"""
self.lstm_dim = lstm_dim
self.name = name
def ops(self, input):
"""
operation
"""
proj = fluid.dygraph.FC(
input=input,
size=self.lstm_dim * 4,
param_attr=attr.ParamAttr(name="%s_fc.w" % self.name),
bias_attr=attr.ParamAttr(name="%s_fc.b" % self.name))
lstm, _ = fluid.layers.dynamic_lstm(
input=proj,
size=self.lstm_dim * 4,
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name))
return lstm
class SequenceLastStepLayer(object):
"""
Get Last Step Sequence Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
last = fluid.layers.sequence_last_step(input)
return last
class SequenceConvPoolLayer(object):
"""
Sequence convolution and pooling Layer class
"""
def __init__(self, filter_size, num_filters, name):
"""
initialize
Args:
filter_size:Convolution kernel size
num_filters:Convolution kernel number
"""
self.filter_size = filter_size
self.num_filters = num_filters
self.name = name
def ops(self, input):
"""
operation
"""
conv = fluid.nets.sequence_conv_pool(
input=input,
filter_size=self.filter_size,
num_filters=self.num_filters,
param_attr=attr.ParamAttr(name=self.name),
act="relu")
return conv
class DataLayer(object):
"""
Data Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, name, shape, dtype, lod_level=0):
"""
operation
"""
data = fluid.layers.data( #不用改
name=name, shape=shape, dtype=dtype, lod_level=lod_level)
return data
class ConcatLayer(object):
"""
Connection Layer class
"""
def __init__(self, axis):
"""
initialize
"""
self.axis = axis
def ops(self, inputs):
"""
operation
"""
concat = fluid.layers.concat(inputs, axis=self.axis)
return concat
class ReduceMeanLayer(object):
"""
Reduce Mean Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
mean = fluid.layers.reduce_mean(input)
return mean
class CrossEntropyLayer(object):
"""
Cross Entropy Calculate Layer
"""
def __init__(self, name="cross_entropy"):
"""
initialize
"""
pass
def ops(self, input, label):
"""
operation
"""
loss = fluid.layers.cross_entropy(input=input, label=label) # 不用改
return loss
class SoftmaxWithCrossEntropyLayer(object):
"""
Softmax with Cross Entropy Calculate Layer
"""
def __init__(self, name="softmax_with_cross_entropy"):
"""
initialize
"""
pass
def ops(self, input, label):
"""
operation
"""
loss = fluid.layers.softmax_with_cross_entropy( # 不用改
logits=input, label=label)
return loss
class CosSimLayer(object):
"""
Cos Similarly Calculate Layer
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
sim = fluid.layers.cos_sim(x, y)
return sim
class ElementwiseMaxLayer(object):
"""
Elementwise Max Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
max = fluid.layers.elementwise_max(x, y)
return max
class ElementwiseAddLayer(object):
"""
Elementwise Add Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
add = fluid.layers.elementwise_add(x, y)
return add
class ElementwiseSubLayer(object):
"""
Elementwise Add Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, x, y):
"""
operation
"""
sub = fluid.layers.elementwise_sub(x, y)
return sub
class ConstantLayer(object):
"""
Generate A Constant Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input, shape, dtype, value):
"""
operation
"""
constant = fluid.layers.fill_constant_batch_size_like(input, shape,
dtype, value)
return constant
class SigmoidLayer(object):
"""
Sigmoid Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
sigmoid = fluid.layers.sigmoid(input)
return sigmoid
class SoftsignLayer(object):
"""
Softsign Layer class
"""
def __init__(self):
"""
initialize
"""
pass
def ops(self, input):
"""
operation
"""
softsign = fluid.layers.softsign(input)
return softsign
# class MatmulLayer(object):
# def __init__(self, transpose_x, transpose_y):
# self.transpose_x = transpose_x
# self.transpose_y = transpose_y
# def ops(self, x, y):
# matmul = fluid.layers.matmul(x, y, self.transpose_x, self.transpose_y)
# return matmul
# class Conv2dLayer(object):
# def __init__(self, num_filters, filter_size, act, name):
# self.num_filters = num_filters
# self.filter_size = filter_size
# self.act = act
# self.name = name
# def ops(self, input):
# conv = fluid.layers.conv2d(input, self.num_filters, self.filter_size, param_attr=attr.ParamAttr(name="%s.w" % self.name), bias_attr=attr.ParamAttr(name="%s.b" % self.name), act=self.act)
# return conv
# class Pool2dLayer(object):
# def __init__(self, pool_size, pool_type):
# self.pool_size = pool_size
# self.pool_type = pool_type
# def ops(self, input):
# pool = fluid.layers.pool2d(input, self.pool_size, self.pool_type)
# return pool
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册