diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9f010b15927770c2a47e656656d672b36c54ec44..1a387b276c5bf33139737015ef35a7cb140556d4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5484,7 +5484,7 @@ def row_conv(input, future_context_size, param_attr=None, act=None): """ helper = LayerHelper('row_conv', **locals()) dtype = helper.input_dtype() - filter_shape = [future_context_size + 1, input.shape[1]] + filter_shape = [future_context_size + 1, input.shape[-1]] filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype) out = helper.create_variable_for_type_inference(dtype) diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid.py b/python/paddle/fluid/tests/unittests/test_hsigmoid.py new file mode 100644 index 0000000000000000000000000000000000000000..80937640c2d2fde6d2ba30b7bccc396566801e63 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid.py @@ -0,0 +1,219 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle import fluid, nn +import paddle.fluid.dygraph as dg +import paddle.nn.functional as F +import paddle.fluid.initializer as I +import numpy as np +import unittest + + +class HSigmoidTestCase(unittest.TestCase): + def __init__(self, + methodName="runTest", + batch_size=4, + feature_size=6, + num_classes=8, + labels=None, + path_code=None, + path_table=None, + is_sparse=False, + dtype="float32"): + super(HSigmoidTestCase, self).__init__() + self.batch_size = batch_size + self.feature_size = feature_size + self.num_classes = num_classes + self.dtype = dtype + self.is_sparse = is_sparse + + self.labels = labels + self.path_code = path_code + self.path_table = path_table + self.is_custom = path_code is not None and path_table is not None + + def setUp(self): + input_shape = (self.batch_size, self.feature_size) + self.input = np.random.uniform( + -1, 1, size=input_shape).astype(self.dtype) + if self.labels is None: + self.labels = np.random.randint( + 0, self.num_classes, size=(self.batch_size, 1)).astype(np.int64) + C = self.num_classes if self.is_custom else self.num_classes - 1 + self.weight_shape = (C, self.feature_size) + self.weight = np.random.randn(*self.weight_shape).astype(self.dtype) + self.bias_shape = (C, 1) + self.bias = np.random.randn(*self.bias_shape).astype(self.dtype) + + def fluid_layer(self, place): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data( + "input", [-1, self.feature_size], dtype=self.dtype) + label = fluid.data("labels", [-1, 1], dtype="int64") + if self.is_custom: + path_table = fluid.data( + "path_table", [-1, -1], dtype="int64") + path_code = fluid.data("path_code", [-1, -1], dtype="int64") + else: + path_table = path_code = None + y = fluid.layers.hsigmoid( + x, + label, + self.num_classes, + param_attr=I.NumpyArrayInitializer(self.weight), + bias_attr=I.NumpyArrayInitializer(self.bias), + path_table=path_table, + path_code=path_code, + is_custom=self.is_custom, + is_sparse=self.is_sparse, ) + exe = fluid.Executor(place) + exe.run(start) + feed_dict = {"input": self.input, "labels": self.labels} + if self.is_custom: + feed_dict["path_code"] = self.path_code + feed_dict["path_table"] = self.path_table + y_np, = exe.run(main, feed=feed_dict, fetch_list=[y]) + return y_np + + def functional(self, place): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data( + "input", [-1, self.feature_size], dtype=self.dtype) + label = fluid.data("labels", [-1, 1], dtype="int64") + if self.is_custom: + path_table = fluid.data( + "path_table", [-1, -1], dtype="int64") + path_code = fluid.data("path_code", [-1, -1], dtype="int64") + else: + path_table = path_code = None + w = fluid.data("weight", self.weight_shape, dtype=self.dtype) + b = fluid.data("bias", self.bias_shape, dtype=self.dtype) + y = F.hsigmoid( + x, + label, + w, + b, + self.num_classes, + is_sparse=self.is_sparse, + path_table=path_table, + path_code=path_code) + + exe = fluid.Executor(place) + exe.run(start) + feed_dict = { + "input": self.input, + "labels": self.labels, + "weight": self.weight, + "bias": self.bias + } + if self.is_custom: + feed_dict["path_code"] = self.path_code + feed_dict["path_table"] = self.path_table + y_np, = exe.run(main, feed=feed_dict, fetch_list=[y]) + return y_np + + def nn_layer(self, place): + with dg.guard(place): + x_var = dg.to_variable(self.input) + label_var = dg.to_variable(self.labels) + if self.is_custom: + path_code_var = dg.to_variable(self.path_code) + path_table_var = dg.to_variable(self.path_table) + else: + path_code_var = path_table_var = None + hierarchical_softmax = nn.HSigmoid( + self.feature_size, + self.num_classes, + is_custom=self.is_custom, + is_sparse=self.is_sparse, + param_attr=I.NumpyArrayInitializer(self.weight), + bias_attr=I.NumpyArrayInitializer(self.bias), + dtype=self.dtype) + y_var = hierarchical_softmax( + x_var, + label_var, + path_table=path_table_var, + path_code=path_code_var) + y_np = y_var.numpy() + return y_np + + def _test_equivalence(self, place): + result1 = self.fluid_layer(place) + result2 = self.functional(place) + result3 = self.nn_layer(place) + np.testing.assert_array_almost_equal(result1, result2) + np.testing.assert_array_almost_equal(result2, result3) + + def runTest(self): + place = fluid.CPUPlace() + self._test_equivalence(place) + + +class HSigmoidTestErrorCase(HSigmoidTestCase): + def runTest(self): + place = fluid.CPUPlace() + with dg.guard(place): + with self.assertRaises(ValueError): + self.nn_layer() + + def nn_layer(self): + x_var = dg.to_variable(self.input) + label_var = dg.to_variable(self.labels) + if self.is_custom: + path_code_var = dg.to_variable(self.path_code) + path_table_var = dg.to_variable(self.path_table) + else: + path_code_var = path_table_var = None + hierarchical_softmax = nn.HSigmoid( + self.feature_size, + self.num_classes, + is_custom=self.is_custom, + param_attr=I.NumpyArrayInitializer(self.weight), + bias_attr=I.NumpyArrayInitializer(self.bias), + dtype=self.dtype) + y_var = hierarchical_softmax( + x_var, + label_var, + path_table=path_table_var, + path_code=path_code_var) + y_np = y_var.numpy() + return y_np + + +def load_tests(loader, standard_tests, pattern): + suite = unittest.TestSuite() + suite.addTest(HSigmoidTestCase(methodName="runTest")) + suite.addTest( + HSigmoidTestCase( + methodName="runTest", + batch_size=4, + feature_size=6, + num_classes=8, + labels=np.array([0, 1, 4, 5]).astype(np.int64), + path_table=np.array([(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), ( + 0, 1, 4, -1, -1), (0, 2, -1, -1, -1)]).astype(np.int64), + path_code=np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]).astype(np.int64))) + suite.addTest(HSigmoidTestErrorCase(methodName="runTest", num_classes=1)) + return suite + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_row_conv.py b/python/paddle/fluid/tests/unittests/test_row_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..abec23c7f658a9c97460fac6a9399ebc83162077 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_row_conv.py @@ -0,0 +1,131 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle import fluid, nn +import paddle.fluid.dygraph as dg +import paddle.fluid.initializer as I +import paddle.nn.functional as F +import unittest + + +class RowConvTestCase(unittest.TestCase): + def __init__(self, + methodName='runTest', + batch_size=4, + num_channels=8, + time_steps=12, + context_size=3, + act=None, + dtype="float32"): + super(RowConvTestCase, self).__init__(methodName=methodName) + self.batch_size = batch_size + self.num_channels = num_channels + self.time_steps = time_steps + self.context_size = context_size + self.act = act + self.dtype = dtype + + def setUp(self): + input_shape = (self.batch_size, self.time_steps, self.num_channels) + self.input = np.random.uniform(size=input_shape).astype(self.dtype) + self.weight_shape = weight_shape = (self.context_size + 1, + self.num_channels) + self.weight = np.random.uniform(size=weight_shape).astype(self.dtype) + + def fluid_layer(self, place): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data( + "input", [-1, -1, self.num_channels], dtype=self.dtype) + y = fluid.layers.row_conv( + x, + self.context_size, + param_attr=I.NumpyArrayInitializer(self.weight), + act=self.act) + exe = fluid.Executor(place) + exe.run(start) + y_np, = exe.run(main, feed={"input": self.input}, fetch_list=[y]) + return y_np + + def functional_declarative(self, place): + main = fluid.Program() + start = fluid.Program() + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data( + "input", [-1, -1, self.num_channels], dtype=self.dtype) + w = fluid.data("weight", self.weight_shape, dtype=self.dtype) + y = F.row_conv(x, w, act=self.act) + exe = fluid.Executor(place) + exe.run(start) + y_np, = exe.run(main, + feed={"input": self.input, + "weight": self.weight}, + fetch_list=[y]) + return y_np + + def functional_imperative(self, place): + with dg.guard(place): + x_var = dg.to_variable(self.input) + w_var = dg.to_variable(self.weight) + y_var = F.row_conv(x_var, w_var, act=self.act) + y_np = y_var.numpy() + return y_np + + def nn_layer(self, place): + with dg.guard(place): + x_var = dg.to_variable(self.input) + conv = nn.RowConv( + self.num_channels, + self.context_size, + param_attr=I.NumpyArrayInitializer(self.weight), + act=self.act, + dtype=self.dtype) + y_var = conv(x_var) + y_np = y_var.numpy() + return y_np + + def _test_equivalence(self, place): + result1 = self.fluid_layer(place) + result2 = self.functional_declarative(place) + result3 = self.functional_imperative(place) + result4 = self.nn_layer(place) + np.testing.assert_array_almost_equal(result1, result2) + np.testing.assert_array_almost_equal(result2, result3) + np.testing.assert_array_almost_equal(result3, result4) + + def runTest(self): + place = fluid.CPUPlace() + self._test_equivalence(place) + + if fluid.core.is_compiled_with_cuda(): + palce = fluid.CUDAPlace(0) + self._test_equivalence(place) + + +def load_tests(loader, standard_tests, pattern): + suite = unittest.TestSuite() + suite.addTest(RowConvTestCase(methodName="runTest")) + suite.addTest(RowConvTestCase(methodName="runTest", act="sigmoid")) + suite.addTest( + RowConvTestCase( + methodName="runTest", context_size=5, act="sigmoid")) + return suite + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index e8e0aa3fd1f559fa6f8d2b2e4e20c7b77dbd2826..ca755fdb725fcdfaf80a128d02be4a670e5c5e5d 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -80,10 +80,13 @@ from .layer.loss import BCELoss #DEFINE_ALIAS # from .layer.norm import LayerNorm #DEFINE_ALIAS from .layer.norm import InstanceNorm #DEFINE_ALIAS # from .layer.norm import SpectralNorm #DEFINE_ALIAS +from .layer.activation import HSigmoid #DEFINE_ALIAS # from .layer.activation import PReLU #DEFINE_ALIAS from .layer.activation import ReLU #DEFINE_ALIAS from .layer.activation import Sigmoid #DEFINE_ALIAS # from .layer.activation import Softmax #DEFINE_ALIAS +# from .layer.activation import LogSoftmax #DEFINE_ALIAS +from .layer.extension import RowConv #DEFINE_ALIAS from .layer.activation import LogSoftmax #DEFINE_ALIAS # from .layer.rnn import RNNCell #DEFINE_ALIAS # from .layer.rnn import GRUCell #DEFINE_ALIAS @@ -184,7 +187,7 @@ from .functional.conv import conv3d_transpose #DEFINE_ALIAS # from .functional.activation import hard_shrink #DEFINE_ALIAS # from .functional.activation import hard_sigmoid #DEFINE_ALIAS # from .functional.activation import hard_swish #DEFINE_ALIAS -# from .functional.activation import hsigmoid #DEFINE_ALIAS +from .functional.activation import hsigmoid #DEFINE_ALIAS # from .functional.activation import leaky_relu #DEFINE_ALIAS # from .functional.activation import logsigmoid #DEFINE_ALIAS # from .functional.activation import maxout #DEFINE_ALIAS @@ -211,7 +214,7 @@ from .functional.activation import log_softmax #DEFINE_ALIAS # from .functional.extension import multiclass_nms #DEFINE_ALIAS # from .functional.extension import polygon_box_transform #DEFINE_ALIAS # from .functional.extension import random_crop #DEFINE_ALIAS -# from .functional.extension import row_conv #DEFINE_ALIAS +from .functional.extension import row_conv #DEFINE_ALIAS # from .functional.extension import rpn_target_assign #DEFINE_ALIAS # from .functional.extension import similarity_focus #DEFINE_ALIAS # from .functional.extension import target_assign #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 1a66af1882db3a6f4d1900f0b708e062dfcb9228..7de9b7db0a81ad5d8b2338ff7bec6030a36f0386 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -110,7 +110,7 @@ from . import activation # from .activation import hard_shrink #DEFINE_ALIAS # from .activation import hard_sigmoid #DEFINE_ALIAS # from .activation import hard_swish #DEFINE_ALIAS -# from .activation import hsigmoid #DEFINE_ALIAS +from .activation import hsigmoid #DEFINE_ALIAS # from .activation import leaky_relu #DEFINE_ALIAS # from .activation import logsigmoid #DEFINE_ALIAS # from .activation import maxout #DEFINE_ALIAS @@ -137,7 +137,7 @@ from .activation import log_softmax #DEFINE_ALIAS # from .extension import multiclass_nms #DEFINE_ALIAS # from .extension import polygon_box_transform #DEFINE_ALIAS # from .extension import random_crop #DEFINE_ALIAS -# from .extension import row_conv #DEFINE_ALIAS +from .extension import row_conv #DEFINE_ALIAS # from .extension import rpn_target_assign #DEFINE_ALIAS # from .extension import similarity_focus #DEFINE_ALIAS # from .extension import target_assign #DEFINE_ALIAS diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 8a236867b601462ac69d966d9147f9fabffa5b9a..47cac5517b42a28477eab44dd8afb67a01b4676f 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -12,40 +12,159 @@ # See the License for the specific language governing permissions and # limitations under the License. +# TODO: define activation functions of neural network +__all__ = [ + # 'brelu', + # 'elu', + # 'erf', + # 'gelu', + # 'hard_shrink', + # 'hard_sigmoid', + # 'hard_swish', + 'hsigmoid', + # 'leaky_relu', + # 'logsigmoid', + # 'maxout', + # 'prelu', + 'relu', + # 'relu6', + # 'selu', + 'sigmoid', + # 'soft_relu', + # 'softmax', + # 'softplus', + # 'softshrink', + # 'softsign', + # 'swish', + # 'tanh_shrink', + # 'thresholded_relu', + 'log_softmax' +] import warnings from ...fluid.layer_helper import LayerHelper from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_ from ...fluid import core from ...fluid.data_feeder import check_variable_and_dtype -# TODO: define activation functions of neural network -__all__ = [ - # 'brelu', - # 'elu', - # 'erf', - # 'gelu', - # 'hard_shrink', - # 'hard_sigmoid', - # 'hard_swish', - # 'hsigmoid', - # 'leaky_relu', - # 'logsigmoid', - # 'maxout', - # 'prelu', - 'relu', - # 'relu6', - # 'selu', - 'sigmoid', - # 'soft_relu', - # 'softmax', - # 'softplus', - # 'softshrink', - # 'softsign', - # 'swish', - # 'tanh_shrink', - # 'thresholded_relu', - 'log_softmax', -] + +def hsigmoid(input, + label, + weight, + bias, + num_classes, + path_table=None, + path_code=None, + is_sparse=False): + """ + The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity + and speed up the model training, especially the training of language model. + Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier. + For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on + the path, and sum them to get a total cost. + Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N` + represents the number of classes or the size of word dict. + + The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural + Network Language Model `_. For the custom + tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example): + + 1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict. + 2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table. + 3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code. + Code means the label of each binary classifier, 1 indicate true, 0 indicate false. + 4. Now, each word should has its path and code along the path, you can pass a batch of path and code related + to the same batch of inputs. + + Parameters: + input (Variable): A tensor with the shape [N, D], where N is the size of mini-batch, + and D is the feature size. Its data type supports float32 and float64. + label (Variable): A tensor contains the labels of training data. Its shape is [N, 1] + and data type is int64. + weight (Variable): A tensor with shape (num_classes - 1, D) if not using custom tree(path_code and path_table is None), or (num_classes, D) if using custom tree. + bias (Variable): A tensor with shape (num_classes - 1, 1) if not using custom tree(path_code and path_table is None), or (num_classes, 1) if using custom tree. + num_classes (int): The number of classes or the size of word dict, must be greater than 2. + If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes` + should not be None. If the custom tree is used (:attr:`is_custom` is set to True), + :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of + classes using by the binary classifier. + path_table (Variable, optional): A tensor that stores each batch of samples' path from leaf to root + node, its shape is [N, L] and data type is int64, where L is the length of path. For each sample i, + path_table[i] is a np.array like structure and each element in this array is the indexes in parent + nodes' weight matrix. Default: None. + path_code (Variable, optional): A tensor that stores each batch of samples' code of path from leaf + to root node, its shape is [N, L] and data type is int64, which is the same as :attr:`path_table`. + Each code of path is consisted with the code of nodes from leaf to root node. Default: None. + is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the + gradient of W and input will be sparse. Default: False. + + Returns: + Variable: A tensor with the cost of hierarchical sigmoid, its shape is [N, 1] and data type is the same as :attr:`input`. + + Examples: + + .. code-block:: python + + from paddle import fluid, nn + import paddle.fluid.dygraph as dg + import paddle.nn.functional as F + import numpy as np + + main = fluid.Program() + start = fluid.Program() + feature_size = 6 + num_classes = 8 + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", [-1, feature_size], + dtype="float32") + label = fluid.data("labels", [-1, 1], dtype="int64") + w = fluid.data("weight", (num_classes -1, feature_size), dtype="float32") + b = fluid.data("bias", (num_classes -1, ), dtype="float32") + y = F.hsigmoid(x, label, w, b, num_classes) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(start) + feed_dict = { + "input": np.random.randn(4, feature_size).astype(np.float32), + "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64), + "weight": np.random.randn(num_classes - 1, feature_size).astype(np.float32), + "bias": np.random.randn(num_classes - 1, ).astype(np.float32), + } + y_np, = exe.run(main, feed=feed_dict, fetch_list=[y]) + print(y_np.shape) + + # (4, 1) + """ + + attrs = { + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": is_sparse + } + + inputs = { + "X": input, + "W": weight, + "Bias": bias, + "PathTable": path_table, + "PathCode": path_code, + "Label": label + } + + helper = LayerHelper('hierarchical_sigmoid', **locals()) + dtype = helper.input_dtype() + + out = helper.create_variable_for_type_inference(dtype) + pre_out = helper.create_variable_for_type_inference(dtype) + outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight} + + helper.append_op( + type="hierarchical_sigmoid", + inputs=inputs, + outputs=outputs, + attrs=attrs) + return out def relu(input, inplace=False, name=None): diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py index 892ea8f7cfa4d0f69c7a3056164cad89c6b84bbf..e00349c8cdaed940d53d643d0cb1651a19194202 100644 --- a/python/paddle/nn/functional/extension.py +++ b/python/paddle/nn/functional/extension.py @@ -12,20 +12,92 @@ # See the License for the specific language governing permissions and # limitations under the License. -# TODO: define the extention functions -# __all__ = ['add_position_encoding', -# 'autoincreased_step_counter', -# 'continuous_value_model', -# 'filter_by_instag', -# 'linear_chain_crf', -# 'merge_selected_rows', -# 'multiclass_nms', -# 'polygon_box_transform', -# 'random_crop', -# 'row_conv', -# 'rpn_target_assign', -# 'similarity_focus', -# 'target_assign', -# 'temporal_shift', -# 'warpctc', -# 'diag_embed'] +# TODO: define the extention functions +__all__ = [ + # 'add_position_encoding', + # 'autoincreased_step_counter', + # 'continuous_value_model', + # 'filter_by_instag', + # 'linear_chain_crf', + # 'merge_selected_rows', + # 'multiclass_nms', + # 'polygon_box_transform', + # 'random_crop', + 'row_conv', + # 'rpn_target_assign', + # 'similarity_focus', + # 'target_assign', + # 'temporal_shift', + # 'warpctc', + # 'diag_embed' +] + +from ...fluid import core, dygraph_utils +from ...fluid.framework import in_dygraph_mode +from ...fluid.layer_helper import LayerHelper +from ...fluid.layers.layer_function_generator import templatedoc + + +@templatedoc() +def row_conv(input, weight, act=None): + """ + ${comment} + + Args: + input (Variable): the input(X) is a LodTensor or tensor, LodTensor(X) + supports variable time-length input sequences. The underlying + tensor in this LoDTensor is a matrix with shape (T, D), where + T is the total time steps in this mini-batch and D is the input + data dimension. + If the input is a padded minibatch, the shape of the input is + (N, T, D), N is batch size, T is the max time steps in the batch, + D is the input data dimension. + weight (Variable): The weight. A Tensor with shape + (future_context_size + 1, D), where future_context_size is the + context size of the RowConv operator. + act (str): Non-linear activation to be applied to output variable. + + Returns: + ${out_comment}. + + Examples: + .. code-block:: python + + from paddle import fluid, nn + import paddle.fluid.dygraph as dg + import paddle.nn.functional as F + import numpy as np + + batch_size = 4 + time_steps = 8 + feature_size = 6 + context_size = 4 + x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32) + weight = np.random.randn(context_size + 1, feature_size).astype(np.float32) + + place = fluid.CPUPlace() + with dg.guard(place): + x_var = dg.to_variable(x) + w_var = dg.to_variable(weight) + y_var = F.row_conv(x_var, w_var) + y_np = y_var.numpy() + + print(y_np.shape) + + # (4, 8, 6) + """ + + if in_dygraph_mode(): + pre_act = core.ops.row_conv(input, weight) + out = dygraph_utils._append_activation_in_dygraph(pre_act, act) + return out + else: + helper = LayerHelper('row_conv', **locals()) + dtype = helper.input_dtype() + + inputs = {'X': [input], 'Filter': [weight]} + pre_act = helper.create_variable_for_type_inference(dtype) + outputs = {'Out': [pre_act]} + helper.append_op(type='row_conv', inputs=inputs, outputs=outputs) + out = helper.append_activation(pre_act) + return out diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 689cc857ef614cd7c57200afa88c9205d5a02748..ff8208f350e6b178adbae1e1684c9aeed08d3273 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -17,9 +17,13 @@ from . import activation from . import loss from . import conv +from . import extension +from . import activation from . import norm from .activation import * from .loss import * from .conv import * +from .extension import * +from .activation import * from .norm import * diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index ad465b9f0b86e54ba1f41f500233bb3c637ee0cd..73d7084fdba1b0735542f31d974f7ed68503e681 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -12,20 +12,155 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ...fluid.dygraph import layers -from ...fluid import core -from ...fluid.framework import in_dygraph_mode -from .. import functional - -# TODO: define activation functions of neural network +# TODO: define activation functions of neural network __all__ = [ # 'PReLU', 'ReLU', 'Sigmoid', # 'Softmax', 'LogSoftmax', + 'HSigmoid' ] +from ...fluid.dygraph import layers +from ...fluid import core +from ...fluid.framework import in_dygraph_mode +from .. import functional + + +class HSigmoid(layers.Layer): + """ + + Hierarchical Sigmoid Layer. + + The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity + and speed up the model training, especially the training of language model. + Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier. + For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on + the path, and sum them to get a total cost. + Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N` + represents the number of classes or the size of word dict. + + The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural + Network Language Model _`. For the custom + tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example): + + 1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict. + 2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table. + 3. Creating a dict map word_id -> code of path that from the word to the root node, we call it path_code. + Code means the label of each binary classifier, 1 indicate true, 0 indicate false. + 4. Now, each word should has its path and code along the path, you can pass a batch of path and code related + to the same batch of inputs. + + Parameters: + feature_size (int): The feature size. + num_classes (int): The number of classes or the size of word dict, must be greater than 2. + If the default tree is used (:attr:`is_custom` is set to False), :attr:`num_classes` + should not be None. If the custom tree is used (:attr:`is_custom` is set to True), + :attr:`num_classes` should be the number of non-leaf nodes, which indicates the num of + classes using by the binary classifier. + param_attr (ParamAttr, optional): The parameter attribute for the learnable parameters/weights + of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create a + ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is + initialized with Xavier. Default: None. + bias_attr (ParamAttr|bool, optional): The parameter attribute for the bias of hsigmoid. If it + is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr, + hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not + set, the bias is initialized zero. Default: None. + is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and + `path_code` should be passed to its forward method, otherwise `path_table` and `path_code` + should not be passed to its forward method. Default: False. + is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True, the + gradient of W and input will be sparse. Default: False. + + Returns: + None + + Examples: + .. code-block:: python + + from paddle import fluid, nn + import paddle.fluid.dygraph as dg + import paddle.nn.functional as F + import numpy as np + + main = fluid.Program() + start = fluid.Program() + feature_size = 6 + num_classes = 8 + with fluid.unique_name.guard(): + with fluid.program_guard(main, start): + x = fluid.data("input", [-1, feature_size], + dtype="float32") + label = fluid.data("labels", [-1, 1], dtype="int64") + hsm = nn.HSigmoid(feature_size, num_classes) + y = hsm(x, label) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(start) + feed_dict = { + "input": np.random.randn(4, feature_size).astype(np.float32), + "labels": np.random.randint(0, num_classes, (4, 1)).astype(np.int64), + } + y_np, = exe.run(main, feed=feed_dict, fetch_list=[y]) + print(y_np.shape) + + # (4, 1) + """ + + def __init__(self, + feature_size, + num_classes, + param_attr=None, + bias_attr=None, + is_custom=False, + is_sparse=False, + dtype="float32"): + super(HSigmoid, self).__init__() + if (num_classes < 2) and (not is_custom): + raise ValueError( + "num_classes must not be less than 2 with default tree") + + if (not is_custom) and (is_sparse): + print("Sparse mode should not be used without custom tree") + is_sparse = False + + self._feature_size = feature_size + self._num_classes = num_classes + self._is_custom = is_custom + self._is_sparse = is_sparse + + self._param_attr = param_attr + self._bias_attr = bias_attr + + self._dtype = dtype + + remote_prefetch = is_sparse + print("With sparse mode, if your models has only" + " small parameter prefetch may cause speed down") + + C = self._num_classes if is_custom else self._num_classes - 1 + self.weight = self.create_parameter( + [C, self._feature_size], + attr=self._param_attr, + is_bias=False, + dtype=self._dtype) + self.bias = self.create_parameter( + [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype) + + def forward(self, input, label, path_table=None, path_code=None): + out = functional.hsigmoid( + input, + label, + self.weight, + self.bias, + self._num_classes, + path_table=path_table, + path_code=path_code, + is_sparse=self._is_sparse) + return out + class ReLU(layers.Layer): """ @@ -40,10 +175,10 @@ class ReLU(layers.Layer): ``ReLU`` are the same variable. Otherwise, the input and output of ``ReLU`` are different variables. Default False. Note that if x is more than one OPs' input, inplace must be False. - + Returns: None - + Examples: .. code-block:: python diff --git a/python/paddle/nn/layer/extension.py b/python/paddle/nn/layer/extension.py new file mode 100644 index 0000000000000000000000000000000000000000..3a3779cc4bdc58586c390d9c3bea195de9403e3b --- /dev/null +++ b/python/paddle/nn/layer/extension.py @@ -0,0 +1,103 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["RowConv"] + +from ...fluid.dygraph import layers +from .. import functional as F + + +class RowConv(layers.Layer): + """ + **Row-convolution operator** + + The row convolution is called lookahead convolution. This operator was + introduced in the following paper for + `DeepSpeech2 `_. + + The main motivation is that a bidirectional RNN, useful in DeepSpeech like + speech models, learns representation for a sequence by performing a + forward and a backward pass through the entire sequence. However, unlike + unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online + and low-latency setting. The lookahead convolution incorporates information + from future subsequences in a computationally efficient manner to improve + unidirectional recurrent neural networks. The row convolution operator is + different from the 1D sequence convolution, and is computed as follows: + + Given an input sequence X of length t and input dimension D, and a filter + (W) of size context * D. + + More details about row_conv please refer to the design document + ``_ . + + Parameters: + num_channels (int): input data's feature size. + future_context_size (int): Future context size. Please note, the shape + of convolution kernel is [future_context_size + 1, D]. + param_attr (ParamAttr): Attributes of parameters, including + name, initializer etc. Default: None. + act (str): Non-linear activation to be applied to output variable. Default: None. + dtype (str, optional): Data type, it can be "float32". Default: "float32". + + Attributes: + weight (Parameter): shape [future_context_size + 1, D], the learnable + weight (convolution kernel) of this layer. + + Returns: + None + + Examples: + .. code-block:: python + + from paddle import fluid, nn + import paddle.fluid.dygraph as dg + import paddle.nn.functional as F + import numpy as np + + batch_size = 4 + time_steps = 8 + feature_size = 6 + context_size = 4 + + x = np.random.randn(batch_size, time_steps, feature_size).astype(np.float32) + + place = fluid.CPUPlace() + with dg.guard(place): + x_var = dg.to_variable(x) + conv = nn.RowConv(feature_size, context_size) + y_var = conv(x_var) + y_np = y_var.numpy() + print(y_np.shape) + + # (4, 8, 6) + """ + + def __init__(self, + num_channels, + future_context_size, + param_attr=None, + act=None, + dtype="float32"): + super(RowConv, self).__init__() + self._dtype = dtype + self._param_attr = param_attr + self._act = act + + filter_shape = [future_context_size + 1, num_channels] + self.weight = self.create_parameter( + filter_shape, attr=param_attr, dtype=dtype) + + def forward(self, input): + out = F.row_conv(input, self.weight, act=self._act) + return out