diff --git a/doc/design/python_api.md b/doc/design/python_api.md index 5c68354274d577c29e054fdd1d3b17c7ea9b4cf9..6213da65c8c5931bc16e42574b8628b676424873 100644 --- a/doc/design/python_api.md +++ b/doc/design/python_api.md @@ -15,9 +15,9 @@ Please be aware that these Python classes need to maintain some construction-tim ### Program -A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator needs to be able to access variables in its ancessor blocks. +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. -Whenever we create a block, we need set its parent block to the current block, so the Python class `Program` needs to maintain a data member `current_block`. +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. ```python class Program(objects): @@ -81,13 +81,13 @@ class Block(objects): self.ops.prepend(Operator(self, ...)) ``` -`create_parameter` is necessary because parameters are global variables, those defined in the global block, but can be created in some sub-blocks, e.g., an FC layer in the step block of an RNN operator. +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. -`prepand_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. ### Operator -The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer output shape from input shape. +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. ```python class Operator(object): @@ -105,7 +105,7 @@ class Operator(object): return self.proto.type() ``` -`Operator` creates the `OpDesc` message in C++ space, so could it call the `InferShape` function, which is in C++. +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. ### Variable @@ -128,7 +128,7 @@ class Variable(object): self.writer = None ``` -Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each writes to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. ### Parameter @@ -155,7 +155,7 @@ class Parameter(Variable): initialize_op_attrs) ``` -When users create a parameter, s/he can call +When users create a parameter, they can call ```python program.create_parameter( diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index 951c7afbc14e2d9119169c1351d38ff0b67bdc5b..d696cdf5ad68118c98577fcaf1aed1cf013df296 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -106,6 +106,7 @@ message LoDTensorDesc { message VarDesc { required string name = 1; optional LoDTensorDesc lod_tensor = 2; + optional bool persistable = 3 [ default = false ]; } message BlockDesc { diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ea2ff3c50306c0b0db4c769129b6e7f2bab3a7ee --- /dev/null +++ b/paddle/operators/adagrad_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/adagrad_op.h" + +namespace paddle { +namespace operators { + +class AdagradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContextBase *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(Param) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Moment"), + "Input(Moment) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of AdagradOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of AdagradOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), + "Output(MomentOut) of AdagradOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "LearningRate should have one element"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of AdagradOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment"), + "Param and Moment input of AdagradOp should have the same dimension."); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("MomentOut", param_dims); + } +}; + +class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AdagradOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("Moment", "(Tensor) Second moment"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + AddOutput("MomentOut", "(Tensor) Output second moment"); + + AddAttr("epsilon", + "(float, default 1.0e-6) " + "Constant for numerical stability") + .SetDefault(1.0e-6f); + AddComment(R"DOC( + +Adaptive Gradient Algorithm (Adagrad). + +moment_out = moment + grad * grad +param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon) + +The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) +does not have the epsilon attribute. It is added here for numerical stability +by avoiding division by zero. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); +REGISTER_OP_CPU_KERNEL(adagrad, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a5b7951121360f78612f9008a522235104708112 --- /dev/null +++ b/paddle/operators/adagrad_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/adagrad_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(adagrad, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c5d8f751d3527f89b96d4274328ba0bb5f6efa44 --- /dev/null +++ b/paddle/operators/adagrad_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class AdagradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out_tensor = ctx.Output("ParamOut"); + auto moment_out_tensor = ctx.Output("MomentOut"); + + param_out_tensor->mutable_data(ctx.GetPlace()); + moment_out_tensor->mutable_data(ctx.GetPlace()); + + float epsilon = ctx.Attr("epsilon"); + + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment_out.device(place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/framework/tests/test_adagrad_op.py new file mode 100644 index 0000000000000000000000000000000000000000..66bad349e59b608cb3cc965401c81ef4c716b318 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_adagrad_op.py @@ -0,0 +1,69 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestAdagradOp1(OpTest): + ''' Test Adagrad operator with explicit attributes + ''' + + def setUp(self): + self.op_type = "adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + epsilon = 1e-8 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'epsilon': epsilon} + + moment_out = moment + grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +class TestAdagradOp2(OpTest): + ''' Test Adagrad operator with default attributes + ''' + + def setUp(self): + self.op_type = "adagrad" + + param = np.random.random((123, 321)).astype("float32") + grad = np.random.random((123, 321)).astype("float32") + moment = np.zeros((123, 321)).astype("float32") + lr = 0.01 + epsilon = 1e-6 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment': moment, + 'LearningRate': np.array([lr]).astype("float32") + } + + self.attrs = {'epsilon': epsilon} + + moment_out = moment + grad * grad + param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon) + + self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main()