diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index f0601877c5d819710553da302382bb53c45701ff..a719da2560291dbc7e98aadfae41d4692d8afcad 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -183,15 +183,20 @@ set(DEPS_OPS array_to_lod_tensor_op lstm_op tensor_array_read_write_op - gru_op) + gru_op + adagrad_op + sgd_op) + op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) +op_library(sum_op DEPS selected_rows_functor) +op_library(sgd_op DEPS selected_rows_functor) +op_library(adagrad_op DEPS selected_rows_functor) op_library(conv_op DEPS vol2col) -op_library(sum_op DEPS net_op selected_rows_functor) op_library(pool_op DEPS pooling) op_library(pool_with_index_op DEPS pooling) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index 8d1a2b7938d2c6607cbeb3cecb72d1d5b83dd8b9..d6686e3ef3165976cf4c077a7a0f213082aa7716 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -14,6 +14,11 @@ limitations under the License. */ #include "paddle/operators/adagrad_op.h" +#include + +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/selected_rows_functor.h" + namespace paddle { namespace operators { @@ -21,7 +26,7 @@ class AdagradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(Param) of AdagradOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -54,8 +59,8 @@ class AdagradOp : public framework::OperatorWithKernel { class AdagradOpMaker : public framework::OpProtoAndCheckerMaker { public: - AdagradOpMaker(framework::OpProto *proto, - framework::OpAttrChecker *op_checker) + AdagradOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Param", "(Tensor) Input parameter"); AddInput("Grad", "(Tensor) Input gradient"); @@ -87,10 +92,85 @@ for numerical stability to avoid the division by zero error. )DOC"); } }; + +namespace { +size_t FindPos(const std::vector& rows, int64_t value) { + return std::find(rows.begin(), rows.end(), value) - rows.begin(); +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_rows = grad.rows(); + std::set row_set(grad_rows.begin(), grad_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto grad_width = grad.value().dims()[1]; + std::unique_ptr grad_merge{ + new framework::SelectedRows()}; + grad_merge->set_rows(merge_rows); + grad_merge->set_height(grad.height()); + grad_merge->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), grad_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, grad_merge->mutable_value(), 0.0); + + auto* grad_merge_data = grad_merge->mutable_value()->data(); + auto* grad_data = grad.value().data(); + + for (size_t i = 0; i < grad_rows.size(); i++) { + size_t grad_merge_i = FindPos(merge_rows, grad_rows[i]); + for (int64_t j = 0; j < grad_width; j++) { + grad_merge_data[grad_merge_i * grad_width + j] += + grad_data[i * grad_width + j]; + } + } + + // 2. m += g_m * g_m + std::unique_ptr grad_square{ + new framework::SelectedRows()}; + grad_square->set_rows(grad_merge->rows()); + grad_square->set_height(grad_merge->height()); + grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), + context.GetPlace()); + auto gs = + framework::EigenVector::Flatten(*(grad_square->mutable_value())); + auto gm = framework::EigenVector::Flatten(grad_merge->value()); + gs.device(*context.GetEigenDevice()) = gm * gm; + + math::SelectedRowsAddToTensor functor; + functor(context, *grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + for (size_t i = 0; i < merge_rows.size(); i++) { + for (int64_t j = 0; j < grad_width; j++) { + param_data[merge_rows[i] * grad_width + j] -= + lr[0] * grad_merge_data[i * grad_width + j] / + (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon); + } + } + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); -REGISTER_OP_CPU_KERNEL(adagrad, - ops::AdagradOpKernel); +REGISTER_OP_CPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index a5b7951121360f78612f9008a522235104708112..5b869e6bc5f4604ba6055ffd62fa21e4a1f41b93 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -14,7 +14,138 @@ #define EIGEN_USE_GPU #include "paddle/operators/adagrad_op.h" +#include "paddle/operators/math/selected_rows_functor.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +namespace { + +template +__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows, + T* grad_merge, const int64_t* grad_merge_rows, + size_t grad_merge_rows_size, + int64_t row_numel) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + __shared__ size_t grad_merge_idx; + + if (tid == 0) { + for (size_t i = 0; i < grad_merge_rows_size; i++) { + if (grad_rows[ty] == grad_merge_rows[i]) { + grad_merge_idx = i; + } + } + } + + __syncthreads(); + + grad += ty * row_numel; + grad_merge += grad_merge_idx * row_numel; + for (int index = tid; index < row_numel; index += block_size) { + paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]); + } +} + +template +__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, + const T* learning_rate, T* param, + T* moment, int64_t row_numel, + T epsilon) { + const int ty = blockIdx.y; + int tid = threadIdx.x; + + grad += ty * row_numel; + param += rows[ty] * row_numel; + moment += rows[ty] * row_numel; + + for (int index = tid; index < row_numel; index += block_size) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd(param + index, + -1.0 * learning_rate[0] * grad[index] / + (sqrt(moment[index]) + epsilon)); + } +} +} // namespace + +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param) { + // 1. g_m.rows = set(g.rows) + auto grad_rows = grad.rows(); + std::set row_set(grad_rows.begin(), grad_rows.end()); + std::vector merge_rows(row_set.begin(), row_set.end()); + + auto grad_width = grad.value().dims()[1]; + std::unique_ptr grad_merge{ + new framework::SelectedRows()}; + grad_merge->set_rows(merge_rows); + grad_merge->set_height(grad.height()); + grad_merge->mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), grad_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, grad_merge->mutable_value(), 0.0); + + auto* grad_merge_data = grad_merge->mutable_value()->data(); + auto* grad_data = grad.value().data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + dim3 grid1(1, grad_rows.size()); + + MergeGradKernel< + T, 256><<(context) + .stream()>>>(grad_data, grad.rows().data(), + grad_merge_data, grad_merge->rows().data(), + grad_merge->rows().size(), grad_width); + + // 2. m += g_m * g_m + std::unique_ptr grad_square{ + new framework::SelectedRows()}; + grad_square->set_rows(grad_merge->rows()); + grad_square->set_height(grad_merge->height()); + grad_square->mutable_value()->mutable_data(grad_merge->value().dims(), + context.GetPlace()); + auto gs = + framework::EigenVector::Flatten(*(grad_square->mutable_value())); + auto gm = framework::EigenVector::Flatten(grad_merge->value()); + gs.device(*context.GetEigenDevice()) = gm * gm; + + math::SelectedRowsAddToTensor functor; + functor(context, *grad_square, moment); + + // 3. update parameter + auto* lr = learning_rate.data(); + auto* param_data = param->data(); + auto* moment_data = moment->data(); + + dim3 grid2(1, merge_rows.size()); + SparseAdagradFunctorKernel< + T, 256><<(context) + .stream()>>>(grad_merge_data, grad_merge->rows().data(), + lr, param_data, + moment_data, grad_width, epsilon); + } +}; + +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adagrad, - ops::AdagradOpKernel); +REGISTER_OP_GPU_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h index c5d8f751d3527f89b96d4274328ba0bb5f6efa44..4d4a6434c7c472d8ceb01edfc4050fbb009d6c9f 100644 --- a/paddle/operators/adagrad_op.h +++ b/paddle/operators/adagrad_op.h @@ -19,35 +19,59 @@ limitations under the License. */ namespace paddle { namespace operators { +template +struct SparseAdagradFunctor { + void operator()(const platform::DeviceContext& context, + const framework::SelectedRows& grad, + const framework::Tensor& learning_rate, T epsilon, + framework::Tensor* moment, framework::Tensor* param); +}; + template class AdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); + auto* param_out_tensor = ctx.Output("ParamOut"); + auto* moment_out_tensor = ctx.Output("MomentOut"); param_out_tensor->mutable_data(ctx.GetPlace()); moment_out_tensor->mutable_data(ctx.GetPlace()); - float epsilon = ctx.Attr("epsilon"); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); - - moment_out.device(place) = moment + grad * grad; - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = - param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + T epsilon = static_cast(ctx.Attr("epsilon")); + + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + auto param = framework::EigenVector::Flatten( + *ctx.Input("Param")); + auto grad = framework::EigenVector::Flatten( + *ctx.Input("Grad")); + auto moment = framework::EigenVector::Flatten( + *ctx.Input("Moment")); + auto lr = framework::EigenVector::Flatten( + *ctx.Input("LearningRate")); + + auto param_out = framework::EigenVector::Flatten(*param_out_tensor); + auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); + auto place = ctx.GetEigenDevice(); + + moment_out.device(place) = moment + grad * grad; + Eigen::DSizes m_dsize(moment_out_tensor->numel()); + param_out.device(place) = + param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); + } else if (grad_var->IsType()) { + auto* param_tensor = ctx.Input("Param"); + PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); + + auto* moment_tensor = ctx.Input("Moment"); + PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); + + SparseAdagradFunctor functor; + functor(ctx.device_context(), *ctx.Input("Grad"), + *ctx.Input("LearningRate"), epsilon, + moment_out_tensor, param_out_tensor); + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } } }; diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 2f41c7fc121950926f6e8d842eb629d59738f321..7b6c5ec30628b521b594ceaa3b7f1e0e03e497e4 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -20,11 +20,11 @@ namespace paddle { namespace operators { namespace { -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel, int block_size) { + int64_t row_numel) { const int ty = blockIdx.y; int tid = threadIdx.x; @@ -59,14 +59,15 @@ struct SparseSGDFunctor { auto* in_data = in_value.data(); auto* out_data = output->data(); - int block_size = 256; + const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); SparseSGDFunctorKernel< - T><<(context) - .stream()>>>(in_data, in_rows.data(), learning_rate.data(), - out_data, in_row_numel, block_size); + T, 256><<(context) + .stream()>>>(in_data, in_rows.data(), + learning_rate.data(), out_data, + in_row_numel); } }; diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 57b99bdb3a9359bbfdbe62a6fc9afca6c4d5df9e..9837f325e30f68ba927a540d395cc7d7e093a607 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -12,7 +12,6 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" #include #include "paddle/framework/var_type_inference.h" -#include "paddle/operators/net_op.h" namespace paddle { namespace operators { diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 66bad349e59b608cb3cc965401c81ef4c716b318..903e84c32887100bbeef6ebf81f66f06f084fab5 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -1,6 +1,9 @@ import unittest import numpy as np +import paddle.v2.fluid.core as core +from paddle.v2.fluid.op import Operator from op_test import OpTest +import math class TestAdagradOp1(OpTest): @@ -65,5 +68,110 @@ class TestAdagradOp2(OpTest): self.check_output() +class TestSparseAdagradOp(unittest.TestCase): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7, 4] + row_numel = 12 + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + np_array = np.ones((len(rows), row_numel)).astype("float32") + np_array[0, 0] = 2.0 + np_array[2, 8] = 4.0 + + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(np_array, place) + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, row_numel), 5.0).astype("float32") + param.set(param_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and initialize moment Variable + moment = scope.var('Moment').get_tensor() + moment_np_array = np.full((height, row_numel), 2.0).astype("float32") + moment.set(moment_np_array, place) + + # create and run sgd operator + adagrad_op = Operator( + "adagrad", + Param='Param', + Grad='Grad', + ParamOut='Param', + Moment='Moment', + MomentOut='Moment', + LearningRate='LearningRate', + epsilon=2.0) + + ctx = core.DeviceContext.create(place) + adagrad_op.run(scope, ctx) + + # get and compare moment result + moment_result_array = np.array(moment) + + self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0]) + self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2]) + self.assertAlmostEqual(2.0, moment_result_array[1, 0]) + # 2.0 + (1.0 + 1.0)^2 + self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10]) + self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4]) + + self.assertAlmostEqual(2.0, moment_result_array[5, 8]) + self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1]) + self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8]) + + # get and compare param result + result_array = np.array(param) + + def get_out(param, lr, grad, m, epsilon): + return param - lr * grad / (math.sqrt(m) + epsilon) + + self.assertAlmostEqual( + get_out(5.0, 2.0, 2.0, 6.0, 2.0), + result_array[rows[0], 0], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 1.0, 3.0, 2.0), + result_array[rows[0], 2], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5) + + # grad_merge = 1.0 + 1.0 + # m = 6.0 + self.assertAlmostEqual( + get_out(5.0, 2.0, 2.0, 6.0, 2.0), + result_array[rows[1], 10], + places=5) + + self.assertAlmostEqual( + get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 1.0, 3.0, 2.0), + result_array[rows[2], 1], + places=5) + self.assertAlmostEqual( + get_out(5.0, 2.0, 4.0, 18.0, 2.0), + result_array[rows[2], 8], + places=5) + + def test_sparse_adagrad(self): + places = [core.CPUPlace()] + if core.is_compile_gpu(): + places.append(core.GPUPlace(0)) + for place in places: + self.check_with_place(place) + + if __name__ == "__main__": unittest.main()