diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 757cac4e4ffce442677eac99bc932f08e6b1cac1..37dafa5c4908fa5434eb941dda991c8276409d31 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -301,23 +301,9 @@ template struct SelectedRowsAddToTensor -typename std::enable_if::value>::type -elementwise_add_to(BlasT* blas, size_t data_len, - const T* in, T* out) { -#ifdef PADDLE_WITH_MKLDNN - onednn_handler_axpy(data_len, T(1.f), in, out); -#else - blas->AXPY(data_len, T(1.f), in, out); -#endif -} - -template -typename std::enable_if::value || - std::is_same::value || - std::is_same>::value || - std::is_same>::value>::type -elementwise_add_to(BlasT* blas, size_t data_len, - const T* in, T* out) { +typename std::enable_if::value>::type elementwise_add_to( + BlasT* blas, size_t data_len, const T* in, + T* out) { blas->AXPY(data_len, T(1.f), in, out); } @@ -330,6 +316,64 @@ typename std::enable_if::value>::type elementwise_add_to( } } +template +typename std::enable_if::value>::type +add_sparse_inputs(const std::vector& inputs, + const std::unordered_map& rows_to_id, + int64_t input_width, + const platform::CPUDeviceContext& context, T* out_data) { +#ifndef PADDLE_WITH_MKLDNN + auto blas = math::GetBlas(context); +#endif + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + +#ifdef PADDLE_WITH_MKLDNN + OneDNNAXPYHandler axpy_handler(input_width, T(1.f)); + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id.at(input_rows[i]); + axpy_handler(&input_data[i * input_width], + &out_data[out_i * input_width]); + } +#else + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id.at(input_rows[i]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); + } +#endif + } +} + +template +typename std::enable_if::value>::type +add_sparse_inputs(const std::vector& inputs, + const std::unordered_map& rows_to_id, + int64_t input_width, + const platform::CPUDeviceContext& context, T* out_data) { + VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name(); + auto blas = math::GetBlas(context); + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id.at(input_rows[i]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); + } + } +} + template struct MergeAdd { framework::SelectedRows operator()(const platform::CPUDeviceContext& context, @@ -435,21 +479,7 @@ struct MergeAdd { rows_to_id[merge_rows[i]] = i; } - auto blas = math::GetBlas(context); - for (auto* input : inputs) { - if (input->rows().size() == 0) { - continue; - } - auto* input_data = input->value().data(); - auto& input_rows = input->rows(); - - for (size_t i = 0; i < input_rows.size(); i++) { - size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to(&blas, static_cast(input_width), - &input_data[i * input_width], - &out_data[out_i * input_width]); - } - } + add_sparse_inputs(inputs, rows_to_id, input_width, context, out_data); } } }; diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index 76101f19ab618c8474ee5f1210a51f39c8f4955e..ed265edf003e01261286b95bd101f98b6bd71477 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -34,76 +33,46 @@ namespace plat = paddle::platform; namespace { template -class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT { +class AXPYHandler { public: - AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx, - const dnnl::engine mkldnn_engine, plat::Place cpu_place, - int n, float alpha) - : plat::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - plat::CreateKey(dev_ctx, static_cast(n), - plat::MKLDNNGetDataType(), alpha, "-axpy")), - alpha_(alpha), - n_(n) {} - - std::shared_ptr AcquireMemory(void *ptr, - const std::string &suffix) { - /*Generate key*/ - auto local_key = this->key_ + suffix; - auto mem_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType(), - dnnl::memory::format_tag::x); - mem_p = std::make_shared(md, this->engine_, ptr); - this->dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); + AXPYHandler(const dnnl::engine mkldnn_engine, int n, float alpha) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + auto md = dnnl::memory::desc({n}, plat::MKLDNNGetDataType(), + dnnl::memory::format_tag::x); + src_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE); + dst_mem_ = dnnl::memory(md, mkldnn_engine, DNNL_MEMORY_NONE); + dnnl::primitive_attr reorder_attr; + dnnl::post_ops post_operations; + if (alpha != 1.f) { + std::vector scales(1, alpha); + reorder_attr.set_output_scales(0, scales); } - return mem_p; - } + post_operations.append_sum(1.0f); - std::shared_ptr AcquireSrcMemory(const T *x) { - return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p"); + reorder_attr.set_post_ops(post_operations); + reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr); } - std::shared_ptr AcquireDstMemory(T *y) { - return this->AcquireMemory(y, "@user_dst_mem_p"); + dnnl::memory &AcquireSrcMemory(const T *x) { + src_mem_.set_data_handle(plat::to_void_cast(x)); + return src_mem_; } - std::shared_ptr AcquireReorder( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - auto prim_key = this->key_ + "@reorder_p"; - auto reorder_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(prim_key)); - if (reorder_p == nullptr) { - // Here we pass Postops to mimick y -> a*X + y - dnnl::primitive_attr reorder_attr; - dnnl::post_ops post_operations; - if (this->alpha_ != 1.f) { - std::vector scales(1, this->alpha_); - reorder_attr.set_output_scales(0, scales); - } - post_operations.append_sum(1.0f); - - reorder_attr.set_post_ops(post_operations); - reorder_p = std::make_shared( - *(src_memory_p), *(dst_memory_p), reorder_attr); - this->dev_ctx_.SetBlob(prim_key, reorder_p); - } - return reorder_p; + dnnl::memory &AcquireDstMemory(T *y) { + dst_mem_.set_data_handle(y); + return dst_mem_; } + const dnnl::reorder &AcquireReorder() { return reorder_p_; } + private: - float alpha_; - int n_; + dnnl::memory src_mem_; + dnnl::memory dst_mem_; + dnnl::reorder reorder_p_; }; -template class AXPYMKLDNNHandler; -template class AXPYMKLDNNHandler; - -} // anonnymouse namespace +template class AXPYHandler; +template class AXPYHandler; template static void naive_axpy(int n, T alpha, const T *x, T *y) { @@ -114,39 +83,60 @@ static void naive_axpy(int n, T alpha, const T *x, T *y) { } } +} // anonnymouse namespace + template -void onednn_handler_axpy(int n, T alpha, const T *x, T *y) { - // fallback to naive version - if (n < 100) { - naive_axpy(n, alpha, x, y); - return; - } +class OneDNNAXPYHandler::Impl { + public: + Impl(int64_t n, T alpha); + void operator()(const T *x, T *y); + + private: + std::unique_ptr> handler_; + int64_t n_; + T alpha_; +}; +template +OneDNNAXPYHandler::Impl::Impl(int64_t n, T alpha) : n_{n}, alpha_{alpha} { auto &pool = plat::DeviceContextPool::Instance(); auto cpu_place = plat::CPUPlace(); auto *dev_ctx = dynamic_cast(pool.Get(cpu_place)); auto &cpu_engine = dev_ctx->GetEngine(); + handler_ = std::make_unique>(cpu_engine, n, + static_cast(alpha)); +} - AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, - static_cast(alpha)); - - auto reorder_src_memory_p = handler.AcquireSrcMemory(x); - auto reorder_dst_memory_p = handler.AcquireDstMemory(y); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); +template +void OneDNNAXPYHandler::Impl::operator()(const T *x, T *y) { + if (this->n_ < 100) { + naive_axpy(this->n_, this->alpha_, x, y); + return; + } + auto &reorder_src_mem_p = handler_->AcquireSrcMemory(x); + auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y); + auto reorder_p = handler_->AcquireReorder(); auto &astream = plat::MKLDNNDeviceContext::tls().get_stream(); - plat::RecordEvent record_reorder("axpy_int_reorder", - plat::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p); astream.wait(); } -template void onednn_handler_axpy(int, float, const float *, float *); -template void onednn_handler_axpy(int, plat::bfloat16, - const plat::bfloat16 *, - plat::bfloat16 *); +template +OneDNNAXPYHandler::OneDNNAXPYHandler(int64_t n, T alpha) + : pimpl_{new Impl{n, alpha}, [](Impl *impl) { delete impl; }} { + VLOG(4) << "[OneDNN] OneDNNAXPYHandler<" << typeid(T).name() << ">, " + << "n: " << n << ", alpha: " << alpha; +} + +template +void OneDNNAXPYHandler::operator()(const T *x, T *y) { + pimpl_->operator()(x, y); +} + +template class OneDNNAXPYHandler; +template class OneDNNAXPYHandler; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h index 8f0fdeb5c02b439e7e531af07728f8d047e32b7c..677fe3b010c2431bc05097f29d96a89513997866 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.h +++ b/paddle/fluid/operators/mkldnn/axpy_handler.h @@ -13,21 +13,47 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include + namespace paddle { namespace operators { /// -/// @brief Helper function to execute AXPY using oneDNN. -/// -/// @param[in] n The number of elements in tensor (assumed 1D) -/// @param[in] alpha The alpha coefficient. -/// @param[in] x The pointer to input X tensor. -/// @param y The pointer to output Y tensor. +/// @brief Helper class for AXPY execution using oneDNN library. /// -/// @tparam T Data type. +/// @tparam T Data type. /// template -void onednn_handler_axpy(int n, T alpha, const T *x, T *y); +class OneDNNAXPYHandler { + public: + OneDNNAXPYHandler(OneDNNAXPYHandler&) = delete; + OneDNNAXPYHandler(OneDNNAXPYHandler&&) = delete; + OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&) = delete; + OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&&) = delete; + /// + /// @brief Constructor. + /// + /// @param[in] n The number of elements in tensor (assumed 1D tensor) + /// @param[in] alpha The alpha coefficient. + /// + OneDNNAXPYHandler(int64_t n, T alpha); + /// + /// @brief Executes AXPY. + /// + /// @param[in] x The pointer to input X tensor data. + /// @param[out] y The pointer to output Y tensor data. + /// + void operator()(const T* x, T* y); + + private: + OneDNNAXPYHandler() = delete; + // (arogowie-intel) Private implementation idiom to hide dependency + // on OneDNN headers. + class Impl; + // We need custom deleter, since the compiler is unable to parameterize + // an allocator's default deleter due to incomple type. + std::unique_ptr pimpl_; +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..450ef376799d3d383f3fa55f65850ca73e6c51a3 --- /dev/null +++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#include "paddle/fluid/operators/optimizers/sgd_op.h" + +namespace pplat = paddle::platform; + +namespace paddle { +namespace operators { + +template +class SGDOneDNNKernel : public SGDOpKernel { + protected: + void dense_param_and_grad_kernel( + const framework::ExecutionContext &ctx) const override { + VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + + auto *out_data = param_out->mutable_data(ctx.GetPlace()); + const T *param_data = param->data(); + const auto *grad_data = grad->data(); + const auto *lr = learning_rate->data(); + // Since denese SGD is not in place operation, first copy params to output + // tensor and then update it. + std::memcpy(out_data, param_data, param->memory_size()); + OneDNNAXPYHandler(param_out->numel(), -lr[0])(grad_data, out_data); + } + + void dense_param_sparse_grad_kernel( + const framework::ExecutionContext &ctx) const override { + VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel"; + const auto *learning_rate = ctx.Input("LearningRate"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad = ctx.Input("Grad"); + + const auto &grad_value = grad->value(); + const auto &grad_rows = grad->rows(); + const auto grad_height = grad->height(); + const int64_t grad_val_height = static_cast(grad_rows.size()); + const auto grad_width = grad_value.numel() / grad_val_height; + + const auto *grad_data = grad_value.data(); + auto *out_data = param_out->data(); + const auto *lr = learning_rate->data(); + + OneDNNAXPYHandler axpy_handler(grad_width, -lr[0]); + + for (size_t i = 0; i < grad_rows.size(); ++i) { + PADDLE_ENFORCE_LT( + grad_rows[i], grad_height, + pplat::errors::OutOfRange( + "Grad rows index value should be less than grad height." + "Got [%s], but expected less than [%s]", + grad_rows[i], grad_height)); + const int64_t row = grad_rows[i]; + const auto *src = grad_data + i * grad_width; + auto *dst = out_data + row * grad_width; + axpy_handler(src, dst); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(sgd, MKLDNN, pplat::CPUPlace, ops::SGDOneDNNKernel, + ops::SGDOneDNNKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 9603411ec4513a585f19d72be2902effe74d360c..b2e258f815d7256501de462d15b2ac2668fd6dc3 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include #include "paddle/fluid/operators/optimizers/sgd_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -67,6 +70,26 @@ class SGDOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + +#ifdef PADDLE_WITH_MKLDNN + using mkldnn::memory; + if (this->CanMKLDNNBeUsed(ctx, data_type)) { + const auto *param_var = ctx.InputVar("Param"); + const auto *grad_var = ctx.InputVar("Grad"); + + // supported cases + bool dense_param_sparse_grad = + param_var->IsType() && + grad_var->IsType(); + bool dense_param_and_grad = param_var->IsType() && + grad_var->IsType(); + + if (dense_param_sparse_grad || dense_param_and_grad) + return framework::OpKernelType(data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif return framework::OpKernelType(data_type, ctx.device_context()); } @@ -106,6 +129,10 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("ParamOut", "(Tensor or SelectedRows, same with Param) " "Output parameter, should share the same memory with Param"); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); AddComment(R"DOC( SGD operator diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 076afdc655386c080e3fde99fbba42d3acf59651..a1fb3debb48e6ebe59eb662778dab701eaeaaf92 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -19,9 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/operators/mkldnn/axpy_handler.h" -#endif #include "paddle/fluid/platform/bfloat16.h" namespace paddle { @@ -142,98 +139,13 @@ struct sgd_dense_param_kernel< "Got [%s], but expected less than [%s]", grad_rows[i], grad_height)); const int64_t row = grad_rows[i]; -#ifdef PADDLE_WITH_MKLDNN - operators::onednn_handler_axpy(grad_width, -lr[0], - grad_data + i * grad_width, - out_data + row * grad_width); -#else for (int64_t j = 0; j < grad_width; ++j) { out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; } -#endif } } }; -template -void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) { - const auto *param = ctx.Input("Param"); - auto *param_out = ctx.Output("ParamOut"); - const auto *grad_var = ctx.InputVar("Grad"); - - if (grad_var->IsType()) { - const auto *grad = ctx.Input("Grad"); - const auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), sz, - platform::errors::InvalidArgument( - "The input tensor Param's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Param's " - "numel = [%s], ParamOut's numel = [%s]", - param->numel(), sz)); - PADDLE_ENFORCE_EQ(grad->numel(), sz, - platform::errors::InvalidArgument( - "The input tensor Grad's numel of SgdOp " - "should be equal with ParamOut's numel. " - "But received Grad's " - "numel = [%s], ParamOut's numel = [%s]", - grad->numel(), sz)); - - sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId>()(ctx); - } else if (grad_var->IsType()) { - // TODO(qijun): In Sparse SGD operator, in-place update is enforced. - // This manual optimization brings difficulty to track data dependency. - // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ(param, param_out, - platform::errors::InvalidArgument( - "The input tensor Param of SgdOp " - "should be equal with ParamOut if variable's " - "type is SelectedRows. ")); - const auto *grad = ctx.Input("Grad"); - - // for distributed training, a sparse var may be empty, - // just skip updating. - if (grad->rows().size() == 0) { - return; - } - - auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ( - grad->height(), out_dims[0], - platform::errors::InvalidArgument( - "The input tensor Grad's height of SgdOp " - "should be equal with ParamOut's dims. But received Grad's " - "height [%s] and ParamOut's dims [%s]", - grad->height(), out_dims[0])); - - auto &grad_value = grad->value(); - auto &grad_rows = grad->rows(); - const auto param_height = param_out->dims()[0]; - const auto param_width = param_out->numel() / param_height; - // note: it is not grad->height() - const auto grad_height = static_cast(grad_rows.size()); - const auto grad_width = grad_value.numel() / grad_height; - - PADDLE_ENFORCE_EQ( - grad_width, param_width, - platform::errors::InvalidArgument( - "The grad_value's numel of SgdOp " - "should be equal with param_out's numel. But received " - "grad_value's numel [%s] and param_out's numel [%s]", - grad_width, param_width)); - - sgd_dense_param_kernel< - T, framework::VarTypeTrait::kId>()(ctx); - } else { - PADDLE_ENFORCE_EQ( - false, true, platform::errors::PermissionDenied( - "Unsupported Variable Type of Grad in SgdOp. Excepted " - "LodTensor or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(grad_var->Type()))); - } -} - } // namespace detail template @@ -247,61 +159,157 @@ class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - const auto *learning_rate = ctx.Input("LearningRate"); - const auto *param_var = ctx.InputVar("Param"); - const auto *grad_var = ctx.InputVar("Grad"); if (param_var->IsType()) { - detail::sgd_op_invoke_dense_param_kernel(ctx); + invoke_dense_param_kernel(ctx); } else if (param_var->IsType()) { - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + sparse_param_and_grad_kernel(ctx); + } else { + PADDLE_ENFORCE_EQ( + false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Parameter in SgdOp. Excepted " + "LodTensor or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(param_var->Type()))); + } + } + + protected: + void invoke_dense_param_kernel(const framework::ExecutionContext &ctx) const { + const auto *param = ctx.Input("Param"); + auto *param_out = ctx.Output("ParamOut"); + const auto *grad_var = ctx.InputVar("Grad"); + + if (grad_var->IsType()) { + const auto *grad = ctx.Input("Grad"); + const auto sz = param_out->numel(); + PADDLE_ENFORCE_EQ(param->numel(), sz, platform::errors::InvalidArgument( - "When param is SelectedRows, gradient should also " - "be SelectedRows")); - const auto ¶m = param_var->Get(); - auto *param_out = ctx.Output("ParamOut"); - const auto &grad = grad_var->Get(); + "The input tensor Param's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Param's " + "numel = [%s], ParamOut's numel = [%s]", + param->numel(), sz)); + PADDLE_ENFORCE_EQ(grad->numel(), sz, + platform::errors::InvalidArgument( + "The input tensor Grad's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Grad's " + "numel = [%s], ParamOut's numel = [%s]", + grad->numel(), sz)); + + dense_param_and_grad_kernel(ctx); + } else if (grad_var->IsType()) { + // TODO(qijun): In Sparse SGD operator, in-place update is enforced. + // This manual optimization brings difficulty to track data dependency. + // It's better to find a more elegant solution. + PADDLE_ENFORCE_EQ(param, param_out, + platform::errors::InvalidArgument( + "The input tensor Param of SgdOp " + "should be equal with ParamOut if variable's " + "type is SelectedRows. ")); + const auto *grad = ctx.Input("Grad"); // for distributed training, a sparse var may be empty, // just skip updating. - if (grad.rows().size() == 0) { + if (grad->rows().size() == 0) { return; } - auto param_row_width = param.value().dims()[1]; - auto grad_row_width = grad.value().dims()[1]; + auto out_dims = param_out->dims(); PADDLE_ENFORCE_EQ( - param_row_width, grad_row_width, + grad->height(), out_dims[0], platform::errors::InvalidArgument( - "The param_row in SgdOP should have the same size with grad_row. " - "But received param_row's width is [%s], and grad_row's width is " - "[%s]", - param_row_width, grad_row_width)); - - const auto *lr = learning_rate->data(); - const auto *grad_data = grad.value().data(); - auto *out_data = param_out->mutable_value()->data(); - for (size_t i = 0; i < grad.rows().size(); i++) { - int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); - PADDLE_ENFORCE_GE( - id_index, static_cast(0), - platform::errors::InvalidArgument( - "The id in SgdOp should be >= 0. But recevied id_index is [%s]", - id_index)); - for (int64_t j = 0; j < grad_row_width; j++) { - out_data[id_index * grad_row_width + j] -= - lr[0] * grad_data[i * grad_row_width + j]; - } - } + "The input tensor Grad's height of SgdOp " + "should be equal with ParamOut's dims. But received Grad's " + "height [%s] and ParamOut's dims [%s]", + grad->height(), out_dims[0])); + + auto &grad_value = grad->value(); + auto &grad_rows = grad->rows(); + const auto param_height = param_out->dims()[0]; + const auto param_width = param_out->numel() / param_height; + // note: it is not grad->height() + const auto grad_height = static_cast(grad_rows.size()); + const auto grad_width = grad_value.numel() / grad_height; + + PADDLE_ENFORCE_EQ( + grad_width, param_width, + platform::errors::InvalidArgument( + "The grad_value's numel of SgdOp " + "should be equal with param_out's numel. But received " + "grad_value's numel [%s] and param_out's numel [%s]", + grad_width, param_width)); + + dense_param_sparse_grad_kernel(ctx); } else { PADDLE_ENFORCE_EQ( false, true, platform::errors::PermissionDenied( - "Unsupported Variable Type of Parameter in SgdOp. Excepted " + "Unsupported Variable Type of Grad in SgdOp. Excepted " "LodTensor or SelectedRows, But received [%s]", - paddle::framework::ToTypeName(param_var->Type()))); + paddle::framework::ToTypeName(grad_var->Type()))); + } + } + + void sparse_param_and_grad_kernel( + const framework::ExecutionContext &ctx) const { + const auto *learning_rate = ctx.Input("LearningRate"); + const auto *param_var = ctx.InputVar("Param"); + const auto *grad_var = ctx.InputVar("Grad"); + + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + platform::errors::InvalidArgument( + "When param is SelectedRows, gradient should also " + "be SelectedRows")); + const auto ¶m = param_var->Get(); + auto *param_out = ctx.Output("ParamOut"); + const auto &grad = grad_var->Get(); + + // for distributed training, a sparse var may be empty, + // just skip updating. + if (grad.rows().size() == 0) { + return; } + + auto param_row_width = param.value().dims()[1]; + auto grad_row_width = grad.value().dims()[1]; + PADDLE_ENFORCE_EQ( + param_row_width, grad_row_width, + platform::errors::InvalidArgument( + "The param_row in SgdOP should have the same size with grad_row. " + "But received param_row's width is [%s], and grad_row's width is " + "[%s]", + param_row_width, grad_row_width)); + + const auto *lr = learning_rate->data(); + const auto *grad_data = grad.value().data(); + auto *out_data = param_out->mutable_value()->data(); + for (size_t i = 0; i < grad.rows().size(); i++) { + int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); + PADDLE_ENFORCE_GE( + id_index, static_cast(0), + platform::errors::InvalidArgument( + "The id in SgdOp should be >= 0. But recevied id_index is [%s]", + id_index)); + for (int64_t j = 0; j < grad_row_width; j++) { + out_data[id_index * grad_row_width + j] -= + lr[0] * grad_data[i * grad_row_width + j]; + } + } + } + + virtual void dense_param_and_grad_kernel( + const framework::ExecutionContext &ctx) const { + detail::sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId>()(ctx); + } + + virtual void dense_param_sparse_grad_kernel( + const framework::ExecutionContext &ctx) const { + detail::sgd_dense_param_kernel< + T, framework::VarTypeTrait::kId>()(ctx); } }; diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py index 32c8a1c3544c229b8f41ec5803a39168816e789a..4189abda0588f3e40a0513f8f39387fc34dabcf6 100644 --- a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py @@ -42,6 +42,8 @@ class OptimizerWithMixedPrecision(object): def __init__(self, optimizer, amp_lists, use_pure_bf16, use_bf16_guard): self._optimizer = optimizer + if optimizer.type == 'sgd': + optimizer._use_mkldnn = True self._amp_lists = amp_lists self._param_grads = None self._train_program = None diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 00188168727f964ae2f7234ef6391464d64af019..24076e82b0365d21e7222a16cbc3d3462699f119 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -1305,6 +1305,7 @@ class SGDOptimizer(Optimizer): grad_clip=grad_clip, name=name) self.type = "sgd" + self._use_mkldnn = False @no_grad def _append_optimize_op(self, block, param_and_grad): @@ -1323,6 +1324,7 @@ class SGDOptimizer(Optimizer): "Grad": param_and_grad[1], "LearningRate": lr }, + attrs={"use_mkldnn": self._use_mkldnn}, outputs={"ParamOut": param_and_grad[0]}, stop_gradient=True) diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index 8b65fc4e431f916be6beadb44d6f29edacf76982..a468d6e828ce14b391e35fad8c8ed220d90e4178 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -32,6 +32,7 @@ class TestSGDOpBF16(OpTest): def setUp(self): self.op_type = 'sgd' self.dtype = np.uint16 + self.use_mkldnn = True self.conf() w = np.random.random((self.h, self.w)).astype('float32') w_bf16 = convert_float_to_uint16(w) @@ -42,6 +43,7 @@ class TestSGDOpBF16(OpTest): self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16} self.outputs = {'ParamOut': w - lr * g} + self.attrs = {'use_mkldnn': self.use_mkldnn} def conf(self): self.h = 102 @@ -53,7 +55,7 @@ class TestSGDOpBF16(OpTest): @unittest.skipIf(not core.supports_bfloat16(), 'place does not support BF16 evaluation') -class TestSGDOpCase8XBF16(TestSGDOpBF16): +class TestSGDOpBF16Case2(TestSGDOpBF16): def conf(self): self.h = 10 self.w = 64 @@ -142,7 +144,8 @@ class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16): Param='Param', Grad='Grad', ParamOut='Param', - LearningRate='LearningRate') + LearningRate='LearningRate', + use_mkldnn=True) sgd_op.run(scope, place) reference = self.ref_optimize(param_array, self.grad_rows, grad_array, @@ -194,7 +197,8 @@ class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16): Param='Param', Grad='Grad', ParamOut='Param', - LearningRate='LearningRate') + LearningRate='LearningRate', + use_mkldnn=True) sgd_op.run(scope, place) reference = self.ref_optimize(param_array, self.grad_rows, grad_array, @@ -213,6 +217,11 @@ class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16): @OpTestTool.skip_if_not_cpu_bf16() class TestSGDOpBF16API(unittest.TestCase): + @classmethod + def setUpClass(cls): + np.random.seed(12345) + fluid.set_flags({'FLAGS_use_mkldnn': True}) + def setUp(self): self.sample_count = 20 self.value = np.random.random() @@ -222,9 +231,7 @@ class TestSGDOpBF16API(unittest.TestCase): self.y_shape = (32, 16) self.learning_rate = 0.1 - np.random.seed(12345) self._set_initializer() - fluid.set_flags({'FLAGS_use_mkldnn': True}) def _fp322bf16(self, val: np.float32): return np.uint16(struct.unpack('> 16)