From 773aabc7710f030b7b2e83d30637cd94c3379194 Mon Sep 17 00:00:00 2001 From: lidanqing Date: Mon, 21 Jun 2021 18:28:25 +0800 Subject: [PATCH] Add AXPY oneDNN handler (#33632) * Add oneDNN AXPY handler. * Add fallback for small tensors. * Fix ifdefs * Remove unnecessary namespace prefixes and add missing headers. * Guard handler_axpy with proper ifdefs. * Compilation of this function is possible only when Paddle is not build with CUDA nor HIP. * Move AXPY handler code to separate files. * Use oneDNN AXPY handler in SGD op. * Use axpy handler only when Paddle is built with oneDNN. * Add test for SUM BF16 with big rows. * Fix SFINAE rules for elementwise_add_to. * Add test case for SGD with big rows. * update * update Co-authored-by: Adam Osewski --- paddle/fluid/operators/CMakeLists.txt | 3 + paddle/fluid/operators/math/CMakeLists.txt | 8 +- .../operators/math/selected_rows_functor.cc | 23 +++ paddle/fluid/operators/mkldnn/CMakeLists.txt | 1 + paddle/fluid/operators/mkldnn/axpy_handler.cc | 152 ++++++++++++++++++ paddle/fluid/operators/mkldnn/axpy_handler.h | 33 ++++ paddle/fluid/operators/optimizers/sgd_op.h | 9 ++ .../fluid/tests/unittests/test_sgd_op_bf16.py | 9 +- .../fluid/tests/unittests/test_sum_op.py | 5 + 9 files changed, 240 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/CMakeLists.txt create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.cc create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 14912ac3a7d..0956410041b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -18,6 +18,9 @@ add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(jit) +if(WITH_MKLDNN) + add_subdirectory(mkldnn) +endif() if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index fdbc0c68525..a13fffe15cf 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows math_function blas) + +if(WITH_MKLDNN) + math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler) +else() + math_library(selected_rows_functor DEPS selected_rows math_function blas) +endif() + math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index ee405be5ae9..a72bdec05d7 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif + namespace paddle { namespace operators { namespace math { @@ -296,6 +300,24 @@ template struct SelectedRowsAddToTensor +typename std::enable_if::value || + std::is_same::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + onednn_handler_axpy(data_len, T(1.f), in, out); +} + +template +typename std::enable_if::value || + std::is_same>::value || + std::is_same>::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + blas->AXPY(data_len, T(1.f), in, out); +} +#else template typename std::enable_if::value || std::is_same>::value || @@ -304,6 +326,7 @@ elementwise_add_to(BlasT* blas, size_t data_len, const T* in, T* out) { blas->AXPY(data_len, T(1.f), in, out); } +#endif template typename std::enable_if::value>::type elementwise_add_to( diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt new file mode 100644 index 00000000000..ce95ec560c2 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc new file mode 100644 index 00000000000..76101f19ab6 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +namespace plat = paddle::platform; + +namespace { + +template +class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT { + public: + AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx, + const dnnl::engine mkldnn_engine, plat::Place cpu_place, + int n, float alpha) + : plat::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + plat::CreateKey(dev_ctx, static_cast(n), + plat::MKLDNNGetDataType(), alpha, "-axpy")), + alpha_(alpha), + n_(n) {} + + std::shared_ptr AcquireMemory(void *ptr, + const std::string &suffix) { + /*Generate key*/ + auto local_key = this->key_ + suffix; + auto mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType(), + dnnl::memory::format_tag::x); + mem_p = std::make_shared(md, this->engine_, ptr); + this->dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + std::shared_ptr AcquireSrcMemory(const T *x) { + return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p"); + } + + std::shared_ptr AcquireDstMemory(T *y) { + return this->AcquireMemory(y, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + auto prim_key = this->key_ + "@reorder_p"; + auto reorder_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + // Here we pass Postops to mimick y -> a*X + y + dnnl::primitive_attr reorder_attr; + dnnl::post_ops post_operations; + if (this->alpha_ != 1.f) { + std::vector scales(1, this->alpha_); + reorder_attr.set_output_scales(0, scales); + } + post_operations.append_sum(1.0f); + + reorder_attr.set_post_ops(post_operations); + reorder_p = std::make_shared( + *(src_memory_p), *(dst_memory_p), reorder_attr); + this->dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + + private: + float alpha_; + int n_; +}; + +template class AXPYMKLDNNHandler; +template class AXPYMKLDNNHandler; + +} // anonnymouse namespace + +template +static void naive_axpy(int n, T alpha, const T *x, T *y) { + while (n-- > 0) { + *y += alpha * *x; + ++y; + ++x; + } +} + +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y) { + // fallback to naive version + if (n < 100) { + naive_axpy(n, alpha, x, y); + return; + } + + auto &pool = plat::DeviceContextPool::Instance(); + auto cpu_place = plat::CPUPlace(); + auto *dev_ctx = + dynamic_cast(pool.Get(cpu_place)); + auto &cpu_engine = dev_ctx->GetEngine(); + + AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, + static_cast(alpha)); + + auto reorder_src_memory_p = handler.AcquireSrcMemory(x); + auto reorder_dst_memory_p = handler.AcquireDstMemory(y); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + + auto &astream = plat::MKLDNNDeviceContext::tls().get_stream(); + plat::RecordEvent record_reorder("axpy_int_reorder", + plat::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); +} + +template void onednn_handler_axpy(int, float, const float *, float *); +template void onednn_handler_axpy(int, plat::bfloat16, + const plat::bfloat16 *, + plat::bfloat16 *); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h new file mode 100644 index 00000000000..8f0fdeb5c02 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +namespace paddle { +namespace operators { + +/// +/// @brief Helper function to execute AXPY using oneDNN. +/// +/// @param[in] n The number of elements in tensor (assumed 1D) +/// @param[in] alpha The alpha coefficient. +/// @param[in] x The pointer to input X tensor. +/// @param y The pointer to output Y tensor. +/// +/// @tparam T Data type. +/// +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 076121c0e27..076afdc6553 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif #include "paddle/fluid/platform/bfloat16.h" namespace paddle { @@ -139,9 +142,15 @@ struct sgd_dense_param_kernel< "Got [%s], but expected less than [%s]", grad_rows[i], grad_height)); const int64_t row = grad_rows[i]; +#ifdef PADDLE_WITH_MKLDNN + operators::onednn_handler_axpy(grad_width, -lr[0], + grad_data + i * grad_width, + out_data + row * grad_width); +#else for (int64_t j = 0; j < grad_width; ++j) { out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; } +#endif } } }; diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index fa8ff4effcf..e60b04257db 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -158,6 +158,13 @@ class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16): self.grad_row_numel = 16 +class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16): + def setup_params(self): + self.grad_height = 10 + self.grad_rows = [0, 4, 7] + self.grad_row_numel = 120 + + @unittest.skipIf(not core.supports_bfloat16(), 'place does not support BF16 evaluation') class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16): @@ -194,8 +201,6 @@ class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16): self.check_output(output, reference, atol=5e-3, rtol=1e-1) -@unittest.skipIf(not core.supports_bfloat16(), - 'place does not support BF16 evaluation') class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16): def setup_params(self): self.grad_height = 14 diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index f9e40cf8133..f0fbd143c5a 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -211,6 +211,11 @@ class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp): self.check_with_place(core.CPUPlace(), inplace) +class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op): + def init_kernel_type(self): + self.row_numel = 102 + + class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp): def setUp(self): self.height = 10 -- GitLab