// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { using framework::DataLayout; using framework::Tensor; using dnnl::memory; using dnnl::primitive; using dnnl::stream; template class EltwiseMKLDNNKernel : public framework::OpKernel { private: dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { dnnl::post_ops post_operations; if (ctx.HasAttr("activation_type")) { const float scale = ctx.HasAttr("activation_scale") ? ctx.Attr("activation_scale") : 1.0f; const float alpha = ctx.HasAttr("activation_alpha") ? ctx.Attr("activation_alpha") : 0.0f; const float beta = ctx.HasAttr("activation_beta") ? ctx.Attr("activation_beta") : 0.0f; static std::unordered_map algo_map = { {"relu", dnnl::algorithm::eltwise_relu}, {"tanh", dnnl::algorithm::eltwise_tanh}, {"leaky_relu", dnnl::algorithm::eltwise_relu}, {"swish", dnnl::algorithm::eltwise_swish}, {"hardswish", dnnl::algorithm::eltwise_hardswish}, {"sqrt", dnnl::algorithm::eltwise_sqrt}, {"abs", dnnl::algorithm::eltwise_abs}, {"clip", dnnl::algorithm::eltwise_clip}, {"gelu", dnnl::algorithm::eltwise_gelu_erf}, {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, {"relu6", dnnl::algorithm::eltwise_bounded_relu}, {"sigmoid", dnnl::algorithm::eltwise_logistic}}; const auto& activation_type = algo_map.find(ctx.Attr("activation_type")); if (activation_type != algo_map.end()) { post_operations.append_eltwise(scale, activation_type->second, alpha, beta); } } return post_operations; } public: void Compute(const framework::ExecutionContext& ctx) const override { const auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const auto* x = ctx.Input("X"); const auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); float scale_x = ctx.Attr("Scale_x"); float scale_y = ctx.Attr("Scale_y"); float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); platform::BinaryMKLDNNHandler handler( BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, scale_y, scale_o, get_post_ops(ctx)); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); // (jczaja) For Inplace src and dst should be the same memory object. // So x should share buffer with z. But UT mechanics is testing inplace // execution for this op not checking that x can be bradcasted to match in // shape y tensor. // This is wrong as when x is to be broadcasted then z(out) will match the // shape of y which is bigger than x. Hence if x is smaller in shape than z // and they share a buffer (of // shape x) then this buffer is not big enough to hold result of elementwise // operation. const bool reuse_x_memopry = x->numel() == z->numel() && x->IsSharedBufferWith(*z); std::shared_ptr dst_memory = nullptr; if (reuse_x_memopry) { dst_memory = src_x_memory; // NOTE(chenfeiyu): when the output reuses memory from other tensor rather // than allocate its own, it's still need to take care of its data type. // Unfortunately, paddle's operator only infers the output' shape, but not // the data type. mutable_data takes care of allocation and data type // normally, but if the memory is already allocated and there is no need // to re-allocate, it just set the data type. So this it added there to // get the right data type. z->mutable_data(ctx.GetPlace()); } else { dst_memory = handler.AcquireDstMemory(z); } const auto binary_prim = handler.AcquireForwardPrimitive(); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); const std::unordered_map args = { {DNNL_ARG_SRC_0, *src_x_memory}, {DNNL_ARG_SRC_1, *src_y_memory}, {DNNL_ARG_DST, *dst_memory}}; binary_prim->execute(astream, args); astream.wait(); z->set_layout(DataLayout::kMKLDNN); z->set_format(platform::GetMKLDNNFormat(*dst_memory)); } }; inline std::vector CalculateBroadcastedDims(const Tensor* x, const Tensor* y) { const auto src_tz = phi::vectorize(x->dims()); const auto dst_tz = phi::vectorize(y->dims()); size_t j = 0; std::vector dst_tz_ex(src_tz.size(), 1); for (size_t i = 0; i < src_tz.size(); ++i) { dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++]; if (j == dst_tz.size()) break; } return dst_tz_ex; } } // namespace operators } // namespace paddle