// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #ifdef PADDLE_WITH_MLU #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { namespace operators { inline void GetReduceAxes(const int axis, const framework::DDim& src_ddims, const framework::DDim& target_ddims, std::vector* axes) { int64_t src_dim_size = src_ddims.size(); int64_t target_dim_size = target_ddims.size(); for (int64_t i = 0; i < src_dim_size; ++i) { if (i < axis || i >= target_dim_size + axis) { axes->push_back(i); continue; } if (src_ddims[i] > target_ddims[i - axis]) { axes->push_back(i); } } } inline void GetReduceAxesAndDstDims(const int axis, const framework::DDim& src_ddims, const framework::DDim& target_ddims, std::vector* reduce_axes, std::vector* dst_dims_vec) { int64_t src_dim_size = src_ddims.size(); int64_t target_dim_size = target_ddims.size(); int src_axis = (target_dim_size < src_dim_size ? axis : 0); for (int ax = 0; ax < src_dim_size; ++ax) { if ((ax < src_axis || ax >= src_axis + target_dim_size) || (src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) { reduce_axes->push_back(ax); } else { dst_dims_vec->push_back(src_ddims[ax]); } } if (dst_dims_vec->size() == 0) { // target_var is scalar dst_dims_vec->push_back(1); } } template void MLUOpTensorKernel(const framework::ExecutionContext& ctx, const cnnlOpTensorDesc_t op_tensor_op) { PADDLE_ENFORCE_EQ( platform::is_mlu_place(ctx.GetPlace()), true, platform::errors::Unavailable("This kernel only runs on MLU.")); PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) || (op_tensor_op == CNNL_OP_TENSOR_SUB) || (op_tensor_op == CNNL_OP_TENSOR_MUL), true, platform::errors::Unavailable( "This kernel of MLU only support ADD, SUB, MUL.")); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); const auto& x_dims = x->dims(); const auto& y_dims = y->dims(); axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis); int max_dim = std::max(x_dims.size(), y_dims.size()); std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, axis); MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); MLUCnnlTensorDesc out_desc(*out); MLUCnnlOpTensorDesc op_tensor_desc( op_tensor_op, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x), y_desc.get(), GetBasePtr(y), out_desc.get(), GetBasePtr(out), ToCnnlDataType()); } // ------------------ BinaryOp ----------------- enum BINARY_FUNCTOR { DIV, DIVNONAN, MAXIMUM, MINIMUM, POW, }; template void MLUBinary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t y_desc, const void* y, const cnnlTensorDescriptor_t out_desc, void* out); template <> inline void MLUBinary
(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t y_desc, const void* y, const cnnlTensorDescriptor_t out_desc, void* out) { MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out); } template <> inline void MLUBinary( const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, // useless, only for compatible const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t y_desc, const void* y, const cnnlTensorDescriptor_t out_desc, void* out) { MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out); } template <> inline void MLUBinary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in1_desc, const void* in1, const cnnlTensorDescriptor_t in2_desc, const void* in2, const cnnlTensorDescriptor_t out_desc, void* out) { MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out); } template <> inline void MLUBinary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t y_desc, const void* y, const cnnlTensorDescriptor_t out_desc, void* out) { MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out); } template void MLUBinaryOp(const framework::ExecutionContext& ctx) { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); const auto& x_dims = x->dims(); const auto& y_dims = y->dims(); axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis); int max_dim = std::max(x_dims.size(), y_dims.size()); std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, axis); MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION; MLUBinary(ctx, prefer_type, x_desc.get(), GetBasePtr(x), y_desc.get(), GetBasePtr(y), out_desc.get(), GetBasePtr(out)); } // ------------------ UnaryOp ----------------- enum UNARY_FUNCTOR { NEG, RECIPROCAL, }; template void MLUUnary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); template <> inline void MLUUnary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { MLUCnnl::Neg(ctx, input_desc, input, output_desc, output); } template <> inline void MLUUnary(const framework::ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output); } template void MLUUnaryOp(const framework::ExecutionContext& ctx) { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION; MLUUnary(ctx, prefer_type, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out)); } // ------------------ MLUElementwiseGradOp ----------------- enum MINMAX_GRAD_FUNCTOR { MAXIMUM_GRAD, MINIMUM_GRAD, }; template void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); const auto& x_dims = x->dims(); const auto& y_dims = y->dims(); axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis); int max_dim = std::max(x_dims.size(), y_dims.size()); std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), y_dims_array.data(), out_dims_array.data(), max_dim, axis); // mask = Logic(x, y) only support min & max cnnlLogicOp_t logic = Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE; Tensor mask(x->dtype()); mask.Resize(phi::make_ddim(out_dims_array)); mask.mutable_data(ctx.GetPlace()); cnnlDataType_t data_type = ToCnnlDataType(); MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type); MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type); MLUCnnlTensorDesc mask_desc(max_dim, out_dims_array.data(), data_type); MLUCnnl::Logic(ctx, logic, x_desc.get(), GetBasePtr(x), y_desc.get(), GetBasePtr(y), mask_desc.get(), GetBasePtr(&mask)); // dx = Mul(dz, mask) Tensor dx_temp(x->dtype()); dx_temp.Resize(dout->dims()); dx_temp.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc dout_desc(*dout); MLUCnnlOpTensorDesc mul_op_desc( CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN); MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(), GetBasePtr(dout), dout_desc.get(), GetBasePtr(&mask), dout_desc.get(), GetBasePtr(&dx_temp), data_type); // dy = Sub(dz, dx) Tensor dy_temp(y->dtype()); dy_temp.Resize(dout->dims()); dy_temp.mutable_data(ctx.GetPlace()); MLUCnnlOpTensorDesc sub_op_desc( CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN); MLUCnnl::OpTensor(ctx, sub_op_desc.get(), dout_desc.get(), GetBasePtr(dout), dout_desc.get(), GetBasePtr(&dx_temp), dout_desc.get(), GetBasePtr(&dy_temp), data_type); if (dx) { if (dx->dims() != dout->dims()) { dx->mutable_data(ctx.GetPlace()); std::vector reduce_axes; GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes); MLUCnnlReduceDesc reduction_desc(reduce_axes, CNNL_REDUCE_ADD, data_type, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); MLUCnnlTensorDesc dx_desc(*dx); MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), nullptr, dout_desc.get(), GetBasePtr(&dx_temp), 0, nullptr, nullptr, dx_desc.get(), GetBasePtr(dx)); } else { dx->ShareDataWith(dx_temp); } } if (dy) { if (dy->dims() != dout->dims()) { dy->mutable_data(ctx.GetPlace()); std::vector reduce_axes; GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes); MLUCnnlReduceDesc reduction_desc(reduce_axes, CNNL_REDUCE_ADD, data_type, CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); MLUCnnlTensorDesc dy_desc(*dy); MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), nullptr, dout_desc.get(), GetBasePtr(&dy_temp), 0, nullptr, nullptr, dy_desc.get(), GetBasePtr(dy)); } else { dy->ShareDataWith(dy_temp); } } } } // namespace operators } // namespace paddle #endif