diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 1ad581cf4ca2b84dcae30b60901bb6a72555ade9..caa498faddaa15a0ef45d00108cb97d5feed2ba4 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -27,40 +27,37 @@ namespace operators { using Tensor = framework::Tensor; -template +template class ActivationMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); - auto& dev_ctx = ctx.template device_context(); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; output->mutable_data(ctx.GetPlace()); - MLUCnnlActivationDesc act_desc(act_mode, alpha_); + MLUCnnlActivationDesc act_desc(act_mode, alpha); MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->type())); MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->type())); - MLUCnnl::Active(dev_ctx, act_desc.get(), input_desc.get(), + MLUCnnl::Active(ctx, act_desc.get(), input_desc.get(), reinterpret_cast(input->data()), output_desc.get(), reinterpret_cast(output->data())); } - - private: - float alpha_ = 1.0; }; -template +template class ActivationGradMLUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - auto& dev_ctx = ctx.template device_context(); + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 1.0f; dx->mutable_data(ctx.GetPlace()); @@ -70,16 +67,13 @@ class ActivationGradMLUKernel : public framework::OpKernel { ToCnnlDataType(out->type())); MLUCnnlTensorDesc dx_desc(*dx, CNNL_LAYOUT_ARRAY, ToCnnlDataType(dx->type())); - MLUCnnlActivationDesc act_desc(act_mode, alpha_); + MLUCnnlActivationDesc act_desc(act_mode, alpha); MLUCnnl::ActiveGrad( - dev_ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr, + ctx, act_desc.get(), nullptr, nullptr, nullptr, nullptr, dout_desc.get(), reinterpret_cast(dout->data()), out_desc.get(), reinterpret_cast(out->data()), dx_desc.get(), reinterpret_cast(dx->data())); } - - private: - float alpha_ = 1.0; }; } // namespace operators @@ -88,13 +82,9 @@ class ActivationGradMLUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_MLU_KERNEL( - relu, ops::ActivationMLUKernel, - ops::ActivationMLUKernel); + relu, ops::ActivationMLUKernel, + ops::ActivationMLUKernel); REGISTER_OP_MLU_KERNEL( - relu_grad, ops::ActivationGradMLUKernel, - ops::ActivationGradMLUKernel, + ops::ActivationGradMLUKernel); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 917692bfbd9d5366d19d09ec42458738cbceeb36..c877b7130c55c728a635a4170405447115551244 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -13,12 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mlu/mlu_baseop.h" -#include -#include -#include -#include -#include +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { @@ -57,7 +54,7 @@ class MLUCnnlTensorDescPool { static MLUCnnlTensorDescPool g_cnnl_tensor_desc_pool; -MLUCnnlTensorDesc &MLUCnnlTensorDesc::operator=(MLUCnnlTensorDesc &&rhs) { +MLUCnnlTensorDesc& MLUCnnlTensorDesc::operator=(MLUCnnlTensorDesc&& rhs) { if (raw_tensor_desc) { g_cnnl_tensor_desc_pool.Recycle(raw_tensor_desc); } @@ -138,7 +135,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const int tensor_dim, cnnlSetTensorDescriptorPosition(raw_tensor_desc, position)); } -MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor, +MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor, const cnnlTensorLayout_t layout, const cnnlDataType_t tensor_dtype) { auto dims = framework::vectorize(tensor.dims()); @@ -156,7 +153,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor, } } -MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor, +MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, const cnnlDataType_t tensor_dtype, int position) @@ -165,7 +162,7 @@ MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor, cnnlSetTensorDescriptorPosition(raw_tensor_desc, position)); } -MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor &tensor, +MLUCnnlTensorDesc::MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, const cnnlDataType_t tensor_dtype, int position, float scale) @@ -197,31 +194,2359 @@ MLUCnnlActivationDesc::~MLUCnnlActivationDesc() { } } -/* static */ void MLUCnnl::Active(const platform::MLUDeviceContext &ctx, +MLUCnnlPoolingDesc::MLUCnnlPoolingDesc( + const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt, + int window_rows, int window_cols, int64_t pad_up, int64_t pad_down, + int64_t pad_left, int64_t pad_right, int row_stride, int col_stride) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreatePoolingDescriptor(&pooling_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetPooling2dDescriptor( + pooling_desc_, mode, maxpooling_nan_opt, window_rows, window_cols, pad_up, + pad_down, pad_left, pad_right, row_stride, col_stride)); +} + +MLUCnnlPoolingDesc::MLUCnnlPoolingDesc( + const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt, + const int tensor_rank, const std::vector& window, + const std::vector& padding, const std::vector& stride) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreatePoolingDescriptor(&pooling_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetPoolingNdDescriptor( + pooling_desc_, mode, maxpooling_nan_opt, tensor_rank, window.data(), + padding.data(), stride.data())); +} + +const cnnlPoolingDescriptor_t MLUCnnlPoolingDesc::get() const { + return pooling_desc_; +} + +MLUCnnlPoolingDesc::~MLUCnnlPoolingDesc() { + if (pooling_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyPoolingDescriptor(pooling_desc_)); + } +} + +MLUCnnlRandomGeneratorDesc::MLUCnnlRandomGeneratorDesc(const bool is_mlu200, + const int seed) { + if (is_mlu200) { + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_FAST)); + } else { + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRandCreateGenerator(&mlu_generator, CNNL_RAND_RNG_MTGP32)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRandSetPseudoRandomGeneratorSeed(mlu_generator, seed)); + } +} + +const cnnlRandGenerator_t MLUCnnlRandomGeneratorDesc::get() const { + return mlu_generator; +} + +MLUCnnlRandomGeneratorDesc::~MLUCnnlRandomGeneratorDesc() { + if (mlu_generator) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandDestroyGenerator(mlu_generator)); + } +} + +MLUCnnlNMSDesc::MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, + const float iou_threshold, + const int max_output_size, + const float confidence_threshold, + const int input_layout) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateNmsDescriptor(&nms_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetNmsDescriptor_v2(nms_desc_, mode, iou_threshold, max_output_size, + confidence_threshold, input_layout)); +} + +const cnnlNmsDescriptor_t MLUCnnlNMSDesc::get() const { return nms_desc_; } + +MLUCnnlNMSDesc::~MLUCnnlNMSDesc() { + if (nms_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyNmsDescriptor(nms_desc_)); + } +} + +MLUCnnlReduceDesc::MLUCnnlReduceDesc(const std::vector& axis_vec, + const cnnlReduceOp_t reduce_op, + const cnnlDataType_t data_type, + const cnnlNanPropagation_t nan_propagation, + const cnnlReduceIndices_t reduce_indices, + const cnnlIndicesType_t indices_type) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateReduceDescriptor(&reduction_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetReduceDescriptor( + reduction_desc_, const_cast(axis_vec.data()), axis_vec.size(), + reduce_op, data_type, nan_propagation, reduce_indices, indices_type)); +} + +const cnnlReduceDescriptor_t MLUCnnlReduceDesc::get() const { + return reduction_desc_; +} + +MLUCnnlReduceDesc::~MLUCnnlReduceDesc() { + if (reduction_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyReduceDescriptor(reduction_desc_)); + } +} + +MLUCnnlOpTensorDesc::MLUCnnlOpTensorDesc( + cnnlOpTensorDesc_t op_tensor_op, cnnlDataType_t op_tensor_comp_type, + cnnlNanPropagation_t op_tensor_nan_opt) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateOpTensorDescriptor(&op_tensor_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetOpTensorDescriptor( + op_tensor_desc_, op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt)); +} + +const cnnlOpTensorDescriptor_t MLUCnnlOpTensorDesc::get() const { + return op_tensor_desc_; +} + +MLUCnnlOpTensorDesc::~MLUCnnlOpTensorDesc() { + if (op_tensor_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyOpTensorDescriptor(op_tensor_desc_)); + } +} + +MLUCnnlConvolutionDesc::MLUCnnlConvolutionDesc( + const int dims, const int pad[], const int stride[], const int dilation[], + const int group_count, const cnnlDataType_t tensor_dtype) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateConvolutionDescriptor(&conv_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetConvolutionDescriptor( + conv_desc_, dims, pad, stride, dilation, group_count, tensor_dtype)); +} + +MLUCnnlConvolutionDesc::MLUCnnlConvolutionDesc( + const int dims, const int64_t pad[], const int64_t stride[], + const int64_t dilation[], const int group_count, + const cnnlDataType_t tensor_dtype) { + const int spatial_dims = dims - 2; + const int pad_dims = spatial_dims * 2; + std::vector pad_int32(pad_dims); + std::vector stride_int32(spatial_dims); + std::vector dilation_int32(spatial_dims); + std::vector::const_iterator int64_pad_cbegin(pad); + std::vector::const_iterator int64_pad_cend(pad + pad_dims); + std::vector::const_iterator int64_stride_cbegin(stride); + std::vector::const_iterator int64_stride_cend(stride + spatial_dims); + std::vector::const_iterator int64_dilation_cbegin(dilation); + std::vector::const_iterator int64_dilation_cend(dilation + + spatial_dims); + std::transform(int64_pad_cbegin, int64_pad_cend, pad_int32.begin(), + &CheckedNarrowing); + std::transform(int64_stride_cbegin, int64_stride_cend, stride_int32.begin(), + &CheckedNarrowing); + std::transform(int64_dilation_cbegin, int64_dilation_cend, + dilation_int32.begin(), &CheckedNarrowing); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateConvolutionDescriptor(&conv_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetConvolutionDescriptor( + conv_desc_, dims, pad_int32.data(), stride_int32.data(), + dilation_int32.data(), group_count, tensor_dtype)); +} + +const cnnlConvolutionDescriptor_t MLUCnnlConvolutionDesc::get() const { + return conv_desc_; +} + +MLUCnnlConvolutionDesc::~MLUCnnlConvolutionDesc() { + if (conv_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyConvolutionDescriptor(conv_desc_)); + } +} + +MLUCnnlBatchSpaceDesc::MLUCnnlBatchSpaceDesc(uint32_t block_shape[], + uint32_t paddings[], + const uint32_t block_shape_size, + const uint32_t paddings_size) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateSpaceBatchNdDescriptor(&op_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetSpaceBatchNdDescriptor( + op_desc_, block_shape, block_shape_size, paddings, paddings_size)); +} + +void MLUCnnlBatchSpaceDesc::getSpace2batchNdextraInputSize( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSpace2batchNdExtraInputSize( + handle, input_desc, op_desc_, &extra_input_size_)); +} + +void MLUCnnlBatchSpaceDesc::getBatch2spaceNdextraInputSize( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatch2spaceNdExtraInputSize( + handle, input_desc, op_desc_, &extra_input_size_)); +} + +void MLUCnnlBatchSpaceDesc::initSpace2batchNdExtraInput( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + void* extra_host_input) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlInitSpace2batchNdExtraInput( + handle, input_desc, op_desc_, extra_host_input)); +} + +void MLUCnnlBatchSpaceDesc::initBatch2spaceNdExtraInput( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + void* extra_host_input) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlInitBatch2spaceNdExtraInput( + handle, input_desc, op_desc_, extra_host_input)); +} + +const cnnlSpaceBatchNdDescriptor_t MLUCnnlBatchSpaceDesc::get() const { + return op_desc_; +} + +size_t MLUCnnlBatchSpaceDesc::getExtraInputSize() const { + return extra_input_size_; +} + +MLUCnnlBatchSpaceDesc::~MLUCnnlBatchSpaceDesc() { + if (op_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroySpaceBatchNdDescriptor(op_desc_)); + } +} + +MLUCnnlTrigonDesc::MLUCnnlTrigonDesc( + const cnnlTrigonFunctionMode_t trigon_function_mode) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateTrigonDescriptor(&trigon_desc_)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTrigonDescriptor(trigon_desc_, trigon_function_mode)); +} + +const cnnlTrigonDescriptor_t MLUCnnlTrigonDesc::get() const { + return trigon_desc_; +} + +MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { + if (trigon_desc_) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyTrigonDescriptor(trigon_desc_)); + } +} + +/* static */ void MLUCnnl::Active(const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, const cnnlTensorDescriptor_t input_desc, - const void *input, + const void* input, const cnnlTensorDescriptor_t output_desc, - void *output) { - cnnlHandle_t handle = ctx.cnnl_handle(); + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); PADDLE_ENFORCE_MLU_SUCCESS(cnnlActivationForward( handle, active_desc, NULL, input_desc, input, NULL, output_desc, output)); } /* static */ void MLUCnnl::ActiveGrad( - const platform::MLUDeviceContext &ctx, - cnnlActivationDescriptor_t active_desc, const void *alpha, const void *beta, - const cnnlTensorDescriptor_t y_desc, const void *y, - const cnnlTensorDescriptor_t diff_y_desc, const void *diff_y, - const cnnlTensorDescriptor_t x_desc, const void *x, - const cnnlTensorDescriptor_t diff_x_desc, void *diff_x) { - cnnlHandle_t handle = ctx.cnnl_handle(); + const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc, + const void* y, const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); PADDLE_ENFORCE_MLU_SUCCESS( cnnlActivationBackward(handle, active_desc, alpha, y_desc, y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x)); } +/* static */ void MLUCnnl::Concat(const ExecutionContext& ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + +/* static */ void MLUCnnl::Div( + const ExecutionContext& ctx, cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetDivWorkspaceSize( + handle, in0_desc, in1_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDiv_v2(handle, prefer, in0_desc, in0, in1_desc, + in1, workspace_ptr, workspace_size, + output_desc, output)); +} + +/* static */ void MLUCnnl::Fill(const ExecutionContext& ctx, float value, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlFill(handle, value, output_desc, output)); +} + +/* static */ void MLUCnnl::QuantifyOffline( + const ExecutionContext& ctx, cnnlQuantizeMode_t mode, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlQuantizeV1(handle, mode, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::LRN(const ExecutionContext& ctx, + const int local_size, const double alpha, + const double beta, const double k, + const cnnlTensorDescriptor_t input_quant_desc, + const void* input_quant, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetLrnWorkspaceSize( + handle, input_quant_desc, output_desc, local_size, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + const cnnlLrnMode_t mode = CNNL_LRN_CROSS_CHANNEL; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlLrn( + handle, mode, local_size, alpha, beta, k, workspace_ptr, workspace_size, + input_quant_desc, const_cast(input_quant), output_desc, output)); +} + +/* static */ void MLUCnnl::QuantifyOnline( + const ExecutionContext& ctx, const int bitwidth, + const cnnlTensorDescriptor_t input_desc, const void* input, + const bool compute_scale, void* position, void* scale, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size)); + + // use ctx allocate interface for profiling purpose + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + const cnnlQuantizeMode_t mode = + compute_scale ? CNNL_QUANTIZE_POSITION_SCALE : CNNL_QUANTIZE_POSITION; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeParam( + handle, mode, input_desc, input, bitwidth, workspace_ptr, workspace_size, + position, scale, nullptr)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeV2(handle, mode, input_desc, input, + position, scale, nullptr, + output_desc, output)); +} + +/* static */ void MLUCnnl::Range(const ExecutionContext& ctx, const void* start, + const void* end, const void* step, + const cnnlDataType_t output_dtype, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlArange(handle, start, end, step, output_dtype, output)); +} + +/* static */ void MLUCnnl::Round(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRound(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::SparseSoftmaxXentWithLogits( + const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, + const cnnlTensorDescriptor_t x_desc, const void* input, + const cnnlTensorDescriptor_t label_desc, const void* label, + const cnnlTensorDescriptor_t y_desc, void* output, + const cnnlTensorDescriptor_t diff_y_desc, void* back_out) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits( + handle, mode, x_desc, input, label_desc, label, y_desc, output, + diff_y_desc, back_out)); +} + +/* static */ void MLUCnnl::Cumsum(const ExecutionContext& ctx, const int axis, + const bool exclusive, const bool reverse, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t ouput_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now. + cnnlNanPropagation_t mode = CNNL_NOT_PROPAGATE_NAN; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCumsum(handle, input_desc, input, axis, + exclusive, reverse, mode, ouput_desc, + output)); +} + +/* static */ void MLUCnnl::BroadcastTo(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlExpand(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::AssignAdd(const ExecutionContext& ctx, + const void* alpha, const void* beta, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, + void* param) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd( + handle, alpha, update_desc, update, nullptr, 0, beta, param_desc, param)); +} + +/* static */ void MLUCnnl::AssignSub(const ExecutionContext& ctx, + const void* alpha, const void* beta, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, + void* param) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignSub( + handle, alpha, update_desc, update, nullptr, 0, beta, param_desc, param)); +} + +/* static */ void MLUCnnl::Assign(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, + void* param) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlCopy(handle, update_desc, update, param_desc, param)); +} + +/* static */ void MLUCnnl::SGD(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, + const cnnlTensorDescriptor_t var_desc, + void* var) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGradientDescent(handle, grad_desc, grad, lr, var_desc, var)); +} + +/* static */ void MLUCnnl::ApplyAdaGrad( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const cnnlTensorDescriptor_t accum_desc, void* accum, + const cnnlTensorDescriptor_t var_desc, void* var, const void* lr, + const bool update_slots) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdaGrad(handle, grad_desc, grad, + accum_desc, accum, var_desc, var, + lr, update_slots)); +} + +/* static */ void MLUCnnl::ApplyRMSProp( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* rho, const void* momentum, + const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t ms_desc, void* ms, + const cnnlTensorDescriptor_t mom_desc, void* mom) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlRMSProp(handle, lr, rho, epsilon, momentum, + grad_desc, grad, var_desc, var, + ms_desc, ms, mom_desc, mom)); +} + +/* static */ void MLUCnnl::ApplyCenterRMSProp( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* rho, const void* momentum, + const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t mg_desc, void* mg, + const cnnlTensorDescriptor_t ms_desc, void* ms, + const cnnlTensorDescriptor_t mom_desc, void* mom) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyCenterRMSProp( + handle, var_desc, var, mg_desc, mg, ms_desc, ms, mom_desc, mom, grad_desc, + grad, lr, rho, momentum, epsilon)); +} + +/* static */ void MLUCnnl::ApplyAdam( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* beta1, const void* beta2, + const void* beta1_power, const void* beta2_power, const void* epsilon, + const bool use_nesterov, const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlApplyAdam( + handle, grad_desc, var, grad_desc, m, grad_desc, v, grad_desc, grad, lr, + beta1, beta2, beta1_power, beta2_power, epsilon, use_nesterov)); +} + +/* static */ void MLUCnnl::ApplyAdaMax( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v, const void* diff, + const void* lr, const void* beta1, const void* beta2, + const void* beta1_power, const void* epsilon) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlApplyAdaMax(handle, var_desc, var, m_desc, m, v_desc, v, grad_desc, + diff, lr, beta1, beta2, beta1_power, epsilon)); +} + +/* static */ void MLUCnnl::ApplyMomentum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const bool use_nesterov, + const void* lr, const void* momentum, + void* var, void* accum) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMomentum(handle, grad_desc, var, grad_desc, + accum, grad_desc, grad, lr, momentum, + use_nesterov)); +} + +/* static */ void MLUCnnl::ApplyKerasMomentum( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const bool use_nesterov, const void* lr, + const void* momentum, void* var, void* accum) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlKerasMomentum(handle, grad_desc, var, grad_desc, accum, grad_desc, + grad, lr, momentum, use_nesterov)); +} + +/* static */ void MLUCnnl::ApplyAdadelta(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* diff, const void* lr, + const void* rho, const void* epsilon, + void* var, void* accum, + void* accum_update) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlApplyAdadelta(handle, grad_desc, var, grad_desc, accum, grad_desc, + accum_update, grad_desc, diff, lr, rho, epsilon)); +} + +/* static */ void MLUCnnl::Scale( + const ExecutionContext& ctx, const int axis, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t alpha_desc, const void* alpha, + const cnnlTensorDescriptor_t beta_desc, const void* beta, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlScale(handle, axis, input_desc, input, + alpha_desc, alpha, beta_desc, beta, + output_desc, output)); +} + +/* static */ void MLUCnnl::AddN(const ExecutionContext& ctx, uint32_t input_num, + const cnnlTensorDescriptor_t inputs_desc[], + const void* inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlAddN(handle, inputs_desc, inputs, input_num, output_desc, output)); +} + +/* static */ void MLUCnnl::Log(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + cnnlLogBase_t log_base = CNNL_LOG_E; + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlLog_v2(handle, prefer, log_base, input_desc, + input, output_desc, output)); +} + +/* static */ void MLUCnnl::Matmul( + const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + float alpha = 1.0f; + float beta = 0.0f; + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlMatMul(handle, transpose_a, transpose_b, + reinterpret_cast(&alpha), in0_desc, in0, in1_desc, in1, + reinterpret_cast(&beta), output_desc, output)); +} + +/* static */ void MLUCnnl::BatchMatmul( + const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatchMatMulBCastWorkspaceSize( + handle, in0_desc, in1_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCast( + handle, transpose_a, transpose_b, in0_desc, in0, in1_desc, in1, + workspace_ptr, workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::OpTensor( + const ExecutionContext& ctx, const cnnlOpTensorDescriptor_t op_tensor_desc, + const cnnlTensorDescriptor_t a_desc, const void* a, + const cnnlTensorDescriptor_t b_desc, const void* b, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlDataType_t dtype) { + static const int alpha1_int = 1, alpha2_int = 1, beta_int = 0; + static const float alpha1_float = 1.f, alpha2_float = 1.f, beta_float = 0.f; + + const void* alpha1_ptr = static_cast(&alpha1_float); + const void* alpha2_ptr = static_cast(&alpha2_float); + const void* beta_ptr = static_cast(&beta_float); + + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + + bool is_dt_float = (dtype == CNNL_DTYPE_FLOAT || dtype == CNNL_DTYPE_HALF); + + // if datatype is not float, we set alpha and beta to be int + if (!is_dt_float) { + alpha1_ptr = static_cast(&alpha1_int); + alpha2_ptr = static_cast(&alpha2_int); + beta_ptr = static_cast(&beta_int); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize_v2( + handle, op_tensor_desc, alpha1_ptr, a_desc, a, alpha2_ptr, b_desc, b, + beta_ptr, output_desc, output, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlOpTensor( + handle, op_tensor_desc, alpha1_ptr, a_desc, a, alpha2_ptr, b_desc, b, + workspace_ptr, workspace_size, beta_ptr, output_desc, output)); +} + +/* static */ void MLUCnnl::BiasAddGrad( + const ExecutionContext& ctx, const int axis, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBiasAddBackward( + handle, out_backprop_desc, out_backprop, axis, output_desc, output)); +} + +/* static */ void MLUCnnl::RandomUniform( + const ExecutionContext& ctx, const int num, const cnnlDataType_t data_type, + const cnnlRandGenerator_t mlu_generator, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlRandGenerateUniform( + handle, mlu_generator, data_type, nullptr, num, 0, 1, output)); +} + +/* static */ void MLUCnnl::TopK( + const ExecutionContext& ctx, const int k, const int dim, const bool largest, + const bool sorted, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t values_output_desc, + void* values_out, const cnnlTensorDescriptor_t indices_output_desc, + void* indices_out) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor( + handle, input_desc, input, k, dim, largest, sorted, values_output_desc, + values_out, indices_output_desc, indices_out)); +} + +/* static */ void MLUCnnl::StridedSlice( + const ExecutionContext& ctx, const int begin[], const int end[], + const int strides[], const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSlice( + handle, input_desc, input, begin, end, strides, output_desc, output)); +} + +/* static */ void MLUCnnl::Split(const ExecutionContext& ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + +/* static */ void MLUCnnl::GatherFunctor( + const ExecutionContext& ctx, const int axis, const int batch_dims, + const cnnlTensorDescriptor_t params_desc, const void* params, + const cnnlTensorDescriptor_t indices_desc, const void* indices, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlBatchGatherV2(handle, axis, batch_dims, params_desc, params, + indices_desc, indices, output_desc, output)); +} + +/* static */ void MLUCnnl::ScatterFunctor( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, + const void* params, const cnnlTensorDescriptor_t updates_desc, + const void* updates, const cnnlTensorDescriptor_t indices_desc, + const void* indices, const cnnlScatterRefMode_t mode) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterRef(handle, params_desc, params, + indices_desc, indices, updates_desc, + updates, 0, mode)); +} + +/* static */ void MLUCnnl::StridedSliceGrad( + const ExecutionContext& ctx, const int begin[], const int end[], + const int strides[], const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSliceBackward( + handle, begin, end, strides, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Logic( + const ExecutionContext& ctx, const MLULogicMethod log_method, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetLogicOpWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlLogicOp( + handle, cnnlLogicOp_t(log_method), input1_desc, input1, input2_desc, + input2, workspace_ptr, workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::Select( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t then_desc, + const void* p_then, const cnnlTensorDescriptor_t else_desc, + const void* p_else, const cnnlTensorDescriptor_t output_desc, void* output, + const bool* condition, const int condition_size) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSelect(handle, then_desc, p_then, else_desc, + p_else, output_desc, output, condition, + condition_size)); +} + +/*static */ void MLUCnnl::GatherNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t params_desc, + const void* params, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGatherNd( + handle, params_desc, params, indices_desc, indices, output_desc, output)); +} + +/* static */ void MLUCnnl::BatchToSpace( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlSpaceBatchParam_t param) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBatch2spaceWorkspaceSize( + handle, input_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatch2space(handle, input_desc, input, + output_desc, output, param, + workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::BatchToSpaceNd( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, cnnlSpaceBatchNdDescriptor_t param, + void* extra_device_input, size_t extra_input_size, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlBatch2spaceNd_v2(handle, input_desc, input, output_desc, output, + param, extra_device_input, extra_input_size)); +} + +/* static */ void MLUCnnl::SoftmaxForward( + const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, + cnnlSoftmaxMode_t mode, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* beta, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftmaxForward(handle, algorithm, mode, alpha, + input_desc, input, beta, + output_desc, output)); +} + +/* static */ void MLUCnnl::Softplus(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t features_desc, + const void* features, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + const int beta = 1; + const int threshold = 20; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftplusForward( + handle, features_desc, features, output_desc, output, beta, threshold)); +} + +/* static */ void MLUCnnl::SoftplusGrad( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t gradients_desc, + const void* gradients, const cnnlTensorDescriptor_t features_desc, + const void* features, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + int beta = 1; + int threshold = 20; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSoftplusBackward(handle, features_desc, features, gradients_desc, + gradients, output_desc, output, beta, threshold)); +} + +/* static */ void MLUCnnl::PoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const std::vector& output_shape, + const cnnlPoolingDescriptor_t pooling_desc, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* beta, const void* extra_input_ptr, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPoolingWorkspaceSize( + handle, pool_mode, output_shape[2], output_shape[1], &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingForward_v2( + handle, pooling_desc, alpha, input_desc, input, beta, extra_input_ptr, + output_desc, output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::Pool3D( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const std::vector& output_shape, + const cnnlPoolingDescriptor_t pooling_desc, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* beta, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPoolingWorkspaceSize( + handle, pool_mode, output_shape[2], output_shape[1], &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlPoolingForward(handle, pooling_desc, alpha, input_desc, input, beta, + output_desc, output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::RsqrtGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, + const void* y, const void* diff_y, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRsqrtBackward(handle, data_desc, y, diff_y, output)); +} + +/* static */ void MLUCnnl::SqrtGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, + const void* y, const void* diff_y, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSqrtBackward(handle, data_desc, y, diff_y, output)); +} + +/* static */ void MLUCnnl::UnsortedSegmentSum( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc, + const void* data, const cnnlTensorDescriptor_t ids_desc, + const int* segment_ids, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetUnsortedSegmentSumWorkspaceSize( + handle, data_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlUnsortedSegmentSum( + handle, data_desc, data, ids_desc, segment_ids, workspace_ptr, + workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::Pad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, const void* paddings, + const void* padding_value, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPad(handle, input_desc, input, paddings, + padding_value, output_desc, output)); +} + +/* static */ void MLUCnnl::OneHot(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t desc_indices, + const void* indices, const int depth, + const void* on_value, const void* off_value, + const int axis, + cnnlDataType_t output_data_type, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlOneHot(handle, desc_indices, indices, depth, + on_value, off_value, axis, + output_data_type, output)); +} + +/* static */ void MLUCnnl::ConvolutionForward( + const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t bias_desc, + const void* bias_ptr, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t filtet_desc, + const void* filter, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // cnnl: select best algorithm for convolution compution. + cnnlConvolutionForwardAlgo_t algo; + cnnlConvolutionFwdPreference_t preference = CNNL_CONVOLUTION_FWD_FASTEST; + cnnlGetConvolutionForwardAlgorithm(handle, conv_desc, input_desc, filtet_desc, + output_desc, preference, &algo); + + // get workspace size + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardWorkspaceSize( + handle, input_desc, filtet_desc, output_desc, bias_desc, conv_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionForward( + handle, conv_desc, algo, alpha, input_desc, input, filtet_desc, filter, + bias_desc, bias_ptr, workspace_ptr, workspace_size, beta, output_desc, + output)); +} + +/* static */ void MLUCnnl::Tile(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlTile(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::SoftmaxCrossEntropyWithLogits( + const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, + cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input_desc, + const void* logits_in, const cnnlTensorDescriptor_t label_desc, + const void* labels_in, const cnnlTensorDescriptor_t loss_out_desc, + void* loss_out, const cnnlTensorDescriptor_t back_out_desc, + void* back_out) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSoftmaxCrossEntropyWithLogits_v2( + handle, mode, prefer, input_desc, logits_in, label_desc, labels_in, + loss_out_desc, loss_out, back_out_desc, back_out)); +} + +/* static */ void MLUCnnl::Reduce( + const ExecutionContext& ctx, const bool need_workspace, + const cnnlReduceDescriptor_t reduction_desc, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const size_t indices_size, void* indices, const void* beta, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + void* workspace_ptr = nullptr; + Tensor workspace; + if (need_workspace) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetReduceOpWorkspaceSize( + handle, input_desc, output_desc, reduction_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + + workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlReduce( + handle, reduction_desc, workspace_ptr, workspace_size, alpha, input_desc, + input, indices_size, indices, beta, output_desc, output)); +} + +/* static */ void MLUCnnl::FloorDiv( + const ExecutionContext& ctx, cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetFloorDivWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlFloorDiv_v2(handle, prefer, input1_desc, input1, input2_desc, input2, + output_desc, output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::FloorMod(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetFloorModWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlFloorMod(handle, input1_desc, input1, input2_desc, input2, + output_desc, output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::Maximum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetMaximumWorkspaceSize(handle, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlMaximum(handle, input1_desc, input1, input2_desc, input2, output_desc, + output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::Minimum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetMinimumWorkspaceSize(handle, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlMinimum(handle, input1_desc, input1, input2_desc, input2, output_desc, + output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::PowR( + const ExecutionContext& ctx, cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPowRWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPowR_v2(handle, prefer, input1_desc, input1, + input2_desc, input2, workspace_ptr, + workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::DivNoNan( + const ExecutionContext& ctx, cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetDivNoNanWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlDivNoNan_v2(handle, prefer, input1_desc, input1, input2_desc, input2, + workspace_ptr, workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::SquaredDifference( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc, + const void* input1, const cnnlTensorDescriptor_t input2_desc, + const void* input2, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSquaredDifferenceWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSquaredDifference( + handle, input1_desc, input1, input2_desc, input2, output_desc, output, + workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::L2Loss(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlL2Loss(handle, input_desc, input, output)); +} + +/* static */ void MLUCnnl::Abs(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlAbs(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Neg(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlNegTensor(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Floor(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlFloor(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Ceil(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlCeil(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::IsNan(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlIsNan(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Square(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSquare(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Sqrt(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSqrt_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Rsqrt(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlRsqrt_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Cos(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlCos_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Sin(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSin_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::TrigonForward( + const ExecutionContext& ctx, const cnnlTrigonDescriptor_t trigon_desc, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlTrigonForward(handle, trigon_desc, input_desc, + input, output_desc, output)); +} + +/* static */ void MLUCnnl::Exp(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlExp_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Sign(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSign(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::IsFinite(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlIsFinite(handle, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::IsNanInf(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // TODO(CTR-3849): output type should be void*, but now bool*. + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlNanInf(handle, input_desc, input, reinterpret_cast(output))); +} + +/* static */ void MLUCnnl::Erf(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlErf_v2(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Log1p(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlLog1p(handle, prefer, input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::LogicalNot(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlLogicOp(handle, CNNL_LOGIC_OP_NOT, input_desc, + input, input_desc, input, nullptr, 0, + output_desc, output)); +} + +/* static */ void MLUCnnl::DynamicStitch( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc, + const int** indices, const cnnlTensorDescriptor_t* data_desc, + const void** data, const int size, int* indices_dims, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetDynamicStitchWorkspaceSize(handle, size, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDynamicStitch( + handle, indices_desc, indices, data_desc, data, size, indices_dims, + workspace_ptr, workspace_size, output_desc, output)); +} + +/* static */ void MLUCnnl::CropAndResize( + const ExecutionContext& ctx, const std::string method_name, + const float extrapolation_value, const cnnlTensorDescriptor_t image_desc, + const void* image, const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, const cnnlTensorDescriptor_t box_index_desc, + const void* box_index, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR; + if (method_name == "nearest") { + mode = CNNL_CROP_AND_RESIZE_NEAREST; + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResize( + handle, image_desc, image, boxes_desc, boxes, box_index_desc, box_index, + mode, extrapolation_value, output_desc, output)); +} + +/* static */ void MLUCnnl::CropAndResizeBackwardImage( + const ExecutionContext& ctx, const std::string method_name, + const cnnlTensorDescriptor_t grads_desc, const void* grads, + const cnnlTensorDescriptor_t boxes_desc, const void* boxes, + const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx, + const cnnlTensorDescriptor_t grads_image_desc, void* grads_image) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR; + if (method_name == "nearest") { + mode = CNNL_CROP_AND_RESIZE_NEAREST; + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResizeBackwardImage( + handle, grads_desc, grads, boxes_desc, boxes, box_idx_desc, box_idx, mode, + grads_image_desc, grads_image)); +} + +/* static */ void MLUCnnl::CropAndResizeBackwardBoxes( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t image_desc, + const void* image, const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, const cnnlTensorDescriptor_t box_idx_desc, + const void* box_idx, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + cnnlCropAndResizeMode_t mode = CNNL_CROP_AND_RESIZE_BILINEAR; + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCropAndResizeBackwardBoxes( + handle, input_desc, input, image_desc, image, boxes_desc, boxes, + box_idx_desc, box_idx, output_desc, output, mode)); +} + +/* static */ void MLUCnnl::Interp( + const ExecutionContext& ctx, const cnnlInterpMode_t mode, + const bool align_corners, const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlInterp_v2(handle, align_corners, half_pixel_centers, mode, NULL, true, + input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::InterpBackward( + const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode, + const bool align_corners, const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlInterpBackward(handle, align_corners, half_pixel_centers, mode, + input_desc, input, output_desc, output)); +} + +/* static */ void MLUCnnl::Cast(const ExecutionContext& ctx, + cnnlCastDataType_t cast_type, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCastDataType(handle, input_desc, input, + cast_type, output_desc, output)); +} + +/* static */ void MLUCnnl::PoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, + const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, + const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingBackward( + handle, const_cast(pooling_desc), alpha, y_desc, + y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x)); +} + +/* static */ void MLUCnnl::NonMaxSuppression( + const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc, + const cnnlTensorDescriptor_t boxes_desc, const void* boxes, + const cnnlTensorDescriptor_t confidence_desc, const void* confidence, + const cnnlTensorDescriptor_t output_desc, void* output, void* output_size) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetNmsWorkspaceSize_v2(handle, confidence_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlNms_v2( + handle, nms_desc, boxes_desc, boxes, confidence_desc, confidence, + workspace_ptr, workspace_size, output_desc, output, output_size)); +} + +/* static */ void MLUCnnl::PoolingIndex( + const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t y_desc, void* y) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPoolingIndex( + handle, const_cast(pooling_desc), x_desc, x, + y_desc, y)); +} + +/* static */ void MLUCnnl::SpaceToBatch( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output, + const int64_t block_shape[]) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSpace2batchWorkspaceSize( + handle, input_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + cnnlSpaceBatchParam_t param = {static_cast(block_shape[0]), + static_cast(block_shape[1])}; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSpace2batch(handle, input_desc, input, + output_desc, output, param, + workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::SpaceToBatchNd( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, cnnlSpaceBatchNdDescriptor_t param, + void* extra_device_input, size_t extra_host_input, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSpace2batchNd_v2(handle, input_desc, input, output_desc, output, + param, extra_device_input, extra_host_input)); +} + +/* static */ void MLUCnnl::FusedBatchNorm( + const ExecutionContext& ctx, const bool is_training, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t scale_desc, const void* scale, + const void* offset, const void* running_mean_input, + const void* running_variance_input, float epsilon, float momentum, + const cnnlTensorDescriptor_t output_desc, void* output, + void* running_mean_output, void* running_var_output, + void* saved_batch_mean_output, void* saved_batch_var_output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + if (is_training) { + /* + * If in Paddle, running_mean_output = momentum * runnning_mean_input + + * (1 - momentum) * batch_mean. However, In CNNL, + * running_mean_output = (1 - momentum) * running_mean_input + + * momentum * batch_mean. So we pass (1.0 - momentum) to momentum param. + */ + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormForwardTraining( + handle, NULL, NULL, x_desc, x, scale_desc, scale, offset, + running_mean_output, running_var_output, epsilon, 1.0 - momentum, + output_desc, output, saved_batch_mean_output, saved_batch_var_output)); + } else { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormForwardInference( + handle, NULL, NULL, x_desc, x, scale_desc, scale, offset, + running_mean_input, running_variance_input, epsilon, output_desc, + output)); + } +} + +/* static */ void MLUCnnl::FusedBatchNormGrad( + const ExecutionContext& ctx, const bool is_training, + const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t scale_desc, const void* scale, + const void* saved_mean, const void* saved_var, float epsilon, + const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop, + void* scale_backprop, void* offset_backprop) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + if (is_training) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchNormBackward( + handle, NULL, NULL, NULL, NULL, x_desc, x, y_backprop_desc, y_backprop, + scale_desc, scale, saved_mean, saved_var, epsilon, x_backprop_desc, + x_backprop, scale_backprop, offset_backprop)); + } else { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlFrozenBatchNormBackward( + handle, x_desc, x, y_backprop_desc, y_backprop, scale_desc, scale, + saved_mean, saved_var, epsilon, x_backprop_desc, x_backprop, + scale_backprop, offset_backprop)); + } +} + +/* static */ void MLUCnnl::QuantizeParam( + const ExecutionContext& ctx, const cnnlQuantizeMode_t mode, + const int bitwidth, const cnnlTensorDescriptor_t input_desc, + const void* input, void* position, void* scale, void* offset) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetQuantizeParamWorkspaceSize(handle, input_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeParam( + handle, mode, input_desc, input, bitwidth, workspace_ptr, workspace_size, + position, scale, offset)); +} + +/* static */ void MLUCnnl::Conv2D( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, + const void* input_position, const void* input_scale, + const void* input_offset, const void* filter_position, + const void* filter_scale, const void* filter_offset, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t filter_desc, const void* filter, + const cnnlTensorDescriptor_t bias_desc, const void* bias, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(input_desc, dt_onchip)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(filter_desc, dt_onchip)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(output_desc, tensor_dtype)); + + cnnlConvolutionForwardAlgo_t algo; + const cnnlConvolutionFwdPreference_t preference = + CNNL_CONVOLUTION_FWD_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardAlgorithm( + handle, conv_desc, input_desc, filter_desc, output_desc, preference, + &algo)); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardWorkspaceSize( + handle, input_desc, filter_desc, output_desc, bias_desc, conv_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionForward( + handle, conv_desc, algo, nullptr /*alpha*/, input_desc, input, + input_position, input_scale, input_offset, filter_desc, filter, + filter_position, filter_scale, filter_offset, bias_desc, bias, + workspace_ptr, workspace_size, nullptr /*beta*/, output_desc, output)); +} + +/* static */ void MLUCnnl::FusedConvBNQuantify( + const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc, + const void* epsilon_ptr, const int fused_ops_number, + const cnnlDataType_t tensor_dtype, const int input_position, + const float input_scale, const int filter_position, + const float filter_scale, const cnnlTensorDescriptor_t scale_desc, + const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc, + const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc, + const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc, + const void* variance_ptr, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t filter_desc, + const void* filter, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(input_desc, CNNL_DTYPE_INT16)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(filter_desc, CNNL_DTYPE_INT16)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(output_desc, tensor_dtype)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorPositionAndScale( + input_desc, input_position, input_scale)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorPositionAndScale( + filter_desc, filter_position, filter_scale)); + + cnnlFusedOpsPlan_t fusion_plan = nullptr; + cnnlActivationDescriptor_t active_desc = nullptr; + cnnlFusedOpsConstParamPack_t cparam_pack = nullptr; + cnnlFusedOpsVariantParamPack_t vparam_pack = nullptr; + cnnlConvolutionForwardAlgo_t algo; + cnnlFusedOps_t fusion_type = CNNL_CONV_SCALE_BN_ACTIVATION; + cnnlConvolutionCastMode_t cast_mode = CNNL_OFFLINE_SYMMETRIC_QUANTIZE; + cnnlConvolutionFwdPreference_t preference = CNNL_CONVOLUTION_FWD_FASTEST; + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionForwardAlgorithm( + handle, conv_desc, input_desc, filter_desc, output_desc, preference, + &algo)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateFusedOpsPlan(&fusion_plan, fusion_type)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlCreateFusedOpsConstParamPack(&cparam_pack, fusion_type)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlCreateFusedOpsVariantParamPack(&vparam_pack, fusion_type)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_XDESC, input_desc)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetFusedOpsVariantParamPackAttribute(vparam_pack, CNNL_PTR_X, input)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_WDESC, filter_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_W, filter)); + + if (fused_ops_number > 1) { + cnnlCreateActivationDescriptor(&active_desc); + cnnlNanPropagation_t nan_opt = CNNL_NOT_PROPAGATE_NAN; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor( + active_desc, CNNL_ACTIVATION_RELU, nan_opt, 0.0)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_ACTIVATION_DESC, active_desc)); + } + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, scale_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_BN_WEIGHT, scale_ptr)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, offset_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_BN_BIAS, offset_ptr)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, mean_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_BN_MEAN, mean_ptr)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_BN_WEIGHT_BIAS_MEAN_VAR_DESC, variance_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_BN_VAR, variance_ptr)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_CONV_DESC, conv_desc)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_SCALAR_CONV_FWD_ALGO, &algo)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_SCALAR_CONV_FWD_CAST_MODE, &cast_mode)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_SCALAR_BN_EPSILON, epsilon_ptr)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsConstParamPackAttribute( + cparam_pack, CNNL_YDESC, output_desc)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_Y, output)); + + // get workspace size + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlMakeFusedOpsPlan(handle, fusion_plan, cparam_pack, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + if (workspace_size > 0) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_PTR_WORKSPACE, workspace_ptr)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetFusedOpsVariantParamPackAttribute( + vparam_pack, CNNL_SCALAR_WORKSPACE_SIZE, &workspace_size)); + } + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlFusedOpsExecute(handle, fusion_plan, vparam_pack)); + + if (active_desc) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyActivationDescriptor(active_desc)); + } + + if (cparam_pack) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyFusedOpsConstParamPack(cparam_pack)); + } + + if (vparam_pack) { + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlDestroyFusedOpsVariantParamPack(vparam_pack)); + } + + if (fusion_plan) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyFusedOpsPlan(fusion_plan)); + } +} + +/* static */ void MLUCnnl::ConvBackpropInput( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t filter_desc, const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + cnnlConvolutionBwdDataAlgo_t algo; + const cnnlConvolutionBwdDataPreference_t preference = + CNNL_CONVOLUTION_BWD_DATA_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataAlgorithm( + handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, + preference, &algo)); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataWorkspaceSize( + handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionBackwardData( + handle, nullptr /*alpha*/, filter_desc, filter, out_backprop_desc, + out_backprop, conv_desc, algo, workspace_ptr, workspace_size, + nullptr /*beta*/, in_backprop_desc, in_backprop)); +} + +/* static */ void MLUCnnl::QuantizeConvBackpropInput( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, + const void* filter_position, const void* filter_scale, + const void* filter_offset, const void* out_backprop_position, + const void* out_backprop_scale, const void* out_backprop_offset, + const cnnlTensorDescriptor_t filter_desc, const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(filter_desc, dt_onchip)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(out_backprop_desc, dt_onchip)); + + cnnlConvolutionBwdDataAlgo_t algo; + const cnnlConvolutionBwdDataPreference_t preference = + CNNL_CONVOLUTION_BWD_DATA_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataAlgorithm( + handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, + preference, &algo)); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardDataWorkspaceSize( + handle, filter_desc, out_backprop_desc, conv_desc, in_backprop_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionBackwardData( + handle, nullptr /*alpha*/, filter_desc, filter, filter_position, + filter_scale, filter_offset, out_backprop_desc, out_backprop, + out_backprop_position, out_backprop_scale, out_backprop_offset, conv_desc, + algo, workspace_ptr, workspace_size, nullptr /*beta*/, in_backprop_desc, + in_backprop)); +} + +/* static */ void MLUCnnl::ConvBackpropFilter( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + cnnlConvolutionBwdFilterAlgo_t algo; + const cnnlConvolutionBwdFilterPreference_t preference = + CNNL_CONVOLUTION_BWD_FILTER_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterAlgorithm( + handle, conv_desc, input_desc, out_backprop_desc, filter_backprop_desc, + preference, &algo)); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterWorkspaceSize( + handle, input_desc, out_backprop_desc, filter_backprop_desc, conv_desc, + algo, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConvolutionBackwardFilter( + handle, nullptr /*alpha*/, input_desc, input, out_backprop_desc, + out_backprop, conv_desc, algo, workspace_ptr, workspace_size, + nullptr /*beta*/, filter_backprop_desc, filter_backprop)); +} + +/* static */ void MLUCnnl::QuantizeConvBackpropFilter( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, + const void* input_position, const void* input_scale, + const void* input_offset, const void* out_backprop_position, + const void* out_backprop_scale, const void* out_backprop_offset, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(input_desc, dt_onchip)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTensorDescriptorOnchipDataType(out_backprop_desc, dt_onchip)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetTensorDescriptorOnchipDataType( + filter_backprop_desc, tensor_dtype)); + + cnnlConvolutionBwdFilterAlgo_t algo; + const cnnlConvolutionBwdFilterPreference_t preference = + CNNL_CONVOLUTION_BWD_FILTER_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterAlgorithm( + handle, conv_desc, input_desc, out_backprop_desc, filter_backprop_desc, + preference, &algo)); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetConvolutionBackwardFilterWorkspaceSize( + handle, input_desc, out_backprop_desc, filter_backprop_desc, conv_desc, + algo, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeConvolutionBackwardFilter( + handle, nullptr /*alpha*/, input_desc, input, input_position, input_scale, + input_offset, out_backprop_desc, out_backprop, out_backprop_position, + out_backprop_scale, out_backprop_offset, conv_desc, algo, workspace_ptr, + workspace_size, nullptr /*beta*/, filter_backprop_desc, filter_backprop)); +} + +/* static */ void MLUCnnl::QuantizeMatMul( + const ExecutionContext& ctx, const bool transpose_a, const bool transpose_b, + const cnnlTensorDescriptor_t a_desc, const void* a, const void* a_position, + const void* a_scale, const void* a_offset, + const cnnlTensorDescriptor_t b_desc, const void* b, const void* b_position, + const void* b_scale, const void* b_offset, const cnnlDataType_t quant_type, + const cnnlDataType_t data_type, const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // Set onchip data type + cnnlSetTensorDescriptorOnchipDataType(a_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(b_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type); + + // Create and set matmul descriptor + cnnlMatMulDescriptor_t matmul_desc; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulDescCreate(&matmul_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr( + matmul_desc, CNNL_MATMUL_DESC_COMPUTE_TYPE, &data_type, sizeof(int))); + int transpose_a_int = static_cast(transpose_a); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr( + matmul_desc, CNNL_MATMUL_DESC_TRANSA, &(transpose_a_int), sizeof(int))); + int transpose_b_int = static_cast(transpose_b); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetMatMulDescAttr( + matmul_desc, CNNL_MATMUL_DESC_TRANSB, &(transpose_b_int), sizeof(int))); + + // Create and get matmul algorithim + cnnlMatMulAlgo_t algo; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulAlgoCreate(&algo)); + const cnnlMatMulPreference_t preference = CNNL_MATMUL_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeMatMulAlgorithm( + handle, matmul_desc, a_desc, b_desc, output_desc, preference, &algo)); + + // Get workspace + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeMatMulWorkspaceSize( + handle, matmul_desc, a_desc, b_desc, output_desc, algo, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + // Compute + float alpha = 1.0; + float beta = 0.0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeMatMul( + handle, matmul_desc, reinterpret_cast(&alpha), a_desc, a, + a_position, a_scale, a_offset, b_desc, b, b_position, b_scale, b_offset, + reinterpret_cast(&beta), output_desc, output, algo, workspace_ptr, + workspace_size)); + + // Destroy matmul descriptor and algorithim + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulDescDestroy(matmul_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatMulAlgoDestroy(algo)); +} + +/* static */ void MLUCnnl::QuantizeBatchMatMul( + const ExecutionContext& ctx, const bool adj_x, const bool adj_y, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const void* in0_position, const void* in0_scale, const void* in0_offset, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const void* in1_position, const void* in1_scale, const void* in1_offset, + const cnnlDataType_t quant_type, const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // Set onchip data type + cnnlSetTensorDescriptorOnchipDataType(in0_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(in1_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type); + + // Create and set batch matmul descriptor + cnnlBatchMatMulDescriptor_t bmm_desc; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulDescCreate(&bmm_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr( + bmm_desc, CNNL_BMM_DESC_COMPUTE_TYPE, &data_type, sizeof(int))); + int transpose_a_int = static_cast(adj_x); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr( + bmm_desc, CNNL_BMM_DESC_TRANSA, &(transpose_a_int), sizeof(int))); + int transpose_b_int = static_cast(adj_y); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulDescAttr( + bmm_desc, CNNL_BMM_DESC_TRANSB, &(transpose_b_int), sizeof(int))); + + // Create and get batch matmul algorithim + cnnlBatchMatMulAlgo_t algo; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulAlgoCreate(&algo)); + const cnnlBatchMatMulPreference_t preference = CNNL_BMM_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulAlgorithm( + handle, bmm_desc, in0_desc, in1_desc, output_desc, preference, &algo)); + + // Get workspace + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulWorkspaceSize( + handle, bmm_desc, in0_desc, in1_desc, output_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + // Compute + float alpha = 1.0; + float beta = 0.0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeBatchMatMul( + handle, bmm_desc, reinterpret_cast(&alpha), in0_desc, in0, + in0_position, in0_scale, in0_offset, in1_desc, in1, in1_position, + in1_scale, in1_offset, reinterpret_cast(&beta), output_desc, + output, algo, workspace_ptr, workspace_size)); + + // Destroy matmul descriptor and algorithim + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulDescDestroy(bmm_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulAlgoDestroy(algo)); +} + +/* static */ void MLUCnnl::QuantizeBatchMatMulBCast( + const ExecutionContext& ctx, const bool adj_x, const bool adj_y, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const void* in0_position, const void* in0_scale, const void* in0_offset, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const void* in1_position, const void* in1_scale, const void* in1_offset, + const cnnlDataType_t quant_type, const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + // Set onchip data type + cnnlSetTensorDescriptorOnchipDataType(in0_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(in1_desc, quant_type); + cnnlSetTensorDescriptorOnchipDataType(output_desc, data_type); + + // Create and set batch matmul descriptor + cnnlBatchMatMulBCastDescriptor_t bmm_bcast_desc; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastDescCreate(&bmm_bcast_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr( + bmm_bcast_desc, CNNL_BMM_BCAST_DESC_COMPUTE_TYPE, &data_type, + sizeof(int))); + int transpose_a_int = static_cast(adj_x); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr( + bmm_bcast_desc, CNNL_BMM_BCAST_DESC_TRANSA, &(transpose_a_int), + sizeof(int))); + int transpose_b_int = static_cast(adj_y); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetBatchMatMulBCastDescAttr( + bmm_bcast_desc, CNNL_BMM_BCAST_DESC_TRANSB, &(transpose_b_int), + sizeof(int))); + + // Create and get batch matmul algorithim + cnnlBatchMatMulBCastAlgo_t algo; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastAlgoCreate(&algo)); + const cnnlBatchMatMulBCastPreference_t preference = CNNL_BMM_BCAST_FASTEST; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulBCastAlgorithm( + handle, bmm_bcast_desc, in0_desc, in1_desc, output_desc, preference, + &algo)); + + // Get workspace + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetQuantizeBatchMatMulBCastWorkspaceSize( + handle, bmm_bcast_desc, in0_desc, in1_desc, output_desc, algo, + &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + // Compute + float alpha = 1.0; + float beta = 0.0; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQuantizeBatchMatMulBCast( + handle, bmm_bcast_desc, reinterpret_cast(&alpha), in0_desc, in0, + in0_position, in0_scale, in0_offset, in1_desc, in1, in1_position, + in1_scale, in1_offset, reinterpret_cast(&beta), output_desc, + output, algo, workspace_ptr, workspace_size)); + + // Destroy matmul descriptor and algorithim + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastDescDestroy(bmm_bcast_desc)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBatchMatMulBCastAlgoDestroy(algo)); +} + +/* static */ void MLUCnnl::Transpose( + const ExecutionContext& ctx, const std::vector perm, + const int input_dim, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + cnnlTransposeDescriptor_t perm_desc; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateTransposeDescriptor(&perm_desc)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetTransposeDescriptor(perm_desc, input_dim, perm.data())); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetTransposeWorkspaceSize( + handle, input_desc, perm_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlTranspose_v2(handle, perm_desc, input_desc, + input, output_desc, output, + workspace_ptr, workspace_size)); + if (perm_desc) { + PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyTransposeDescriptor(perm_desc)); + } +} + +/* static */ void MLUCnnl::MatrixBandPart( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc, + const void* input, const int num_lower, const int num_upper, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlMatrixBandPart(handle, data_desc, input, + num_lower, num_upper, output)); +} + +/* static */ void MLUCnnl::NumTrue(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, Tensor index, + uint32_t* num_true) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetNumTrueWorkspaceSize(handle, x_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + index = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* index_ptr = index.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlNumTrue( + handle, x_desc, x, static_cast(index_ptr), num_true)); +} + +/* static */ void MLUCnnl::Where(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, const uint32_t* strides, + const uint32_t* index, + const cnnlTensorDescriptor_t y_desc, int* y, + const bool as_tuple) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlWhere(handle, x_desc, x, strides, index, y_desc, y, as_tuple)); +} + +/* static */ void MLUCnnl::InTopK( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t predictions_desc, + const void* predictions, const cnnlTensorDescriptor_t targets_desc, + const void* targets, const cnnlTensorDescriptor_t k_desc, const void* k, + const int k_int, const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlInTopK(handle, predictions_desc, predictions, + targets_desc, targets, k_desc, k, k_int, + output_desc, output)); +} + +/* static */ void MLUCnnl::ScatterNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t updates_desc, + const void* updates, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlScatterNd(handle, indices_desc, indices, + updates_desc, updates, output_desc, + output)); +} + +/* static */ void MLUCnnl::BitWise( + const ExecutionContext& ctx, const cnnlBitComputeOp_t optype, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetBitComputeWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlBitCompute_v2( + handle, optype, input1_desc, input1, input2_desc, input2, output_desc, + output, workspace_ptr, workspace_size)); +} + +/* static */ void MLUCnnl::QR(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t a_desc, + const void* a, + const cnnlTensorDescriptor_t q_desc, void* q, + const cnnlTensorDescriptor_t r_desc, void* r, + const bool some) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetQRWorkspaceSize(handle, a_desc, some, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlQR(handle, a_desc, a, q_desc, q, r_desc, r, + workspace_ptr, workspace_size, some)); +} + +/* static */ void MLUCnnl::Reciprocal(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlReciprocal(handle, input_desc, input, output_desc, output)); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index e0a2735e0ea4dc855cb3cf3dab2917cd5d040685..ab398a92c2972986766f58d29f8c98cb27258655 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -30,7 +30,20 @@ namespace operators { using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; +using ExecutionContext = framework::ExecutionContext; using DeviceContextPool = platform::DeviceContextPool; +using MLUDeviceContext = platform::MLUDeviceContext; + +enum MLULogicMethod { + CNNL_LOGIC_OP_EQ = 0, + CNNL_LOGIC_OP_NE = 1, + CNNL_LOGIC_OP_GT = 2, + CNNL_LOGIC_OP_GE = 3, + CNNL_LOGIC_OP_LT = 4, + CNNL_LOGIC_OP_LE = 5, + CNNL_LOGIC_OP_AND = 6, + CNNL_LOGIC_OP_OR = 7, +}; template inline cnnlDataType_t ToCnnlDataType(const T& t) { @@ -76,6 +89,14 @@ NarrowT CheckedNarrowing(const WideT& wide) { return narrow; } +static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) { + return ctx.template device_context().cnnl_handle(); +} + +static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) { + return ctx.template device_context(); +} + cnnlDeviceType_t GetCnnlDev(int dev_ordinal); using CnnlTensorDesc = cnnlTensorDescriptor_t; @@ -146,22 +167,914 @@ class MLUCnnlActivationDesc { cnnlActivationDescriptor_t active_desc_ = nullptr; }; +class MLUCnnlPoolingDesc { + public: + MLUCnnlPoolingDesc(const MLUCnnlPoolingDesc& desc) = delete; + MLUCnnlPoolingDesc& operator=(const MLUCnnlPoolingDesc& desc) = delete; + + MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, + const cnnlNanPropagation_t maxpooling_nan_opt, + int window_rows, int window_cols, int64_t pad_up, + int64_t pad_down, int64_t pad_left, int64_t pad_right, + int row_stride, int col_stride); + + MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, + const cnnlNanPropagation_t maxpooling_nan_opt, + const int tensor_rank, const std::vector& window, + const std::vector& padding, + const std::vector& stride); + + const cnnlPoolingDescriptor_t get() const; + + ~MLUCnnlPoolingDesc(); + + private: + cnnlPoolingDescriptor_t pooling_desc_ = nullptr; +}; + +class MLUCnnlRandomGeneratorDesc { + public: + MLUCnnlRandomGeneratorDesc(const bool is_mlu200, const int seed); + const cnnlRandGenerator_t get() const; + ~MLUCnnlRandomGeneratorDesc(); + + private: + cnnlRandGenerator_t mlu_generator = nullptr; +}; + +class MLUCnnlReduceDesc { + public: + MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete; + MLUCnnlReduceDesc& operator=(const MLUCnnlReduceDesc& desc) = delete; + + MLUCnnlReduceDesc(const std::vector& axis_vec, + const cnnlReduceOp_t reduce_op, + const cnnlDataType_t data_type, + const cnnlNanPropagation_t nan_propagation, + const cnnlReduceIndices_t reduce_indices, + const cnnlIndicesType_t indices_type); + + const cnnlReduceDescriptor_t get() const; + + ~MLUCnnlReduceDesc(); + + private: + cnnlReduceDescriptor_t reduction_desc_ = nullptr; +}; + +class MLUCnnlOpTensorDesc { + public: + MLUCnnlOpTensorDesc(const MLUCnnlOpTensorDesc& desc) = delete; + void operator=(const MLUCnnlOpTensorDesc&) = delete; + + MLUCnnlOpTensorDesc(cnnlOpTensorDesc_t op_tensor_op, + cnnlDataType_t op_tensor_comp_type, + cnnlNanPropagation_t op_tensor_nan_opt); + + const cnnlOpTensorDescriptor_t get() const; + + ~MLUCnnlOpTensorDesc(); + + private: + cnnlOpTensorDescriptor_t op_tensor_desc_ = nullptr; +}; + +class MLUCnnlNMSDesc { + public: + MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete; + MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete; + + MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, const float iou_threshold, + const int max_output_size, const float confidence_threshold, + const int input_layout); + + const cnnlNmsDescriptor_t get() const; + + ~MLUCnnlNMSDesc(); + + private: + cnnlNmsDescriptor_t nms_desc_ = nullptr; +}; + +class MLUCnnlConvolutionDesc { + public: + MLUCnnlConvolutionDesc(const int dims, const int pad[], const int stride[], + const int dilation[], const int group_count, + const cnnlDataType_t tensor_dtype); + + MLUCnnlConvolutionDesc(const int dims, const int64_t pad[], + const int64_t stride[], const int64_t dilation[], + const int group_count, + const cnnlDataType_t tensor_dtype); + + MLUCnnlConvolutionDesc(const MLUCnnlConvolutionDesc& desc) = delete; + + MLUCnnlConvolutionDesc& operator=(const MLUCnnlConvolutionDesc& desc) = + delete; + + const cnnlConvolutionDescriptor_t get() const; + + ~MLUCnnlConvolutionDesc(); + + private: + cnnlConvolutionDescriptor_t conv_desc_ = nullptr; +}; + +class MLUCnnlBatchSpaceDesc { + public: + MLUCnnlBatchSpaceDesc(uint32_t block_shape[], uint32_t paddings[], + const uint32_t block_shape_size, + const uint32_t paddings_size); + + void getBatch2spaceNdextraInputSize(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc); + + void getSpace2batchNdextraInputSize(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc); + + void initSpace2batchNdExtraInput(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + void* extra_host_input); + + void initBatch2spaceNdExtraInput(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + void* extra_host_input); + + const cnnlSpaceBatchNdDescriptor_t get() const; + + size_t getExtraInputSize() const; + + ~MLUCnnlBatchSpaceDesc(); + + private: + cnnlSpaceBatchNdDescriptor_t op_desc_ = nullptr; + size_t extra_input_size_; +}; + +class MLUCnnlTrigonDesc { + public: + explicit MLUCnnlTrigonDesc( + const cnnlTrigonFunctionMode_t trigon_function_mode); + + const cnnlTrigonDescriptor_t get() const; + + ~MLUCnnlTrigonDesc(); + + private: + cnnlTrigonDescriptor_t trigon_desc_ = nullptr; +}; + class MLUCnnl { public: - static void Active(const platform::MLUDeviceContext& ctx, + static void Active(const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); - static void ActiveGrad(const platform::MLUDeviceContext& ctx, - cnnlActivationDescriptor_t active_desc, - const void* alpha, const void* beta, - const cnnlTensorDescriptor_t y_desc, const void* y, - const cnnlTensorDescriptor_t diff_y_desc, - const void* diff_y, - const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t diff_x_desc, - void* diff_x); + static void ActiveGrad( + const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc, + const void* y, const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + + static void Concat(const ExecutionContext& ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Div(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Fill(const ExecutionContext& ctx, float value, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void LRN(const ExecutionContext& ctx, const int local_size, + const double alpha, const double beta, const double k, + const cnnlTensorDescriptor_t input_quant_desc, + const void* input_quant, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void QuantifyOffline(const ExecutionContext& context, + cnnlQuantizeMode_t mode, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t ouput_desc, + void* output); + + static void QuantifyOnline(const ExecutionContext& context, + const int bitwidth, + const cnnlTensorDescriptor_t input_desc, + const void* input, const bool compute_scale, + void* position, void* scale, + const cnnlTensorDescriptor_t ouput_desc, + void* output); + + static void SGD(const ExecutionContext& context, + const cnnlTensorDescriptor_t grad_desc, const void* grad, + const void* lr, const cnnlTensorDescriptor_t var_desc, + void* var); + + static void ApplyAdaGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const cnnlTensorDescriptor_t accum_desc, void* accum, + const cnnlTensorDescriptor_t var_desc, void* var, + const void* lr, const bool update_slots); + + static void ApplyRMSProp(const ExecutionContext& context, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* rho, + const void* momentum, const void* epsilon, + const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t ms_desc, void* ms, + const cnnlTensorDescriptor_t mom_desc, void* mom); + + static void ApplyCenterRMSProp( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* rho, const void* momentum, + const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t mg_desc, void* mg, + const cnnlTensorDescriptor_t ms_desc, void* ms, + const cnnlTensorDescriptor_t mom_desc, void* mom); + + static void ApplyAdam(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, const void* lr, const void* beta1, + const void* beta2, const void* beta1_power, + const void* beta2_power, const void* epsilon, + const bool use_nesterov, + const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v); + + static void ApplyAdaMax(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const cnnlTensorDescriptor_t var_desc, void* var, + const cnnlTensorDescriptor_t m_desc, void* m, + const cnnlTensorDescriptor_t v_desc, void* v, + const void* diff, const void* lr, const void* beta1, + const void* beta2, const void* beta1_power, + const void* epsilon); + + static void ApplyMomentum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, const bool use_nesterov, + const void* lr, const void* momentum, void* var, + void* accum); + + static void ApplyKerasMomentum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, const bool use_nesterov, + const void* lr, const void* momentum, + void* var, void* accum); + + static void ApplyAdadelta(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* diff, const void* lr, const void* rho, + const void* epsilon, void* var, void* accum, + void* accum_update); + + static void SparseSoftmaxXentWithLogits( + const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, + const cnnlTensorDescriptor_t x_desc, const void* input, + const cnnlTensorDescriptor_t label_desc, const void* label, + const cnnlTensorDescriptor_t y_desc, void* output, + const cnnlTensorDescriptor_t diff_y_desc, void* back_out); + + static void RandomUniform(const ExecutionContext& ctx, const int num, + const cnnlDataType_t data_type, + const cnnlRandGenerator_t mlu_generator, + void* output); + + static void Cumsum(const ExecutionContext& ctx, const int axis, + const bool exclusive, const bool reverse, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t ouput_desc, void* output); + + static void BroadcastTo(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void GatherFunctor( + const ExecutionContext& ctx, const int axis, const int batch_dims, + const cnnlTensorDescriptor_t params_desc, const void* params, + const cnnlTensorDescriptor_t indices_desc, const void* indices, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void ScatterFunctor( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, + const void* params, const cnnlTensorDescriptor_t updates_desc, + const void* updates, const cnnlTensorDescriptor_t indices_desc, + const void* indices, const cnnlScatterRefMode_t mode); + + static void Range(const ExecutionContext& ctx, const void* start, + const void* end, const void* step, + const cnnlDataType_t output_dtype, void* output); + + static void Round(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void TopK(const ExecutionContext& ctx, const int k, const int dim, + const bool largest, const bool sorted, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t values_output_desc, + void* values_out, + const cnnlTensorDescriptor_t indices_output_desc, + void* indices_out); + + static void StridedSlice(const ExecutionContext& ctx, const int begin[], + const int end[], const int strides[], + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Split(const ExecutionContext& ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + + static void Scale(const ExecutionContext& ctx, const int axis, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t alpha_desc, const void* alpha, + const cnnlTensorDescriptor_t beta_desc, const void* beta, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void AddN(const ExecutionContext& ctx, uint32_t input_num, + const cnnlTensorDescriptor_t inputs_desc[], + const void* inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Log(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void StridedSliceGrad(const ExecutionContext& ctx, const int begin[], + const int end[], const int strides[], + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Logic(const ExecutionContext& ctx, + const MLULogicMethod log_method, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, const cnnlTensorDescriptor_t ouput_desc, + void* output); + + static void Select(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t then_desc, const void* p_then, + const cnnlTensorDescriptor_t else_desc, const void* p_else, + const cnnlTensorDescriptor_t output_desc, void* output, + const bool* condition, const int condition_size); + + static void AssignAdd(const ExecutionContext& ctx, const void* alpha, + const void* beta, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, void* param); + + static void AssignSub(const ExecutionContext& ctx, const void* alpha, + const void* beta, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, void* param); + + static void Assign(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t update_desc, + const void* update, + const cnnlTensorDescriptor_t param_desc, void* param); + + static void GatherNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t params_desc, + const void* params, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void BatchToSpace(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output, const cnnlSpaceBatchParam_t param); + + static void BatchToSpaceNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + cnnlSpaceBatchNdDescriptor_t param, + void* extra_device_input, size_t extra_input_size, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void PoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const std::vector& output_shape, + cnnlPoolingDescriptor_t pooling_desc, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* beta, const void* extra_input_ptr, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const std::vector& output_shape, + cnnlPoolingDescriptor_t pooling_desc, const void* alpha, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* beta, const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Pad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const void* paddings, const void* padding_value, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Matmul(const ExecutionContext& ctx, const bool transpose_a, + const bool transpose_b, + const cnnlTensorDescriptor_t in0_desc, const void* in0, + const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void BatchMatmul( + const ExecutionContext& ctx, const bool transpose_a, + const bool transpose_b, const cnnlTensorDescriptor_t in0_desc, + const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void OpTensor(const ExecutionContext& ctx, + const cnnlOpTensorDescriptor_t op_tensor_desc, + const cnnlTensorDescriptor_t a_desc, const void* a, + const cnnlTensorDescriptor_t b_desc, const void* b, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlDataType_t dtype); + + static void BiasAddGrad(const ExecutionContext& ctx, const int axis, + const cnnlTensorDescriptor_t out_backprop_desc, + const void* out_backprop, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void OneHot(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t desc_indices, + const void* indices, const int depth, const void* on_value, + const void* off_value, const int axis, + cnnlDataType_t output_data_type, void* output); + + static void NonMaxSuppression(const ExecutionContext& ctx, + const cnnlNmsDescriptor_t nms_desc, + const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, + const cnnlTensorDescriptor_t confidence_desc, + const void* confidence, + const cnnlTensorDescriptor_t output_desc, + void* output, void* output_size); + + static void SoftmaxCrossEntropyWithLogits( + const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* logits_in, + const cnnlTensorDescriptor_t label_desc, const void* labels_in, + const cnnlTensorDescriptor_t loss_out_desc, void* loss_out, + const cnnlTensorDescriptor_t back_out_desc, void* back_out); + + static void SoftmaxForward(const ExecutionContext& ctx, + cnnlSoftmaxAlgorithm_t algorithm, + cnnlSoftmaxMode_t mode, const void* alpha, + const cnnlTensorDescriptor_t input_desc, + const void* input, const void* beta, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Softplus(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t features_desc, + const void* features, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void SoftplusGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t gradients_desc, + const void* gradients, + const cnnlTensorDescriptor_t features_desc, + const void* features, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void RsqrtGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, const void* y, + const void* diff_y, void* output); + + static void SqrtGrad(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, const void* y, + const void* diff_y, void* output); + + static void ConvolutionForward( + const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc_, + const void* alpha, const void* beta, + const cnnlTensorDescriptor_t bias_desc, const void* bias_ptr, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t filtet_desc, const void* filter, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void FusedConvBNQuantify( + const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc, + const void* epsilon_ptr, const int fused_ops_number, + const cnnlDataType_t tensor_dtype, const int input_position, + const float input_scale, const int filter_position, + const float filter_scale, const cnnlTensorDescriptor_t scale_desc, + const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc, + const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc, + const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc, + const void* variance_ptr, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t filtet_desc, + const void* filter, const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Tile(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void UnsortedSegmentSum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, + const void* data, + const cnnlTensorDescriptor_t ids_desc, + const int* segment_ids, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Reduce(const ExecutionContext& ctx, const bool need_workspace, + const cnnlReduceDescriptor_t reduction_desc, + const void* alpha, const cnnlTensorDescriptor_t input_desc, + const void* input, const size_t indices_size, + void* indices, const void* beta, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void FloorDiv(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void FloorMod(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Maximum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Minimum(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void PowR(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, const void* input1, + const cnnlTensorDescriptor_t input2_desc, const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void DivNoNan(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void SquaredDifference(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void L2Loss(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + void* output); + + static void Abs(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Neg(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Floor(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Ceil(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void IsNan(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Square(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Sqrt(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Rsqrt(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Cos(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Sin(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void TrigonForward(const ExecutionContext& ctx, + const cnnlTrigonDescriptor_t trigon_desc, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Exp(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Sign(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void IsFinite(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void IsNanInf(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, void* output); + + static void Erf(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void Log1p(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void LogicalNot(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void DynamicStitch( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc, + const int** indices, const cnnlTensorDescriptor_t* data_desc, + const void** data, const int size, int* indices_dims, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void CropAndResize( + const ExecutionContext& ctx, const std::string method_name, + const float extrapolation_value, const cnnlTensorDescriptor_t image_desc, + const void* image, const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, const cnnlTensorDescriptor_t box_index_desc, + const void* box_index, const cnnlTensorDescriptor_t output_desc, + void* output); + + static void CropAndResizeBackwardImage( + const ExecutionContext& ctx, const std::string method_name, + const cnnlTensorDescriptor_t image_desc, const void* image, + const cnnlTensorDescriptor_t boxes_desc, const void* boxes, + const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx, + const cnnlTensorDescriptor_t grads_image_desc, void* grads_image); + + static void CropAndResizeBackwardBoxes( + const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t image_desc, + const void* image, const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, const cnnlTensorDescriptor_t box_idx_desc, + const void* box_idx, const cnnlTensorDescriptor_t output_desc, + void* output); + + static void PoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, + const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, + const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + + static void PoolingIndex(const ExecutionContext& ctx, + const cnnlPoolingDescriptor_t pooling_desc, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t y_desc, void* y); + + static void SpaceToBatch(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output, const int64_t block_shape[]); + + static void SpaceToBatchNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + cnnlSpaceBatchNdDescriptor_t param, + void* extra_device_input, size_t extra_input_size, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Interp(const ExecutionContext& ctx, const cnnlInterpMode_t mode, + const bool align_corners, const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void InterpBackward( + const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode, + const bool align_corners, const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void QuantizeParam(const ExecutionContext& ctx, + const cnnlQuantizeMode_t mode, const int bitwidth, + const cnnlTensorDescriptor_t input_desc, + const void* input, void* position, void* scale, + void* offset); + + static void QuantizeMatMul( + const ExecutionContext& ctx, const bool transpose_a, + const bool transpose_b, const cnnlTensorDescriptor_t a_desc, + const void* a, const void* a_position, const void* a_scale, + const void* a_offset, const cnnlTensorDescriptor_t b_desc, const void* b, + const void* b_position, const void* b_scale, const void* b_offset, + const cnnlDataType_t quant_type, const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void QuantizeBatchMatMul( + const ExecutionContext& ctx, const bool adj_x, const bool adj_y, + const cnnlTensorDescriptor_t a_desc, const void* a, + const void* a_position, const void* a_scale, const void* a_offset, + const cnnlTensorDescriptor_t b_desc, const void* b, + const void* b_position, const void* b_scale, const void* b_offset, + const cnnlDataType_t quant_type, const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void QuantizeBatchMatMulBCast( + const ExecutionContext& ctx, const bool adj_x, const bool adj_y, + const cnnlTensorDescriptor_t a_desc, const void* a, + const void* a_position, const void* a_scale, const void* a_offset, + const cnnlTensorDescriptor_t b_desc, const void* b, + const void* b_position, const void* b_scale, const void* b_offset, + const cnnlDataType_t quant_type, const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void FusedBatchNorm( + const ExecutionContext& ctx, const bool is_training, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t scale_desc, const void* scale, + const void* offset, const void* estimated_mean, + const void* estimated_variance, float epsilon, float momentum, + const cnnlTensorDescriptor_t output_desc, void* output, void* batch_mean, + void* batch_var, void* saved_mean, void* saved_var); + + static void FusedBatchNormGrad( + const ExecutionContext& ctx, const bool is_training, + const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop, + const cnnlTensorDescriptor_t x_desc, const void* x, + const cnnlTensorDescriptor_t scale_desc, const void* scale, + const void* saved_mean, const void* saved_var, float epsilon, + const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop, + void* scale_backprop, void* offset_backprop); + + static void Transpose(const ExecutionContext& ctx, + const std::vector perm, const int input_dim, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void MatrixBandPart(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t data_desc, + const void* input, const int num_lower, + const int num_upper, void* output); + + static void NumTrue(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, const void* x, + Tensor index, uint32_t* num_true); + + static void Where(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, const void* x, + const uint32_t* strides, const uint32_t* index, + const cnnlTensorDescriptor_t y_desc, int* y, + const bool as_tuple); + + static void Conv2D(const ExecutionContext& ctx, + const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, + const cnnlDataType_t dt_onchip, const void* input_position, + const void* input_scale, const void* input_offset, + const void* filter_position, const void* filter_scale, + const void* filter_offset, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t filter_desc, + const void* filter, const cnnlTensorDescriptor_t bias_desc, + const void* bias, const cnnlTensorDescriptor_t output_desc, + void* output); + + static void ConvBackpropInput( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t input_desc, const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); + + static void QuantizeConvBackpropInput( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, + const void* filter_position, const void* filter_scale, + const void* filter_offset, const void* out_backprop_position, + const void* out_backprop_scale, const void* out_backprop_offset, + const cnnlTensorDescriptor_t input_desc, const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); + + static void ConvBackpropFilter( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); + + static void QuantizeConvBackpropFilter( + const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, + const void* input_position, const void* input_scale, + const void* input_offset, const void* out_backprop_position, + const void* out_backprop_scale, const void* out_backprop_offset, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); + + static void InTopK(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t predictions_desc, + const void* predictions, + const cnnlTensorDescriptor_t targets_desc, + const void* targets, const cnnlTensorDescriptor_t k_desc, + const void* k, const int k_int, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void ScatterNd(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t updates_desc, + const void* updates, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void BitWise(const ExecutionContext& ctx, + const cnnlBitComputeOp_t optype, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, void* output); + + static void QR(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t a_desc, const void* a, + const cnnlTensorDescriptor_t q_desc, void* q, + const cnnlTensorDescriptor_t r_desc, void* r, const bool some); + + static void Reciprocal(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); }; } // namespace operators