diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index d182cab9da5bd039dbbe6f6b83d9134de80c5e0b..1c5dd2ee448073dce42a1dd3a6e7ddcf2417b712 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -35,9 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool; using MLUDeviceContext = platform::MLUDeviceContext; const std::map MLUReduceOpMap = { - {"reduce_all", CNNL_REDUCE_AND}, {"reduce_any", CNNL_REDUCE_OR}, - {"reduce_max", CNNL_REDUCE_MAX}, {"reduce_mean", CNNL_REDUCE_AVG}, - {"reduce_min", CNNL_REDUCE_MIN}, {"reduce_sum", CNNL_REDUCE_ADD}, + {"reduce_all", CNNL_REDUCE_AND}, + {"reduce_any", CNNL_REDUCE_OR}, + {"reduce_max", CNNL_REDUCE_MAX}, + {"reduce_mean", CNNL_REDUCE_AVG}, + {"reduce_min", CNNL_REDUCE_MIN}, + {"reduce_sum", CNNL_REDUCE_ADD}, {"reduce_prod", CNNL_REDUCE_MUL}, }; @@ -225,36 +228,49 @@ class MLUCnnlTensorDesc { MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs); - MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], + MLUCnnlTensorDesc(const int tensor_dim, + const int dim_sizes[], const cnnlDataType_t tensor_dtype); - MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], + MLUCnnlTensorDesc(const int tensor_dim, + const int dim_sizes[], const cnnlDataType_t tensor_dtype, const cnnlTensorLayout_t layout); - MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], - const cnnlDataType_t tensor_dtype, int position); + MLUCnnlTensorDesc(const int tensor_dim, + const int dim_sizes[], + const cnnlDataType_t tensor_dtype, + int position); - MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], + MLUCnnlTensorDesc(const int tensor_dim, + const int64_t dim_sizes[], const cnnlDataType_t tensor_dtype); - MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], + MLUCnnlTensorDesc(const int tensor_dim, + const int64_t dim_sizes[], const cnnlDataType_t tensor_dtype, const cnnlTensorLayout_t layout); - MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], - const cnnlDataType_t tensor_dtype, int position); + MLUCnnlTensorDesc(const int tensor_dim, + const int64_t dim_sizes[], + const cnnlDataType_t tensor_dtype, + int position); - MLUCnnlTensorDesc(const Tensor& tensor, const cnnlTensorLayout_t layout, + MLUCnnlTensorDesc(const Tensor& tensor, + const cnnlTensorLayout_t layout, const cnnlDataType_t tensor_dtype); explicit MLUCnnlTensorDesc(const Tensor& tensor); - MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, - const cnnlDataType_t tensor_dtype, int position); + MLUCnnlTensorDesc(const Tensor& tensor, + cnnlTensorLayout_t layout, + const cnnlDataType_t tensor_dtype, + int position); - MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, - const cnnlDataType_t tensor_dtype, int position, + MLUCnnlTensorDesc(const Tensor& tensor, + cnnlTensorLayout_t layout, + const cnnlDataType_t tensor_dtype, + int position, float scale); ~MLUCnnlTensorDesc(); @@ -270,8 +286,10 @@ class MLUCnnlActivationDesc { MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof); - MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof, - const float sliced_dim, const float selu_alpha, + MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, + const float ceof, + const float sliced_dim, + const float selu_alpha, const float selu_lambda); const cnnlActivationDescriptor_t get() const; @@ -288,14 +306,22 @@ class MLUCnnlPoolingDesc { MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt, - int window_rows, int window_cols, int64_t pad_up, - int64_t pad_down, int64_t pad_left, int64_t pad_right, - int row_stride, int col_stride, int row_dilation, - int col_dilation, bool ceil_mode); + int window_rows, + int window_cols, + int64_t pad_up, + int64_t pad_down, + int64_t pad_left, + int64_t pad_right, + int row_stride, + int col_stride, + int row_dilation, + int col_dilation, + bool ceil_mode); MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, const cnnlNanPropagation_t maxpooling_nan_opt, - const int tensor_rank, const std::vector& window, + const int tensor_rank, + const std::vector& window, const std::vector& padding, const std::vector& stride); @@ -364,8 +390,10 @@ class MLUCnnlNMSDesc { MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete; MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete; - MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, const float iou_threshold, - const int max_output_size, const float confidence_threshold, + MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, + const float iou_threshold, + const int max_output_size, + const float confidence_threshold, const int input_layout); const cnnlNmsDescriptor_t get() const; @@ -378,12 +406,17 @@ class MLUCnnlNMSDesc { class MLUCnnlConvolutionDesc { public: - MLUCnnlConvolutionDesc(const int dims, const int pad[], const int stride[], - const int dilation[], const int group_count, + MLUCnnlConvolutionDesc(const int dims, + const int pad[], + const int stride[], + const int dilation[], + const int group_count, const cnnlDataType_t tensor_dtype); - MLUCnnlConvolutionDesc(const int dims, const int64_t pad[], - const int64_t stride[], const int64_t dilation[], + MLUCnnlConvolutionDesc(const int dims, + const int64_t pad[], + const int64_t stride[], + const int64_t dilation[], const int group_count, const cnnlDataType_t tensor_dtype); @@ -402,7 +435,8 @@ class MLUCnnlConvolutionDesc { class MLUCnnlBatchSpaceDesc { public: - MLUCnnlBatchSpaceDesc(uint32_t block_shape[], uint32_t paddings[], + MLUCnnlBatchSpaceDesc(uint32_t block_shape[], + uint32_t paddings[], const uint32_t block_shape_size, const uint32_t paddings_size); @@ -446,8 +480,12 @@ class MLUCnnlTrigonDesc { class MLUCnnlDCNDesc { public: - MLUCnnlDCNDesc(int dimNb, const int* pad, const int* stride, - const int* dilation, int deformable_group, int conv_group, + MLUCnnlDCNDesc(int dimNb, + const int* pad, + const int* stride, + const int* dilation, + int deformable_group, + int conv_group, int im2col_step); const cnnlDCNDescriptor_t get() const; @@ -461,55 +499,88 @@ class MLUCnnl { public: static void Active(const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void ActiveGrad( - const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, - const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc, - const void* y, const cnnlTensorDescriptor_t diff_y_desc, - const void* diff_y, const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); - - static void Concat(const ExecutionContext& ctx, const int pack_num, - const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void ActiveGrad(const ExecutionContext& ctx, + cnnlActivationDescriptor_t active_desc, + const void* alpha, + const void* beta, + const cnnlTensorDescriptor_t y_desc, + const void* y, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); + + static void Concat(const ExecutionContext& ctx, + const int pack_num, + const int axis, + const cnnlTensorDescriptor_t inputs_desc[], const void* const inputs[], - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); - static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, - const int axis, const cnnlTensorDescriptor_t inputs_desc[], + static void Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, + const int axis, + const cnnlTensorDescriptor_t inputs_desc[], const void* const inputs[], - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); - static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, + cnnlCastDataType_t cast_type, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Clip(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const void* min, const void* max, void* y); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const void* min, + const void* max, + void* y); - static void HardtanhBackward( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc, - const void* x, const cnnlTensorDescriptor_t diff_y_desc, - const void* diff_y, const float max_val, const float min_val, - const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void HardtanhBackward(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const float max_val, + const float min_val, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); static void Div(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t in0_desc, const void* in0, - const cnnlTensorDescriptor_t in1_desc, const void* in1, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t in0_desc, + const void* in0, + const cnnlTensorDescriptor_t in1_desc, + const void* in1, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Fill(const ExecutionContext& ctx, - const cnnlPointerMode_t pointer_mode, const void* value_ptr, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void LRN(const ExecutionContext& ctx, const int local_size, - const double alpha, const double beta, const double k, + const cnnlPointerMode_t pointer_mode, + const void* value_ptr, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void LRN(const ExecutionContext& ctx, + const int local_size, + const double alpha, + const double beta, + const double k, const cnnlTensorDescriptor_t input_quant_desc, const void* input_quant, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void QuantifyOffline(const ExecutionContext& context, cnnlQuantizeMode_t mode, @@ -521,98 +592,158 @@ class MLUCnnl { static void QuantifyOnline(const ExecutionContext& context, const int bitwidth, const cnnlTensorDescriptor_t input_desc, - const void* input, const bool compute_scale, - void* position, void* scale, + const void* input, + const bool compute_scale, + void* position, + void* scale, const cnnlTensorDescriptor_t ouput_desc, void* output); static void SGD(const ExecutionContext& context, - const cnnlTensorDescriptor_t grad_desc, const void* grad, - const void* lr, const cnnlTensorDescriptor_t var_desc, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const void* lr, + const cnnlTensorDescriptor_t var_desc, void* var); static void ApplyAdaGrad(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, const void* grad, - const cnnlTensorDescriptor_t accum_desc, void* accum, - const cnnlTensorDescriptor_t var_desc, void* var, - const void* lr, const bool update_slots); + const cnnlTensorDescriptor_t accum_desc, + void* accum, + const cnnlTensorDescriptor_t var_desc, + void* var, + const void* lr, + const bool update_slots); static void ApplyRMSProp(const ExecutionContext& context, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const void* lr, const void* rho, - const void* momentum, const void* epsilon, - const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t ms_desc, void* ms, - const cnnlTensorDescriptor_t mom_desc, void* mom); - - static void ApplyCenterRMSProp( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const void* lr, const void* rho, const void* momentum, - const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t mg_desc, void* mg, - const cnnlTensorDescriptor_t ms_desc, void* ms, - const cnnlTensorDescriptor_t mom_desc, void* mom); + const void* grad, + const void* lr, + const void* rho, + const void* momentum, + const void* epsilon, + const cnnlTensorDescriptor_t var_desc, + void* var, + const cnnlTensorDescriptor_t ms_desc, + void* ms, + const cnnlTensorDescriptor_t mom_desc, + void* mom); + + static void ApplyCenterRMSProp(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const void* lr, + const void* rho, + const void* momentum, + const void* epsilon, + const cnnlTensorDescriptor_t var_desc, + void* var, + const cnnlTensorDescriptor_t mg_desc, + void* mg, + const cnnlTensorDescriptor_t ms_desc, + void* ms, + const cnnlTensorDescriptor_t mom_desc, + void* mom); static void ApplyAdam(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t m_desc, void* m, - const cnnlTensorDescriptor_t v_desc, void* v, + const cnnlTensorDescriptor_t var_desc, + void* var, + const cnnlTensorDescriptor_t m_desc, + void* m, + const cnnlTensorDescriptor_t v_desc, + void* v, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const void* lr, const void* beta1, - const void* beta2, const void* beta1_power, - const void* beta2_power, const void* epsilon, + const void* grad, + const void* lr, + const void* beta1, + const void* beta2, + const void* beta1_power, + const void* beta2_power, + const void* epsilon, const bool use_nesterov); static void ApplyAdaMax(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const cnnlTensorDescriptor_t var_desc, void* var, - const cnnlTensorDescriptor_t m_desc, void* m, - const cnnlTensorDescriptor_t v_desc, void* v, - const void* diff, const void* lr, const void* beta1, - const void* beta2, const void* beta1_power, + const cnnlTensorDescriptor_t var_desc, + void* var, + const cnnlTensorDescriptor_t m_desc, + void* m, + const cnnlTensorDescriptor_t v_desc, + void* v, + const void* diff, + const void* lr, + const void* beta1, + const void* beta2, + const void* beta1_power, const void* epsilon); static void ApplyMomentum(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const bool use_nesterov, - const void* lr, const void* momentum, void* var, + const void* grad, + const bool use_nesterov, + const void* lr, + const void* momentum, + void* var, void* accum); static void ApplyKerasMomentum(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const void* grad, const bool use_nesterov, - const void* lr, const void* momentum, - void* var, void* accum); + const void* grad, + const bool use_nesterov, + const void* lr, + const void* momentum, + void* var, + void* accum); static void ApplyAdadelta(const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, - const void* diff, const void* lr, const void* rho, - const void* epsilon, void* var, void* accum, + const void* diff, + const void* lr, + const void* rho, + const void* epsilon, + void* var, + void* accum, void* accum_update); static void SparseSoftmaxXentWithLogits( - const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, - const cnnlTensorDescriptor_t x_desc, const void* input, - const cnnlTensorDescriptor_t label_desc, const void* label, - const cnnlTensorDescriptor_t y_desc, void* output, - const cnnlTensorDescriptor_t diff_y_desc, void* back_out); - - static void RandomUniform(const ExecutionContext& ctx, const int num, + const ExecutionContext& ctx, + cnnlSoftmaxMode_t mode, + const cnnlTensorDescriptor_t x_desc, + const void* input, + const cnnlTensorDescriptor_t label_desc, + const void* label, + const cnnlTensorDescriptor_t y_desc, + void* output, + const cnnlTensorDescriptor_t diff_y_desc, + void* back_out); + + static void RandomUniform(const ExecutionContext& ctx, + const int num, const cnnlDataType_t data_type, const cnnlRandGenerator_t mlu_generator, - void* mlu_state, void* output); + void* mlu_state, + void* output); - static void FusedDropout( - const ExecutionContext& ctx, const cnnlRandGenerator_t generator, - const cnnlTensorDescriptor_t input_desc, const void* input, const float p, - void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask, - const cnnlTensorDescriptor_t output_desc, void* output); + static void FusedDropout(const ExecutionContext& ctx, + const cnnlRandGenerator_t generator, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const float p, + void* state, + const cnnlTensorDescriptor_t mask_desc, + const void* mask, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void Cumsum(const ExecutionContext& ctx, const int axis, - const bool exclusive, const bool reverse, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t ouput_desc, void* output); + static void Cumsum(const ExecutionContext& ctx, + const int axis, + const bool exclusive, + const bool reverse, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t ouput_desc, + void* output); static void BroadcastTo(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, @@ -620,189 +751,267 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); - static void GatherFunctor( - const ExecutionContext& ctx, const int axis, const int batch_dims, - const cnnlTensorDescriptor_t params_desc, const void* params, - const cnnlTensorDescriptor_t indices_desc, const void* indices, - const cnnlTensorDescriptor_t output_desc, void* output); + static void GatherFunctor(const ExecutionContext& ctx, + const int axis, + const int batch_dims, + const cnnlTensorDescriptor_t params_desc, + const void* params, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void ScatterRefFunctor( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, - const void* params, const cnnlTensorDescriptor_t updates_desc, - const void* updates, const cnnlTensorDescriptor_t indices_desc, - const void* indices, const cnnlScatterRefMode_t mode); + static void ScatterRefFunctor(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t params_desc, + const void* params, + const cnnlTensorDescriptor_t updates_desc, + const void* updates, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlScatterRefMode_t mode); static void ScatterFunctor(const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, - const void* params, + void* params, const cnnlTensorDescriptor_t updates_desc, const void* updates, const cnnlTensorDescriptor_t indices_desc, - const void* indices, const int dim, + const void* indices, + const int dim, const cnnlScatterMode_t mode = CNNL_SCATTER); - static void Range(const ExecutionContext& ctx, const void* start, - const void* end, const void* step, - const cnnlDataType_t output_dtype, void* output); + static void Range(const ExecutionContext& ctx, + const void* start, + const void* end, + const void* step, + const cnnlDataType_t output_dtype, + void* output); static void Round(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void TopK(const ExecutionContext& ctx, const int k, const int dim, - const bool largest, const bool sorted, - const cnnlTensorDescriptor_t input_desc, const void* input, + static void TopK(const ExecutionContext& ctx, + const int k, + const int dim, + const bool largest, + const bool sorted, + const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t values_output_desc, void* values_out, const cnnlTensorDescriptor_t indices_output_desc, void* indices_out); - static void StridedSlice(const ExecutionContext& ctx, const int begin[], - const int end[], const int strides[], + static void StridedSlice(const ExecutionContext& ctx, + const int begin[], + const int end[], + const int strides[], const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); - static void Split(const ExecutionContext& ctx, int split_num, int axis, + static void Split(const ExecutionContext& ctx, + int split_num, + int axis, const cnnlTensorDescriptor_t input_desc, const void* input_ptr, const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); - static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + static void Split(const MLUDeviceContext& dev_ctx, + int split_num, + int axis, const cnnlTensorDescriptor_t input_desc, const void* input_ptr, const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); - static void Scale(const ExecutionContext& ctx, const int axis, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t alpha_desc, const void* alpha, - const cnnlTensorDescriptor_t beta_desc, const void* beta, - const cnnlTensorDescriptor_t output_desc, void* output); + static void Scale(const ExecutionContext& ctx, + const int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t alpha_desc, + const void* alpha, + const cnnlTensorDescriptor_t beta_desc, + const void* beta, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void AddN(const ExecutionContext& ctx, uint32_t input_num, + static void AddN(const ExecutionContext& ctx, + uint32_t input_num, const cnnlTensorDescriptor_t inputs_desc[], const void* inputs[], - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void Log(const ExecutionContext& ctx, - cnnlComputationPreference_t prefer, cnnlLogBase_t log_base, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void StridedSliceGrad(const ExecutionContext& ctx, const int begin[], - const int end[], const int strides[], + cnnlComputationPreference_t prefer, + cnnlLogBase_t log_base, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void StridedSliceGrad(const ExecutionContext& ctx, + const int begin[], + const int end[], + const int strides[], const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); - static void Logic(const ExecutionContext& ctx, const cnnlLogicOp_t log_method, + static void Logic(const ExecutionContext& ctx, + const cnnlLogicOp_t log_method, const cnnlTensorDescriptor_t input1_desc, const void* input1, const cnnlTensorDescriptor_t input2_desc, - const void* input2, const cnnlTensorDescriptor_t ouput_desc, + const void* input2, + const cnnlTensorDescriptor_t ouput_desc, void* output); - static void Select( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t condition_desc, - const void* condition_ptr, const cnnlTensorDescriptor_t then_desc, - const void* then_ptr, const cnnlTensorDescriptor_t else_desc, - const void* else_ptr, const cnnlTensorDescriptor_t output_desc, - void* output_ptr); - - static void AssignAdd(const ExecutionContext& ctx, const void* alpha, + static void Select(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t condition_desc, + const void* condition_ptr, + const cnnlTensorDescriptor_t then_desc, + const void* then_ptr, + const cnnlTensorDescriptor_t else_desc, + const void* else_ptr, + const cnnlTensorDescriptor_t output_desc, + void* output_ptr); + + static void AssignAdd(const ExecutionContext& ctx, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t update_desc, const void* update, - const cnnlTensorDescriptor_t param_desc, void* param); + const cnnlTensorDescriptor_t param_desc, + void* param); - static void AssignSub(const ExecutionContext& ctx, const void* alpha, + static void AssignSub(const ExecutionContext& ctx, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t update_desc, const void* update, - const cnnlTensorDescriptor_t param_desc, void* param); + const cnnlTensorDescriptor_t param_desc, + void* param); static void Assign(const ExecutionContext& ctx, const cnnlTensorDescriptor_t update_desc, const void* update, - const cnnlTensorDescriptor_t param_desc, void* param); + const cnnlTensorDescriptor_t param_desc, + void* param); static void GatherNd(const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, const void* params, const cnnlTensorDescriptor_t indices_desc, const void* indices, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void BatchToSpace(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, - void* output, const cnnlSpaceBatchParam_t param); + void* output, + const cnnlSpaceBatchParam_t param); static void BatchToSpaceNd(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, cnnlSpaceBatchNdDescriptor_t param, - void* extra_device_input, size_t extra_input_size, + void* extra_device_input, + size_t extra_input_size, const cnnlTensorDescriptor_t output_desc, void* output); - static void PoolingForward( - const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, - int64_t output_h, int64_t output_w, cnnlPoolingDescriptor_t pooling_desc, - const void* alpha, const cnnlTensorDescriptor_t input_desc, - const void* input, const void* beta, const void* extra_input_ptr, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void AdaptivePoolingForward( - const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output, - const cnnlTensorDescriptor_t index_desc, void* index); + static void PoolingForward(const ExecutionContext& ctx, + cnnlPoolingMode_t pool_mode, + int64_t output_h, + int64_t output_w, + cnnlPoolingDescriptor_t pooling_desc, + const void* alpha, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const void* beta, + const void* extra_input_ptr, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + static void AdaptivePoolingForward(const ExecutionContext& ctx, + cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output, + const cnnlTensorDescriptor_t index_desc, + void* index); + + static void Pool3D(const ExecutionContext& ctx, + cnnlPoolingMode_t pool_mode, const std::vector& output_shape, - cnnlPoolingDescriptor_t pooling_desc, const void* alpha, - const cnnlTensorDescriptor_t input_desc, const void* input, - const void* beta, const cnnlTensorDescriptor_t output_desc, + cnnlPoolingDescriptor_t pooling_desc, + const void* alpha, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const void* beta, + const cnnlTensorDescriptor_t output_desc, void* output); static void Pad(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const void* paddings, const void* padding_value, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void Matmul(const ExecutionContext& ctx, const bool transpose_a, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const void* paddings, + const void* padding_value, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Matmul(const ExecutionContext& ctx, + const bool transpose_a, const bool transpose_b, - const cnnlTensorDescriptor_t in0_desc, const void* in0, - const cnnlTensorDescriptor_t in1_desc, const void* in1, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t in0_desc, + const void* in0, + const cnnlTensorDescriptor_t in1_desc, + const void* in1, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void BatchMatmul( - const ExecutionContext& ctx, const bool transpose_a, - const bool transpose_b, const cnnlTensorDescriptor_t in0_desc, - const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1, - const cnnlTensorDescriptor_t output_desc, void* output); + static void BatchMatmul(const ExecutionContext& ctx, + const bool transpose_a, + const bool transpose_b, + const cnnlTensorDescriptor_t in0_desc, + const void* in0, + const cnnlTensorDescriptor_t in1_desc, + const void* in1, + const cnnlTensorDescriptor_t output_desc, + void* output); static void MulAx(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t alpha_desc, const void* alpha, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t alpha_desc, + const void* alpha, + const cnnlTensorDescriptor_t output_desc, + void* output); static void OpTensor(const ExecutionContext& ctx, const cnnlOpTensorDescriptor_t op_tensor_desc, - const cnnlTensorDescriptor_t a_desc, const void* a, - const cnnlTensorDescriptor_t b_desc, const void* b, - const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlTensorDescriptor_t a_desc, + const void* a, + const cnnlTensorDescriptor_t b_desc, + const void* b, + const cnnlTensorDescriptor_t output_desc, + void* output, const cnnlDataType_t dtype, const float alpha1_float = 1.f, const float alpha2_float = 1.f, const float beta_float = 0.f); - static void BiasAddGrad(const ExecutionContext& ctx, const int axis, + static void BiasAddGrad(const ExecutionContext& ctx, + const int axis, const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, const cnnlTensorDescriptor_t output_desc, @@ -810,9 +1019,13 @@ class MLUCnnl { static void OneHot(const ExecutionContext& ctx, const cnnlTensorDescriptor_t desc_indices, - const void* indices, const int depth, const void* on_value, - const void* off_value, const int axis, - cnnlDataType_t output_data_type, void* output); + const void* indices, + const int depth, + const void* on_value, + const void* off_value, + const int axis, + cnnlDataType_t output_data_type, + void* output); static void NonMaxSuppression(const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc, @@ -821,35 +1034,47 @@ class MLUCnnl { const cnnlTensorDescriptor_t confidence_desc, const void* confidence, const cnnlTensorDescriptor_t output_desc, - void* output, void* output_size); + void* output, + void* output_size); static void SoftmaxCrossEntropyWithLogits( - const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, + const ExecutionContext& ctx, + cnnlSoftmaxMode_t mode, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* logits_in, - const cnnlTensorDescriptor_t label_desc, const void* labels_in, - const cnnlTensorDescriptor_t loss_out_desc, void* loss_out, - const cnnlTensorDescriptor_t back_out_desc, void* back_out); + const cnnlTensorDescriptor_t input_desc, + const void* logits_in, + const cnnlTensorDescriptor_t label_desc, + const void* labels_in, + const cnnlTensorDescriptor_t loss_out_desc, + void* loss_out, + const cnnlTensorDescriptor_t back_out_desc, + void* back_out); static void SoftmaxForward(const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, - cnnlSoftmaxMode_t mode, const void* alpha, + cnnlSoftmaxMode_t mode, + const void* alpha, const cnnlTensorDescriptor_t input_desc, - const void* input, const void* beta, + const void* input, + const void* beta, const cnnlTensorDescriptor_t output_desc, void* output); - static void SoftmaxBackward( - const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, - cnnlSoftmaxMode_t mode, const cnnlTensorDescriptor_t y_desc, - const void* y, const cnnlTensorDescriptor_t diff_y_desc, - const void* diff_y, const cnnlTensorDescriptor_t diff_x_desc, - void* diff_x); + static void SoftmaxBackward(const ExecutionContext& ctx, + cnnlSoftmaxAlgorithm_t algorithm, + cnnlSoftmaxMode_t mode, + const cnnlTensorDescriptor_t y_desc, + const void* y, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); static void Softplus(const ExecutionContext& ctx, const cnnlTensorDescriptor_t features_desc, const void* features, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void SoftplusGrad(const ExecutionContext& ctx, const cnnlTensorDescriptor_t gradients_desc, @@ -860,38 +1085,59 @@ class MLUCnnl { void* output); static void RsqrtGrad(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t data_desc, const void* y, - const void* diff_y, void* output); + const cnnlTensorDescriptor_t data_desc, + const void* y, + const void* diff_y, + void* output); static void SqrtGrad(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t data_desc, const void* y, - const void* diff_y, void* output); - - static void ConvolutionForward( - const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc_, - const void* alpha, const void* beta, - const cnnlTensorDescriptor_t bias_desc, const void* bias_ptr, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t filtet_desc, const void* filter, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void FusedConvBNQuantify( - const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc, - const void* epsilon_ptr, const int fused_ops_number, - const cnnlDataType_t tensor_dtype, const int input_position, - const float input_scale, const int filter_position, - const float filter_scale, const cnnlTensorDescriptor_t scale_desc, - const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc, - const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc, - const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc, - const void* variance_ptr, const cnnlTensorDescriptor_t input_desc, - const void* input, const cnnlTensorDescriptor_t filtet_desc, - const void* filter, const cnnlTensorDescriptor_t output_desc, - void* output); + const cnnlTensorDescriptor_t data_desc, + const void* y, + const void* diff_y, + void* output); + + static void ConvolutionForward(const ExecutionContext& ctx, + cnnlConvolutionDescriptor_t conv_desc_, + const void* alpha, + const void* beta, + const cnnlTensorDescriptor_t bias_desc, + const void* bias_ptr, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t filtet_desc, + const void* filter, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void FusedConvBNQuantify(const ExecutionContext& ctx, + cnnlConvolutionDescriptor_t conv_desc, + const void* epsilon_ptr, + const int fused_ops_number, + const cnnlDataType_t tensor_dtype, + const int input_position, + const float input_scale, + const int filter_position, + const float filter_scale, + const cnnlTensorDescriptor_t scale_desc, + const void* scale_ptr, + const cnnlTensorDescriptor_t offset_desc, + const void* offset_ptr, + const cnnlTensorDescriptor_t mean_desc, + const void* mean_ptr, + const cnnlTensorDescriptor_t variance_desc, + const void* variance_ptr, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t filtet_desc, + const void* filter, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Tile(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void UnsortedSegmentSum(const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc, @@ -901,12 +1147,17 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); - static void Reduce(const ExecutionContext& ctx, const bool need_workspace, + static void Reduce(const ExecutionContext& ctx, + const bool need_workspace, const cnnlReduceDescriptor_t reduction_desc, - const void* alpha, const cnnlTensorDescriptor_t input_desc, - const void* input, const size_t indices_size, - void* indices, const void* beta, - const cnnlTensorDescriptor_t output_desc, void* output); + const void* alpha, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const size_t indices_size, + void* indices, + const void* beta, + const cnnlTensorDescriptor_t output_desc, + void* output); static void FloorDiv(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, @@ -914,34 +1165,41 @@ class MLUCnnl { const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void FloorMod(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc, const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void Maximum(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc, const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void Minimum(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc, const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void PowR(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input1_desc, const void* input1, - const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output); static void DivNoNan(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, @@ -949,7 +1207,8 @@ class MLUCnnl { const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void SquaredDifference(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input1_desc, @@ -960,52 +1219,73 @@ class MLUCnnl { void* output); static void L2Loss(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t input_desc, + const void* input, void* output); static void Abs(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Neg(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Floor(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Ceil(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void IsNan(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Square(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Sqrt(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Rsqrt(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Cos(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Sin(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void TrigonForward(const ExecutionContext& ctx, const cnnlTrigonDescriptor_t trigon_desc, @@ -1016,31 +1296,41 @@ class MLUCnnl { static void Exp(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Sign(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void IsFinite(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void IsNanInf(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, - const void* input, void* output); + const void* input, + void* output); static void Erf(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void Log1p(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void LogicalNot(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, @@ -1048,152 +1338,250 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); - static void DynamicStitch( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc, - const int** indices, const cnnlTensorDescriptor_t* data_desc, - const void** data, const int size, int* indices_dims, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void CropAndResize( - const ExecutionContext& ctx, const std::string method_name, - const float extrapolation_value, const cnnlTensorDescriptor_t image_desc, - const void* image, const cnnlTensorDescriptor_t boxes_desc, - const void* boxes, const cnnlTensorDescriptor_t box_index_desc, - const void* box_index, const cnnlTensorDescriptor_t output_desc, - void* output); + static void DynamicStitch(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t* indices_desc, + const int** indices, + const cnnlTensorDescriptor_t* data_desc, + const void** data, + const int size, + int* indices_dims, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void CropAndResize(const ExecutionContext& ctx, + const std::string method_name, + const float extrapolation_value, + const cnnlTensorDescriptor_t image_desc, + const void* image, + const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, + const cnnlTensorDescriptor_t box_index_desc, + const void* box_index, + const cnnlTensorDescriptor_t output_desc, + void* output); static void CropAndResizeBackwardImage( - const ExecutionContext& ctx, const std::string method_name, - const cnnlTensorDescriptor_t image_desc, const void* image, - const cnnlTensorDescriptor_t boxes_desc, const void* boxes, - const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx, - const cnnlTensorDescriptor_t grads_image_desc, void* grads_image); + const ExecutionContext& ctx, + const std::string method_name, + const cnnlTensorDescriptor_t image_desc, + const void* image, + const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, + const cnnlTensorDescriptor_t box_idx_desc, + const void* box_idx, + const cnnlTensorDescriptor_t grads_image_desc, + void* grads_image); static void CropAndResizeBackwardBoxes( - const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, - const void* input, const cnnlTensorDescriptor_t image_desc, - const void* image, const cnnlTensorDescriptor_t boxes_desc, - const void* boxes, const cnnlTensorDescriptor_t box_idx_desc, - const void* box_idx, const cnnlTensorDescriptor_t output_desc, + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t image_desc, + const void* image, + const cnnlTensorDescriptor_t boxes_desc, + const void* boxes, + const cnnlTensorDescriptor_t box_idx_desc, + const void* box_idx, + const cnnlTensorDescriptor_t output_desc, void* output); - static void PoolingBackward( - const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, - const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y, - const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, - const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta, - const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); - - static void AdaptivePoolingBackward( - const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, - const cnnlTensorDescriptor_t y_desc, const void* y, - const cnnlTensorDescriptor_t index_desc, const void* index, - const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void PoolingBackward(const ExecutionContext& ctx, + const cnnlPoolingDescriptor_t pooling_desc, + const void* alpha, + const cnnlTensorDescriptor_t y_desc, + const void* y, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const void* beta, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); + + static void AdaptivePoolingBackward(const ExecutionContext& ctx, + const cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t y_desc, + const void* y, + const cnnlTensorDescriptor_t index_desc, + const void* index, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); static void PoolingIndex(const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, - const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t y_desc, void* y); + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t y_desc, + void* y); static void SpaceToBatch(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, - void* output, const int64_t block_shape[]); + void* output, + const int64_t block_shape[]); static void SpaceToBatchNd(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, cnnlSpaceBatchNdDescriptor_t param, - void* extra_device_input, size_t extra_input_size, + void* extra_device_input, + size_t extra_input_size, const cnnlTensorDescriptor_t output_desc, void* output); - static void Interp(const ExecutionContext& ctx, const cnnlInterpMode_t mode, - const bool align_corners, const bool half_pixel_centers, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + static void Interp(const ExecutionContext& ctx, + const cnnlInterpMode_t mode, + const bool align_corners, + const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void InterpBackward( - const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode, - const bool align_corners, const bool half_pixel_centers, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + static void InterpBackward(const ExecutionContext& ctx, + const cnnlInterpBackwardMode_t mode, + const bool align_corners, + const bool half_pixel_centers, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t output_desc, + void* output); static void QuantizeParam(const ExecutionContext& ctx, - const cnnlQuantizeMode_t mode, const int bitwidth, + const cnnlQuantizeMode_t mode, + const int bitwidth, const cnnlTensorDescriptor_t input_desc, - const void* input, void* position, void* scale, + const void* input, + void* position, + void* scale, void* offset); - static void QuantizeMatMul( - const ExecutionContext& ctx, const bool transpose_a, - const bool transpose_b, const cnnlTensorDescriptor_t a_desc, - const void* a, const void* a_position, const void* a_scale, - const void* a_offset, const cnnlTensorDescriptor_t b_desc, const void* b, - const void* b_position, const void* b_scale, const void* b_offset, - const cnnlDataType_t quant_type, const cnnlDataType_t data_type, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void QuantizeBatchMatMul( - const ExecutionContext& ctx, const bool adj_x, const bool adj_y, - const cnnlTensorDescriptor_t a_desc, const void* a, - const void* a_position, const void* a_scale, const void* a_offset, - const cnnlTensorDescriptor_t b_desc, const void* b, - const void* b_position, const void* b_scale, const void* b_offset, - const cnnlDataType_t quant_type, const cnnlDataType_t data_type, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void QuantizeBatchMatMulBCast( - const ExecutionContext& ctx, const bool adj_x, const bool adj_y, - const cnnlTensorDescriptor_t a_desc, const void* a, - const void* a_position, const void* a_scale, const void* a_offset, - const cnnlTensorDescriptor_t b_desc, const void* b, - const void* b_position, const void* b_scale, const void* b_offset, - const cnnlDataType_t quant_type, const cnnlDataType_t data_type, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void FusedBatchNorm( - const ExecutionContext& ctx, const bool is_training, - const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t scale_desc, const void* scale, - const void* offset, const void* estimated_mean, - const void* estimated_variance, float epsilon, float momentum, - const cnnlTensorDescriptor_t output_desc, void* output, void* batch_mean, - void* batch_var, void* saved_mean, void* saved_var); - - static void FusedBatchNormGrad( - const ExecutionContext& ctx, const bool is_training, - const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop, - const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t scale_desc, const void* scale, - const void* saved_mean, const void* saved_var, float epsilon, - const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop, - void* scale_backprop, void* offset_backprop); - - static void LayerNormForward(const ExecutionContext& ctx, int axis, + static void QuantizeMatMul(const ExecutionContext& ctx, + const bool transpose_a, + const bool transpose_b, + const cnnlTensorDescriptor_t a_desc, + const void* a, + const void* a_position, + const void* a_scale, + const void* a_offset, + const cnnlTensorDescriptor_t b_desc, + const void* b, + const void* b_position, + const void* b_scale, + const void* b_offset, + const cnnlDataType_t quant_type, + const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void QuantizeBatchMatMul(const ExecutionContext& ctx, + const bool adj_x, + const bool adj_y, + const cnnlTensorDescriptor_t a_desc, + const void* a, + const void* a_position, + const void* a_scale, + const void* a_offset, + const cnnlTensorDescriptor_t b_desc, + const void* b, + const void* b_position, + const void* b_scale, + const void* b_offset, + const cnnlDataType_t quant_type, + const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void QuantizeBatchMatMulBCast(const ExecutionContext& ctx, + const bool adj_x, + const bool adj_y, + const cnnlTensorDescriptor_t a_desc, + const void* a, + const void* a_position, + const void* a_scale, + const void* a_offset, + const cnnlTensorDescriptor_t b_desc, + const void* b, + const void* b_position, + const void* b_scale, + const void* b_offset, + const cnnlDataType_t quant_type, + const cnnlDataType_t data_type, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void FusedBatchNorm(const ExecutionContext& ctx, + const bool is_training, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t scale_desc, + const void* scale, + const void* offset, + const void* estimated_mean, + const void* estimated_variance, + float epsilon, + float momentum, + const cnnlTensorDescriptor_t output_desc, + void* output, + void* batch_mean, + void* batch_var, + void* saved_mean, + void* saved_var); + + static void FusedBatchNormGrad(const ExecutionContext& ctx, + const bool is_training, + const cnnlTensorDescriptor_t y_backprop_desc, + const void* y_backprop, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t scale_desc, + const void* scale, + const void* saved_mean, + const void* saved_var, + float epsilon, + const cnnlTensorDescriptor_t x_backprop_desc, + void* x_backprop, + void* scale_backprop, + void* offset_backprop); + + static void LayerNormForward(const ExecutionContext& ctx, + int axis, const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t weight_bias_desc, - const void* weight, const void* bias, float eps, - const cnnlTensorDescriptor_t y_desc, void* y, + const void* weight, + const void* bias, + float eps, + const cnnlTensorDescriptor_t y_desc, + void* y, const cnnlTensorDescriptor_t mean_rstd_desc, - void* saved_mean, void* saved_rstd); - - static void LayerNormBackward( - const ExecutionContext& ctx, int axis, - const cnnlTensorDescriptor_t x_desc, const void* x, - const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z, - const cnnlTensorDescriptor_t weight_bias_desc, const void* weight, - const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean, - const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc, - void* diff_x, void* diff_weight, void* diff_bias); + void* saved_mean, + void* saved_rstd); + + static void LayerNormBackward(const ExecutionContext& ctx, + int axis, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t diff_z_desc, + const void* diff_z, + const cnnlTensorDescriptor_t weight_bias_desc, + const void* weight, + const cnnlTensorDescriptor_t mean_rstd_desc, + const void* saved_mean, + const void* saved_rstd, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x, + void* diff_weight, + void* diff_bias); static void Transpose(const ExecutionContext& ctx, - const std::vector perm, const int input_dim, + const std::vector perm, + const int input_dim, const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void TrilTriu(const ExecutionContext& ctx, const int diagonal_k, const bool tri_up_mode, @@ -1203,109 +1591,170 @@ class MLUCnnl { static void MatrixBandPart(const ExecutionContext& ctx, const cnnlTensorDescriptor_t data_desc, - const void* input, const int num_lower, - const int num_upper, void* output); + const void* input, + const int num_lower, + const int num_upper, + void* output); static void NumTrue(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t x_desc, const void* x, - Tensor index, uint32_t* num_true); + const cnnlTensorDescriptor_t x_desc, + const void* x, + Tensor index, + uint32_t* num_true); static void Where(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t x_desc, const void* x, - const uint32_t* strides, const uint32_t* index, - const cnnlTensorDescriptor_t y_desc, int* y, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const uint32_t* strides, + const uint32_t* index, + const cnnlTensorDescriptor_t y_desc, + int* y, const bool as_tuple); static void Conv2D(const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, const cnnlDataType_t tensor_dtype, - const cnnlDataType_t dt_onchip, const void* input_position, - const void* input_scale, const void* input_offset, - const void* filter_position, const void* filter_scale, + const cnnlDataType_t dt_onchip, + const void* input_position, + const void* input_scale, + const void* input_offset, + const void* filter_position, + const void* filter_scale, const void* filter_offset, - const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t input_desc, + const void* input, const cnnlTensorDescriptor_t filter_desc, - const void* filter, const cnnlTensorDescriptor_t bias_desc, - const void* bias, const cnnlTensorDescriptor_t output_desc, + const void* filter, + const cnnlTensorDescriptor_t bias_desc, + const void* bias, + const cnnlTensorDescriptor_t output_desc, void* output); - static void ConvBackpropInput( - const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, - const cnnlTensorDescriptor_t filter_desc, const void* filter, - const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, - const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); + static void ConvBackpropInput(const ExecutionContext& ctx, + const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t filter_desc, + const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, + const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, + void* in_backprop); static void QuantizeConvBackpropInput( - const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, - const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, - const void* filter_position, const void* filter_scale, - const void* filter_offset, const void* out_backprop_position, - const void* out_backprop_scale, const void* out_backprop_offset, - const cnnlTensorDescriptor_t input_desc, const void* filter, - const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, - const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); + const ExecutionContext& ctx, + const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, + const cnnlDataType_t dt_onchip, + const void* filter_position, + const void* filter_scale, + const void* filter_offset, + const void* out_backprop_position, + const void* out_backprop_scale, + const void* out_backprop_offset, + const cnnlTensorDescriptor_t input_desc, + const void* filter, + const cnnlTensorDescriptor_t out_backprop_desc, + const void* out_backprop, + const cnnlTensorDescriptor_t in_backprop_desc, + void* in_backprop); static void ConvBackpropFilter( - const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, - const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); + const ExecutionContext& ctx, + const cnnlConvolutionDescriptor_t conv_desc, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, + const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, + void* filter_backprop); static void QuantizeConvBackpropFilter( - const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, - const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, - const void* input_position, const void* input_scale, - const void* input_offset, const void* out_backprop_position, - const void* out_backprop_scale, const void* out_backprop_offset, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, - const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); - - static void DCNForward( - const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t offset_desc, const void* offset, - const cnnlTensorDescriptor_t mask_desc, const void* mask, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t bias_desc, const void* bias, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void DCNBackwardData( - const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t offset_desc, const void* offset, - const cnnlTensorDescriptor_t mask_desc, const void* mask, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output, - const cnnlTensorDescriptor_t grad_input_desc, void* grad_input, - const cnnlTensorDescriptor_t grad_offset_desc, void* grad_offset, - const cnnlTensorDescriptor_t grad_mask_desc, void* grad_mask); - - static void DCNBackwardWeight( - const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t offset_desc, const void* offset, - const cnnlTensorDescriptor_t mask_desc, const void* mask, - const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output, - const cnnlTensorDescriptor_t grad_weight_desc, void* grad_weight, - const cnnlTensorDescriptor_t grad_bias_desc, void* grad_bias); + const ExecutionContext& ctx, + const cnnlConvolutionDescriptor_t conv_desc, + const cnnlDataType_t tensor_dtype, + const cnnlDataType_t dt_onchip, + const void* input_position, + const void* input_scale, + const void* input_offset, + const void* out_backprop_position, + const void* out_backprop_scale, + const void* out_backprop_offset, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t out_backprop_desc, + const void* out_backprop, + const cnnlTensorDescriptor_t filter_backprop_desc, + void* filter_backprop); + + static void DCNForward(const ExecutionContext& ctx, + const cnnlDCNDescriptor_t dcn_desc, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t offset_desc, + const void* offset, + const cnnlTensorDescriptor_t mask_desc, + const void* mask, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t bias_desc, + const void* bias, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void DCNBackwardData(const ExecutionContext& ctx, + const cnnlDCNDescriptor_t dcn_desc, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t offset_desc, + const void* offset, + const cnnlTensorDescriptor_t mask_desc, + const void* mask, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t grad_output_desc, + const void* grad_output, + const cnnlTensorDescriptor_t grad_input_desc, + void* grad_input, + const cnnlTensorDescriptor_t grad_offset_desc, + void* grad_offset, + const cnnlTensorDescriptor_t grad_mask_desc, + void* grad_mask); + + static void DCNBackwardWeight(const ExecutionContext& ctx, + const cnnlDCNDescriptor_t dcn_desc, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t offset_desc, + const void* offset, + const cnnlTensorDescriptor_t mask_desc, + const void* mask, + const cnnlTensorDescriptor_t grad_output_desc, + const void* grad_output, + const cnnlTensorDescriptor_t grad_weight_desc, + void* grad_weight, + const cnnlTensorDescriptor_t grad_bias_desc, + void* grad_bias); static void InTopK(const ExecutionContext& ctx, const cnnlTensorDescriptor_t predictions_desc, const void* predictions, const cnnlTensorDescriptor_t targets_desc, - const void* targets, const cnnlTensorDescriptor_t k_desc, - const void* k, const int k_int, - const cnnlTensorDescriptor_t output_desc, void* output); + const void* targets, + const cnnlTensorDescriptor_t k_desc, + const void* k, + const int k_int, + const cnnlTensorDescriptor_t output_desc, + void* output); - static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode, + static void ScatterNd(const ExecutionContext& ctx, + cnnlScatterNdMode_t mode, const cnnlTensorDescriptor_t indices_desc, const void* indices, const cnnlTensorDescriptor_t updates_desc, const void* updates, const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void BitWise(const ExecutionContext& ctx, const cnnlBitComputeOp_t optype, @@ -1313,12 +1762,17 @@ class MLUCnnl { const void* input1, const cnnlTensorDescriptor_t input2_desc, const void* input2, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); static void QR(const ExecutionContext& ctx, - const cnnlTensorDescriptor_t a_desc, const void* a, - const cnnlTensorDescriptor_t q_desc, void* q, - const cnnlTensorDescriptor_t r_desc, void* r, const bool some); + const cnnlTensorDescriptor_t a_desc, + const void* a, + const cnnlTensorDescriptor_t q_desc, + void* q, + const cnnlTensorDescriptor_t r_desc, + void* r, + const bool some); static void Reciprocal(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, @@ -1326,55 +1780,85 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); - static void BceLoss( - const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t target_desc, const void* target, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void BceLossBackward( - const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction, - const cnnlTensorDescriptor_t grad_desc, const void* grad, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t target_desc, const void* target, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void EmbeddingForward( - const ExecutionContext& ctx, const int padding_idx, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t indices_desc, const int* indices, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void Transform(const ExecutionContext& ctx, const void* alpha, + static void BceLoss(const ExecutionContext& ctx, + const cnnlBceLossReduction_t reduction, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t target_desc, + const void* target, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void BceLossBackward(const ExecutionContext& ctx, + const cnnlBceLossReduction_t reduction, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t target_desc, + const void* target, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void EmbeddingForward(const ExecutionContext& ctx, + const int padding_idx, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t indices_desc, + const int* indices, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void Transform(const ExecutionContext& ctx, + const void* alpha, const void* beta, const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void EmbeddingBackward( - const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, - const cnnlTensorDescriptor_t indices_desc, const void* indices, - const cnnlTensorDescriptor_t diff_desc, const void* diff, - const cnnlTensorDescriptor_t output_desc, void* output); - - static void BceWithLogits( - const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t target_desc, const void* target, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight, - const cnnlTensorDescriptor_t output_desc, void* output); + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void EmbeddingBackward(const ExecutionContext& ctx, + int padding_idx, + bool scale_grad_by_freq, + const cnnlTensorDescriptor_t indices_desc, + const void* indices, + const cnnlTensorDescriptor_t diff_desc, + const void* diff, + const cnnlTensorDescriptor_t output_desc, + void* output); + + static void BceWithLogits(const ExecutionContext& ctx, + cnnlBceWithLogitsReduction_t reduction, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t target_desc, + const void* target, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t pos_weight_desc, + const void* pos_weight, + const cnnlTensorDescriptor_t output_desc, + void* output); static void BceWithLogitsBackward( - const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction, - const cnnlTensorDescriptor_t grad_desc, const void* grad, - const cnnlTensorDescriptor_t input_desc, const void* input, - const cnnlTensorDescriptor_t target_desc, const void* target, - const cnnlTensorDescriptor_t weight_desc, const void* weight, - const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight, - const cnnlTensorDescriptor_t diff_input_desc, void* diff_input); + const ExecutionContext& ctx, + cnnlBceWithLogitsReduction_t reduction, + const cnnlTensorDescriptor_t grad_desc, + const void* grad, + const cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t target_desc, + const void* target, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t pos_weight_desc, + const void* pos_weight, + const cnnlTensorDescriptor_t diff_input_desc, + void* diff_input); }; template @@ -1393,22 +1877,27 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx, transformed_output->mutable_data( framework::DDim(output_shape.data(), dim_size), ctx.GetPlace()); } - MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY, - ToCnnlDataType()); - MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY, - ToCnnlDataType()); - - MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(), - GetBasePtr(transformed_input), trans_out_desc.get(), + MLUCnnlTensorDesc trans_in_desc( + *transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc trans_out_desc( + *transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::Transpose(ctx, + perm, + dim_size, + trans_in_desc.get(), + GetBasePtr(transformed_input), + trans_out_desc.get(), GetBasePtr(transformed_output)); } template -inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value, +inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, + T value, Tensor* out) { MLUCnnlTensorDesc out_desc(*out); - MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), - GetBasePtr(out)); + MLUCnnl::Fill( + ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out)); } } // namespace operators diff --git a/paddle/fluid/operators/scatter_op_mlu.cc b/paddle/fluid/operators/scatter_op_mlu.cc index 057ba3f4a4f2fc0051d983288c2f3d3392412a1d..952da0edb8f34d7049a9c0ee59ecc7baee93c242 100644 --- a/paddle/fluid/operators/scatter_op_mlu.cc +++ b/paddle/fluid/operators/scatter_op_mlu.cc @@ -33,26 +33,43 @@ class ScatterMLUKernel : public framework::OpKernel { cnnlScatterRefMode_t mode; if (overwrite) { mode = CNNL_SCATTERREF_UPDATE; - MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), - updates_desc.get(), GetBasePtr(updates), - indices_desc.get(), GetBasePtr(indices), mode); + MLUCnnl::ScatterRefFunctor(ctx, + x_desc.get(), + GetBasePtr(x), + updates_desc.get(), + GetBasePtr(updates), + indices_desc.get(), + GetBasePtr(indices), + mode); } else { Tensor tensor_zeros(updates->type()); tensor_zeros.mutable_data(updates->dims(), ctx.GetPlace()); MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros); float value = 0.0; auto value_t = static_cast(value); - MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, - tensor_zeros_desc.get(), GetBasePtr(&tensor_zeros)); + MLUCnnl::Fill(ctx, + CNNL_POINTER_MODE_HOST, + &value_t, + tensor_zeros_desc.get(), + GetBasePtr(&tensor_zeros)); mode = CNNL_SCATTERREF_UPDATE; - MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), - tensor_zeros_desc.get(), - GetBasePtr(&tensor_zeros), indices_desc.get(), - GetBasePtr(indices), mode); + MLUCnnl::ScatterRefFunctor(ctx, + x_desc.get(), + GetBasePtr(x), + tensor_zeros_desc.get(), + GetBasePtr(&tensor_zeros), + indices_desc.get(), + GetBasePtr(indices), + mode); mode = CNNL_SCATTERREF_ADD; - MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), - updates_desc.get(), GetBasePtr(updates), - indices_desc.get(), GetBasePtr(indices), mode); + MLUCnnl::ScatterRefFunctor(ctx, + x_desc.get(), + GetBasePtr(x), + updates_desc.get(), + GetBasePtr(updates), + indices_desc.get(), + GetBasePtr(indices), + mode); } paddle::framework::TensorCopy(*x, place, out); } @@ -62,5 +79,6 @@ class ScatterMLUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_MLU_KERNEL(scatter, ops::ScatterMLUKernel, +REGISTER_OP_MLU_KERNEL(scatter, + ops::ScatterMLUKernel, ops::ScatterMLUKernel);