未验证 提交 0d17c047 编写于 作者: Z zhaoying9105 提交者: GitHub

[MLU](bugfix): fix MLUCnnl::ScatterFunctor function declare bug (#43778)

上级 03972d5a
...@@ -35,9 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool; ...@@ -35,9 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool;
using MLUDeviceContext = platform::MLUDeviceContext; using MLUDeviceContext = platform::MLUDeviceContext;
const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = { const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
{"reduce_all", CNNL_REDUCE_AND}, {"reduce_any", CNNL_REDUCE_OR}, {"reduce_all", CNNL_REDUCE_AND},
{"reduce_max", CNNL_REDUCE_MAX}, {"reduce_mean", CNNL_REDUCE_AVG}, {"reduce_any", CNNL_REDUCE_OR},
{"reduce_min", CNNL_REDUCE_MIN}, {"reduce_sum", CNNL_REDUCE_ADD}, {"reduce_max", CNNL_REDUCE_MAX},
{"reduce_mean", CNNL_REDUCE_AVG},
{"reduce_min", CNNL_REDUCE_MIN},
{"reduce_sum", CNNL_REDUCE_ADD},
{"reduce_prod", CNNL_REDUCE_MUL}, {"reduce_prod", CNNL_REDUCE_MUL},
}; };
...@@ -225,36 +228,49 @@ class MLUCnnlTensorDesc { ...@@ -225,36 +228,49 @@ class MLUCnnlTensorDesc {
MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs); MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype); const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype, const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout); const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const cnnlDataType_t tensor_dtype, int position); const int dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype); const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype, const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout); const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[], MLUCnnlTensorDesc(const int tensor_dim,
const cnnlDataType_t tensor_dtype, int position); const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const Tensor& tensor, const cnnlTensorLayout_t layout, MLUCnnlTensorDesc(const Tensor& tensor,
const cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype); const cnnlDataType_t tensor_dtype);
explicit MLUCnnlTensorDesc(const Tensor& tensor); explicit MLUCnnlTensorDesc(const Tensor& tensor);
MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, MLUCnnlTensorDesc(const Tensor& tensor,
const cnnlDataType_t tensor_dtype, int position); cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout, MLUCnnlTensorDesc(const Tensor& tensor,
const cnnlDataType_t tensor_dtype, int position, cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position,
float scale); float scale);
~MLUCnnlTensorDesc(); ~MLUCnnlTensorDesc();
...@@ -270,8 +286,10 @@ class MLUCnnlActivationDesc { ...@@ -270,8 +286,10 @@ class MLUCnnlActivationDesc {
MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete; MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof); MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof, MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode,
const float sliced_dim, const float selu_alpha, const float ceof,
const float sliced_dim,
const float selu_alpha,
const float selu_lambda); const float selu_lambda);
const cnnlActivationDescriptor_t get() const; const cnnlActivationDescriptor_t get() const;
...@@ -288,14 +306,22 @@ class MLUCnnlPoolingDesc { ...@@ -288,14 +306,22 @@ class MLUCnnlPoolingDesc {
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt, const cnnlNanPropagation_t maxpooling_nan_opt,
int window_rows, int window_cols, int64_t pad_up, int window_rows,
int64_t pad_down, int64_t pad_left, int64_t pad_right, int window_cols,
int row_stride, int col_stride, int row_dilation, int64_t pad_up,
int col_dilation, bool ceil_mode); int64_t pad_down,
int64_t pad_left,
int64_t pad_right,
int row_stride,
int col_stride,
int row_dilation,
int col_dilation,
bool ceil_mode);
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode, MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt, const cnnlNanPropagation_t maxpooling_nan_opt,
const int tensor_rank, const std::vector<int>& window, const int tensor_rank,
const std::vector<int>& window,
const std::vector<int>& padding, const std::vector<int>& padding,
const std::vector<int>& stride); const std::vector<int>& stride);
...@@ -364,8 +390,10 @@ class MLUCnnlNMSDesc { ...@@ -364,8 +390,10 @@ class MLUCnnlNMSDesc {
MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete; MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete; MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, const float iou_threshold, MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode,
const int max_output_size, const float confidence_threshold, const float iou_threshold,
const int max_output_size,
const float confidence_threshold,
const int input_layout); const int input_layout);
const cnnlNmsDescriptor_t get() const; const cnnlNmsDescriptor_t get() const;
...@@ -378,12 +406,17 @@ class MLUCnnlNMSDesc { ...@@ -378,12 +406,17 @@ class MLUCnnlNMSDesc {
class MLUCnnlConvolutionDesc { class MLUCnnlConvolutionDesc {
public: public:
MLUCnnlConvolutionDesc(const int dims, const int pad[], const int stride[], MLUCnnlConvolutionDesc(const int dims,
const int dilation[], const int group_count, const int pad[],
const int stride[],
const int dilation[],
const int group_count,
const cnnlDataType_t tensor_dtype); const cnnlDataType_t tensor_dtype);
MLUCnnlConvolutionDesc(const int dims, const int64_t pad[], MLUCnnlConvolutionDesc(const int dims,
const int64_t stride[], const int64_t dilation[], const int64_t pad[],
const int64_t stride[],
const int64_t dilation[],
const int group_count, const int group_count,
const cnnlDataType_t tensor_dtype); const cnnlDataType_t tensor_dtype);
...@@ -402,7 +435,8 @@ class MLUCnnlConvolutionDesc { ...@@ -402,7 +435,8 @@ class MLUCnnlConvolutionDesc {
class MLUCnnlBatchSpaceDesc { class MLUCnnlBatchSpaceDesc {
public: public:
MLUCnnlBatchSpaceDesc(uint32_t block_shape[], uint32_t paddings[], MLUCnnlBatchSpaceDesc(uint32_t block_shape[],
uint32_t paddings[],
const uint32_t block_shape_size, const uint32_t block_shape_size,
const uint32_t paddings_size); const uint32_t paddings_size);
...@@ -446,8 +480,12 @@ class MLUCnnlTrigonDesc { ...@@ -446,8 +480,12 @@ class MLUCnnlTrigonDesc {
class MLUCnnlDCNDesc { class MLUCnnlDCNDesc {
public: public:
MLUCnnlDCNDesc(int dimNb, const int* pad, const int* stride, MLUCnnlDCNDesc(int dimNb,
const int* dilation, int deformable_group, int conv_group, const int* pad,
const int* stride,
const int* dilation,
int deformable_group,
int conv_group,
int im2col_step); int im2col_step);
const cnnlDCNDescriptor_t get() const; const cnnlDCNDescriptor_t get() const;
...@@ -461,55 +499,88 @@ class MLUCnnl { ...@@ -461,55 +499,88 @@ class MLUCnnl {
public: public:
static void Active(const ExecutionContext& ctx, static void Active(const ExecutionContext& ctx,
cnnlActivationDescriptor_t active_desc, cnnlActivationDescriptor_t active_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ActiveGrad( static void ActiveGrad(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc, cnnlActivationDescriptor_t active_desc,
const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc, const void* alpha,
const void* y, const cnnlTensorDescriptor_t diff_y_desc, const void* beta,
const void* diff_y, const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t y_desc,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Concat(const ExecutionContext& ctx, const int pack_num, static void Concat(const ExecutionContext& ctx,
const int axis, const cnnlTensorDescriptor_t inputs_desc[], const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[], const void* const inputs[],
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, static void Concat(const MLUDeviceContext& dev_ctx,
const int axis, const cnnlTensorDescriptor_t inputs_desc[], const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[], const void* const inputs[],
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, static void Cast(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, cnnlCastDataType_t cast_type,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Clip(const ExecutionContext& ctx, static void Clip(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const void* min, const void* max, void* y); const void* input,
const void* min,
const void* max,
void* y);
static void HardtanhBackward( static void HardtanhBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc, const cnnlTensorDescriptor_t x_desc,
const void* x, const cnnlTensorDescriptor_t diff_y_desc, const void* x,
const void* diff_y, const float max_val, const float min_val, const cnnlTensorDescriptor_t diff_y_desc,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); const void* diff_y,
const float max_val,
const float min_val,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Div(const ExecutionContext& ctx, static void Div(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t in0_desc, const void* in0, const cnnlTensorDescriptor_t in0_desc,
const cnnlTensorDescriptor_t in1_desc, const void* in1, const void* in0,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Fill(const ExecutionContext& ctx, static void Fill(const ExecutionContext& ctx,
const cnnlPointerMode_t pointer_mode, const void* value_ptr, const cnnlPointerMode_t pointer_mode,
const cnnlTensorDescriptor_t output_desc, void* output); const void* value_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LRN(const ExecutionContext& ctx, const int local_size, static void LRN(const ExecutionContext& ctx,
const double alpha, const double beta, const double k, const int local_size,
const double alpha,
const double beta,
const double k,
const cnnlTensorDescriptor_t input_quant_desc, const cnnlTensorDescriptor_t input_quant_desc,
const void* input_quant, const void* input_quant,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantifyOffline(const ExecutionContext& context, static void QuantifyOffline(const ExecutionContext& context,
cnnlQuantizeMode_t mode, cnnlQuantizeMode_t mode,
...@@ -521,98 +592,158 @@ class MLUCnnl { ...@@ -521,98 +592,158 @@ class MLUCnnl {
static void QuantifyOnline(const ExecutionContext& context, static void QuantifyOnline(const ExecutionContext& context,
const int bitwidth, const int bitwidth,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const bool compute_scale, const void* input,
void* position, void* scale, const bool compute_scale,
void* position,
void* scale,
const cnnlTensorDescriptor_t ouput_desc, const cnnlTensorDescriptor_t ouput_desc,
void* output); void* output);
static void SGD(const ExecutionContext& context, static void SGD(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc, const void* grad, const cnnlTensorDescriptor_t grad_desc,
const void* lr, const cnnlTensorDescriptor_t var_desc, const void* grad,
const void* lr,
const cnnlTensorDescriptor_t var_desc,
void* var); void* var);
static void ApplyAdaGrad(const ExecutionContext& ctx, static void ApplyAdaGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* grad,
const cnnlTensorDescriptor_t accum_desc, void* accum, const cnnlTensorDescriptor_t accum_desc,
const cnnlTensorDescriptor_t var_desc, void* var, void* accum,
const void* lr, const bool update_slots); const cnnlTensorDescriptor_t var_desc,
void* var,
const void* lr,
const bool update_slots);
static void ApplyRMSProp(const ExecutionContext& context, static void ApplyRMSProp(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* lr, const void* rho, const void* grad,
const void* momentum, const void* epsilon, const void* lr,
const cnnlTensorDescriptor_t var_desc, void* var, const void* rho,
const cnnlTensorDescriptor_t ms_desc, void* ms, const void* momentum,
const cnnlTensorDescriptor_t mom_desc, void* mom); const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
static void ApplyCenterRMSProp( void* var,
const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t ms_desc,
const void* grad, const void* lr, const void* rho, const void* momentum, void* ms,
const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var, const cnnlTensorDescriptor_t mom_desc,
const cnnlTensorDescriptor_t mg_desc, void* mg, void* mom);
const cnnlTensorDescriptor_t ms_desc, void* ms,
const cnnlTensorDescriptor_t mom_desc, void* mom); static void ApplyCenterRMSProp(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const void* rho,
const void* momentum,
const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t mg_desc,
void* mg,
const cnnlTensorDescriptor_t ms_desc,
void* ms,
const cnnlTensorDescriptor_t mom_desc,
void* mom);
static void ApplyAdam(const ExecutionContext& ctx, static void ApplyAdam(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t var_desc, void* var, const cnnlTensorDescriptor_t var_desc,
const cnnlTensorDescriptor_t m_desc, void* m, void* var,
const cnnlTensorDescriptor_t v_desc, void* v, const cnnlTensorDescriptor_t m_desc,
void* m,
const cnnlTensorDescriptor_t v_desc,
void* v,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* lr, const void* beta1, const void* grad,
const void* beta2, const void* beta1_power, const void* lr,
const void* beta2_power, const void* epsilon, const void* beta1,
const void* beta2,
const void* beta1_power,
const void* beta2_power,
const void* epsilon,
const bool use_nesterov); const bool use_nesterov);
static void ApplyAdaMax(const ExecutionContext& ctx, static void ApplyAdaMax(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const cnnlTensorDescriptor_t var_desc, void* var, const cnnlTensorDescriptor_t var_desc,
const cnnlTensorDescriptor_t m_desc, void* m, void* var,
const cnnlTensorDescriptor_t v_desc, void* v, const cnnlTensorDescriptor_t m_desc,
const void* diff, const void* lr, const void* beta1, void* m,
const void* beta2, const void* beta1_power, const cnnlTensorDescriptor_t v_desc,
void* v,
const void* diff,
const void* lr,
const void* beta1,
const void* beta2,
const void* beta1_power,
const void* epsilon); const void* epsilon);
static void ApplyMomentum(const ExecutionContext& ctx, static void ApplyMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const bool use_nesterov, const void* grad,
const void* lr, const void* momentum, void* var, const bool use_nesterov,
const void* lr,
const void* momentum,
void* var,
void* accum); void* accum);
static void ApplyKerasMomentum(const ExecutionContext& ctx, static void ApplyKerasMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const bool use_nesterov, const void* grad,
const void* lr, const void* momentum, const bool use_nesterov,
void* var, void* accum); const void* lr,
const void* momentum,
void* var,
void* accum);
static void ApplyAdadelta(const ExecutionContext& ctx, static void ApplyAdadelta(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const cnnlTensorDescriptor_t grad_desc,
const void* diff, const void* lr, const void* rho, const void* diff,
const void* epsilon, void* var, void* accum, const void* lr,
const void* rho,
const void* epsilon,
void* var,
void* accum,
void* accum_update); void* accum_update);
static void SparseSoftmaxXentWithLogits( static void SparseSoftmaxXentWithLogits(
const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc, const void* input, cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t label_desc, const void* label, const cnnlTensorDescriptor_t x_desc,
const cnnlTensorDescriptor_t y_desc, void* output, const void* input,
const cnnlTensorDescriptor_t diff_y_desc, void* back_out); const cnnlTensorDescriptor_t label_desc,
const void* label,
static void RandomUniform(const ExecutionContext& ctx, const int num, const cnnlTensorDescriptor_t y_desc,
void* output,
const cnnlTensorDescriptor_t diff_y_desc,
void* back_out);
static void RandomUniform(const ExecutionContext& ctx,
const int num,
const cnnlDataType_t data_type, const cnnlDataType_t data_type,
const cnnlRandGenerator_t mlu_generator, const cnnlRandGenerator_t mlu_generator,
void* mlu_state, void* output); void* mlu_state,
void* output);
static void FusedDropout( static void FusedDropout(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlRandGenerator_t generator, const cnnlRandGenerator_t generator,
const cnnlTensorDescriptor_t input_desc, const void* input, const float p, const cnnlTensorDescriptor_t input_desc,
void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const float p,
void* state,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cumsum(const ExecutionContext& ctx, const int axis, static void Cumsum(const ExecutionContext& ctx,
const bool exclusive, const bool reverse, const int axis,
const cnnlTensorDescriptor_t input_desc, const void* input, const bool exclusive,
const cnnlTensorDescriptor_t ouput_desc, void* output); const bool reverse,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void BroadcastTo(const ExecutionContext& ctx, static void BroadcastTo(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
...@@ -620,189 +751,267 @@ class MLUCnnl { ...@@ -620,189 +751,267 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void GatherFunctor( static void GatherFunctor(const ExecutionContext& ctx,
const ExecutionContext& ctx, const int axis, const int batch_dims, const int axis,
const cnnlTensorDescriptor_t params_desc, const void* params, const int batch_dims,
const cnnlTensorDescriptor_t indices_desc, const void* indices, const cnnlTensorDescriptor_t params_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* params,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterRefFunctor( static void ScatterRefFunctor(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc, const cnnlTensorDescriptor_t params_desc,
const void* params, const cnnlTensorDescriptor_t updates_desc, const void* params,
const void* updates, const cnnlTensorDescriptor_t indices_desc, const cnnlTensorDescriptor_t updates_desc,
const void* indices, const cnnlScatterRefMode_t mode); const void* updates,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlScatterRefMode_t mode);
static void ScatterFunctor(const ExecutionContext& ctx, static void ScatterFunctor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc, const cnnlTensorDescriptor_t params_desc,
const void* params, void* params,
const cnnlTensorDescriptor_t updates_desc, const cnnlTensorDescriptor_t updates_desc,
const void* updates, const void* updates,
const cnnlTensorDescriptor_t indices_desc, const cnnlTensorDescriptor_t indices_desc,
const void* indices, const int dim, const void* indices,
const int dim,
const cnnlScatterMode_t mode = CNNL_SCATTER); const cnnlScatterMode_t mode = CNNL_SCATTER);
static void Range(const ExecutionContext& ctx, const void* start, static void Range(const ExecutionContext& ctx,
const void* end, const void* step, const void* start,
const cnnlDataType_t output_dtype, void* output); const void* end,
const void* step,
const cnnlDataType_t output_dtype,
void* output);
static void Round(const ExecutionContext& ctx, static void Round(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TopK(const ExecutionContext& ctx, const int k, const int dim, static void TopK(const ExecutionContext& ctx,
const bool largest, const bool sorted, const int k,
const cnnlTensorDescriptor_t input_desc, const void* input, const int dim,
const bool largest,
const bool sorted,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t values_output_desc, const cnnlTensorDescriptor_t values_output_desc,
void* values_out, void* values_out,
const cnnlTensorDescriptor_t indices_output_desc, const cnnlTensorDescriptor_t indices_output_desc,
void* indices_out); void* indices_out);
static void StridedSlice(const ExecutionContext& ctx, const int begin[], static void StridedSlice(const ExecutionContext& ctx,
const int end[], const int strides[], const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Split(const ExecutionContext& ctx, int split_num, int axis, static void Split(const ExecutionContext& ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input_ptr, const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[], const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]); void* output_ptrs[]);
static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, static void Split(const MLUDeviceContext& dev_ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input_ptr, const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[], const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]); void* output_ptrs[]);
static void Scale(const ExecutionContext& ctx, const int axis, static void Scale(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const int axis,
const cnnlTensorDescriptor_t alpha_desc, const void* alpha, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t beta_desc, const void* beta, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t alpha_desc,
const void* alpha,
const cnnlTensorDescriptor_t beta_desc,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AddN(const ExecutionContext& ctx, uint32_t input_num, static void AddN(const ExecutionContext& ctx,
uint32_t input_num,
const cnnlTensorDescriptor_t inputs_desc[], const cnnlTensorDescriptor_t inputs_desc[],
const void* inputs[], const void* inputs[],
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log(const ExecutionContext& ctx, static void Log(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlLogBase_t log_base, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, cnnlLogBase_t log_base,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void StridedSliceGrad(const ExecutionContext& ctx, const int begin[], static void StridedSliceGrad(const ExecutionContext& ctx,
const int end[], const int strides[], const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Logic(const ExecutionContext& ctx, const cnnlLogicOp_t log_method, static void Logic(const ExecutionContext& ctx,
const cnnlLogicOp_t log_method,
const cnnlTensorDescriptor_t input1_desc, const cnnlTensorDescriptor_t input1_desc,
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const cnnlTensorDescriptor_t ouput_desc, const void* input2,
const cnnlTensorDescriptor_t ouput_desc,
void* output); void* output);
static void Select( static void Select(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlTensorDescriptor_t condition_desc, const cnnlTensorDescriptor_t condition_desc,
const void* condition_ptr, const cnnlTensorDescriptor_t then_desc, const void* condition_ptr,
const void* then_ptr, const cnnlTensorDescriptor_t else_desc, const cnnlTensorDescriptor_t then_desc,
const void* else_ptr, const cnnlTensorDescriptor_t output_desc, const void* then_ptr,
const cnnlTensorDescriptor_t else_desc,
const void* else_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output_ptr); void* output_ptr);
static void AssignAdd(const ExecutionContext& ctx, const void* alpha, static void AssignAdd(const ExecutionContext& ctx,
const void* alpha,
const void* beta, const void* beta,
const cnnlTensorDescriptor_t update_desc, const cnnlTensorDescriptor_t update_desc,
const void* update, const void* update,
const cnnlTensorDescriptor_t param_desc, void* param); const cnnlTensorDescriptor_t param_desc,
void* param);
static void AssignSub(const ExecutionContext& ctx, const void* alpha, static void AssignSub(const ExecutionContext& ctx,
const void* alpha,
const void* beta, const void* beta,
const cnnlTensorDescriptor_t update_desc, const cnnlTensorDescriptor_t update_desc,
const void* update, const void* update,
const cnnlTensorDescriptor_t param_desc, void* param); const cnnlTensorDescriptor_t param_desc,
void* param);
static void Assign(const ExecutionContext& ctx, static void Assign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t update_desc, const cnnlTensorDescriptor_t update_desc,
const void* update, const void* update,
const cnnlTensorDescriptor_t param_desc, void* param); const cnnlTensorDescriptor_t param_desc,
void* param);
static void GatherNd(const ExecutionContext& ctx, static void GatherNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc, const cnnlTensorDescriptor_t params_desc,
const void* params, const void* params,
const cnnlTensorDescriptor_t indices_desc, const cnnlTensorDescriptor_t indices_desc,
const void* indices, const void* indices,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchToSpace(const ExecutionContext& ctx, static void BatchToSpace(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output, const cnnlSpaceBatchParam_t param); void* output,
const cnnlSpaceBatchParam_t param);
static void BatchToSpaceNd(const ExecutionContext& ctx, static void BatchToSpaceNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
cnnlSpaceBatchNdDescriptor_t param, cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input, size_t extra_input_size, void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void PoolingForward( static void PoolingForward(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, cnnlPoolingMode_t pool_mode,
int64_t output_h, int64_t output_w, cnnlPoolingDescriptor_t pooling_desc, int64_t output_h,
const void* alpha, const cnnlTensorDescriptor_t input_desc, int64_t output_w,
const void* input, const void* beta, const void* extra_input_ptr, cnnlPoolingDescriptor_t pooling_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const void* extra_input_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AdaptivePoolingForward( static void AdaptivePoolingForward(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output, const void* input,
const cnnlTensorDescriptor_t index_desc, void* index); const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlTensorDescriptor_t index_desc,
void* index);
static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, static void Pool3D(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
const std::vector<int64_t>& output_shape, const std::vector<int64_t>& output_shape,
cnnlPoolingDescriptor_t pooling_desc, const void* alpha, cnnlPoolingDescriptor_t pooling_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, const void* alpha,
const void* beta, const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Pad(const ExecutionContext& ctx, static void Pad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const void* paddings, const void* padding_value, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const void* paddings,
const void* padding_value,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Matmul(const ExecutionContext& ctx, const bool transpose_a, static void Matmul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b, const bool transpose_b,
const cnnlTensorDescriptor_t in0_desc, const void* in0, const cnnlTensorDescriptor_t in0_desc,
const cnnlTensorDescriptor_t in1_desc, const void* in1, const void* in0,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchMatmul( static void BatchMatmul(const ExecutionContext& ctx,
const ExecutionContext& ctx, const bool transpose_a, const bool transpose_a,
const bool transpose_b, const cnnlTensorDescriptor_t in0_desc, const bool transpose_b,
const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1, const cnnlTensorDescriptor_t in0_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void MulAx(const ExecutionContext& ctx, static void MulAx(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t alpha_desc, const void* alpha, const cnnlTensorDescriptor_t alpha_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* alpha,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void OpTensor(const ExecutionContext& ctx, static void OpTensor(const ExecutionContext& ctx,
const cnnlOpTensorDescriptor_t op_tensor_desc, const cnnlOpTensorDescriptor_t op_tensor_desc,
const cnnlTensorDescriptor_t a_desc, const void* a, const cnnlTensorDescriptor_t a_desc,
const cnnlTensorDescriptor_t b_desc, const void* b, const void* a,
const cnnlTensorDescriptor_t output_desc, void* output, const cnnlTensorDescriptor_t b_desc,
const void* b,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlDataType_t dtype, const cnnlDataType_t dtype,
const float alpha1_float = 1.f, const float alpha1_float = 1.f,
const float alpha2_float = 1.f, const float alpha2_float = 1.f,
const float beta_float = 0.f); const float beta_float = 0.f);
static void BiasAddGrad(const ExecutionContext& ctx, const int axis, static void BiasAddGrad(const ExecutionContext& ctx,
const int axis,
const cnnlTensorDescriptor_t out_backprop_desc, const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop, const void* out_backprop,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
...@@ -810,9 +1019,13 @@ class MLUCnnl { ...@@ -810,9 +1019,13 @@ class MLUCnnl {
static void OneHot(const ExecutionContext& ctx, static void OneHot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t desc_indices, const cnnlTensorDescriptor_t desc_indices,
const void* indices, const int depth, const void* on_value, const void* indices,
const void* off_value, const int axis, const int depth,
cnnlDataType_t output_data_type, void* output); const void* on_value,
const void* off_value,
const int axis,
cnnlDataType_t output_data_type,
void* output);
static void NonMaxSuppression(const ExecutionContext& ctx, static void NonMaxSuppression(const ExecutionContext& ctx,
const cnnlNmsDescriptor_t nms_desc, const cnnlNmsDescriptor_t nms_desc,
...@@ -821,35 +1034,47 @@ class MLUCnnl { ...@@ -821,35 +1034,47 @@ class MLUCnnl {
const cnnlTensorDescriptor_t confidence_desc, const cnnlTensorDescriptor_t confidence_desc,
const void* confidence, const void* confidence,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output, void* output_size); void* output,
void* output_size);
static void SoftmaxCrossEntropyWithLogits( static void SoftmaxCrossEntropyWithLogits(
const ExecutionContext& ctx, cnnlSoftmaxMode_t mode, const ExecutionContext& ctx,
cnnlSoftmaxMode_t mode,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* logits_in, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t label_desc, const void* labels_in, const void* logits_in,
const cnnlTensorDescriptor_t loss_out_desc, void* loss_out, const cnnlTensorDescriptor_t label_desc,
const cnnlTensorDescriptor_t back_out_desc, void* back_out); const void* labels_in,
const cnnlTensorDescriptor_t loss_out_desc,
void* loss_out,
const cnnlTensorDescriptor_t back_out_desc,
void* back_out);
static void SoftmaxForward(const ExecutionContext& ctx, static void SoftmaxForward(const ExecutionContext& ctx,
cnnlSoftmaxAlgorithm_t algorithm, cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode, const void* alpha, cnnlSoftmaxMode_t mode,
const void* alpha,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* beta, const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void SoftmaxBackward( static void SoftmaxBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm, cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode, const cnnlTensorDescriptor_t y_desc, cnnlSoftmaxMode_t mode,
const void* y, const cnnlTensorDescriptor_t diff_y_desc, const cnnlTensorDescriptor_t y_desc,
const void* diff_y, const cnnlTensorDescriptor_t diff_x_desc, const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x); void* diff_x);
static void Softplus(const ExecutionContext& ctx, static void Softplus(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t features_desc, const cnnlTensorDescriptor_t features_desc,
const void* features, const void* features,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void SoftplusGrad(const ExecutionContext& ctx, static void SoftplusGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t gradients_desc, const cnnlTensorDescriptor_t gradients_desc,
...@@ -860,38 +1085,59 @@ class MLUCnnl { ...@@ -860,38 +1085,59 @@ class MLUCnnl {
void* output); void* output);
static void RsqrtGrad(const ExecutionContext& ctx, static void RsqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const void* y, const cnnlTensorDescriptor_t data_desc,
const void* diff_y, void* output); const void* y,
const void* diff_y,
void* output);
static void SqrtGrad(const ExecutionContext& ctx, static void SqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const void* y, const cnnlTensorDescriptor_t data_desc,
const void* diff_y, void* output); const void* y,
const void* diff_y,
static void ConvolutionForward( void* output);
const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc_,
const void* alpha, const void* beta, static void ConvolutionForward(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t bias_desc, const void* bias_ptr, cnnlConvolutionDescriptor_t conv_desc_,
const cnnlTensorDescriptor_t input_desc, const void* input, const void* alpha,
const cnnlTensorDescriptor_t filtet_desc, const void* filter, const void* beta,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t bias_desc,
const void* bias_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedConvBNQuantify( static void FusedConvBNQuantify(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc, cnnlConvolutionDescriptor_t conv_desc,
const void* epsilon_ptr, const int fused_ops_number, const void* epsilon_ptr,
const cnnlDataType_t tensor_dtype, const int input_position, const int fused_ops_number,
const float input_scale, const int filter_position, const cnnlDataType_t tensor_dtype,
const float filter_scale, const cnnlTensorDescriptor_t scale_desc, const int input_position,
const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc, const float input_scale,
const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc, const int filter_position,
const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc, const float filter_scale,
const void* variance_ptr, const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t scale_desc,
const void* input, const cnnlTensorDescriptor_t filtet_desc, const void* scale_ptr,
const void* filter, const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t offset_desc,
const void* offset_ptr,
const cnnlTensorDescriptor_t mean_desc,
const void* mean_ptr,
const cnnlTensorDescriptor_t variance_desc,
const void* variance_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Tile(const ExecutionContext& ctx, static void Tile(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void UnsortedSegmentSum(const ExecutionContext& ctx, static void UnsortedSegmentSum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const cnnlTensorDescriptor_t data_desc,
...@@ -901,12 +1147,17 @@ class MLUCnnl { ...@@ -901,12 +1147,17 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Reduce(const ExecutionContext& ctx, const bool need_workspace, static void Reduce(const ExecutionContext& ctx,
const bool need_workspace,
const cnnlReduceDescriptor_t reduction_desc, const cnnlReduceDescriptor_t reduction_desc,
const void* alpha, const cnnlTensorDescriptor_t input_desc, const void* alpha,
const void* input, const size_t indices_size, const cnnlTensorDescriptor_t input_desc,
void* indices, const void* beta, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const size_t indices_size,
void* indices,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorDiv(const ExecutionContext& ctx, static void FloorDiv(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
...@@ -914,34 +1165,41 @@ class MLUCnnl { ...@@ -914,34 +1165,41 @@ class MLUCnnl {
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorMod(const ExecutionContext& ctx, static void FloorMod(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc, const cnnlTensorDescriptor_t input1_desc,
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void Maximum(const ExecutionContext& ctx, static void Maximum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc, const cnnlTensorDescriptor_t input1_desc,
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void Minimum(const ExecutionContext& ctx, static void Minimum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc, const cnnlTensorDescriptor_t input1_desc,
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void PowR(const ExecutionContext& ctx, static void PowR(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc, const void* input1, const cnnlTensorDescriptor_t input1_desc,
const cnnlTensorDescriptor_t input2_desc, const void* input2, const void* input1,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DivNoNan(const ExecutionContext& ctx, static void DivNoNan(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
...@@ -949,7 +1207,8 @@ class MLUCnnl { ...@@ -949,7 +1207,8 @@ class MLUCnnl {
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void SquaredDifference(const ExecutionContext& ctx, static void SquaredDifference(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc, const cnnlTensorDescriptor_t input1_desc,
...@@ -960,52 +1219,73 @@ class MLUCnnl { ...@@ -960,52 +1219,73 @@ class MLUCnnl {
void* output); void* output);
static void L2Loss(const ExecutionContext& ctx, static void L2Loss(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const void* input,
void* output); void* output);
static void Abs(const ExecutionContext& ctx, static void Abs(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Neg(const ExecutionContext& ctx, static void Neg(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Floor(const ExecutionContext& ctx, static void Floor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Ceil(const ExecutionContext& ctx, static void Ceil(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNan(const ExecutionContext& ctx, static void IsNan(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Square(const ExecutionContext& ctx, static void Square(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sqrt(const ExecutionContext& ctx, static void Sqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Rsqrt(const ExecutionContext& ctx, static void Rsqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cos(const ExecutionContext& ctx, static void Cos(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sin(const ExecutionContext& ctx, static void Sin(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrigonForward(const ExecutionContext& ctx, static void TrigonForward(const ExecutionContext& ctx,
const cnnlTrigonDescriptor_t trigon_desc, const cnnlTrigonDescriptor_t trigon_desc,
...@@ -1016,31 +1296,41 @@ class MLUCnnl { ...@@ -1016,31 +1296,41 @@ class MLUCnnl {
static void Exp(const ExecutionContext& ctx, static void Exp(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sign(const ExecutionContext& ctx, static void Sign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsFinite(const ExecutionContext& ctx, static void IsFinite(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNanInf(const ExecutionContext& ctx, static void IsNanInf(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, void* output); const void* input,
void* output);
static void Erf(const ExecutionContext& ctx, static void Erf(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log1p(const ExecutionContext& ctx, static void Log1p(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LogicalNot(const ExecutionContext& ctx, static void LogicalNot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
...@@ -1048,152 +1338,250 @@ class MLUCnnl { ...@@ -1048,152 +1338,250 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void DynamicStitch( static void DynamicStitch(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc, const cnnlTensorDescriptor_t* indices_desc,
const int** indices, const cnnlTensorDescriptor_t* data_desc, const int** indices,
const void** data, const int size, int* indices_dims, const cnnlTensorDescriptor_t* data_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void** data,
const int size,
int* indices_dims,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void CropAndResize( static void CropAndResize(const ExecutionContext& ctx,
const ExecutionContext& ctx, const std::string method_name, const std::string method_name,
const float extrapolation_value, const cnnlTensorDescriptor_t image_desc, const float extrapolation_value,
const void* image, const cnnlTensorDescriptor_t boxes_desc, const cnnlTensorDescriptor_t image_desc,
const void* boxes, const cnnlTensorDescriptor_t box_index_desc, const void* image,
const void* box_index, const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_index_desc,
const void* box_index,
const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void CropAndResizeBackwardImage( static void CropAndResizeBackwardImage(
const ExecutionContext& ctx, const std::string method_name, const ExecutionContext& ctx,
const cnnlTensorDescriptor_t image_desc, const void* image, const std::string method_name,
const cnnlTensorDescriptor_t boxes_desc, const void* boxes, const cnnlTensorDescriptor_t image_desc,
const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx, const void* image,
const cnnlTensorDescriptor_t grads_image_desc, void* grads_image); const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx,
const cnnlTensorDescriptor_t grads_image_desc,
void* grads_image);
static void CropAndResizeBackwardBoxes( static void CropAndResizeBackwardBoxes(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const ExecutionContext& ctx,
const void* input, const cnnlTensorDescriptor_t image_desc, const cnnlTensorDescriptor_t input_desc,
const void* image, const cnnlTensorDescriptor_t boxes_desc, const void* input,
const void* boxes, const cnnlTensorDescriptor_t box_idx_desc, const cnnlTensorDescriptor_t image_desc,
const void* box_idx, const cnnlTensorDescriptor_t output_desc, const void* image,
void* output); const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
static void PoolingBackward( const cnnlTensorDescriptor_t box_idx_desc,
const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, const void* box_idx,
const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y, const cnnlTensorDescriptor_t output_desc,
const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y, void* output);
const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); static void PoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc,
static void AdaptivePoolingBackward( const void* alpha,
const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, const cnnlTensorDescriptor_t y_desc,
const cnnlTensorDescriptor_t y_desc, const void* y, const void* y,
const cnnlTensorDescriptor_t index_desc, const void* index, const cnnlTensorDescriptor_t diff_y_desc,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const void* beta,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void AdaptivePoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t index_desc,
const void* index,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void PoolingIndex(const ExecutionContext& ctx, static void PoolingIndex(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc, const cnnlPoolingDescriptor_t pooling_desc,
const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t x_desc,
const cnnlTensorDescriptor_t y_desc, void* y); const void* x,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void SpaceToBatch(const ExecutionContext& ctx, static void SpaceToBatch(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output, const int64_t block_shape[]); void* output,
const int64_t block_shape[]);
static void SpaceToBatchNd(const ExecutionContext& ctx, static void SpaceToBatchNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
cnnlSpaceBatchNdDescriptor_t param, cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input, size_t extra_input_size, void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void Interp(const ExecutionContext& ctx, const cnnlInterpMode_t mode, static void Interp(const ExecutionContext& ctx,
const bool align_corners, const bool half_pixel_centers, const cnnlInterpMode_t mode,
const cnnlTensorDescriptor_t input_desc, const void* input, const bool align_corners,
const cnnlTensorDescriptor_t output_desc, void* output); const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void InterpBackward( static void InterpBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode, const cnnlInterpBackwardMode_t mode,
const bool align_corners, const bool half_pixel_centers, const bool align_corners,
const cnnlTensorDescriptor_t input_desc, const void* input, const bool half_pixel_centers,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeParam(const ExecutionContext& ctx, static void QuantizeParam(const ExecutionContext& ctx,
const cnnlQuantizeMode_t mode, const int bitwidth, const cnnlQuantizeMode_t mode,
const int bitwidth,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, void* position, void* scale, const void* input,
void* position,
void* scale,
void* offset); void* offset);
static void QuantizeMatMul( static void QuantizeMatMul(const ExecutionContext& ctx,
const ExecutionContext& ctx, const bool transpose_a, const bool transpose_a,
const bool transpose_b, const cnnlTensorDescriptor_t a_desc, const bool transpose_b,
const void* a, const void* a_position, const void* a_scale, const cnnlTensorDescriptor_t a_desc,
const void* a_offset, const cnnlTensorDescriptor_t b_desc, const void* b, const void* a,
const void* b_position, const void* b_scale, const void* b_offset, const void* a_position,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type, const void* a_scale,
const cnnlTensorDescriptor_t output_desc, void* output); const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMul( static void QuantizeBatchMatMul(const ExecutionContext& ctx,
const ExecutionContext& ctx, const bool adj_x, const bool adj_y, const bool adj_x,
const cnnlTensorDescriptor_t a_desc, const void* a, const bool adj_y,
const void* a_position, const void* a_scale, const void* a_offset, const cnnlTensorDescriptor_t a_desc,
const cnnlTensorDescriptor_t b_desc, const void* b, const void* a,
const void* b_position, const void* b_scale, const void* b_offset, const void* a_position,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type, const void* a_scale,
const cnnlTensorDescriptor_t output_desc, void* output); const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMulBCast( static void QuantizeBatchMatMulBCast(const ExecutionContext& ctx,
const ExecutionContext& ctx, const bool adj_x, const bool adj_y, const bool adj_x,
const cnnlTensorDescriptor_t a_desc, const void* a, const bool adj_y,
const void* a_position, const void* a_scale, const void* a_offset, const cnnlTensorDescriptor_t a_desc,
const cnnlTensorDescriptor_t b_desc, const void* b, const void* a,
const void* b_position, const void* b_scale, const void* b_offset, const void* a_position,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type, const void* a_scale,
const cnnlTensorDescriptor_t output_desc, void* output); const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedBatchNorm( static void FusedBatchNorm(const ExecutionContext& ctx,
const ExecutionContext& ctx, const bool is_training, const bool is_training,
const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t x_desc,
const cnnlTensorDescriptor_t scale_desc, const void* scale, const void* x,
const void* offset, const void* estimated_mean, const cnnlTensorDescriptor_t scale_desc,
const void* estimated_variance, float epsilon, float momentum, const void* scale,
const cnnlTensorDescriptor_t output_desc, void* output, void* batch_mean, const void* offset,
void* batch_var, void* saved_mean, void* saved_var); const void* estimated_mean,
const void* estimated_variance,
static void FusedBatchNormGrad( float epsilon,
const ExecutionContext& ctx, const bool is_training, float momentum,
const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop, const cnnlTensorDescriptor_t output_desc,
const cnnlTensorDescriptor_t x_desc, const void* x, void* output,
const cnnlTensorDescriptor_t scale_desc, const void* scale, void* batch_mean,
const void* saved_mean, const void* saved_var, float epsilon, void* batch_var,
const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop, void* saved_mean,
void* scale_backprop, void* offset_backprop); void* saved_var);
static void LayerNormForward(const ExecutionContext& ctx, int axis, static void FusedBatchNormGrad(const ExecutionContext& ctx,
const bool is_training,
const cnnlTensorDescriptor_t y_backprop_desc,
const void* y_backprop,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t scale_desc,
const void* scale,
const void* saved_mean,
const void* saved_var,
float epsilon,
const cnnlTensorDescriptor_t x_backprop_desc,
void* x_backprop,
void* scale_backprop,
void* offset_backprop);
static void LayerNormForward(const ExecutionContext& ctx,
int axis,
const cnnlTensorDescriptor_t x_desc, const cnnlTensorDescriptor_t x_desc,
const void* x, const void* x,
const cnnlTensorDescriptor_t weight_bias_desc, const cnnlTensorDescriptor_t weight_bias_desc,
const void* weight, const void* bias, float eps, const void* weight,
const cnnlTensorDescriptor_t y_desc, void* y, const void* bias,
float eps,
const cnnlTensorDescriptor_t y_desc,
void* y,
const cnnlTensorDescriptor_t mean_rstd_desc, const cnnlTensorDescriptor_t mean_rstd_desc,
void* saved_mean, void* saved_rstd); void* saved_mean,
void* saved_rstd);
static void LayerNormBackward( static void LayerNormBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, int axis, int axis,
const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t x_desc,
const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z, const void* x,
const cnnlTensorDescriptor_t weight_bias_desc, const void* weight, const cnnlTensorDescriptor_t diff_z_desc,
const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean, const void* diff_z,
const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc, const cnnlTensorDescriptor_t weight_bias_desc,
void* diff_x, void* diff_weight, void* diff_bias); const void* weight,
const cnnlTensorDescriptor_t mean_rstd_desc,
const void* saved_mean,
const void* saved_rstd,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x,
void* diff_weight,
void* diff_bias);
static void Transpose(const ExecutionContext& ctx, static void Transpose(const ExecutionContext& ctx,
const std::vector<int> perm, const int input_dim, const std::vector<int> perm,
const int input_dim,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrilTriu(const ExecutionContext& ctx, const int diagonal_k, static void TrilTriu(const ExecutionContext& ctx, const int diagonal_k,
const bool tri_up_mode, const bool tri_up_mode,
...@@ -1203,109 +1591,170 @@ class MLUCnnl { ...@@ -1203,109 +1591,170 @@ class MLUCnnl {
static void MatrixBandPart(const ExecutionContext& ctx, static void MatrixBandPart(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const cnnlTensorDescriptor_t data_desc,
const void* input, const int num_lower, const void* input,
const int num_upper, void* output); const int num_lower,
const int num_upper,
void* output);
static void NumTrue(const ExecutionContext& ctx, static void NumTrue(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t x_desc,
Tensor index, uint32_t* num_true); const void* x,
Tensor index,
uint32_t* num_true);
static void Where(const ExecutionContext& ctx, static void Where(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc, const void* x, const cnnlTensorDescriptor_t x_desc,
const uint32_t* strides, const uint32_t* index, const void* x,
const cnnlTensorDescriptor_t y_desc, int* y, const uint32_t* strides,
const uint32_t* index,
const cnnlTensorDescriptor_t y_desc,
int* y,
const bool as_tuple); const bool as_tuple);
static void Conv2D(const ExecutionContext& ctx, static void Conv2D(const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype, const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip, const void* input_position, const cnnlDataType_t dt_onchip,
const void* input_scale, const void* input_offset, const void* input_position,
const void* filter_position, const void* filter_scale, const void* input_scale,
const void* input_offset,
const void* filter_position,
const void* filter_scale,
const void* filter_offset, const void* filter_offset,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filter_desc, const cnnlTensorDescriptor_t filter_desc,
const void* filter, const cnnlTensorDescriptor_t bias_desc, const void* filter,
const void* bias, const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void ConvBackpropInput( static void ConvBackpropInput(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t filter_desc, const void* filter, const cnnlTensorDescriptor_t filter_desc,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, const void* filter,
const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void QuantizeConvBackpropInput( static void QuantizeConvBackpropInput(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, const ExecutionContext& ctx,
const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, const cnnlConvolutionDescriptor_t conv_desc,
const void* filter_position, const void* filter_scale, const cnnlDataType_t tensor_dtype,
const void* filter_offset, const void* out_backprop_position, const cnnlDataType_t dt_onchip,
const void* out_backprop_scale, const void* out_backprop_offset, const void* filter_position,
const cnnlTensorDescriptor_t input_desc, const void* filter, const void* filter_scale,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, const void* filter_offset,
const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop); const void* out_backprop_position,
const void* out_backprop_scale,
const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc,
const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void ConvBackpropFilter( static void ConvBackpropFilter(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); const void* input,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc,
void* filter_backprop);
static void QuantizeConvBackpropFilter( static void QuantizeConvBackpropFilter(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc, const ExecutionContext& ctx,
const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip, const cnnlConvolutionDescriptor_t conv_desc,
const void* input_position, const void* input_scale, const cnnlDataType_t tensor_dtype,
const void* input_offset, const void* out_backprop_position, const cnnlDataType_t dt_onchip,
const void* out_backprop_scale, const void* out_backprop_offset, const void* input_position,
const cnnlTensorDescriptor_t input_desc, const void* input, const void* input_scale,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop, const void* input_offset,
const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop); const void* out_backprop_position,
const void* out_backprop_scale,
static void DCNForward( const void* out_backprop_offset,
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, const void* input,
const cnnlTensorDescriptor_t offset_desc, const void* offset, const cnnlTensorDescriptor_t out_backprop_desc,
const cnnlTensorDescriptor_t mask_desc, const void* mask, const void* out_backprop,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const cnnlTensorDescriptor_t filter_backprop_desc,
const cnnlTensorDescriptor_t bias_desc, const void* bias, void* filter_backprop);
const cnnlTensorDescriptor_t output_desc, void* output);
static void DCNBackwardData( static void DCNForward(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t offset_desc, const void* offset, const void* input,
const cnnlTensorDescriptor_t mask_desc, const void* mask, const cnnlTensorDescriptor_t offset_desc,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const void* offset,
const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output, const cnnlTensorDescriptor_t mask_desc,
const cnnlTensorDescriptor_t grad_input_desc, void* grad_input, const void* mask,
const cnnlTensorDescriptor_t grad_offset_desc, void* grad_offset, const cnnlTensorDescriptor_t weight_desc,
const cnnlTensorDescriptor_t grad_mask_desc, void* grad_mask); const void* weight,
const cnnlTensorDescriptor_t bias_desc,
static void DCNBackwardWeight( const void* bias,
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc, const cnnlTensorDescriptor_t output_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, void* output);
const cnnlTensorDescriptor_t offset_desc, const void* offset,
const cnnlTensorDescriptor_t mask_desc, const void* mask, static void DCNBackwardData(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output, const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t grad_weight_desc, void* grad_weight, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t grad_bias_desc, void* grad_bias); const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_input_desc,
void* grad_input,
const cnnlTensorDescriptor_t grad_offset_desc,
void* grad_offset,
const cnnlTensorDescriptor_t grad_mask_desc,
void* grad_mask);
static void DCNBackwardWeight(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_weight_desc,
void* grad_weight,
const cnnlTensorDescriptor_t grad_bias_desc,
void* grad_bias);
static void InTopK(const ExecutionContext& ctx, static void InTopK(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t predictions_desc, const cnnlTensorDescriptor_t predictions_desc,
const void* predictions, const void* predictions,
const cnnlTensorDescriptor_t targets_desc, const cnnlTensorDescriptor_t targets_desc,
const void* targets, const cnnlTensorDescriptor_t k_desc, const void* targets,
const void* k, const int k_int, const cnnlTensorDescriptor_t k_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* k,
const int k_int,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode, static void ScatterNd(const ExecutionContext& ctx,
cnnlScatterNdMode_t mode,
const cnnlTensorDescriptor_t indices_desc, const cnnlTensorDescriptor_t indices_desc,
const void* indices, const void* indices,
const cnnlTensorDescriptor_t updates_desc, const cnnlTensorDescriptor_t updates_desc,
const void* updates, const void* updates,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void BitWise(const ExecutionContext& ctx, static void BitWise(const ExecutionContext& ctx,
const cnnlBitComputeOp_t optype, const cnnlBitComputeOp_t optype,
...@@ -1313,12 +1762,17 @@ class MLUCnnl { ...@@ -1313,12 +1762,17 @@ class MLUCnnl {
const void* input1, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const cnnlTensorDescriptor_t input2_desc,
const void* input2, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void QR(const ExecutionContext& ctx, static void QR(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t a_desc, const void* a, const cnnlTensorDescriptor_t a_desc,
const cnnlTensorDescriptor_t q_desc, void* q, const void* a,
const cnnlTensorDescriptor_t r_desc, void* r, const bool some); const cnnlTensorDescriptor_t q_desc,
void* q,
const cnnlTensorDescriptor_t r_desc,
void* r,
const bool some);
static void Reciprocal(const ExecutionContext& ctx, static void Reciprocal(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
...@@ -1326,55 +1780,85 @@ class MLUCnnl { ...@@ -1326,55 +1780,85 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc, const cnnlTensorDescriptor_t output_desc,
void* output); void* output);
static void BceLoss( static void BceLoss(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction, const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t target_desc, const void* target, const void* input,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const cnnlTensorDescriptor_t target_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceLossBackward( static void BceLossBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction, const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc, const void* grad, const cnnlTensorDescriptor_t grad_desc,
const cnnlTensorDescriptor_t input_desc, const void* input, const void* grad,
const cnnlTensorDescriptor_t target_desc, const void* target, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void EmbeddingForward( static void EmbeddingForward(const ExecutionContext& ctx,
const ExecutionContext& ctx, const int padding_idx, const int padding_idx,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const cnnlTensorDescriptor_t weight_desc,
const cnnlTensorDescriptor_t indices_desc, const int* indices, const void* weight,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t indices_desc,
const int* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Transform(const ExecutionContext& ctx, const void* alpha, static void Transform(const ExecutionContext& ctx,
const void* alpha,
const void* beta, const void* beta,
const cnnlTensorDescriptor_t input_desc, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t output_desc,
void* output);
static void EmbeddingBackward( static void EmbeddingBackward(const ExecutionContext& ctx,
const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq, int padding_idx,
const cnnlTensorDescriptor_t indices_desc, const void* indices, bool scale_grad_by_freq,
const cnnlTensorDescriptor_t diff_desc, const void* diff, const cnnlTensorDescriptor_t indices_desc,
const cnnlTensorDescriptor_t output_desc, void* output); const void* indices,
const cnnlTensorDescriptor_t diff_desc,
const void* diff,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogits( static void BceWithLogits(const ExecutionContext& ctx,
const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction, cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t target_desc, const void* target, const void* input,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const cnnlTensorDescriptor_t target_desc,
const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight, const void* target,
const cnnlTensorDescriptor_t output_desc, void* output); const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogitsBackward( static void BceWithLogitsBackward(
const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction, const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc, const void* grad, cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t grad_desc,
const cnnlTensorDescriptor_t target_desc, const void* target, const void* grad,
const cnnlTensorDescriptor_t weight_desc, const void* weight, const cnnlTensorDescriptor_t input_desc,
const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight, const void* input,
const cnnlTensorDescriptor_t diff_input_desc, void* diff_input); const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t diff_input_desc,
void* diff_input);
}; };
template <typename T> template <typename T>
...@@ -1393,22 +1877,27 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx, ...@@ -1393,22 +1877,27 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
transformed_output->mutable_data<T>( transformed_output->mutable_data<T>(
framework::DDim(output_shape.data(), dim_size), ctx.GetPlace()); framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
} }
MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY, MLUCnnlTensorDesc trans_in_desc(
ToCnnlDataType<T>()); *transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY, MLUCnnlTensorDesc trans_out_desc(
ToCnnlDataType<T>()); *transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(), MLUCnnl::Transpose(ctx,
GetBasePtr(transformed_input), trans_out_desc.get(), perm,
dim_size,
trans_in_desc.get(),
GetBasePtr(transformed_input),
trans_out_desc.get(),
GetBasePtr(transformed_output)); GetBasePtr(transformed_output));
} }
template <typename T> template <typename T>
inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value, inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx,
T value,
Tensor* out) { Tensor* out) {
MLUCnnlTensorDesc out_desc(*out); MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), MLUCnnl::Fill(
GetBasePtr(out)); ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out));
} }
} // namespace operators } // namespace operators
......
...@@ -33,26 +33,43 @@ class ScatterMLUKernel : public framework::OpKernel<T> { ...@@ -33,26 +33,43 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
cnnlScatterRefMode_t mode; cnnlScatterRefMode_t mode;
if (overwrite) { if (overwrite) {
mode = CNNL_SCATTERREF_UPDATE; mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), MLUCnnl::ScatterRefFunctor(ctx,
updates_desc.get(), GetBasePtr(updates), x_desc.get(),
indices_desc.get(), GetBasePtr(indices), mode); GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
} else { } else {
Tensor tensor_zeros(updates->type()); Tensor tensor_zeros(updates->type());
tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace()); tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros); MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
float value = 0.0; float value = 0.0;
auto value_t = static_cast<T>(value); auto value_t = static_cast<T>(value);
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t, MLUCnnl::Fill(ctx,
tensor_zeros_desc.get(), GetBasePtr(&tensor_zeros)); CNNL_POINTER_MODE_HOST,
&value_t,
tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros));
mode = CNNL_SCATTERREF_UPDATE; mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
tensor_zeros_desc.get(), tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros), indices_desc.get(), GetBasePtr(&tensor_zeros),
GetBasePtr(indices), mode); indices_desc.get(),
GetBasePtr(indices),
mode);
mode = CNNL_SCATTERREF_ADD; mode = CNNL_SCATTERREF_ADD;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x), MLUCnnl::ScatterRefFunctor(ctx,
updates_desc.get(), GetBasePtr(updates), x_desc.get(),
indices_desc.get(), GetBasePtr(indices), mode); GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
} }
paddle::framework::TensorCopy(*x, place, out); paddle::framework::TensorCopy(*x, place, out);
} }
...@@ -62,5 +79,6 @@ class ScatterMLUKernel : public framework::OpKernel<T> { ...@@ -62,5 +79,6 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(scatter, ops::ScatterMLUKernel<float>, REGISTER_OP_MLU_KERNEL(scatter,
ops::ScatterMLUKernel<float>,
ops::ScatterMLUKernel<paddle::platform::float16>); ops::ScatterMLUKernel<paddle::platform::float16>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册