未验证 提交 0d17c047 编写于 作者: Z zhaoying9105 提交者: GitHub

[MLU](bugfix): fix MLUCnnl::ScatterFunctor function declare bug (#43778)

上级 03972d5a
......@@ -35,9 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool;
using MLUDeviceContext = platform::MLUDeviceContext;
const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
{"reduce_all", CNNL_REDUCE_AND}, {"reduce_any", CNNL_REDUCE_OR},
{"reduce_max", CNNL_REDUCE_MAX}, {"reduce_mean", CNNL_REDUCE_AVG},
{"reduce_min", CNNL_REDUCE_MIN}, {"reduce_sum", CNNL_REDUCE_ADD},
{"reduce_all", CNNL_REDUCE_AND},
{"reduce_any", CNNL_REDUCE_OR},
{"reduce_max", CNNL_REDUCE_MAX},
{"reduce_mean", CNNL_REDUCE_AVG},
{"reduce_min", CNNL_REDUCE_MIN},
{"reduce_sum", CNNL_REDUCE_ADD},
{"reduce_prod", CNNL_REDUCE_MUL},
};
......@@ -225,36 +228,49 @@ class MLUCnnlTensorDesc {
MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[],
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[],
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim, const int dim_sizes[],
const cnnlDataType_t tensor_dtype, int position);
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[],
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[],
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim, const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype, int position);
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const Tensor& tensor, const cnnlTensorLayout_t layout,
MLUCnnlTensorDesc(const Tensor& tensor,
const cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype);
explicit MLUCnnlTensorDesc(const Tensor& tensor);
MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype, int position);
MLUCnnlTensorDesc(const Tensor& tensor,
cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const Tensor& tensor, cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype, int position,
MLUCnnlTensorDesc(const Tensor& tensor,
cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position,
float scale);
~MLUCnnlTensorDesc();
......@@ -270,8 +286,10 @@ class MLUCnnlActivationDesc {
MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof,
const float sliced_dim, const float selu_alpha,
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode,
const float ceof,
const float sliced_dim,
const float selu_alpha,
const float selu_lambda);
const cnnlActivationDescriptor_t get() const;
......@@ -288,14 +306,22 @@ class MLUCnnlPoolingDesc {
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt,
int window_rows, int window_cols, int64_t pad_up,
int64_t pad_down, int64_t pad_left, int64_t pad_right,
int row_stride, int col_stride, int row_dilation,
int col_dilation, bool ceil_mode);
int window_rows,
int window_cols,
int64_t pad_up,
int64_t pad_down,
int64_t pad_left,
int64_t pad_right,
int row_stride,
int col_stride,
int row_dilation,
int col_dilation,
bool ceil_mode);
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt,
const int tensor_rank, const std::vector<int>& window,
const int tensor_rank,
const std::vector<int>& window,
const std::vector<int>& padding,
const std::vector<int>& stride);
......@@ -364,8 +390,10 @@ class MLUCnnlNMSDesc {
MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode, const float iou_threshold,
const int max_output_size, const float confidence_threshold,
MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode,
const float iou_threshold,
const int max_output_size,
const float confidence_threshold,
const int input_layout);
const cnnlNmsDescriptor_t get() const;
......@@ -378,12 +406,17 @@ class MLUCnnlNMSDesc {
class MLUCnnlConvolutionDesc {
public:
MLUCnnlConvolutionDesc(const int dims, const int pad[], const int stride[],
const int dilation[], const int group_count,
MLUCnnlConvolutionDesc(const int dims,
const int pad[],
const int stride[],
const int dilation[],
const int group_count,
const cnnlDataType_t tensor_dtype);
MLUCnnlConvolutionDesc(const int dims, const int64_t pad[],
const int64_t stride[], const int64_t dilation[],
MLUCnnlConvolutionDesc(const int dims,
const int64_t pad[],
const int64_t stride[],
const int64_t dilation[],
const int group_count,
const cnnlDataType_t tensor_dtype);
......@@ -402,7 +435,8 @@ class MLUCnnlConvolutionDesc {
class MLUCnnlBatchSpaceDesc {
public:
MLUCnnlBatchSpaceDesc(uint32_t block_shape[], uint32_t paddings[],
MLUCnnlBatchSpaceDesc(uint32_t block_shape[],
uint32_t paddings[],
const uint32_t block_shape_size,
const uint32_t paddings_size);
......@@ -446,8 +480,12 @@ class MLUCnnlTrigonDesc {
class MLUCnnlDCNDesc {
public:
MLUCnnlDCNDesc(int dimNb, const int* pad, const int* stride,
const int* dilation, int deformable_group, int conv_group,
MLUCnnlDCNDesc(int dimNb,
const int* pad,
const int* stride,
const int* dilation,
int deformable_group,
int conv_group,
int im2col_step);
const cnnlDCNDescriptor_t get() const;
......@@ -461,55 +499,88 @@ class MLUCnnl {
public:
static void Active(const ExecutionContext& ctx,
cnnlActivationDescriptor_t active_desc,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ActiveGrad(
const ExecutionContext& ctx, cnnlActivationDescriptor_t active_desc,
const void* alpha, const void* beta, const cnnlTensorDescriptor_t y_desc,
const void* y, const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y, const cnnlTensorDescriptor_t x_desc, const void* x,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
static void ActiveGrad(const ExecutionContext& ctx,
cnnlActivationDescriptor_t active_desc,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Concat(const ExecutionContext& ctx, const int pack_num,
const int axis, const cnnlTensorDescriptor_t inputs_desc[],
static void Concat(const ExecutionContext& ctx,
const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[],
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num,
const int axis, const cnnlTensorDescriptor_t inputs_desc[],
static void Concat(const MLUDeviceContext& dev_ctx,
const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[],
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
static void Cast(const ExecutionContext& ctx,
cnnlCastDataType_t cast_type,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Clip(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const void* min, const void* max, void* y);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* min,
const void* max,
void* y);
static void HardtanhBackward(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc,
const void* x, const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y, const float max_val, const float min_val,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
static void HardtanhBackward(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const float max_val,
const float min_val,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Div(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t in0_desc, const void* in0,
const cnnlTensorDescriptor_t in1_desc, const void* in1,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Fill(const ExecutionContext& ctx,
const cnnlPointerMode_t pointer_mode, const void* value_ptr,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlPointerMode_t pointer_mode,
const void* value_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LRN(const ExecutionContext& ctx, const int local_size,
const double alpha, const double beta, const double k,
static void LRN(const ExecutionContext& ctx,
const int local_size,
const double alpha,
const double beta,
const double k,
const cnnlTensorDescriptor_t input_quant_desc,
const void* input_quant,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantifyOffline(const ExecutionContext& context,
cnnlQuantizeMode_t mode,
......@@ -521,98 +592,158 @@ class MLUCnnl {
static void QuantifyOnline(const ExecutionContext& context,
const int bitwidth,
const cnnlTensorDescriptor_t input_desc,
const void* input, const bool compute_scale,
void* position, void* scale,
const void* input,
const bool compute_scale,
void* position,
void* scale,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void SGD(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc, const void* grad,
const void* lr, const cnnlTensorDescriptor_t var_desc,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const cnnlTensorDescriptor_t var_desc,
void* var);
static void ApplyAdaGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t accum_desc, void* accum,
const cnnlTensorDescriptor_t var_desc, void* var,
const void* lr, const bool update_slots);
const cnnlTensorDescriptor_t accum_desc,
void* accum,
const cnnlTensorDescriptor_t var_desc,
void* var,
const void* lr,
const bool update_slots);
static void ApplyRMSProp(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* lr, const void* rho,
const void* momentum, const void* epsilon,
const cnnlTensorDescriptor_t var_desc, void* var,
const cnnlTensorDescriptor_t ms_desc, void* ms,
const cnnlTensorDescriptor_t mom_desc, void* mom);
static void ApplyCenterRMSProp(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* lr, const void* rho, const void* momentum,
const void* epsilon, const cnnlTensorDescriptor_t var_desc, void* var,
const cnnlTensorDescriptor_t mg_desc, void* mg,
const cnnlTensorDescriptor_t ms_desc, void* ms,
const cnnlTensorDescriptor_t mom_desc, void* mom);
const void* grad,
const void* lr,
const void* rho,
const void* momentum,
const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t ms_desc,
void* ms,
const cnnlTensorDescriptor_t mom_desc,
void* mom);
static void ApplyCenterRMSProp(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const void* rho,
const void* momentum,
const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t mg_desc,
void* mg,
const cnnlTensorDescriptor_t ms_desc,
void* ms,
const cnnlTensorDescriptor_t mom_desc,
void* mom);
static void ApplyAdam(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t var_desc, void* var,
const cnnlTensorDescriptor_t m_desc, void* m,
const cnnlTensorDescriptor_t v_desc, void* v,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t m_desc,
void* m,
const cnnlTensorDescriptor_t v_desc,
void* v,
const cnnlTensorDescriptor_t grad_desc,
const void* grad, const void* lr, const void* beta1,
const void* beta2, const void* beta1_power,
const void* beta2_power, const void* epsilon,
const void* grad,
const void* lr,
const void* beta1,
const void* beta2,
const void* beta1_power,
const void* beta2_power,
const void* epsilon,
const bool use_nesterov);
static void ApplyAdaMax(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const cnnlTensorDescriptor_t var_desc, void* var,
const cnnlTensorDescriptor_t m_desc, void* m,
const cnnlTensorDescriptor_t v_desc, void* v,
const void* diff, const void* lr, const void* beta1,
const void* beta2, const void* beta1_power,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t m_desc,
void* m,
const cnnlTensorDescriptor_t v_desc,
void* v,
const void* diff,
const void* lr,
const void* beta1,
const void* beta2,
const void* beta1_power,
const void* epsilon);
static void ApplyMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad, const bool use_nesterov,
const void* lr, const void* momentum, void* var,
const void* grad,
const bool use_nesterov,
const void* lr,
const void* momentum,
void* var,
void* accum);
static void ApplyKerasMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad, const bool use_nesterov,
const void* lr, const void* momentum,
void* var, void* accum);
const void* grad,
const bool use_nesterov,
const void* lr,
const void* momentum,
void* var,
void* accum);
static void ApplyAdadelta(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* diff, const void* lr, const void* rho,
const void* epsilon, void* var, void* accum,
const void* diff,
const void* lr,
const void* rho,
const void* epsilon,
void* var,
void* accum,
void* accum_update);
static void SparseSoftmaxXentWithLogits(
const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t x_desc, const void* input,
const cnnlTensorDescriptor_t label_desc, const void* label,
const cnnlTensorDescriptor_t y_desc, void* output,
const cnnlTensorDescriptor_t diff_y_desc, void* back_out);
static void RandomUniform(const ExecutionContext& ctx, const int num,
const ExecutionContext& ctx,
cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t x_desc,
const void* input,
const cnnlTensorDescriptor_t label_desc,
const void* label,
const cnnlTensorDescriptor_t y_desc,
void* output,
const cnnlTensorDescriptor_t diff_y_desc,
void* back_out);
static void RandomUniform(const ExecutionContext& ctx,
const int num,
const cnnlDataType_t data_type,
const cnnlRandGenerator_t mlu_generator,
void* mlu_state, void* output);
void* mlu_state,
void* output);
static void FusedDropout(
const ExecutionContext& ctx, const cnnlRandGenerator_t generator,
const cnnlTensorDescriptor_t input_desc, const void* input, const float p,
void* state, const cnnlTensorDescriptor_t mask_desc, const void* mask,
const cnnlTensorDescriptor_t output_desc, void* output);
static void FusedDropout(const ExecutionContext& ctx,
const cnnlRandGenerator_t generator,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const float p,
void* state,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cumsum(const ExecutionContext& ctx, const int axis,
const bool exclusive, const bool reverse,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t ouput_desc, void* output);
static void Cumsum(const ExecutionContext& ctx,
const int axis,
const bool exclusive,
const bool reverse,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void BroadcastTo(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
......@@ -620,189 +751,267 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc,
void* output);
static void GatherFunctor(
const ExecutionContext& ctx, const int axis, const int batch_dims,
const cnnlTensorDescriptor_t params_desc, const void* params,
const cnnlTensorDescriptor_t indices_desc, const void* indices,
const cnnlTensorDescriptor_t output_desc, void* output);
static void GatherFunctor(const ExecutionContext& ctx,
const int axis,
const int batch_dims,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterRefFunctor(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t params_desc,
const void* params, const cnnlTensorDescriptor_t updates_desc,
const void* updates, const cnnlTensorDescriptor_t indices_desc,
const void* indices, const cnnlScatterRefMode_t mode);
static void ScatterRefFunctor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlScatterRefMode_t mode);
static void ScatterFunctor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
const void* params,
void* params,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t indices_desc,
const void* indices, const int dim,
const void* indices,
const int dim,
const cnnlScatterMode_t mode = CNNL_SCATTER);
static void Range(const ExecutionContext& ctx, const void* start,
const void* end, const void* step,
const cnnlDataType_t output_dtype, void* output);
static void Range(const ExecutionContext& ctx,
const void* start,
const void* end,
const void* step,
const cnnlDataType_t output_dtype,
void* output);
static void Round(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TopK(const ExecutionContext& ctx, const int k, const int dim,
const bool largest, const bool sorted,
const cnnlTensorDescriptor_t input_desc, const void* input,
static void TopK(const ExecutionContext& ctx,
const int k,
const int dim,
const bool largest,
const bool sorted,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t values_output_desc,
void* values_out,
const cnnlTensorDescriptor_t indices_output_desc,
void* indices_out);
static void StridedSlice(const ExecutionContext& ctx, const int begin[],
const int end[], const int strides[],
static void StridedSlice(const ExecutionContext& ctx,
const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Split(const ExecutionContext& ctx, int split_num, int axis,
static void Split(const ExecutionContext& ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]);
static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis,
static void Split(const MLUDeviceContext& dev_ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]);
static void Scale(const ExecutionContext& ctx, const int axis,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
const cnnlTensorDescriptor_t beta_desc, const void* beta,
const cnnlTensorDescriptor_t output_desc, void* output);
static void Scale(const ExecutionContext& ctx,
const int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t alpha_desc,
const void* alpha,
const cnnlTensorDescriptor_t beta_desc,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AddN(const ExecutionContext& ctx, uint32_t input_num,
static void AddN(const ExecutionContext& ctx,
uint32_t input_num,
const cnnlTensorDescriptor_t inputs_desc[],
const void* inputs[],
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer, cnnlLogBase_t log_base,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
cnnlComputationPreference_t prefer,
cnnlLogBase_t log_base,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void StridedSliceGrad(const ExecutionContext& ctx, const int begin[],
const int end[], const int strides[],
static void StridedSliceGrad(const ExecutionContext& ctx,
const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Logic(const ExecutionContext& ctx, const cnnlLogicOp_t log_method,
static void Logic(const ExecutionContext& ctx,
const cnnlLogicOp_t log_method,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2, const cnnlTensorDescriptor_t ouput_desc,
const void* input2,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void Select(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t condition_desc,
const void* condition_ptr, const cnnlTensorDescriptor_t then_desc,
const void* then_ptr, const cnnlTensorDescriptor_t else_desc,
const void* else_ptr, const cnnlTensorDescriptor_t output_desc,
static void Select(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t condition_desc,
const void* condition_ptr,
const cnnlTensorDescriptor_t then_desc,
const void* then_ptr,
const cnnlTensorDescriptor_t else_desc,
const void* else_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output_ptr);
static void AssignAdd(const ExecutionContext& ctx, const void* alpha,
static void AssignAdd(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc, void* param);
const cnnlTensorDescriptor_t param_desc,
void* param);
static void AssignSub(const ExecutionContext& ctx, const void* alpha,
static void AssignSub(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc, void* param);
const cnnlTensorDescriptor_t param_desc,
void* param);
static void Assign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc, void* param);
const cnnlTensorDescriptor_t param_desc,
void* param);
static void GatherNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchToSpace(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output, const cnnlSpaceBatchParam_t param);
void* output,
const cnnlSpaceBatchParam_t param);
static void BatchToSpaceNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input, size_t extra_input_size,
void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PoolingForward(
const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
int64_t output_h, int64_t output_w, cnnlPoolingDescriptor_t pooling_desc,
const void* alpha, const cnnlTensorDescriptor_t input_desc,
const void* input, const void* beta, const void* extra_input_ptr,
const cnnlTensorDescriptor_t output_desc, void* output);
static void PoolingForward(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
int64_t output_h,
int64_t output_w,
cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const void* extra_input_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AdaptivePoolingForward(
const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output,
const cnnlTensorDescriptor_t index_desc, void* index);
static void AdaptivePoolingForward(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlTensorDescriptor_t index_desc,
void* index);
static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode,
static void Pool3D(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
const std::vector<int64_t>& output_shape,
cnnlPoolingDescriptor_t pooling_desc, const void* alpha,
const cnnlTensorDescriptor_t input_desc, const void* input,
const void* beta, const cnnlTensorDescriptor_t output_desc,
cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Pad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const void* paddings, const void* padding_value,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* paddings,
const void* padding_value,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Matmul(const ExecutionContext& ctx, const bool transpose_a,
static void Matmul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t in0_desc, const void* in0,
const cnnlTensorDescriptor_t in1_desc, const void* in1,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchMatmul(
const ExecutionContext& ctx, const bool transpose_a,
const bool transpose_b, const cnnlTensorDescriptor_t in0_desc,
const void* in0, const cnnlTensorDescriptor_t in1_desc, const void* in1,
const cnnlTensorDescriptor_t output_desc, void* output);
static void BatchMatmul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void MulAx(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t alpha_desc, const void* alpha,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t alpha_desc,
const void* alpha,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void OpTensor(const ExecutionContext& ctx,
const cnnlOpTensorDescriptor_t op_tensor_desc,
const cnnlTensorDescriptor_t a_desc, const void* a,
const cnnlTensorDescriptor_t b_desc, const void* b,
const cnnlTensorDescriptor_t output_desc, void* output,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlDataType_t dtype,
const float alpha1_float = 1.f,
const float alpha2_float = 1.f,
const float beta_float = 0.f);
static void BiasAddGrad(const ExecutionContext& ctx, const int axis,
static void BiasAddGrad(const ExecutionContext& ctx,
const int axis,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t output_desc,
......@@ -810,9 +1019,13 @@ class MLUCnnl {
static void OneHot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t desc_indices,
const void* indices, const int depth, const void* on_value,
const void* off_value, const int axis,
cnnlDataType_t output_data_type, void* output);
const void* indices,
const int depth,
const void* on_value,
const void* off_value,
const int axis,
cnnlDataType_t output_data_type,
void* output);
static void NonMaxSuppression(const ExecutionContext& ctx,
const cnnlNmsDescriptor_t nms_desc,
......@@ -821,35 +1034,47 @@ class MLUCnnl {
const cnnlTensorDescriptor_t confidence_desc,
const void* confidence,
const cnnlTensorDescriptor_t output_desc,
void* output, void* output_size);
void* output,
void* output_size);
static void SoftmaxCrossEntropyWithLogits(
const ExecutionContext& ctx, cnnlSoftmaxMode_t mode,
const ExecutionContext& ctx,
cnnlSoftmaxMode_t mode,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* logits_in,
const cnnlTensorDescriptor_t label_desc, const void* labels_in,
const cnnlTensorDescriptor_t loss_out_desc, void* loss_out,
const cnnlTensorDescriptor_t back_out_desc, void* back_out);
const cnnlTensorDescriptor_t input_desc,
const void* logits_in,
const cnnlTensorDescriptor_t label_desc,
const void* labels_in,
const cnnlTensorDescriptor_t loss_out_desc,
void* loss_out,
const cnnlTensorDescriptor_t back_out_desc,
void* back_out);
static void SoftmaxForward(const ExecutionContext& ctx,
cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode, const void* alpha,
cnnlSoftmaxMode_t mode,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input, const void* beta,
const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SoftmaxBackward(
const ExecutionContext& ctx, cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode, const cnnlTensorDescriptor_t y_desc,
const void* y, const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y, const cnnlTensorDescriptor_t diff_x_desc,
static void SoftmaxBackward(const ExecutionContext& ctx,
cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Softplus(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t features_desc,
const void* features,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SoftplusGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t gradients_desc,
......@@ -860,38 +1085,59 @@ class MLUCnnl {
void* output);
static void RsqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const void* y,
const void* diff_y, void* output);
const cnnlTensorDescriptor_t data_desc,
const void* y,
const void* diff_y,
void* output);
static void SqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc, const void* y,
const void* diff_y, void* output);
static void ConvolutionForward(
const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc_,
const void* alpha, const void* beta,
const cnnlTensorDescriptor_t bias_desc, const void* bias_ptr,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t filtet_desc, const void* filter,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t data_desc,
const void* y,
const void* diff_y,
void* output);
static void ConvolutionForward(const ExecutionContext& ctx,
cnnlConvolutionDescriptor_t conv_desc_,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t bias_desc,
const void* bias_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedConvBNQuantify(
const ExecutionContext& ctx, cnnlConvolutionDescriptor_t conv_desc,
const void* epsilon_ptr, const int fused_ops_number,
const cnnlDataType_t tensor_dtype, const int input_position,
const float input_scale, const int filter_position,
const float filter_scale, const cnnlTensorDescriptor_t scale_desc,
const void* scale_ptr, const cnnlTensorDescriptor_t offset_desc,
const void* offset_ptr, const cnnlTensorDescriptor_t mean_desc,
const void* mean_ptr, const cnnlTensorDescriptor_t variance_desc,
const void* variance_ptr, const cnnlTensorDescriptor_t input_desc,
const void* input, const cnnlTensorDescriptor_t filtet_desc,
const void* filter, const cnnlTensorDescriptor_t output_desc,
static void FusedConvBNQuantify(const ExecutionContext& ctx,
cnnlConvolutionDescriptor_t conv_desc,
const void* epsilon_ptr,
const int fused_ops_number,
const cnnlDataType_t tensor_dtype,
const int input_position,
const float input_scale,
const int filter_position,
const float filter_scale,
const cnnlTensorDescriptor_t scale_desc,
const void* scale_ptr,
const cnnlTensorDescriptor_t offset_desc,
const void* offset_ptr,
const cnnlTensorDescriptor_t mean_desc,
const void* mean_ptr,
const cnnlTensorDescriptor_t variance_desc,
const void* variance_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Tile(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void UnsortedSegmentSum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
......@@ -901,12 +1147,17 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Reduce(const ExecutionContext& ctx, const bool need_workspace,
static void Reduce(const ExecutionContext& ctx,
const bool need_workspace,
const cnnlReduceDescriptor_t reduction_desc,
const void* alpha, const cnnlTensorDescriptor_t input_desc,
const void* input, const size_t indices_size,
void* indices, const void* beta,
const cnnlTensorDescriptor_t output_desc, void* output);
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const size_t indices_size,
void* indices,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorDiv(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
......@@ -914,34 +1165,41 @@ class MLUCnnl {
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorMod(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Maximum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Minimum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PowR(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc, const void* input1,
const cnnlTensorDescriptor_t input2_desc, const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DivNoNan(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
......@@ -949,7 +1207,8 @@ class MLUCnnl {
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SquaredDifference(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
......@@ -960,52 +1219,73 @@ class MLUCnnl {
void* output);
static void L2Loss(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t input_desc,
const void* input,
void* output);
static void Abs(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Neg(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Floor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Ceil(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNan(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Square(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Rsqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cos(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sin(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrigonForward(const ExecutionContext& ctx,
const cnnlTrigonDescriptor_t trigon_desc,
......@@ -1016,31 +1296,41 @@ class MLUCnnl {
static void Exp(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsFinite(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNanInf(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input, void* output);
const void* input,
void* output);
static void Erf(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log1p(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LogicalNot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
......@@ -1048,152 +1338,250 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DynamicStitch(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t* indices_desc,
const int** indices, const cnnlTensorDescriptor_t* data_desc,
const void** data, const int size, int* indices_dims,
const cnnlTensorDescriptor_t output_desc, void* output);
static void DynamicStitch(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t* indices_desc,
const int** indices,
const cnnlTensorDescriptor_t* data_desc,
const void** data,
const int size,
int* indices_dims,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void CropAndResize(
const ExecutionContext& ctx, const std::string method_name,
const float extrapolation_value, const cnnlTensorDescriptor_t image_desc,
const void* image, const cnnlTensorDescriptor_t boxes_desc,
const void* boxes, const cnnlTensorDescriptor_t box_index_desc,
const void* box_index, const cnnlTensorDescriptor_t output_desc,
static void CropAndResize(const ExecutionContext& ctx,
const std::string method_name,
const float extrapolation_value,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_index_desc,
const void* box_index,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void CropAndResizeBackwardImage(
const ExecutionContext& ctx, const std::string method_name,
const cnnlTensorDescriptor_t image_desc, const void* image,
const cnnlTensorDescriptor_t boxes_desc, const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc, const void* box_idx,
const cnnlTensorDescriptor_t grads_image_desc, void* grads_image);
const ExecutionContext& ctx,
const std::string method_name,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx,
const cnnlTensorDescriptor_t grads_image_desc,
void* grads_image);
static void CropAndResizeBackwardBoxes(
const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc,
const void* input, const cnnlTensorDescriptor_t image_desc,
const void* image, const cnnlTensorDescriptor_t boxes_desc,
const void* boxes, const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx, const cnnlTensorDescriptor_t output_desc,
void* output);
static void PoolingBackward(
const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc,
const void* alpha, const cnnlTensorDescriptor_t y_desc, const void* y,
const cnnlTensorDescriptor_t diff_y_desc, const void* diff_y,
const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
static void AdaptivePoolingBackward(
const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t y_desc, const void* y,
const cnnlTensorDescriptor_t index_desc, const void* index,
const cnnlTensorDescriptor_t diff_x_desc, void* diff_x);
const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const void* beta,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void AdaptivePoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t index_desc,
const void* index,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void PoolingIndex(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc,
const cnnlTensorDescriptor_t x_desc, const void* x,
const cnnlTensorDescriptor_t y_desc, void* y);
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void SpaceToBatch(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output, const int64_t block_shape[]);
void* output,
const int64_t block_shape[]);
static void SpaceToBatchNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input, size_t extra_input_size,
void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Interp(const ExecutionContext& ctx, const cnnlInterpMode_t mode,
const bool align_corners, const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
static void Interp(const ExecutionContext& ctx,
const cnnlInterpMode_t mode,
const bool align_corners,
const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void InterpBackward(
const ExecutionContext& ctx, const cnnlInterpBackwardMode_t mode,
const bool align_corners, const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
static void InterpBackward(const ExecutionContext& ctx,
const cnnlInterpBackwardMode_t mode,
const bool align_corners,
const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeParam(const ExecutionContext& ctx,
const cnnlQuantizeMode_t mode, const int bitwidth,
const cnnlQuantizeMode_t mode,
const int bitwidth,
const cnnlTensorDescriptor_t input_desc,
const void* input, void* position, void* scale,
const void* input,
void* position,
void* scale,
void* offset);
static void QuantizeMatMul(
const ExecutionContext& ctx, const bool transpose_a,
const bool transpose_b, const cnnlTensorDescriptor_t a_desc,
const void* a, const void* a_position, const void* a_scale,
const void* a_offset, const cnnlTensorDescriptor_t b_desc, const void* b,
const void* b_position, const void* b_scale, const void* b_offset,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc, void* output);
static void QuantizeMatMul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMul(
const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
const cnnlTensorDescriptor_t a_desc, const void* a,
const void* a_position, const void* a_scale, const void* a_offset,
const cnnlTensorDescriptor_t b_desc, const void* b,
const void* b_position, const void* b_scale, const void* b_offset,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc, void* output);
static void QuantizeBatchMatMul(const ExecutionContext& ctx,
const bool adj_x,
const bool adj_y,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMulBCast(
const ExecutionContext& ctx, const bool adj_x, const bool adj_y,
const cnnlTensorDescriptor_t a_desc, const void* a,
const void* a_position, const void* a_scale, const void* a_offset,
const cnnlTensorDescriptor_t b_desc, const void* b,
const void* b_position, const void* b_scale, const void* b_offset,
const cnnlDataType_t quant_type, const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc, void* output);
static void QuantizeBatchMatMulBCast(const ExecutionContext& ctx,
const bool adj_x,
const bool adj_y,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedBatchNorm(
const ExecutionContext& ctx, const bool is_training,
const cnnlTensorDescriptor_t x_desc, const void* x,
const cnnlTensorDescriptor_t scale_desc, const void* scale,
const void* offset, const void* estimated_mean,
const void* estimated_variance, float epsilon, float momentum,
const cnnlTensorDescriptor_t output_desc, void* output, void* batch_mean,
void* batch_var, void* saved_mean, void* saved_var);
static void FusedBatchNormGrad(
const ExecutionContext& ctx, const bool is_training,
const cnnlTensorDescriptor_t y_backprop_desc, const void* y_backprop,
const cnnlTensorDescriptor_t x_desc, const void* x,
const cnnlTensorDescriptor_t scale_desc, const void* scale,
const void* saved_mean, const void* saved_var, float epsilon,
const cnnlTensorDescriptor_t x_backprop_desc, void* x_backprop,
void* scale_backprop, void* offset_backprop);
static void LayerNormForward(const ExecutionContext& ctx, int axis,
static void FusedBatchNorm(const ExecutionContext& ctx,
const bool is_training,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t scale_desc,
const void* scale,
const void* offset,
const void* estimated_mean,
const void* estimated_variance,
float epsilon,
float momentum,
const cnnlTensorDescriptor_t output_desc,
void* output,
void* batch_mean,
void* batch_var,
void* saved_mean,
void* saved_var);
static void FusedBatchNormGrad(const ExecutionContext& ctx,
const bool is_training,
const cnnlTensorDescriptor_t y_backprop_desc,
const void* y_backprop,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t scale_desc,
const void* scale,
const void* saved_mean,
const void* saved_var,
float epsilon,
const cnnlTensorDescriptor_t x_backprop_desc,
void* x_backprop,
void* scale_backprop,
void* offset_backprop);
static void LayerNormForward(const ExecutionContext& ctx,
int axis,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t weight_bias_desc,
const void* weight, const void* bias, float eps,
const cnnlTensorDescriptor_t y_desc, void* y,
const void* weight,
const void* bias,
float eps,
const cnnlTensorDescriptor_t y_desc,
void* y,
const cnnlTensorDescriptor_t mean_rstd_desc,
void* saved_mean, void* saved_rstd);
void* saved_mean,
void* saved_rstd);
static void LayerNormBackward(
const ExecutionContext& ctx, int axis,
const cnnlTensorDescriptor_t x_desc, const void* x,
const cnnlTensorDescriptor_t diff_z_desc, const void* diff_z,
const cnnlTensorDescriptor_t weight_bias_desc, const void* weight,
const cnnlTensorDescriptor_t mean_rstd_desc, const void* saved_mean,
const void* saved_rstd, const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x, void* diff_weight, void* diff_bias);
static void LayerNormBackward(const ExecutionContext& ctx,
int axis,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_z_desc,
const void* diff_z,
const cnnlTensorDescriptor_t weight_bias_desc,
const void* weight,
const cnnlTensorDescriptor_t mean_rstd_desc,
const void* saved_mean,
const void* saved_rstd,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x,
void* diff_weight,
void* diff_bias);
static void Transpose(const ExecutionContext& ctx,
const std::vector<int> perm, const int input_dim,
const std::vector<int> perm,
const int input_dim,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrilTriu(const ExecutionContext& ctx, const int diagonal_k,
const bool tri_up_mode,
......@@ -1203,109 +1591,170 @@ class MLUCnnl {
static void MatrixBandPart(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
const void* input, const int num_lower,
const int num_upper, void* output);
const void* input,
const int num_lower,
const int num_upper,
void* output);
static void NumTrue(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc, const void* x,
Tensor index, uint32_t* num_true);
const cnnlTensorDescriptor_t x_desc,
const void* x,
Tensor index,
uint32_t* num_true);
static void Where(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc, const void* x,
const uint32_t* strides, const uint32_t* index,
const cnnlTensorDescriptor_t y_desc, int* y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const uint32_t* strides,
const uint32_t* index,
const cnnlTensorDescriptor_t y_desc,
int* y,
const bool as_tuple);
static void Conv2D(const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip, const void* input_position,
const void* input_scale, const void* input_offset,
const void* filter_position, const void* filter_scale,
const cnnlDataType_t dt_onchip,
const void* input_position,
const void* input_scale,
const void* input_offset,
const void* filter_position,
const void* filter_scale,
const void* filter_offset,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filter_desc,
const void* filter, const cnnlTensorDescriptor_t bias_desc,
const void* bias, const cnnlTensorDescriptor_t output_desc,
const void* filter,
const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ConvBackpropInput(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t filter_desc, const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop);
static void ConvBackpropInput(const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t filter_desc,
const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void QuantizeConvBackpropInput(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
const void* filter_position, const void* filter_scale,
const void* filter_offset, const void* out_backprop_position,
const void* out_backprop_scale, const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc, const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc, void* in_backprop);
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip,
const void* filter_position,
const void* filter_scale,
const void* filter_offset,
const void* out_backprop_position,
const void* out_backprop_scale,
const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc,
const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void ConvBackpropFilter(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop);
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc,
void* filter_backprop);
static void QuantizeConvBackpropFilter(
const ExecutionContext& ctx, const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype, const cnnlDataType_t dt_onchip,
const void* input_position, const void* input_scale,
const void* input_offset, const void* out_backprop_position,
const void* out_backprop_scale, const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t out_backprop_desc, const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc, void* filter_backprop);
static void DCNForward(
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t offset_desc, const void* offset,
const cnnlTensorDescriptor_t mask_desc, const void* mask,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t bias_desc, const void* bias,
const cnnlTensorDescriptor_t output_desc, void* output);
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip,
const void* input_position,
const void* input_scale,
const void* input_offset,
const void* out_backprop_position,
const void* out_backprop_scale,
const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc,
void* filter_backprop);
static void DCNBackwardData(
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t offset_desc, const void* offset,
const cnnlTensorDescriptor_t mask_desc, const void* mask,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output,
const cnnlTensorDescriptor_t grad_input_desc, void* grad_input,
const cnnlTensorDescriptor_t grad_offset_desc, void* grad_offset,
const cnnlTensorDescriptor_t grad_mask_desc, void* grad_mask);
static void DCNBackwardWeight(
const ExecutionContext& ctx, const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t offset_desc, const void* offset,
const cnnlTensorDescriptor_t mask_desc, const void* mask,
const cnnlTensorDescriptor_t grad_output_desc, const void* grad_output,
const cnnlTensorDescriptor_t grad_weight_desc, void* grad_weight,
const cnnlTensorDescriptor_t grad_bias_desc, void* grad_bias);
static void DCNForward(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DCNBackwardData(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_input_desc,
void* grad_input,
const cnnlTensorDescriptor_t grad_offset_desc,
void* grad_offset,
const cnnlTensorDescriptor_t grad_mask_desc,
void* grad_mask);
static void DCNBackwardWeight(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_weight_desc,
void* grad_weight,
const cnnlTensorDescriptor_t grad_bias_desc,
void* grad_bias);
static void InTopK(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t predictions_desc,
const void* predictions,
const cnnlTensorDescriptor_t targets_desc,
const void* targets, const cnnlTensorDescriptor_t k_desc,
const void* k, const int k_int,
const cnnlTensorDescriptor_t output_desc, void* output);
const void* targets,
const cnnlTensorDescriptor_t k_desc,
const void* k,
const int k_int,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterNd(const ExecutionContext& ctx, cnnlScatterNdMode_t mode,
static void ScatterNd(const ExecutionContext& ctx,
cnnlScatterNdMode_t mode,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BitWise(const ExecutionContext& ctx,
const cnnlBitComputeOp_t optype,
......@@ -1313,12 +1762,17 @@ class MLUCnnl {
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QR(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t a_desc, const void* a,
const cnnlTensorDescriptor_t q_desc, void* q,
const cnnlTensorDescriptor_t r_desc, void* r, const bool some);
const cnnlTensorDescriptor_t a_desc,
const void* a,
const cnnlTensorDescriptor_t q_desc,
void* q,
const cnnlTensorDescriptor_t r_desc,
void* r,
const bool some);
static void Reciprocal(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
......@@ -1326,55 +1780,85 @@ class MLUCnnl {
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceLoss(
const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t target_desc, const void* target,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t output_desc, void* output);
static void BceLoss(const ExecutionContext& ctx,
const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceLossBackward(
const ExecutionContext& ctx, const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc, const void* grad,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t target_desc, const void* target,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t output_desc, void* output);
static void BceLossBackward(const ExecutionContext& ctx,
const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void EmbeddingForward(
const ExecutionContext& ctx, const int padding_idx,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t indices_desc, const int* indices,
const cnnlTensorDescriptor_t output_desc, void* output);
static void EmbeddingForward(const ExecutionContext& ctx,
const int padding_idx,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t indices_desc,
const int* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Transform(const ExecutionContext& ctx, const void* alpha,
static void Transform(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc, void* output);
const cnnlTensorDescriptor_t output_desc,
void* output);
static void EmbeddingBackward(
const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
const cnnlTensorDescriptor_t indices_desc, const void* indices,
const cnnlTensorDescriptor_t diff_desc, const void* diff,
const cnnlTensorDescriptor_t output_desc, void* output);
static void EmbeddingBackward(const ExecutionContext& ctx,
int padding_idx,
bool scale_grad_by_freq,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t diff_desc,
const void* diff,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogits(
const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t target_desc, const void* target,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight,
const cnnlTensorDescriptor_t output_desc, void* output);
static void BceWithLogits(const ExecutionContext& ctx,
cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogitsBackward(
const ExecutionContext& ctx, cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc, const void* grad,
const cnnlTensorDescriptor_t input_desc, const void* input,
const cnnlTensorDescriptor_t target_desc, const void* target,
const cnnlTensorDescriptor_t weight_desc, const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc, const void* pos_weight,
const cnnlTensorDescriptor_t diff_input_desc, void* diff_input);
const ExecutionContext& ctx,
cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t diff_input_desc,
void* diff_input);
};
template <typename T>
......@@ -1393,22 +1877,27 @@ inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
transformed_output->mutable_data<T>(
framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
}
MLUCnnlTensorDesc trans_in_desc(*transformed_input, CNNL_LAYOUT_ARRAY,
ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(*transformed_output, CNNL_LAYOUT_ARRAY,
ToCnnlDataType<T>());
MLUCnnl::Transpose(ctx, perm, dim_size, trans_in_desc.get(),
GetBasePtr(transformed_input), trans_out_desc.get(),
MLUCnnlTensorDesc trans_in_desc(
*transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
*transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Transpose(ctx,
perm,
dim_size,
trans_in_desc.get(),
GetBasePtr(transformed_input),
trans_out_desc.get(),
GetBasePtr(transformed_output));
}
template <typename T>
inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx, T value,
inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx,
T value,
Tensor* out) {
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(),
GetBasePtr(out));
MLUCnnl::Fill(
ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out));
}
} // namespace operators
......
......@@ -33,26 +33,43 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
cnnlScatterRefMode_t mode;
if (overwrite) {
mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
updates_desc.get(), GetBasePtr(updates),
indices_desc.get(), GetBasePtr(indices), mode);
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
} else {
Tensor tensor_zeros(updates->type());
tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
float value = 0.0;
auto value_t = static_cast<T>(value);
MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value_t,
tensor_zeros_desc.get(), GetBasePtr(&tensor_zeros));
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value_t,
tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros));
mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros), indices_desc.get(),
GetBasePtr(indices), mode);
GetBasePtr(&tensor_zeros),
indices_desc.get(),
GetBasePtr(indices),
mode);
mode = CNNL_SCATTERREF_ADD;
MLUCnnl::ScatterFunctor(ctx, x_desc.get(), GetBasePtr(x),
updates_desc.get(), GetBasePtr(updates),
indices_desc.get(), GetBasePtr(indices), mode);
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
}
paddle::framework::TensorCopy(*x, place, out);
}
......@@ -62,5 +79,6 @@ class ScatterMLUKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(scatter, ops::ScatterMLUKernel<float>,
REGISTER_OP_MLU_KERNEL(scatter,
ops::ScatterMLUKernel<float>,
ops::ScatterMLUKernel<paddle::platform::float16>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册