未验证 提交 bb48b596 编写于 作者: Y Young-Flash 提交者: GitHub

delete paddle/fluid/operators/*_mlu.* files (#52435)

上级 0e3f7ab1
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class AbsMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnl::Abs(ctx,
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class AbsGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*x);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
phi::DenseTensor sign_x;
sign_x.mutable_data<T>(x->dims(), ctx.GetPlace());
MLUCnnl::Sign(ctx,
input_desc.get(),
GetBasePtr(x),
input_desc.get(),
GetBasePtr(&sign_x));
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
input_desc.get(),
GetBasePtr(&sign_x),
input_desc.get(),
GetBasePtr(dout),
input_desc.get(),
GetBasePtr(dx),
ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(abs,
ops::AbsMLUKernel<float>,
ops::AbsMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(abs_grad,
ops::AbsGradMLUKernel<float>,
ops::AbsGradMLUKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <cnnlActivationMode_t act_mode, typename T>
class ActivationMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnl::Active(ctx,
act_desc.get(),
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
// For gelu, leaky_relu
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernelV1 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(ctx,
act_desc.get(),
nullptr,
nullptr,
nullptr,
nullptr,
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
dx_desc.get(),
GetBasePtr(dx));
}
};
// For tanh, sigmoid
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernelV2 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(ctx,
act_desc.get(),
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out),
dout_desc.get(),
GetBasePtr(dout),
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
}
};
// For relu, relu6
template <cnnlActivationMode_t act_mode, typename T>
class ActivationGradMLUKernelV3 : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(act_mode, alpha);
MLUCnnl::ActiveGrad(ctx,
act_desc.get(),
nullptr,
nullptr,
nullptr,
nullptr,
dout_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(out),
dx_desc.get(),
GetBasePtr(dx));
}
};
// For sqrt
template <typename T>
class SqrtMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc input_desc(*x);
MLUCnnlTensorDesc output_desc(*out);
cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
MLUCnnl::Sqrt(ctx,
prefer,
input_desc.get(),
GetBasePtr(x),
output_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class SqrtGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto place = ctx.GetPlace();
dx->mutable_data<T>(place);
MLUCnnlTensorDesc data_desc(*out);
MLUCnnl::SqrtGrad(ctx,
data_desc.get(),
GetBasePtr(out),
GetBasePtr(dout),
GetBasePtr(dx));
}
};
// CNNL_LOG_E = 0,
// CNNL_LOG_2 = 1,
// CNNL_LOG_10 = 2,
template <cnnlLogBase_t Log_base, typename T>
class LogMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
MLUCnnl::Log(ctx,
prefer,
Log_base,
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class ExpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
MLUCnnl::Exp(ctx,
prefer,
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class ExpGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(out),
dx_desc.get(),
GetBasePtr(dx),
ToCnnlDataType<T>());
}
};
template <typename T>
class HardSwishMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
float threshold = ctx.Attr<float>("threshold");
float scale = ctx.Attr<float>("scale");
float offset = ctx.Attr<float>("offset");
PADDLE_ENFORCE_EQ(threshold,
6.0f,
platform::errors::External(
"Not support threshold [%f] in MLU", threshold));
PADDLE_ENFORCE_EQ(
scale,
6.0f,
platform::errors::External("Not support scale [%f] in MLU", scale));
PADDLE_ENFORCE_EQ(
offset,
3.0f,
platform::errors::External("Not support offset [%f] in MLU", offset));
MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
1.0f /*ceof useless*/);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnl::Active(ctx,
act_desc.get(),
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class HardSwishGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
float threshold = ctx.Attr<float>("threshold");
float scale = ctx.Attr<float>("scale");
float offset = ctx.Attr<float>("offset");
PADDLE_ENFORCE_EQ(threshold,
6.0f,
platform::errors::External(
"Not support threshold [%f] in MLU", threshold));
PADDLE_ENFORCE_EQ(
scale,
6.0f,
platform::errors::External("Not support scale [%f] in MLU", scale));
PADDLE_ENFORCE_EQ(
offset,
3.0f,
platform::errors::External("Not support offset [%f] in MLU", offset));
auto* out = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH,
1.0f /*ceof useless*/);
MLUCnnl::ActiveGrad(ctx,
act_desc.get(),
nullptr,
nullptr,
nullptr,
nullptr,
dout_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(out),
dx_desc.get(),
GetBasePtr(dx));
}
};
template <typename T>
class HardSigmoidMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
float slope = ctx.Attr<float>("slope");
float offset = ctx.Attr<float>("offset");
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
1.0f /*ceof useless*/,
1.0f /*sliced_dim useless*/,
slope,
offset);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnl::Active(ctx,
act_desc.get(),
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class HardSigmoidGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
float slope = ctx.Attr<float>("slope");
float offset = ctx.Attr<float>("offset");
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID,
1.0f /*ceof useless*/,
1.0f /*sliced_dim useless*/,
slope,
offset);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::ActiveGrad(ctx,
act_desc.get(),
nullptr,
nullptr,
nullptr,
nullptr,
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
dx_desc.get(),
GetBasePtr(dx));
}
};
template <typename T>
class FloorMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnl::Floor(ctx,
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename DeviceContext, typename T>
class ReciprocalMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Reciprocal(
ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class ReciprocalGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto place = ctx.GetPlace();
dx->mutable_data<T>(place);
phi::DenseTensor square_out;
square_out.Resize(out->dims());
square_out.mutable_data<T>(place);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlTensorDesc square_out_desc(square_out);
MLUCnnl::Square(ctx,
out_desc.get(),
GetBasePtr(out),
square_out_desc.get(),
GetBasePtr(&square_out));
cnnlOpTensorDesc_t op_tensor_op = CNNL_OP_TENSOR_MUL;
cnnlDataType_t op_tensor_comp_type = CNNL_DTYPE_FLOAT;
cnnlNanPropagation_t op_tensor_nan_opt = CNNL_NOT_PROPAGATE_NAN;
MLUCnnlOpTensorDesc op_tensor_desc(
op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt);
float alpha1_float = -1;
float alpha2_float = 1;
float beta_float = 0;
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
square_out_desc.get(),
GetBasePtr(&square_out),
dx_desc.get(),
GetBasePtr(dx),
op_tensor_comp_type,
alpha1_float,
alpha2_float,
beta_float);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
// reciprocal
REGISTER_OP_MLU_KERNEL(
reciprocal,
ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::ReciprocalMLUKernel<paddle::platform::MLUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
reciprocal_grad,
ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::ReciprocalGradMLUKernel<paddle::platform::MLUDeviceContext,
paddle::platform::float16>);
// relu
REGISTER_OP_MLU_KERNEL(
relu,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
relu_grad,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU, float>,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU,
paddle::platform::float16>);
// relu6
REGISTER_OP_MLU_KERNEL(
relu6,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_RELU6, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
relu6_grad,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6, float>,
ops::ActivationGradMLUKernelV3<CNNL_ACTIVATION_RELU6,
paddle::platform::float16>);
// sigmoid
REGISTER_OP_MLU_KERNEL(sigmoid,
ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_SIGMOID,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
sigmoid_grad,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID, float>,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_SIGMOID,
paddle::platform::float16>);
// tanh
REGISTER_OP_MLU_KERNEL(
tanh,
ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_TANH, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
tanh_grad,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH, float>,
ops::ActivationGradMLUKernelV2<CNNL_ACTIVATION_TANH,
paddle::platform::float16>);
// gelu
REGISTER_OP_MLU_KERNEL(
gelu,
ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_GELU, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
gelu_grad,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU, float>,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_GELU,
paddle::platform::float16>);
// leaky_relu
REGISTER_OP_MLU_KERNEL(
leaky_relu,
ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU, float>,
ops::ActivationMLUKernel<CNNL_ACTIVATION_LEAKYRELU,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
leaky_relu_grad,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU, float>,
ops::ActivationGradMLUKernelV1<CNNL_ACTIVATION_LEAKYRELU,
paddle::platform::float16>);
// sqrt
REGISTER_OP_MLU_KERNEL(sqrt,
ops::SqrtMLUKernel<float>,
ops::SqrtMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(sqrt_grad,
ops::SqrtGradMLUKernel<float>,
ops::SqrtGradMLUKernel<paddle::platform::float16>);
// log log2 log10
REGISTER_OP_MLU_KERNEL(
log,
ops::LogMLUKernel<CNNL_LOG_E, float>,
ops::LogMLUKernel<CNNL_LOG_E, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
log2,
ops::LogMLUKernel<CNNL_LOG_2, float>,
ops::LogMLUKernel<CNNL_LOG_2, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
log10,
ops::LogMLUKernel<CNNL_LOG_10, float>,
ops::LogMLUKernel<CNNL_LOG_10, paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(exp,
ops::ExpMLUKernel<float>,
ops::ExpMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(exp_grad,
ops::ExpGradMLUKernel<float>,
ops::ExpGradMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(hard_swish,
ops::HardSwishMLUKernel<float>,
ops::HardSwishMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(hard_swish_grad,
ops::HardSwishGradMLUKernel<float>,
ops::HardSwishGradMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(hard_sigmoid,
ops::HardSigmoidMLUKernel<float>,
ops::HardSigmoidMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
hard_sigmoid_grad,
ops::HardSigmoidGradMLUKernel<float>,
ops::HardSigmoidGradMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(floor,
ops::FloorMLUKernel<float>,
ops::FloorMLUKernel<paddle::platform::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ArgMaxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto axis = static_cast<int>(ctx.Attr<int64_t>("axis"));
auto dtype = ctx.Attr<int>("dtype");
const bool& flatten = ctx.Attr<bool>("flatten");
if (x->numel() == 0) return;
PADDLE_ENFORCE_EQ(
(dtype == 2 || dtype == 3),
true,
platform::errors::InvalidArgument(
"The attribute of dtype in argmax op must be [%s] or [%s], "
"but "
"received [%s]",
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
static_cast<framework::proto::VarType::Type>(dtype))));
if (axis < 0) {
framework::DDim x_dims;
x_dims = x->dims();
axis += x_dims.size();
}
phi::DenseTensor flatten_x(x->type());
flatten_x.ShareDataWith(*x);
if (flatten) {
flatten_x.Resize(phi::make_ddim({x->numel()}));
// if flatten, the axis just as 0
axis = 0;
}
std::vector<int> reduce_dims;
reduce_dims.push_back(axis);
auto out_dims = out->dims();
int out_count = out_dims[0];
for (int i = 1; i < out_dims.size(); i++) {
out_count = out_count * out_dims[i];
}
size_t indices_size_inbytes = out_count * sizeof(int32_t);
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
phi::DenseTensor value_out =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(out->dims(), dev_ctx);
MLUCnnlTensorDesc value_out_desc(value_out);
MLUCnnlTensorDesc input_desc(
flatten_x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(flatten_x.dtype()));
MLUCnnlReduceDesc reduction_desc(reduce_dims,
CNNL_REDUCE_MAX,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_ONLY_INDICES,
CNNL_32BIT_INDICES);
if (dtype == 2) {
out->template mutable_data<int32_t>(ctx.GetPlace());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
GetBasePtr(&flatten_x),
indices_size_inbytes /*indices_size*/,
GetBasePtr(out),
nullptr,
value_out_desc.get(),
GetBasePtr(&value_out));
} else {
out->template mutable_data<int64_t>(ctx.GetPlace());
phi::DenseTensor out_int32 =
ctx.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
dev_ctx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
GetBasePtr(&flatten_x),
indices_size_inbytes /*indices_size*/,
GetBasePtr(&out_int32),
nullptr,
value_out_desc.get(),
GetBasePtr(&value_out));
// cast indices type to int64
MLUCnnlTensorDesc out_int32_desc(out_int32);
MLUCnnlTensorDesc cast_output_desc(*out);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
out_int32_desc.get(),
GetBasePtr(&out_int32),
cast_output_desc.get(),
GetBasePtr(out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(arg_max,
ops::ArgMaxMLUKernel<int>,
ops::ArgMaxMLUKernel<float>,
ops::ArgMaxMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ArgsortMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto* indices = ctx.Output<phi::DenseTensor>("Indices");
const auto& place = ctx.GetPlace();
const auto& sorted = true;
const bool descending = ctx.Attr<bool>("descending");
// axis < 0, cacluate the real axis
int axis = static_cast<int>(ctx.Attr<int>("axis"));
if (axis < 0) {
const auto& in_dims = input->dims();
axis += in_dims.size();
}
auto in_dims = input->dims();
size_t k = in_dims[axis];
output->mutable_data<T>(place);
indices->mutable_data<int64_t>(place);
// cnnl only support int32/int16 type of indices
phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
indices_int32.Resize(indices->dims());
indices_int32.mutable_data<int32_t>(place);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc values_output_desc(*output);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnl::TopK(ctx,
k,
axis,
descending,
sorted,
input_desc.get(),
GetBasePtr(input),
values_output_desc.get(),
GetBasePtr(output),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
// cast indices type to int64
MLUCnnlTensorDesc cast_output_desc(*indices);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
indices_int32_desc.get(),
GetBasePtr(&indices_int32),
cast_output_desc.get(),
GetBasePtr(indices));
}
};
template <typename T>
class ArgsortGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* indices = ctx.Input<phi::DenseTensor>("Indices");
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
int axis = ctx.Attr<int>("axis");
dx->mutable_data<T>(ctx.GetPlace());
auto in_dims = indices->dims();
axis = (axis < 0) ? (in_dims.size() + axis) : axis;
if (dout->numel() == 0) return;
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlTensorDesc indices_desc(*indices);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::ScatterFunctor(ctx,
dx_desc.get(),
GetBasePtr(dx),
dout_desc.get(),
GetBasePtr(dout),
indices_desc.get(),
GetBasePtr(indices),
axis);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(argsort,
ops::ArgsortMLUKernel<paddle::platform::float16>,
ops::ArgsortMLUKernel<float>,
ops::ArgsortMLUKernel<int8_t>,
ops::ArgsortMLUKernel<uint8_t>,
ops::ArgsortMLUKernel<int16_t>,
ops::ArgsortMLUKernel<int>);
REGISTER_OP_MLU_KERNEL(argsort_grad,
ops::ArgsortGradMLUKernel<paddle::platform::float16>,
ops::ArgsortGradMLUKernel<float>,
ops::ArgsortGradMLUKernel<int8_t>,
ops::ArgsortGradMLUKernel<uint8_t>,
ops::ArgsortGradMLUKernel<int16_t>,
ops::ArgsortGradMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/operators/assign_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class AssignMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Assign(
ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(assign,
ops::AssignMLUKernel<int>,
ops::AssignMLUKernel<float>,
ops::AssignMLUKernel<plat::float16>,
ops::AssignMLUKernel<bool>)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/assign_value_op.h"
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(assign_value,
ops::AssignValueKernel<bool>,
ops::AssignValueKernel<int>,
ops::AssignValueKernel<int64_t>,
ops::AssignValueKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUBatchNormOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto &place = ctx.GetPlace();
const float epsilon = ctx.Attr<float>("epsilon");
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
bool test_mode = is_test && (!trainable_stats);
bool global_stats = test_mode || use_global_stats;
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<phi::DenseTensor>("X");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
platform::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
platform::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x->numel() / N / C;
const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
auto *y = ctx.Output<phi::DenseTensor>("Y");
auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
// alloc memory
y->mutable_data<T>(place);
mean_out->mutable_data<MPDType>(place);
variance_out->mutable_data<MPDType>(place);
saved_mean->mutable_data<MPDType>(place);
saved_variance->mutable_data<MPDType>(place);
phi::DenseTensor transformed_x;
phi::DenseTensor transformed_y;
const int transformed_dim_size = 4;
const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
transformed_shape,
ToCnnlDataType<T>(),
CNNL_LAYOUT_NHWC);
MLUCnnlTensorDesc others_input_desc(*scale);
// input dimension is 2 and the format is NCHW. The input can be regarded as
// NHWC format. Don't need to transpose.
bool need_transpose =
(data_layout == DataLayout::kNCHW && x_dims.size() != 2);
if (need_transpose) {
auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
transformed_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
const int x_reshaped[] = {N, C, sample_size, 1};
MLUCnnlTensorDesc x_reshaped_desc(
transformed_dim_size, x_reshaped, ToCnnlDataType<T>());
const std::vector<int> perm = {0, 2, 3, 1};
MLUCnnl::Transpose(ctx,
perm,
transformed_dim_size,
x_reshaped_desc.get(),
GetBasePtr(x),
transformed_desc.get(),
GetBasePtr(&transformed_x));
} else {
transformed_x = *x;
transformed_y = *y;
}
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
phi::DenseTensor mom_cpu;
framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
MLUCnnl::FusedBatchNorm(ctx,
!global_stats,
transformed_desc.get(),
GetBasePtr(&transformed_x),
others_input_desc.get(),
GetBasePtr(scale),
GetBasePtr(bias),
GetBasePtr(running_mean),
GetBasePtr(running_var),
epsilon,
momentum,
transformed_desc.get(),
GetBasePtr(&transformed_y),
GetBasePtr(mean_out),
GetBasePtr(variance_out),
GetBasePtr(saved_mean),
GetBasePtr(saved_variance));
if (need_transpose) {
const int y_reshaped[] = {N, C, sample_size, 1};
MLUCnnlTensorDesc y_reshaped_desc(
transformed_dim_size, y_reshaped, ToCnnlDataType<T>());
const std::vector<int> perm = {0, 3, 1, 2};
MLUCnnl::Transpose(ctx,
perm,
transformed_y.dims().size(),
transformed_desc.get(),
GetBasePtr(&transformed_y),
y_reshaped_desc.get(),
GetBasePtr(y));
}
}
};
template <typename T>
class MLUBatchNormGradOpKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<phi::DenseTensor>("X");
const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
// SavedVariance have been reverted in forward operator
const auto *saved_inv_variance =
ctx.Input<phi::DenseTensor>("SavedVariance");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool is_test = ctx.Attr<bool>("is_test");
const float epsilon = ctx.Attr<float>("epsilon");
DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_scale =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
auto &dev_ctx = ctx.template device_context<MLUDeviceContext>();
auto d_x_tmp =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(x->dims(), dev_ctx);
auto scale_grad_tmp = ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(
scale->dims(), dev_ctx);
auto bias_grad_tmp =
ctx.AllocateTmpTensor<MPDType, MLUDeviceContext>(bias->dims(), dev_ctx);
if (d_x == nullptr) {
d_x = &d_x_tmp;
}
if (d_scale == nullptr) {
d_scale = &scale_grad_tmp;
}
if (d_bias == nullptr) {
d_bias = &bias_grad_tmp;
}
const auto &place = ctx.GetPlace();
d_x->mutable_data<T>(place);
d_scale->mutable_data<MPDType>(place);
d_bias->mutable_data<MPDType>(place);
use_global_stats = is_test || use_global_stats;
const auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(
x_dims.size(),
2,
platform::errors::InvalidArgument(
"The size of input X's dimensions should be larger than 1."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
PADDLE_ENFORCE_LE(
x_dims.size(),
5,
platform::errors::InvalidArgument(
"The size of input X's dimensions should be less than 6."
"But received: the size of input X's dimensions is [%d]",
x_dims.size()));
const int N = x_dims[0];
const int C =
(data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]);
const int sample_size = x->numel() / N / C;
phi::DenseTensor transformed_d_y;
phi::DenseTensor transformed_x;
phi::DenseTensor transformed_d_x;
const int transformed_dim_size = 4;
const int transformed_shape[transformed_dim_size] = {N, sample_size, 1, C};
MLUCnnlTensorDesc transformed_desc(transformed_dim_size,
transformed_shape,
ToCnnlDataType<T>(),
CNNL_LAYOUT_NHWC);
MLUCnnlTensorDesc others_input_desc(*scale);
bool need_transpose =
(data_layout == DataLayout::kNCHW && x_dims.size() != 2);
if (need_transpose) {
transformed_d_y = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
transformed_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
transformed_d_x = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
framework::DDim(transformed_shape, transformed_dim_size), dev_ctx);
const int org_reshaped[] = {N, C, sample_size, 1};
MLUCnnlTensorDesc org_reshaped_desc(
transformed_dim_size, org_reshaped, ToCnnlDataType<T>());
const std::vector<int> perm = {0, 2, 3, 1};
MLUCnnl::Transpose(ctx,
perm,
transformed_dim_size,
org_reshaped_desc.get(),
GetBasePtr(d_y),
transformed_desc.get(),
GetBasePtr(&transformed_d_y));
MLUCnnl::Transpose(ctx,
perm,
transformed_dim_size,
org_reshaped_desc.get(),
GetBasePtr(x),
transformed_desc.get(),
GetBasePtr(&transformed_x));
} else {
transformed_d_y = *d_y;
transformed_x = *x;
transformed_d_x = *d_x;
}
if (use_global_stats) {
const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
MLUCnnl::FusedBatchNormGrad(ctx,
false /*is_training*/,
transformed_desc.get(),
GetBasePtr(&transformed_d_y),
transformed_desc.get(),
GetBasePtr(&transformed_x),
others_input_desc.get(),
GetBasePtr(scale),
GetBasePtr(running_mean),
GetBasePtr(running_variance),
epsilon,
transformed_desc.get(),
GetBasePtr(&transformed_d_x),
GetBasePtr(d_scale),
GetBasePtr(d_bias));
} else {
MLUCnnl::FusedBatchNormGrad(ctx,
true /*is_training*/,
transformed_desc.get(),
GetBasePtr(&transformed_d_y),
transformed_desc.get(),
GetBasePtr(&transformed_x),
others_input_desc.get(),
GetBasePtr(scale),
GetBasePtr(saved_mean),
GetBasePtr(saved_inv_variance),
epsilon,
transformed_desc.get(),
GetBasePtr(&transformed_d_x),
GetBasePtr(d_scale),
GetBasePtr(d_bias));
}
if (need_transpose) {
const int d_x_reshaped[] = {N, C, sample_size, 1};
MLUCnnlTensorDesc d_x_reshaped_desc(
transformed_dim_size, d_x_reshaped, ToCnnlDataType<T>());
const std::vector<int> perm = {0, 3, 1, 2};
MLUCnnl::Transpose(ctx,
perm,
transformed_dim_size,
transformed_desc.get(),
GetBasePtr(&transformed_d_x),
d_x_reshaped_desc.get(),
GetBasePtr(d_x));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(batch_norm,
ops::MLUBatchNormOpKernel<float>,
ops::MLUBatchNormOpKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(batch_norm_grad,
ops::MLUBatchNormGradOpKernel<float>,
ops::MLUBatchNormGradOpKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class BCELossMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* labels = ctx.Input<phi::DenseTensor>("Label");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc label_desc(*labels);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::BceLoss(ctx,
CNNL_BCE_LOSS_NONE,
x_desc.get(),
GetBasePtr(x),
label_desc.get(),
GetBasePtr(labels),
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class BCELossGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* labels = ctx.Input<phi::DenseTensor>("Label");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc label_desc(*labels);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnl::BceLossBackward(ctx,
CNNL_BCE_LOSS_NONE,
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
label_desc.get(),
GetBasePtr(labels),
nullptr,
nullptr,
x_desc.get(),
GetBasePtr(dx));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(bce_loss,
ops::BCELossMLUKernel<float>,
ops::BCELossMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(bce_loss_grad,
ops::BCELossGradMLUKernel<float>,
ops::BCELossGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace paddle {
namespace operators {
template <typename T>
class CastMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
auto place = ctx.GetPlace();
if (src_type == dst_type) {
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
output->mutable_data<T>(place);
framework::TensorCopy(*input, place, dev_ctx, output);
return;
}
PADDLE_ENFORCE_EQ(MLUSupportsCast(src_type, dst_type),
true,
platform::errors::InvalidArgument(
"MLU not support cast [%d] to [%d]",
framework::DataTypeToString(src_type),
framework::DataTypeToString(dst_type)));
output->mutable_data(place, framework::TransToPhiDataType(dst_type));
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc output_desc(*output);
cnnlCastDataType_t cast_type = GetCastDataType(src_type, dst_type);
MLUCnnl::Cast(ctx,
cast_type,
input_desc.get(),
GetBasePtr(input),
output_desc.get(),
GetBasePtr(output));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(cast,
ops::CastMLUKernel<float>,
ops::CastMLUKernel<int>,
ops::CastMLUKernel<int16_t>,
ops::CastMLUKernel<uint8_t>,
ops::CastMLUKernel<bool>,
ops::CastMLUKernel<int64_t>,
ops::CastMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ClipMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto min = static_cast<T>(ctx.Attr<float>("min"));
auto max = static_cast<T>(ctx.Attr<float>("max"));
if (ctx.HasInput("Min")) {
phi::DenseTensor min_cpu;
auto* min_tensor = ctx.Input<phi::DenseTensor>("Min");
auto* min_data = min_tensor->data<T>();
if (platform::is_mlu_place(min_tensor->place())) {
paddle::framework::TensorCopySync(
*min_tensor, platform::CPUPlace(), &min_cpu);
min_data = min_cpu.data<T>();
}
min = min_data[0];
}
if (ctx.HasInput("Max")) {
phi::DenseTensor max_cpu;
auto* max_tensor = ctx.Input<phi::DenseTensor>("Max");
auto* max_data = max_tensor->data<T>();
if (platform::is_mlu_place(max_tensor->place())) {
paddle::framework::TensorCopySync(
*max_tensor, platform::CPUPlace(), &max_cpu);
max_data = max_cpu.data<T>();
}
max = max_data[0];
}
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Clip(ctx,
x_desc.get(),
GetBasePtr(x),
static_cast<const void*>(&min),
static_cast<const void*>(&max),
GetBasePtr(out));
}
};
template <typename T>
class ClipGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
auto* min_tensor =
ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
auto* max_tensor =
ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
auto min_val = ctx.Attr<float>("min");
if (min_tensor) {
phi::DenseTensor min_data;
framework::TensorCopy(
*min_tensor,
platform::CPUPlace(),
ctx.template device_context<platform::DeviceContext>(),
&min_data);
ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
min_val = static_cast<float>(min_data.data<T>()[0]);
}
auto max_val = ctx.Attr<float>("max");
if (max_tensor) {
phi::DenseTensor max_data;
framework::TensorCopy(
*max_tensor,
platform::CPUPlace(),
ctx.template device_context<platform::DeviceContext>(),
&max_data);
ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
max_val = static_cast<float>(max_data.data<T>()[0]);
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnl::HardtanhBackward(ctx,
x_desc.get(),
GetBasePtr(x),
dout_desc.get(),
GetBasePtr(dout),
max_val,
min_val,
dx_desc.get(),
GetBasePtr(dx));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(clip,
ops::ClipMLUKernel<float>,
ops::ClipMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(clip_grad,
ops::ClipGradMLUKernel<float>,
ops::ClipGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/concat_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class ConcatMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
auto axis = ctx.Attr<int>("axis");
auto ins_size = ins.size();
bool need_resize_out_dims = false;
if (ctx.HasInput("AxisTensor")) {
auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
need_resize_out_dims = true;
}
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
if (need_resize_out_dims) {
const size_t n = ins.size();
std::vector<framework::DDim> ins_dims(n);
for (size_t i = 0; i < n; i++) {
ins_dims[i] = ins[i]->dims();
}
framework::DDim out_dims =
phi::funcs::ComputeAndCheckShape(true, ins_dims, axis);
out->Resize(out_dims);
}
const int axis_t = axis;
const int ins_size_t = ins_size;
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
// mlu should do sth
// init ins tensors
std::vector<const void*> inputs;
std::vector<MLUCnnlTensorDesc> input_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < ins_size; i++) {
input_descs.emplace_back(MLUCnnlTensorDesc(
*ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->dtype())));
desc_vector.push_back(input_descs.back().get());
inputs.push_back(GetBasePtr(ins[i]));
}
// init out tensors
MLUCnnlTensorDesc output_desc(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
// MLU should do sth
MLUCnnl::Concat(ctx,
ins_size_t,
axis_t,
desc_vector.data(),
inputs.data(),
output_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class ConcatGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
auto axis = ctx.Attr<int>("axis");
int split_num = ins.size();
PADDLE_ENFORCE_NOT_NULL(ins[0],
platform::errors::NotFound(
"The first input tensor is not initalized."));
if (ctx.HasInput("AxisTensor")) {
auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
axis = phi::GetVectorFromTensor<int>(axis_tensor)[0];
}
axis = ComputeAxis(static_cast<int64_t>(axis),
static_cast<int64_t>(ins[0]->dims().size()));
PADDLE_ENFORCE_GE(axis,
0,
platform::errors::InvalidArgument(
"concat_grad: axis should be larger than or "
"equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LT(
axis,
out_grad->dims().size(),
platform::errors::InvalidArgument(
"concat_grad: axis should be less than ins[0]->dims()!"
"But received axis is %d, while ins[0]->dims()"
"size is %d.",
axis,
out_grad->dims().size()));
// get output tensor that the name is not kEmptyVarName
std::vector<void*> outputs_vec;
std::vector<phi::DenseTensor> tmp_outputs_vec;
std::vector<MLUCnnlTensorDesc> output_descs;
std::vector<cnnlTensorDescriptor_t> descs_vec;
for (size_t j = 0; j < outs.size(); ++j) {
if (out_var_names[j] != framework::kEmptyVarName &&
outs[j]->numel() != 0UL) {
outs[j]->mutable_data<T>(ctx.GetPlace());
output_descs.emplace_back(MLUCnnlTensorDesc(*outs[j]));
outputs_vec.push_back(GetBasePtr(outs[j]));
} else {
phi::DenseTensor tmp_tensor;
tmp_tensor.mutable_data<T>(ins[j]->dims(), ctx.GetPlace());
tmp_outputs_vec.push_back(tmp_tensor);
output_descs.emplace_back(MLUCnnlTensorDesc(*ins[j]));
outputs_vec.push_back(GetBasePtr(&(tmp_outputs_vec.back())));
}
descs_vec.push_back(output_descs.back().get());
}
MLUCnnlTensorDesc out_grad_desc(*out_grad);
MLUCnnl::Split(ctx,
static_cast<int>(split_num),
static_cast<int>(axis),
out_grad_desc.get(),
GetBasePtr(out_grad),
descs_vec.data(),
outputs_vec.data());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(concat,
ops::ConcatMLUKernel<float>,
ops::ConcatMLUKernel<paddle::platform::float16>,
ops::ConcatMLUKernel<int64_t>,
ops::ConcatMLUKernel<bool>,
ops::ConcatMLUKernel<int>,
ops::ConcatMLUKernel<uint8_t>);
REGISTER_OP_MLU_KERNEL(concat_grad,
ops::ConcatGradMLUKernel<float>,
ops::ConcatGradMLUKernel<paddle::platform::float16>,
ops::ConcatGradMLUKernel<int64_t>,
ops::ConcatGradMLUKernel<bool>,
ops::ConcatGradMLUKernel<int>,
ops::ConcatGradMLUKernel<uint8_t>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
using DataLayout = phi::DataLayout;
template <typename T>
class MLUConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
auto* filter = ctx.Input<phi::DenseTensor>("Filter");
auto* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = data_format == "NHWC";
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_tensor(output->type());
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
if (channel_last) {
input_tensor.ShareDataWith(*input);
output_tensor.ShareDataWith(*output);
} else {
// transpose input from NCHW to NHWC
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
auto output_dims = output->dims();
output_tensor.mutable_data<T>(
{output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
ctx.GetPlace());
}
input_tensor.set_layout(DataLayout::kNHWC);
output_tensor.set_layout(DataLayout::kNHWC);
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(
input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
MLUCnnlTensorDesc filter_desc(
trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
MLUCnnlTensorDesc output_desc(
output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
ToCnnlDataType<T>());
MLUCnnl::ConvolutionForward(ctx,
conv_desc.get(),
nullptr /*alpha*/,
nullptr /*beta*/,
nullptr /*bias_desc*/,
nullptr /*bias_ptr*/,
input_desc.get(),
GetBasePtr(&input_tensor),
filter_desc.get(),
GetBasePtr(&trans_filter),
output_desc.get(),
GetBasePtr(&output_tensor));
if (!channel_last) {
// transpose output from NHWC to NCHW
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&output_tensor,
output,
false /*need_reshape_or_alloc*/);
}
}
};
template <typename T>
class MLUConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto input = ctx.Input<phi::DenseTensor>("Input");
auto filter = ctx.Input<phi::DenseTensor>("Filter");
auto output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
auto input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
auto filter_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = data_format == "NHWC";
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_grad_tensor(output_grad->type());
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
if (channel_last) {
input_tensor.ShareDataWith(*input);
output_grad_tensor.ShareDataWith(*output_grad);
} else {
// transpose input and output_grad from NCHW to NHWC
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
output_grad,
&output_grad_tensor,
true /*need_reshape_or_alloc*/);
}
input_tensor.set_layout(DataLayout::kNHWC);
output_grad_tensor.set_layout(DataLayout::kNHWC);
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
auto filter_grad_dims = filter_grad->dims();
phi::DenseTensor temp_filter_grad(filter_grad->type());
temp_filter_grad.mutable_data<T>({filter_grad_dims[0],
filter_grad_dims[2],
filter_grad_dims[3],
filter_grad_dims[1]},
ctx.GetPlace());
cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc out_grad_desc(
output_grad_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc temp_filter_grad_desc(
temp_filter_grad, data_layout, tensor_dtype);
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
tensor_dtype);
MLUCnnl::ConvBackpropFilter(ctx,
conv_desc.get(),
input_desc.get(),
GetBasePtr(&input_tensor),
out_grad_desc.get(),
GetBasePtr(&output_grad_tensor),
temp_filter_grad_desc.get(),
GetBasePtr(&temp_filter_grad));
// transpose filter_grad from MHWC to MCHW
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&temp_filter_grad,
filter_grad,
false /*need_reshape_or_alloc*/);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor input_grad_tensor(input_grad->type());
if (channel_last) {
input_grad_tensor.ShareDataWith(*input_grad);
} else {
auto input_grad_dims = input_grad->dims();
input_grad_tensor.mutable_data<T>({input_grad_dims[0],
input_grad_dims[2],
input_grad_dims[3],
input_grad_dims[1]},
ctx.GetPlace());
}
input_grad_tensor.set_layout(DataLayout::kNHWC);
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
MLUCnnlTensorDesc out_grad_desc(
output_grad_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc in_grad_desc(
input_grad_tensor, data_layout, tensor_dtype);
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
tensor_dtype);
MLUCnnl::ConvBackpropInput(ctx,
conv_desc.get(),
filter_desc.get(),
GetBasePtr(&trans_filter),
out_grad_desc.get(),
GetBasePtr(&output_grad_tensor),
in_grad_desc.get(),
GetBasePtr(&input_grad_tensor));
if (!channel_last) {
// transpose input_grad from NHWC to NCHW
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&input_grad_tensor,
input_grad,
false /*need_reshape_or_alloc*/);
}
}
}
};
template <typename T>
class MLUDepthwiseConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
auto* filter = ctx.Input<phi::DenseTensor>("Filter");
auto* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = data_format == "NHWC";
int groups;
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_tensor(output->type());
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
if (channel_last) {
groups = in_dims[3];
input_tensor.ShareDataWith(*input);
output_tensor.ShareDataWith(*output);
} else {
// transpose input from NCHW to NHWC
groups = in_dims[1];
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
auto output_dims = output->dims();
output_tensor.mutable_data<T>(
{output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
ctx.GetPlace());
}
input_tensor.set_layout(DataLayout::kNHWC);
output_tensor.set_layout(DataLayout::kNHWC);
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(
input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
MLUCnnlTensorDesc filter_desc(
trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
MLUCnnlTensorDesc output_desc(
output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
ToCnnlDataType<T>());
MLUCnnl::ConvolutionForward(ctx,
conv_desc.get(),
nullptr /*alpha*/,
nullptr /*beta*/,
nullptr /*bias_desc*/,
nullptr /*bias_ptr*/,
input_desc.get(),
GetBasePtr(&input_tensor),
filter_desc.get(),
GetBasePtr(&trans_filter),
output_desc.get(),
GetBasePtr(&output_tensor));
if (!channel_last) {
// transpose output from NHWC to NCHW
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&output_tensor,
output,
false /*need_reshape_or_alloc*/);
}
}
};
template <typename T>
class MLUDepthwiseConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto input = ctx.Input<phi::DenseTensor>("Input");
auto filter = ctx.Input<phi::DenseTensor>("Filter");
auto output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
auto input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
auto filter_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const bool channel_last = data_format == "NHWC";
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
int groups;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_grad_tensor(output_grad->type());
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
const std::vector<int> perm_hwcm_to_mchw = {3, 2, 0, 1};
const std::vector<int> perm_mchw_to_hwcm = {2, 3, 1, 0};
if (channel_last) {
input_tensor.ShareDataWith(*input);
output_grad_tensor.ShareDataWith(*output_grad);
groups = in_dims[3];
} else {
groups = in_dims[1];
// transpose input and output_grad from NCHW to NHWC
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
output_grad,
&output_grad_tensor,
true /*need_reshape_or_alloc*/);
}
input_tensor.set_layout(DataLayout::kNHWC);
output_grad_tensor.set_layout(DataLayout::kNHWC);
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
auto filter_grad_dims = filter_grad->dims();
phi::DenseTensor temp_filter_grad(filter_grad->type());
// Details about setting diff_w hwcn for better performance, see the CNNL
// documentation.
temp_filter_grad.mutable_data<T>({filter_grad_dims[perm_mchw_to_hwcm[0]],
filter_grad_dims[perm_mchw_to_hwcm[1]],
filter_grad_dims[perm_mchw_to_hwcm[2]],
filter_grad_dims[perm_mchw_to_hwcm[3]]},
ctx.GetPlace());
cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(input_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc out_grad_desc(
output_grad_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc temp_filter_grad_desc(
temp_filter_grad, CNNL_LAYOUT_HWCN, tensor_dtype);
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
tensor_dtype);
MLUCnnl::ConvBackpropFilter(ctx,
conv_desc.get(),
input_desc.get(),
GetBasePtr(&input_tensor),
out_grad_desc.get(),
GetBasePtr(&output_grad_tensor),
temp_filter_grad_desc.get(),
GetBasePtr(&temp_filter_grad));
// transpose filter_grad from HWCM to MCHW
TransposeFromMLUTensor<T>(ctx,
perm_hwcm_to_mchw,
&temp_filter_grad,
filter_grad,
false /*need_reshape_or_alloc*/);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor input_grad_tensor(input_grad->type());
if (channel_last) {
input_grad_tensor.ShareDataWith(*input_grad);
} else {
auto input_grad_dims = input_grad->dims();
input_grad_tensor.mutable_data<T>({input_grad_dims[0],
input_grad_dims[2],
input_grad_dims[3],
input_grad_dims[1]},
ctx.GetPlace());
}
input_grad_tensor.set_layout(DataLayout::kNHWC);
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
cnnlDataType_t tensor_dtype = ToCnnlDataType<T>();
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc filter_desc(trans_filter, data_layout, tensor_dtype);
MLUCnnlTensorDesc out_grad_desc(
output_grad_tensor, data_layout, tensor_dtype);
MLUCnnlTensorDesc in_grad_desc(
input_grad_tensor, data_layout, tensor_dtype);
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
tensor_dtype);
MLUCnnl::ConvBackpropInput(ctx,
conv_desc.get(),
filter_desc.get(),
GetBasePtr(&trans_filter),
out_grad_desc.get(),
GetBasePtr(&output_grad_tensor),
in_grad_desc.get(),
GetBasePtr(&input_grad_tensor));
if (!channel_last) {
// transpose input_grad from NHWC to NCHW
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&input_grad_tensor,
input_grad,
false /*need_reshape_or_alloc*/);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(conv2d,
ops::MLUConvOpKernel<float>,
ops::MLUConvOpKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(conv2d_grad,
ops::MLUConvGradOpKernel<float>,
ops::MLUConvGradOpKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(depthwise_conv2d,
ops::MLUDepthwiseConvOpKernel<float>,
ops::MLUDepthwiseConvOpKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(depthwise_conv2d_grad,
ops::MLUDepthwiseConvGradOpKernel<float>,
ops::MLUDepthwiseConvGradOpKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_transpose_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
namespace paddle {
namespace operators {
using DataLayout = phi::DataLayout;
template <typename T>
class Conv2DTransposeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
std::vector<int> output_padding =
ctx.Attr<std::vector<int>>("output_padding");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const std::string data_format = ctx.Attr<std::string>("data_format");
int groups = ctx.Attr<int>("groups");
const std::string padding_algorithm =
ctx.Attr<std::string>("padding_algorithm");
// check dimension
const bool channel_last = data_format == "NHWC";
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_tensor(output->type());
input_tensor.set_layout(DataLayout::kNHWC);
output_tensor.set_layout(DataLayout::kNHWC);
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
if (channel_last) {
input_tensor.ShareDataWith(*input);
output_tensor.ShareDataWith(*output);
} else {
// transpose input from NCHW to NHWC
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
auto output_dims = output->dims();
output_tensor.mutable_data<T>(
{output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
ctx.GetPlace());
}
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
// construct MLU attr
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(
input_tensor, data_layout, ToCnnlDataType(input_tensor.dtype()));
MLUCnnlTensorDesc filter_desc(
trans_filter, data_layout, ToCnnlDataType(trans_filter.type()));
MLUCnnlTensorDesc output_desc(
output_tensor, data_layout, ToCnnlDataType(output_tensor.dtype()));
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
ToCnnlDataType<T>());
MLUCnnl::ConvBackpropInput(ctx,
conv_desc.get(),
filter_desc.get(),
GetBasePtr(&trans_filter),
input_desc.get(),
GetBasePtr(&input_tensor),
output_desc.get(),
GetBasePtr(&output_tensor));
if (!channel_last) {
// transpose output from NHWC to NCHW
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&output_tensor,
output,
false /*need_reshape_or_alloc*/);
}
}
};
template <typename T>
class Conv2DTransposeGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
const phi::DenseTensor* output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
phi::DenseTensor* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
phi::DenseTensor* filter_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
if ((!input_grad) && (!filter_grad)) return;
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
const int groups = ctx.Attr<int>("groups");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
const std::string data_format = ctx.Attr<std::string>("data_format");
const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
auto in_dims = input->dims();
auto filter_dims = filter->dims();
auto in_dims_size = in_dims.size();
const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
framework::DDim in_data_dims;
if (channel_last) {
in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
}
framework::DDim filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
phi::UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
phi::DenseTensor input_tensor(input->type());
phi::DenseTensor output_grad_tensor(output_grad->type());
output_grad_tensor.set_layout(DataLayout::kNHWC);
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
if (channel_last) {
input_tensor.ShareDataWith(*input);
output_grad_tensor.ShareDataWith(*output_grad);
} else {
// transpose input from NCHW to NHWC
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
input,
&input_tensor,
true /*need_reshape_or_alloc*/);
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
output_grad,
&output_grad_tensor,
true /*need_reshape_or_alloc*/);
}
// transpose filter from MCHW to MHWC
phi::DenseTensor trans_filter(filter->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
// MLU descs
cnnlTensorLayout_t data_layout_mlu = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(
input_tensor, data_layout_mlu, ToCnnlDataType(input_tensor.dtype()));
MLUCnnlTensorDesc trans_filter_desc(
trans_filter, data_layout_mlu, ToCnnlDataType(trans_filter.type()));
MLUCnnlTensorDesc output_grad_desc(
output_grad_tensor,
data_layout_mlu,
ToCnnlDataType(output_grad_tensor.dtype()));
MLUCnnlConvolutionDesc conv_desc(in_dims_size,
paddings.data(),
strides.data(),
dilations.data(),
groups,
ToCnnlDataType<T>());
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor filter_grad_tensor(filter_grad->type());
// filter_grad always MCHW
// filter_grad_tensor always MHWC
auto filter_grad_dims = filter_grad->dims();
filter_grad_tensor.mutable_data<T>({filter_grad_dims[0],
filter_grad_dims[2],
filter_grad_dims[3],
filter_grad_dims[1]},
ctx.GetPlace());
//}
filter_grad_tensor.set_layout(DataLayout::kNHWC);
MLUCnnlTensorDesc filter_grad_desc(
filter_grad_tensor,
data_layout_mlu,
ToCnnlDataType(filter_grad_tensor.dtype()));
MLUCnnl::ConvBackpropFilter(ctx,
conv_desc.get(),
output_grad_desc.get(),
GetBasePtr(output_grad),
input_desc.get(),
GetBasePtr(&input_tensor),
filter_grad_desc.get(),
GetBasePtr(&filter_grad_tensor));
// transpose output from MHWC to MCHW
const std::vector<int> perm_to_mchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_mchw,
&filter_grad_tensor,
filter_grad,
false /*need_reshape_or_alloc*/);
}
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor input_grad_tensor(input_grad->type());
input_tensor.set_layout(DataLayout::kNHWC);
if (channel_last) {
input_grad_tensor.ShareDataWith(*input_grad);
} else {
auto input_grad_dims = input_grad->dims();
input_grad_tensor.mutable_data<T>({input_grad_dims[0],
input_grad_dims[2],
input_grad_dims[3],
input_grad_dims[1]},
ctx.GetPlace());
}
MLUCnnlTensorDesc input_grad_desc(
input_grad_tensor,
data_layout_mlu,
ToCnnlDataType(input_grad_tensor.dtype()));
MLUCnnl::ConvolutionForward(ctx,
conv_desc.get(),
nullptr /*alpha*/,
nullptr /*beta*/,
nullptr /*bias_desc*/,
nullptr /*bias_ptr*/,
output_grad_desc.get(),
GetBasePtr(&output_grad_tensor),
trans_filter_desc.get(),
GetBasePtr(&trans_filter),
input_grad_desc.get(),
GetBasePtr(&input_grad_tensor));
if (!channel_last) {
// transpose output from NHWC to NCHW
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&input_grad_tensor,
input_grad,
false /*need_reshape_or_alloc*/);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(conv2d_transpose,
ops::Conv2DTransposeMLUKernel<float>,
ops::Conv2DTransposeMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(conv2d_transpose_grad,
ops::Conv2DTransposeGradMLUKernel<float>,
ops::Conv2DTransposeGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class CumSumMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
int axis = ctx.Attr<int>("axis");
bool exclusive = ctx.Attr<bool>("exclusive");
bool reverse = ctx.Attr<bool>("reverse");
bool flatten = ctx.Attr<bool>("flatten");
out->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor* input_ptr = const_cast<phi::DenseTensor*>(x);
phi::DenseTensor flat_x(x->type());
if (flatten) {
PADDLE_ENFORCE_EQ(
axis,
-1,
platform::errors::InvalidArgument(
"when flatten is true, attr axis must be default %d, but got %d",
-1,
axis));
flat_x.ShareDataWith(*x);
flat_x.Resize(phi::make_ddim({x->numel()}));
input_ptr = &flat_x;
}
const int true_axis = (axis < 0) ? input_ptr->dims().size() + axis : axis;
MLUCnnlTensorDesc input_desc(*input_ptr);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Cumsum(ctx,
true_axis,
exclusive,
reverse,
input_desc.get(),
GetBasePtr(input_ptr),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(cumsum,
ops::CumSumMLUKernel<int>,
ops::CumSumMLUKernel<float>,
ops::CumSumMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class DeformableConvMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* offset = ctx.Input<phi::DenseTensor>("Offset");
auto* mask = ctx.Input<phi::DenseTensor>("Mask");
auto* filter = ctx.Input<phi::DenseTensor>("Filter");
auto* output = ctx.Output<phi::DenseTensor>("Output");
output->mutable_data<T>(ctx.GetPlace());
const int groups = ctx.Attr<int>("groups");
const int deformable_groups = ctx.Attr<int>("deformable_groups");
const int im2col_step = ctx.Attr<int>("im2col_step");
const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
const std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
// TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
PADDLE_ENFORCE_EQ(
groups == 1,
true,
platform::errors::InvalidArgument(
"MLU deformable_conv kernel only support groups == 1, but get %d.",
groups));
// transform paddings from {h, w} to {top, bottom, left, right}.
const std::vector<int> trans_paddings{
paddings[0], paddings[0], paddings[1], paddings[1]};
MLUCnnlDCNDesc dcn_desc(input->dims().size(),
trans_paddings.data(),
strides.data(),
dilations.data(),
deformable_groups,
groups,
im2col_step);
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
phi::DenseTensor trans_input(input->dtype());
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_offset(offset->dtype());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
offset,
&trans_offset,
true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_mask(mask->dtype());
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_filter(filter->dtype());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
phi::DenseTensor tmp_output(output->dtype());
auto output_dims = output->dims();
tmp_output.mutable_data<T>(
{output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
ctx.GetPlace());
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc input_desc(
trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
MLUCnnlTensorDesc offset_desc(
trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
MLUCnnlTensorDesc mask_desc(
trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
MLUCnnlTensorDesc filter_desc(
trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
MLUCnnlTensorDesc output_desc(
tmp_output, data_layout, ToCnnlDataType(tmp_output.dtype()));
MLUCnnl::DCNForward(ctx,
dcn_desc.get(),
input_desc.get(),
GetBasePtr(&trans_input),
offset_desc.get(),
GetBasePtr(&trans_offset),
mask_desc.get(),
GetBasePtr(&trans_mask),
filter_desc.get(),
GetBasePtr(&trans_filter),
nullptr,
nullptr,
output_desc.get(),
GetBasePtr(&tmp_output));
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_output,
output,
false /*need_reshape_or_alloc*/);
}
};
template <typename T>
class DeformableConvGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
auto* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
auto* filter_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
auto* offset_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Offset"));
auto* mask_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Mask"));
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
auto* offset = ctx.Input<phi::DenseTensor>("Offset");
auto* mask = ctx.Input<phi::DenseTensor>("Mask");
auto* filter = ctx.Input<phi::DenseTensor>("Filter");
int groups = ctx.Attr<int>("groups");
int deformable_groups = ctx.Attr<int>("deformable_groups");
int im2col_step = ctx.Attr<int>("im2col_step");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
// TODO(fwg): Remove this check when cnnl fix the bug that groups > 1.
PADDLE_ENFORCE_EQ(groups == 1,
true,
platform::errors::InvalidArgument(
"MLU deformable_conv_grad kernel only support groups "
"== 1, but get %d.",
groups));
// transform paddings from {h, w} to {top, bottom, left, right}.
const std::vector<int> trans_paddings{
paddings[0], paddings[0], paddings[1], paddings[1]};
MLUCnnlDCNDesc dcn_desc(input->dims().size(),
trans_paddings.data(),
strides.data(),
dilations.data(),
deformable_groups,
groups,
im2col_step);
phi::DenseTensor tmp_input_grad;
auto input_dims = input->dims();
tmp_input_grad.mutable_data<T>(
{input_dims[0], input_dims[2], input_dims[3], input_dims[1]},
ctx.GetPlace());
phi::DenseTensor tmp_filter_grad;
auto filter_dims = filter->dims();
tmp_filter_grad.mutable_data<T>(
{filter_dims[0], filter_dims[2], filter_dims[3], filter_dims[1]},
ctx.GetPlace());
phi::DenseTensor tmp_offset_grad;
auto offset_dims = offset->dims();
tmp_offset_grad.mutable_data<T>(
{offset_dims[0], offset_dims[2], offset_dims[3], offset_dims[1]},
ctx.GetPlace());
phi::DenseTensor tmp_mask_grad;
auto mask_dims = mask->dims();
tmp_mask_grad.mutable_data<T>(
{mask_dims[0], mask_dims[2], mask_dims[3], mask_dims[1]},
ctx.GetPlace());
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
phi::DenseTensor trans_output_grad(output_grad->dtype());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
output_grad,
&trans_output_grad,
true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_input(input->dtype());
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_offset(offset->dtype());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
offset,
&trans_offset,
true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_mask(mask->dtype());
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, mask, &trans_mask, true /*need_reshape_or_alloc*/);
phi::DenseTensor trans_filter(filter->dtype());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
filter,
&trans_filter,
true /*need_reshape_or_alloc*/);
cnnlTensorLayout_t data_layout = CNNL_LAYOUT_NHWC;
MLUCnnlTensorDesc output_grad_desc(
trans_output_grad,
data_layout,
ToCnnlDataType(trans_output_grad.dtype()));
MLUCnnlTensorDesc input_desc(
trans_input, data_layout, ToCnnlDataType(trans_input.dtype()));
MLUCnnlTensorDesc offset_desc(
trans_offset, data_layout, ToCnnlDataType(trans_offset.dtype()));
MLUCnnlTensorDesc mask_desc(
trans_mask, data_layout, ToCnnlDataType(trans_mask.dtype()));
MLUCnnlTensorDesc filter_desc(
trans_filter, data_layout, ToCnnlDataType(trans_filter.dtype()));
MLUCnnl::DCNBackwardData(ctx,
dcn_desc.get(),
input_desc.get(),
GetBasePtr(&trans_input),
offset_desc.get(),
GetBasePtr(&trans_offset),
mask_desc.get(),
GetBasePtr(&trans_mask),
filter_desc.get(),
GetBasePtr(&trans_filter),
output_grad_desc.get(),
GetBasePtr(&trans_output_grad),
input_desc.get(),
GetBasePtr(&tmp_input_grad),
offset_desc.get(),
GetBasePtr(&tmp_offset_grad),
mask_desc.get(),
GetBasePtr(&tmp_mask_grad));
MLUCnnl::DCNBackwardWeight(ctx,
dcn_desc.get(),
input_desc.get(),
GetBasePtr(&trans_input),
offset_desc.get(),
GetBasePtr(&trans_offset),
mask_desc.get(),
GetBasePtr(&trans_mask),
output_grad_desc.get(),
GetBasePtr(&trans_output_grad),
filter_desc.get(),
GetBasePtr(&tmp_filter_grad),
nullptr,
nullptr);
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
if (input_grad) {
input_grad->mutable_data<T>(ctx.GetPlace());
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_input_grad,
input_grad,
false /*need_reshape_or_alloc*/);
}
if (filter_grad) {
filter_grad->mutable_data<T>(ctx.GetPlace());
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_filter_grad,
filter_grad,
false /*need_reshape_or_alloc*/);
}
if (offset_grad) {
offset_grad->mutable_data<T>(ctx.GetPlace());
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_offset_grad,
offset_grad,
false /*need_reshape_or_alloc*/);
}
if (mask_grad) {
mask_grad->mutable_data<T>(ctx.GetPlace());
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_mask_grad,
mask_grad,
false /*need_reshape_or_alloc*/);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(deformable_conv, ops::DeformableConvMLUKernel<float>);
REGISTER_OP_MLU_KERNEL(deformable_conv_grad,
ops::DeformableConvGradMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class DropoutMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto dropout_prob = ctx.Attr<float>("dropout_prob");
auto is_test = ctx.Attr<bool>("is_test");
auto* seed_tensor =
ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
auto dropout_implementation =
ctx.Attr<std::string>("dropout_implementation");
const bool is_upscale = (dropout_implementation == "upscale_in_train");
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
if (is_test && is_upscale) {
// dropout op for inference: out = input.
framework::TensorCopy(
*x,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
out);
return;
} else if (!is_test) {
// dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
// out = input * mask.
int seed_data = 0;
if (seed_tensor) {
if (platform::is_mlu_place(seed_tensor->place())) {
memory::Copy(platform::CPUPlace(),
&seed_data,
seed_tensor->place(),
seed_tensor->data<int>(),
sizeof(int));
} else {
seed_data = *(seed_tensor->data<int>());
}
} else {
seed_data = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
}
auto* mask = ctx.Output<phi::DenseTensor>("Mask");
mask->mutable_data<uint8_t>(ctx.GetPlace());
MLUCnnlTensorDesc mask_desc(*mask);
// Special case when dropout_prob is 1.0
if (dropout_prob == 1.0f) {
auto value_t = static_cast<T>(0.0f);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value_t,
out_desc.get(),
GetBasePtr(out));
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value_t,
mask_desc.get(),
GetBasePtr(mask));
return;
}
// create mlu random generator
const int device_id = ctx.GetPlace().GetDeviceId();
auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
// compute out = input * mask / ( 1.0 - dropout_prob )
MLUCnnl::FusedDropout(ctx,
mlu_gen_random->get(),
x_desc.get(),
GetBasePtr(x),
dropout_prob,
GetBasePtr(&(mlu_gen_random->get_state())),
mask_desc.get(),
GetBasePtr(mask),
out_desc.get(),
GetBasePtr(out));
if (is_upscale) {
return;
}
}
// In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
phi::DenseTensor scale_tensor(x->dtype());
phi::DenseTensor bias_tensor(x->dtype());
scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
FillMLUTensorWithHostValue(
ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
MLUCnnl::Scale(ctx,
0,
is_test ? x_desc.get() : out_desc.get(),
is_test ? GetBasePtr(x) : GetBasePtr(out),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class DropoutGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(!ctx.Attr<bool>("is_test"),
true,
platform::errors::InvalidArgument(
"GradOp is only callable when is_test is false"));
auto* grad_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* grad_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* mask = ctx.Input<phi::DenseTensor>("Mask");
auto dropout_prob = ctx.Attr<float>("dropout_prob");
auto dropout_impl = ctx.Attr<std::string>("dropout_implementation");
grad_x->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc grad_x_desc(*grad_x);
if (dropout_prob == 1.) {
auto value_t = static_cast<T>(0.0f);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value_t,
grad_x_desc.get(),
GetBasePtr(grad_x));
return;
}
// cast mask from uint8 to float32/float16
phi::DenseTensor cast_mask(grad_x->dtype());
cast_mask.Resize(mask->dims());
cast_mask.mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc mask_desc(*mask);
MLUCnnlTensorDesc cast_mask_desc(cast_mask);
cnnlCastDataType_t cast_type =
GetCastDataType(framework::TransToProtoVarType(mask->dtype()),
framework::TransToProtoVarType(cast_mask.dtype()));
MLUCnnl::Cast(ctx,
cast_type,
mask_desc.get(),
GetBasePtr(mask),
cast_mask_desc.get(),
GetBasePtr(&cast_mask));
const bool is_upscale = (dropout_impl == "upscale_in_train");
const float scale = is_upscale ? (1.0f / (1.0f - dropout_prob)) : (1.0f);
auto data_type = ToCnnlDataType<T>();
MLUCnnlTensorDesc grad_out_desc(*grad_out);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
cast_mask_desc.get(),
GetBasePtr(&cast_mask),
grad_out_desc.get(),
GetBasePtr(grad_out),
grad_x_desc.get(),
GetBasePtr(grad_x),
data_type,
scale);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(dropout,
ops::DropoutMLUKernel<float>,
ops::DropoutMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(dropout_grad,
ops::DropoutGradMLUKernel<float>,
ops::DropoutGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/expand_as_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ExpandAsV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
auto target_shape = context.Attr<std::vector<int>>("target_shape");
auto target_rank = target_shape.size();
PADDLE_ENFORCE_GE(target_rank,
rank,
platform::errors::InvalidArgument(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be greater than or equal to "
"the rank (%d) of the input 'x'.",
target_rank,
rank));
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
"expand_as_v2 op must be positive.",
rank));
PADDLE_ENFORCE_LE(target_rank,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The rank (%d) of the input 'target_tensor' for "
"expand_as_v2 op must be less than or equal to %d.",
target_rank,
MAX_RANK_SUPPORTED));
ExpandAs(context);
}
protected:
void ExpandAs(const framework::ExecutionContext& context) const {
auto* in0 = context.Input<phi::DenseTensor>("X");
auto in_dims = in0->dims();
auto target_shape = context.Attr<std::vector<int>>("target_shape");
auto vec_in_dims = phi::vectorize<int>(in_dims);
auto diff = target_shape.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
for (size_t i = 0; i < vec_in_dims.size(); ++i) {
PADDLE_ENFORCE_NE(target_shape[i],
0,
platform::errors::InvalidArgument(
"The value of target shape cannot be zero."));
if (vec_in_dims[i] != 1) {
PADDLE_ENFORCE_EQ(
vec_in_dims[i],
target_shape[i],
platform::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in "
"target tensor for expand_as_v2 op.",
vec_in_dims[i],
target_shape[i]));
}
}
auto* out0 = context.Output<phi::DenseTensor>("Out");
framework::DDim out_dims = phi::make_ddim(target_shape);
out0->Resize(out_dims);
out0->mutable_data<T>(context.GetPlace());
MLUCnnlTensorDesc x_desc(*in0);
MLUCnnlTensorDesc out_desc(*out0);
MLUCnnl::BroadcastTo(context,
x_desc.get(),
GetBasePtr(in0),
out_desc.get(),
GetBasePtr(out0));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(expand_as_v2,
ops::ExpandAsV2MLUKernel<float>,
ops::ExpandAsV2MLUKernel<int>,
ops::ExpandAsV2MLUKernel<int64_t>,
ops::ExpandAsV2MLUKernel<int8_t>,
ops::ExpandAsV2MLUKernel<uint8_t>,
ops::ExpandAsV2MLUKernel<bool>,
ops::ExpandAsV2MLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/expand_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ExpandV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Out = ctx.Output<phi::DenseTensor>("Out");
auto in_dims = X->dims();
auto expand_shape = get_expand_shape(ctx);
auto vec_in_dims = phi::vectorize<int>(in_dims);
auto diff = expand_shape.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
std::vector<int> final_expand_shape(vec_in_dims.size());
for (size_t i = 0; i < vec_in_dims.size(); ++i) {
PADDLE_ENFORCE_NE(expand_shape[i],
0,
platform::errors::InvalidArgument(
"The expanded size cannot be zero."));
if (i < diff) { // expand_shape = [3,4,-1,-1], X = [10,2] -->
// final_expand_shape = [3,4,10,2]
PADDLE_ENFORCE_GT(
expand_shape[i],
0,
platform::errors::InvalidArgument(
"The expanded size (%d) for non-existing dimensions must be "
"positive for expand_v2 op.",
expand_shape[i]));
final_expand_shape[i] = expand_shape[i];
} else if (expand_shape[i] > 0) { // expand_shape = [3,4,10,4], X =
// [10,1] --> final_expand_shape =
// [3,4,10,4]
if (vec_in_dims[i] != 1) {
PADDLE_ENFORCE_EQ(
vec_in_dims[i],
expand_shape[i],
platform::errors::InvalidArgument(
"The value (%d) of the non-singleton dimension does not match"
" the corresponding value (%d) in shape for expand_v2 op.",
vec_in_dims[i],
expand_shape[i]));
final_expand_shape[i] = expand_shape[i];
} else {
final_expand_shape[i] = expand_shape[i];
}
} else { // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
// = [3,4,10,2]
PADDLE_ENFORCE_EQ(
expand_shape[i],
-1,
platform::errors::InvalidArgument(
"When the value in shape is negative for expand_v2 op, "
"only -1 is supported, but the value received is %d.",
expand_shape[i]));
final_expand_shape[i] = vec_in_dims[i];
}
}
auto rank = X->dims().size();
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument(
"The rank of the input 'X' for expand_v2_mlu op must be positive, "
"but the value received is %d.",
rank));
auto shape_size = final_expand_shape.size();
PADDLE_ENFORCE_GE(
shape_size,
rank,
platform::errors::InvalidArgument(
"The number (%d) of elements of 'shape' for expand_v2_mlu op must "
"be "
"greater than or equal to the rank (%d) of the input 'X'.",
shape_size,
rank));
framework::DDim out_dims = phi::make_ddim(final_expand_shape);
Out->Resize(out_dims);
auto place = ctx.GetPlace();
Out->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*X);
MLUCnnlTensorDesc out_desc(*Out);
MLUCnnl::BroadcastTo(
ctx, x_desc.get(), GetBasePtr(X), out_desc.get(), GetBasePtr(Out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(expand_v2,
ops::ExpandV2MLUKernel<float>,
ops::ExpandV2MLUKernel<paddle::platform::float16>,
ops::ExpandV2MLUKernel<bool>,
ops::ExpandV2MLUKernel<int>,
ops::ExpandV2MLUKernel<int64_t>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class FillAnyLikeMLUKernel : public framework::OpKernel<T> {
public:
using CommonType = typename std::common_type<
float,
typename std::conditional<std::is_same<T, platform::float16>::value,
float,
T>::type>::type;
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
float value = ctx.Attr<float>("value");
auto common_type_value = static_cast<CommonType>(value);
PADDLE_ENFORCE_EQ(
(common_type_value >=
static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
(common_type_value <=
static_cast<CommonType>(std::numeric_limits<T>::max())),
true,
platform::errors::InvalidArgument(
"The filled value is out of range for target type, "
"current kernel type is %s, the range should between %f "
"and %f, but now value is %f.",
typeid(T).name(),
static_cast<CommonType>(std::numeric_limits<T>::lowest()),
static_cast<CommonType>(std::numeric_limits<T>::max()),
value));
PADDLE_ENFORCE_EQ(
std::isnan(value),
false,
platform::errors::InvalidArgument("The filled value is NaN."));
auto value_t = static_cast<T>(value);
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Fill(
ctx, CNNL_POINTER_MODE_HOST, &value_t, out_desc.get(), GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(fill_any_like,
ops::FillAnyLikeMLUKernel<int>,
ops::FillAnyLikeMLUKernel<int64_t>,
ops::FillAnyLikeMLUKernel<float>,
ops::FillAnyLikeMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto data_type =
static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
auto float_value = ctx.Attr<float>("value");
auto str_value = ctx.Attr<std::string>("str_value");
auto force_cpu = ctx.Attr<bool>("force_cpu");
auto *out = ctx.Output<phi::DenseTensor>("Out");
auto *in = ctx.Input<phi::DenseTensor>("Input");
if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
// set the correct batch size for the phi::DenseTensor.
auto odims = out->dims();
int output_dim_idx = ctx.Attr<int>("output_dim_idx");
odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
out->mutable_data<T>(odims, ctx.GetPlace());
}
T value;
if (str_value.empty()) {
value = static_cast<T>(float_value);
} else {
// handle NaN/Inf first, which cannot be read from stream.
if (str_value == "inf") {
value = static_cast<T>(std::numeric_limits<double>::infinity());
} else if (str_value == "-inf") {
value = static_cast<T>(-std::numeric_limits<double>::infinity());
} else if (str_value == "nan") {
value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
} else {
std::stringstream convert_stream(str_value);
if (std::is_same<int64_t, T>::value) {
int64_t tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
} else {
double tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
}
}
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
if (cpu_place) {
auto &dev_ctx = *pool.Get(platform::CPUPlace());
phi::funcs::SetConstant<phi::CPUContext, T> functor;
out->mutable_data(platform::CPUPlace(),
framework::TransToPhiDataType(data_type));
functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
out,
static_cast<T>(value));
} else {
out->mutable_data(ctx.GetPlace(),
framework::TransToPhiDataType(data_type));
const T *value_data = &value;
cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
MLUCnnlTensorDesc output_desc(*out);
MLUCnnl::Fill(
ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
fill_constant_batch_size_like,
ops::FillConstantBatchSizeLikeOpMLUKernel<int>,
ops::FillConstantBatchSizeLikeOpMLUKernel<float>,
ops::FillConstantBatchSizeLikeOpMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
namespace paddle {
namespace operators {
template <typename T>
class FillConstantMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto str_value = ctx.Attr<std::string>("str_value");
auto float_value = ctx.Attr<float>("value");
auto *out_var = ctx.Output<phi::DenseTensor>("Out");
T value;
if (str_value.empty()) {
value = static_cast<T>(float_value);
} else {
// handle NaN/Inf first, which cannot be read from stream.
if (str_value == "inf") {
value = static_cast<T>(std::numeric_limits<double>::infinity());
} else if (str_value == "-inf") {
value = static_cast<T>(-std::numeric_limits<double>::infinity());
} else if (str_value == "nan") {
value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
} else {
std::stringstream convert_stream(str_value);
if (std::is_same<int64_t, T>::value) {
int64_t tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
} else {
double tmp_value;
convert_stream >> tmp_value;
value = static_cast<T>(tmp_value);
}
}
}
const T *value_data = &value;
cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
if (ctx.HasInput("ValueTensor")) {
auto *value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
PADDLE_ENFORCE_EQ(
value_tensor->numel(),
1,
platform::errors::InvalidArgument(
"When use phi::DenseTensor as value to set phi::DenseTensor "
"value in fill_cosntant, "
"value input(ValueTensor) size must be 1, but get %d",
value_tensor->numel()));
value_data = value_tensor->data<T>();
auto tmp_place = value_tensor->place();
if (platform::is_mlu_place(tmp_place)) {
pointer_mode = CNNL_POINTER_MODE_DEVICE;
}
}
auto shape = GetShape(ctx);
out_var->mutable_data<T>(shape, ctx.GetPlace());
MLUCnnlTensorDesc output_desc(*out_var);
MLUCnnl::Fill(
ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out_var));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
fill_constant,
paddle::operators::FillConstantMLUKernel<float>,
paddle::operators::FillConstantMLUKernel<bool>,
paddle::operators::FillConstantMLUKernel<int>,
paddle::operators::FillConstantMLUKernel<uint8_t>,
paddle::operators::FillConstantMLUKernel<int16_t>,
paddle::operators::FillConstantMLUKernel<int64_t>,
paddle::operators::FillConstantMLUKernel<paddle::platform::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/flatten_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FlattenMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *in = context.Input<phi::DenseTensor>("X");
auto *out = context.Output<phi::DenseTensor>("Out");
auto &axes = context.Attr<int>("axis");
auto x_dims = in->dims();
auto out_dims = phi::make_ddim(GetOutputShape(axes, x_dims));
out->mutable_data(context.GetPlace(), in->type());
framework::TensorCopy(
*in,
context.GetPlace(),
context.template device_context<platform::DeviceContext>(),
out);
out->Resize(out_dims);
}
static std::vector<int32_t> GetOutputShape(const int axis,
const framework::DDim &in_dims) {
int64_t outer = 1, inner = 1;
for (int i = 0; i < in_dims.size(); ++i) {
if (i < axis) {
outer *= in_dims[i];
} else {
inner *= in_dims[i];
}
}
std::vector<int32_t> out_shape(2);
out_shape[0] = outer;
out_shape[1] = inner;
return out_shape;
}
};
template <typename DeviceContext, typename T>
class FlattenGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto in_dims = ctx.Input<phi::DenseTensor>("X")->dims();
d_x->mutable_data(ctx.GetPlace(), d_out->type());
framework::TensorCopy(
*d_out,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
d_x);
d_x->Resize(in_dims);
}
};
template <typename DeviceContext, typename T>
class Flatten2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &axes = context.Attr<int>("axis");
auto *in = context.Input<phi::DenseTensor>("X");
auto x_dims = in->dims();
auto *out = context.Output<phi::DenseTensor>("Out");
auto out_dims = phi::make_ddim(
FlattenMLUKernel<DeviceContext, T>::GetOutputShape(axes, x_dims));
out->mutable_data(context.GetPlace(), in->type());
framework::TensorCopy(
*in,
context.GetPlace(),
context.template device_context<platform::DeviceContext>(),
out);
out->Resize(out_dims);
}
};
template <typename DeviceContext, typename T>
class Flatten2GradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
d_x->mutable_data(ctx.GetPlace(), d_out->type());
framework::TensorCopy(
*d_out,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
d_x);
d_x->Resize(x_dims);
}
};
template <typename DeviceContext, typename T>
class FlattenContiguousRangeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto *in = context.Input<phi::DenseTensor>("X");
auto *out = context.Output<phi::DenseTensor>("Out");
out->mutable_data(context.GetPlace(), in->type());
auto &start_axis = context.Attr<int>("start_axis");
auto &stop_axis = context.Attr<int>("stop_axis");
// make out dims
auto in_dims = in->dims();
auto out_dims =
phi::make_ddim(GetOutputShape(start_axis, stop_axis, in_dims));
framework::TensorCopy(
*in,
context.GetPlace(),
context.template device_context<platform::DeviceContext>(),
out);
out->Resize(out_dims);
}
static std::vector<int32_t> GetOutputShape(const int start_axis,
const int stop_axis,
const framework::DDim &in_dims) {
int64_t outer = 1;
std::vector<int32_t> out_shape;
int in_dims_size = in_dims.size();
out_shape.reserve(in_dims_size - stop_axis + start_axis);
int real_start_axis = start_axis, real_stop_axis = stop_axis;
if (start_axis < 0) {
real_start_axis = start_axis + in_dims_size;
}
if (stop_axis < 0) {
real_stop_axis = stop_axis + in_dims_size;
}
for (int i = 0; i < real_start_axis; ++i) {
out_shape.push_back(in_dims[i]);
}
for (int i = real_start_axis; i <= real_stop_axis; i++) {
if (in_dims[i] == -1 || outer == -1) {
outer = -1;
} else {
outer *= in_dims[i];
}
}
out_shape.push_back(outer);
for (int i = real_stop_axis + 1; i < in_dims_size; i++) {
out_shape.push_back(in_dims[i]);
}
return out_shape;
}
};
template <typename DeviceContext, typename T>
class FlattenContiguousRangeGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
d_x->mutable_data(ctx.GetPlace(), d_out->type());
framework::TensorCopy(
*d_out,
ctx.GetPlace(),
ctx.template device_context<paddle::platform::MLUDeviceContext>(),
d_x);
d_x->Resize(x_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
flatten,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
ops::FlattenMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
flatten_grad,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
ops::FlattenGradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
flatten2,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
ops::Flatten2MLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
flatten2_grad,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int8_t>,
ops::Flatten2GradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
flatten_contiguous_range,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
float>,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
double>,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
uint8_t>,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
int>,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeMLUKernel<paddle::platform::MLUDeviceContext,
int64_t>);
REGISTER_OP_MLU_KERNEL(
flatten_contiguous_range_grad,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
float>,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
double>,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
uint8_t>,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
int>,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
int8_t>,
ops::FlattenContiguousRangeGradMLUKernel<paddle::platform::MLUDeviceContext,
int64_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
template <typename T>
class GatherNdMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<phi::DenseTensor>("X");
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto *out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->template mutable_data<T>(place);
if (x->numel() == 0) return;
if (index->numel() == 0) {
auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(*x, place, dev_ctx, out);
return;
}
const auto &index_type = framework::TransToProtoVarType(index->dtype());
bool index_type_match = index_type == framework::proto::VarType::INT32 ||
index_type == framework::proto::VarType::INT64;
PADDLE_ENFORCE_EQ(index_type_match,
true,
platform::errors::InvalidArgument(
"Index holds the wrong type, it holds [%s],"
"but desires to be [%s] or [%s]",
paddle::framework::DataTypeToString(index_type),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT32),
paddle::framework::DataTypeToString(
framework::proto::VarType::INT64)));
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc index_desc(*index);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::GatherNd(ctx,
x_desc.get(),
GetBasePtr(x),
index_desc.get(),
GetBasePtr(index),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class GatherNdGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *x = ctx.Input<phi::DenseTensor>("X");
if (dx->numel() == 0) return;
if (index->numel() == 0) {
auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
return;
}
phi::DenseTensor tmp_tensor(index->type());
phi::DenseTensor tmp_tensor2(dout->type());
const auto index_dims = index->dims();
if (index_dims.size() == 1) {
tmp_tensor.ShareDataWith(*index);
std::vector<int64_t> new_dim = {1, index_dims[0]};
tmp_tensor.Resize(phi::make_ddim(new_dim));
index = &tmp_tensor;
tmp_tensor2.ShareDataWith(*dout);
std::vector<int64_t> new_dim2{1};
for (int i = index->numel(); i < x->dims().size(); i++) {
new_dim2.push_back(x->dims()[i]);
}
tmp_tensor2.Resize(phi::make_ddim(new_dim2));
dout = &tmp_tensor2;
}
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc dx_desc(*dx);
auto value = static_cast<T>(0);
MLUCnnl::Fill(
ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
MLUCnnlTensorDesc index_desc(*index);
MLUCnnlTensorDesc dout_desc(*dout);
const cnnlScatterNdMode_t mode = CNNL_SCATTERND_ADD;
MLUCnnl::ScatterNd(ctx,
mode,
index_desc.get(),
GetBasePtr(index),
dout_desc.get(),
GetBasePtr(dout),
dx_desc.get(),
GetBasePtr(dx),
dx_desc.get(),
GetBasePtr(dx));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(gather_nd,
ops::GatherNdMLUKernel<float>,
ops::GatherNdMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(gather_nd_grad,
ops::GatherNdGradMLUKernel<paddle::platform::float16>,
ops::GatherNdGradMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class GatherOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<phi::DenseTensor>("X");
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto axis = ctx.Attr<int>("axis");
const auto index_dims = index->dims();
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1],
1,
platform::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(),
1,
platform::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
auto *out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
MLUCnnlTensorDesc index_desc(
1, index_shape_1d, ToCnnlDataType(index->dtype()));
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::GatherFunctor(ctx,
axis,
0 /*batch_dims*/,
x_desc.get(),
GetBasePtr(x),
index_desc.get(),
GetBasePtr(index),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class GatherGradOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *index = ctx.Input<phi::DenseTensor>("Index");
auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
const auto index_dims = index->dims();
if (index_dims.size() == 2) {
PADDLE_ENFORCE_EQ(
index_dims[1],
1,
platform::errors::InvalidArgument(
"The last dim of index should be 1 when it is 2D, but we get %d",
index_dims[1]));
} else {
PADDLE_ENFORCE_EQ(
index_dims.size(),
1,
platform::errors::InvalidArgument(
"The index should be 1D, when it is not 2D, but we get %d",
index_dims.size()));
}
dx->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc dx_desc(*dx);
auto value = static_cast<T>(0);
MLUCnnl::Fill(
ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(), GetBasePtr(dx));
int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
MLUCnnlTensorDesc index_desc(
1, index_shape_1d, ToCnnlDataType(index->dtype()));
MLUCnnlTensorDesc dout_desc(*dout);
const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterRefFunctor(ctx,
dx_desc.get(),
GetBasePtr(dx),
dout_desc.get(),
GetBasePtr(dout),
index_desc.get(),
GetBasePtr(index),
mode);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(gather,
ops::GatherOpMLUKernel<float>,
ops::GatherOpMLUKernel<paddle::platform::float16>,
ops::GatherOpMLUKernel<int>);
REGISTER_OP_MLU_KERNEL(gather_grad,
ops::GatherGradOpMLUKernel<float>,
ops::GatherGradOpMLUKernel<paddle::platform::float16>,
ops::GatherGradOpMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/generator.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUGaussianRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<phi::DenseTensor>("Out");
tensor->mutable_data<T>(context.GetPlace());
phi::DenseTensor cpu_tensor(tensor->type());
cpu_tensor.Resize(tensor->dims());
T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
std::normal_distribution<T> dist(mean, std);
int64_t size = tensor->numel();
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = phi::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
cpu_data[i] = dist(*engine);
}
auto& dev_ctx =
context.template device_context<paddle::platform::MLUDeviceContext>();
framework::TensorCopy(cpu_tensor, context.GetPlace(), dev_ctx, tensor);
dev_ctx.Wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(gaussian_random, ops::MLUGaussianRandomKernel<float>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class GridSamplerMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
platform::is_mlu_place(ctx.GetPlace()),
true,
platform::errors::Unavailable("This kernel only runs on MLU."));
// input and output data
const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("X");
const phi::DenseTensor* grid = ctx.Input<phi::DenseTensor>("Grid");
phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
int n = input->dims()[0];
int c = input->dims()[1];
int out_h = grid->dims()[1];
int out_w = grid->dims()[2];
output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
// attrs
// paddle.nn.functional.grid_sample(x, grid, mode='bilinear',
// padding_mode='zeros', align_corners=True, name=None)
const std::string mode = ctx.Attr<std::string>("mode");
const std::string padding_mode = ctx.Attr<std::string>("padding_mode");
bool align_corners = ctx.Attr<bool>("align_corners");
const std::string data_format = phi::DataLayoutToString(input->layout());
PADDLE_ENFORCE_EQ(
mode == "bilinear",
true,
platform::errors::Unavailable(
"Only support bilinear mode in mlu grid_sample kernel."));
PADDLE_ENFORCE_EQ(
padding_mode == "zeros",
true,
platform::errors::Unavailable(
"Only support zeros padding_mode in mlu grid_sample kernel."));
phi::DenseTensor trans_input(input->dtype());
// transpose input from NCHW to NHWC
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, input, &trans_input, true /*need_reshape_or_alloc*/);
phi::DenseTensor tmp_output(output->dtype());
tmp_output.mutable_data<T>({n, out_h, out_w, c}, ctx.GetPlace());
MLUCnnlGridSampleDesc grid_sample_desc(mode, padding_mode, align_corners);
MLUCnnlTensorDesc input_desc(
trans_input, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc grid_desc(*grid, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc tmp_output_desc(
tmp_output, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnl::GridSample(ctx,
grid_sample_desc.get(),
input_desc.get(),
GetBasePtr(&trans_input),
grid_desc.get(),
GetBasePtr(grid),
tmp_output_desc.get(),
GetBasePtr(&tmp_output));
// transpose output from NHWC to NCHW
const std::vector<int> perm_to_nchw = {
0,
3,
1,
2,
};
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&tmp_output,
output,
false /*need_reshape_or_alloc*/);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(grid_sampler,
ops::GridSamplerMLUKernel<float>,
ops::GridSamplerMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class HuberLossMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* residual = ctx.Output<phi::DenseTensor>("Residual");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto delta = ctx.Attr<float>("delta");
auto place = ctx.GetPlace();
// compute y-x
cnnlDataType_t data_type = ToCnnlDataType<T>();
residual->mutable_data<T>(x->dims(), place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
x_desc.get(),
GetBasePtr(y),
x_desc.get(),
GetBasePtr(x),
x_desc.get(),
GetBasePtr(residual),
data_type);
// compute smoothl1loss
out->mutable_data<T>(x->dims(), place);
cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
CNNL_SMOOTHL1LOSS_REDUCTION_NONE; // defines whether to do reduction
// here
MLUCnnl::SmoothL1LossForward(ctx,
x_desc.get(),
GetBasePtr(x),
x_desc.get(), /* target has same shape as x */
GetBasePtr(y),
static_cast<float>(delta),
smoothl1_algo,
x_desc.get(), /* out has same shape as x */
GetBasePtr(out));
// compute multiply by delta
phi::DenseTensor scale_tensor, bias_tensor;
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
const int axis = std::max(out->dims().size() - 1, 0);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Scale(ctx,
axis,
out_desc.get(),
GetBasePtr(out),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class HuberLossGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* residual = ctx.Input<phi::DenseTensor>("Residual");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
auto delta = ctx.Attr<float>("delta");
auto place = ctx.GetPlace();
phi::DenseTensor t_grad_rd;
t_grad_rd =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
if (dx || dy) {
phi::DenseTensor t_zero;
t_zero =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
MLUCnnlTensorDesc residual_desc(*residual);
MLUCnnlTensorDesc dout_desc(*dout);
cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
CNNL_SMOOTHL1LOSS_REDUCTION_NONE; // defines whether to do reduction
// here
MLUCnnl::SmoothL1LossBackward(ctx,
residual_desc.get(),
GetBasePtr(residual),
residual_desc.get(),
GetBasePtr(&t_zero),
dout_desc.get(),
GetBasePtr(dout),
static_cast<float>(delta),
smoothl1_algo,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd));
}
// compute multiply by delta
phi::DenseTensor scale_tensor, bias_tensor;
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
if (dx) {
dx->mutable_data<T>(place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
MLUCnnlTensorDesc out_desc(*dx);
MLUCnnl::Scale(ctx,
axis,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(dx));
}
if (dy) {
dy->mutable_data<T>(place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
MLUCnnlTensorDesc out_desc(*dy);
MLUCnnl::Scale(ctx,
axis,
t_grad_rd_desc.get(),
GetBasePtr(&t_grad_rd),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(dy));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(huber_loss,
ops::HuberLossMLUKernel<float>,
ops::HuberLossMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(huber_loss_grad,
ops::HuberLossGradMLUKernel<float>,
ops::HuberLossGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/interpolate_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
using DataLayout = phi::DataLayout;
inline std::vector<int> get_new_shape_mlu(
const std::vector<const phi::DenseTensor*>& list_new_shape_tensor) {
// get tensor from
std::vector<int> vec_new_shape;
for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
auto tensor = list_new_shape_tensor[i];
PADDLE_ENFORCE_EQ(
tensor->dims(),
phi::make_ddim({1}),
platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
phi::DenseTensor temp;
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
}
return vec_new_shape;
}
template <typename T>
class InterpolateV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto input_dims = input->dims();
PADDLE_ENFORCE_GE(
input_dims.size(),
4,
platform::errors::External("MLU Interpolate kernel supports input "
"range greater or equal than 4."));
PADDLE_ENFORCE_LE(
input_dims.size(),
5,
platform::errors::External("MLU Interpolate kernel supports input "
"range less or equal than 5. "));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
int n, c, in_d, in_h, in_w;
ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
auto interp_method = ctx.Attr<std::string>("interp_method");
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
int align_center = align_corners ? 0 : (align_mode == 1 ? 0 : 1);
int out_d = ctx.Attr<int>("out_d");
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
float scale_d = -1;
float scale_h = -1;
float scale_w = -1;
auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
if (list_new_size_tensor.size() > 0) {
// have size tensor
auto new_size = get_new_shape_mlu(list_new_size_tensor);
if (new_size.size() <= 2) {
// default NCHW
out_h = new_size[0];
out_w = new_size[1];
} else {
// rank of input is 5, HCDHW
out_d = new_size[0];
out_h = new_size[1];
out_w = new_size[2];
}
} else {
auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
auto scale = ctx.Attr<std::vector<float>>("scale");
if (scale_tensor != nullptr) {
std::vector<float> scale_data;
scale_data = phi::GetVectorFromTensor<float>(scale_tensor);
if (scale_data.size() > 1 && scale_data.size() <= 2) {
scale_h = scale_data[0];
scale_w = scale_data[1];
} else if (scale_data.size() > 2) {
scale_d = scale_data[0];
scale_h = scale_data[1];
scale_w = scale_data[2];
} else {
scale_d = scale_data[0];
scale_h = scale_data[0];
scale_w = scale_data[0];
}
PADDLE_ENFORCE_EQ(
scale_w > 0 && scale_h > 0,
true,
platform::errors::InvalidArgument("scale of Op(interpolate) "
"should be greater than 0."));
} else {
if (scale.size() > 1 && scale.size() <= 2) {
scale_h = scale[0];
scale_w = scale[1];
PADDLE_ENFORCE_EQ(
scale_w > 0 && scale_h > 0,
true,
platform::errors::InvalidArgument("scale of Op(interpolate) "
"should be greater than 0."));
} else if (scale.size() > 2) {
scale_d = scale[0];
scale_h = scale[1];
scale_w = scale[2];
PADDLE_ENFORCE_EQ(
scale_d > 0 && scale_w > 0 && scale_h > 0,
true,
platform::errors::InvalidArgument("scale of Op(interpolate) "
"should be greater than 0."));
}
}
if (scale_h > 0. && scale_w > 0.) {
out_h = static_cast<int>(in_h * scale_h);
out_w = static_cast<int>(in_w * scale_w);
}
if (scale_d > 0.) {
out_d = static_cast<int>(in_d * scale_d);
}
auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
if (out_size != nullptr) {
std::vector<int32_t> out_size_data;
out_size_data = phi::GetVectorFromTensor<int>(out_size);
if (out_size_data.size() <= 2) {
out_h = out_size_data[0];
out_w = out_size_data[1];
} else {
out_d = out_size_data[0];
out_h = out_size_data[1];
out_w = out_size_data[2];
}
}
}
PADDLE_ENFORCE_GT(
out_h,
0,
platform::errors::InvalidArgument("out_h in Attr(out_shape) of "
"Op(interpolate) "
"should be greater than 0."));
PADDLE_ENFORCE_GT(
out_w,
0,
platform::errors::InvalidArgument("out_w in Attr(out_shape) of "
"Op(interpolate) "
"should be greater than 0."));
// do transpose according to cnnl's constraints
// cnnlInterp_v2 only accepts NHWC when mode is CNNL_INTERP_BILINEAR and
// CNNL_INTERP_NEAREST,
framework::DDim dim_in, dim_in_trans, dim_out, dim_out_trans;
phi::DenseTensor transformed_input, transformed_output;
bool need_transpose = input_dims.size() != 2;
if (input_dims.size() == 4) {
// need to do transpose if layout is kNCHW
need_transpose &= data_layout == DataLayout::kNCHW;
if (need_transpose) {
// if need_transpose, do the following
// 1. transpose input NCHW -> NHWC
// 2. interpolation in(NHWC) -> out(NHWC)
// 3. transpose output NHWC -> HCHW
// dim_in = {n, c, in_h, in_w};
dim_in_trans = {n, in_h, in_w, c};
dim_out = {n, c, out_h, out_w};
dim_out_trans = {n, out_h, out_w, c};
output->mutable_data<T>(dim_out, ctx.GetPlace());
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
// do transpose on input tensor, then do interpolation
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_NCHW, ToCnnlDataType(input->dtype()));
transformed_input =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
transformed_output =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
MLUCnnlTensorDesc input_reshaped_desc(
transformed_input,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_input.dtype()));
const std::vector<int> perm = {0, 2, 3, 1};
MLUCnnl::Transpose(ctx,
perm,
input_dims.size(),
input_desc.get(),
GetBasePtr(input),
input_reshaped_desc.get(),
GetBasePtr(&transformed_input));
} else {
// if no need_transpose, do the following
// 1. interpolation in(NHWC) -> out(NHWC)
// dim_in = {n, in_h, in_w, c};
dim_out = {n, out_h, out_w, c};
output->mutable_data<T>(dim_out, ctx.GetPlace());
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
transformed_input = *input;
transformed_output = *output;
}
MLUCnnlTensorDesc input_desc(transformed_input,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_input.dtype()));
MLUCnnlTensorDesc output_desc(transformed_output,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_output.dtype()));
MLUCnnl::Interp(ctx,
GetMLUCnnlInterpMode(interp_method),
align_corners,
align_center,
input_desc.get(),
GetBasePtr(&transformed_input),
output_desc.get(),
GetBasePtr(&transformed_output));
if (need_transpose) {
// if need_transpose, reshape output back to NCHW
const std::vector<int> perm = {0, 3, 1, 2};
MLUCnnlTensorDesc output_reshape_desc(
*output, CNNL_LAYOUT_NCHW, ToCnnlDataType(output->dtype()));
MLUCnnl::Transpose(ctx,
perm,
dim_out_trans.size(),
output_desc.get(),
GetBasePtr(&transformed_output),
output_reshape_desc.get(),
GetBasePtr(output));
}
} else {
PADDLE_ENFORCE_EQ(
interp_method,
"trilinear",
platform::errors::External("MLU Interpolate kernel only supports 5D "
"data in trilinear mode."));
// need to do transpose if layout is kNCDHW
need_transpose &= data_layout == DataLayout::kNCHW;
if (need_transpose) {
// if need_transpose, do the following
// 1. transpose input NCDHW -> NDHWC
// 2. interpolation in(NDHWC) -> out(NDHWC)
// 3. transpose output NDHWC -> HCDHW
// dim_in = {n, c, in_d, in_h, in_w};
dim_in_trans = {n, in_d, in_h, in_w, c};
dim_out = {n, c, out_d, out_h, out_w};
dim_out_trans = {n, out_d, out_h, out_w, c};
output->mutable_data<T>(dim_out, ctx.GetPlace());
if (in_h == out_h && in_w == out_w && in_d == out_d) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
// do transpose on input tensor (HCDHW -> NDHWC), then do interpolation
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_NCDHW, ToCnnlDataType(input->dtype()));
transformed_input =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_in_trans, dev_ctx);
transformed_output =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(dim_out_trans, dev_ctx);
MLUCnnlTensorDesc input_reshaped_desc(
transformed_input,
CNNL_LAYOUT_NDHWC,
ToCnnlDataType(transformed_input.dtype()));
const std::vector<int> perm = {0, 2, 3, 4, 1};
MLUCnnl::Transpose(ctx,
perm,
input_dims.size(),
input_desc.get(),
GetBasePtr(input),
input_reshaped_desc.get(),
GetBasePtr(&transformed_input));
} else {
// if no need_transpose, do the following
// 1. interpolation in(NDHWC) -> out(NDHWC)
// dim_in = {n, in_d, in_h, in_w, c};
dim_out = {n, out_d, out_h, out_w, c};
output->mutable_data<T>(dim_out, ctx.GetPlace());
if (in_h == out_h && in_w == out_w && in_d == out_d) {
framework::TensorCopy(*input, ctx.GetPlace(), output);
return;
}
transformed_input = *input;
transformed_output = *output;
}
MLUCnnlTensorDesc input_desc(transformed_input,
CNNL_LAYOUT_NDHWC,
ToCnnlDataType(transformed_input.dtype()));
MLUCnnlTensorDesc output_desc(transformed_output,
CNNL_LAYOUT_NDHWC,
ToCnnlDataType(transformed_output.dtype()));
// use trilinear mode in HCDHW layout
MLUCnnl::Interp(ctx,
GetMLUCnnlInterpMode(interp_method),
align_corners,
align_center,
input_desc.get(),
GetBasePtr(&transformed_input),
output_desc.get(),
GetBasePtr(&transformed_output));
if (need_transpose) {
// if need_transpose, reshape output back (NDHWC -> NCDHW)
const std::vector<int> perm = {0, 4, 1, 2, 3};
MLUCnnlTensorDesc output_reshape_desc(
*output, CNNL_LAYOUT_NCDHW, ToCnnlDataType(output->dtype()));
MLUCnnl::Transpose(ctx,
perm,
dim_out_trans.size(),
output_desc.get(),
GetBasePtr(&transformed_output),
output_reshape_desc.get(),
GetBasePtr(output));
}
}
}
};
template <typename T>
class InterpolateV2GradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
auto* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto output_grad_dims = output_grad->dims();
PADDLE_ENFORCE_EQ(output_grad_dims.size(),
4,
platform::errors::External(
"XPU Interpolategrad kernel only support 2d"));
auto* input = ctx.Input<phi::DenseTensor>("X");
auto input_dims = input->dims();
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
int n, c, in_d, in_h, in_w;
ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
auto interp_method = ctx.Attr<std::string>("interp_method");
bool align_corners = ctx.Attr<bool>("align_corners");
int align_mode = ctx.Attr<int>("align_mode");
int align_center = align_corners ? 0 : (align_mode == 0 ? 0 : 1);
align_center = 0;
int out_h = ctx.Attr<int>("out_h");
int out_w = ctx.Attr<int>("out_w");
float scale_h = -1;
float scale_w = -1;
auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
if (list_new_size_tensor.size() > 0) {
// have size tensor
auto new_size = get_new_shape_mlu(list_new_size_tensor);
out_h = new_size[0];
out_w = new_size[1];
} else {
auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
auto scale = ctx.Attr<std::vector<float>>("scale");
if (scale_tensor != nullptr) {
std::vector<float> scale_data;
scale_data = phi::GetVectorFromTensor<float>(scale_tensor);
if (scale_data.size() > 1) {
scale_h = scale_data[0];
scale_w = scale_data[1];
} else {
scale_h = scale_data[0];
scale_w = scale_data[0];
}
PADDLE_ENFORCE_EQ(
scale_w > 0 && scale_h > 0,
true,
platform::errors::InvalidArgument("scale of Op(interpolate) "
"should be greater than 0."));
} else {
if (scale.size() > 1) {
scale_h = scale[0];
scale_w = scale[1];
PADDLE_ENFORCE_EQ(
scale_w > 0 && scale_h > 0,
true,
platform::errors::InvalidArgument("scale of Op(interpolate) "
"should be greater than 0."));
}
}
if (scale_h > 0. && scale_w > 0.) {
out_h = static_cast<int>(in_h * scale_h);
out_w = static_cast<int>(in_w * scale_w);
}
auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
if (out_size != nullptr) {
std::vector<int32_t> out_size_data;
out_size_data = phi::GetVectorFromTensor<int>(out_size);
out_h = out_size_data[0];
out_w = out_size_data[1];
}
}
framework::DDim dim_grad;
framework::DDim dim_out_grad, dim_out_trans_grad, dim_in_grad,
dim_in_trans_grad;
phi::DenseTensor transformed_output_grad, transformed_input_grad;
bool need_transpose =
input_dims.size() != 2 && data_layout == DataLayout::kNCHW;
if (need_transpose) {
// if need_transpose, do the following
// 1. transpose output_grad NCHW -> NHWC
// 2. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
// 3. transpose input_grad NHWC -> HCHW
// dim_out_grad = {n, c, out_h, out_w};
dim_out_trans_grad = {n, out_h, out_w, c};
dim_in_grad = {n, c, in_h, in_w};
dim_in_trans_grad = {n, in_h, in_w, c};
input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
return;
}
// do transpose on input tensor, then do interpolation
MLUCnnlTensorDesc input_desc(
*output_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(output_grad->dtype()));
transformed_output_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
dim_out_trans_grad, dev_ctx);
transformed_input_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
dim_in_trans_grad, dev_ctx);
MLUCnnlTensorDesc input_reshaped_desc(
transformed_output_grad,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_output_grad.dtype()));
const std::vector<int> perm = {0, 2, 3, 1};
MLUCnnl::Transpose(ctx,
perm,
input_dims.size(),
input_desc.get(),
GetBasePtr(output_grad),
input_reshaped_desc.get(),
GetBasePtr(&transformed_output_grad));
} else {
// if no need_transpose, do the following
// 1. InterpBackward output_grad(NHWC) -> input_grad(NHWC)
dim_in_grad = {n, in_h, in_w, c};
input_grad->mutable_data<T>(dim_in_grad, ctx.GetPlace());
if (in_h == out_h && in_w == out_w) {
framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
return;
}
transformed_output_grad = *output_grad;
transformed_input_grad = *input_grad;
}
MLUCnnlTensorDesc input_desc(
transformed_output_grad,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_output_grad.dtype()));
MLUCnnlTensorDesc output_desc(
transformed_input_grad,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(transformed_input_grad.dtype()));
MLUCnnl::InterpBackward(ctx,
GetMLUCnnlInterpBackwardMode(interp_method),
align_corners,
align_center,
input_desc.get(),
GetBasePtr(&transformed_output_grad),
output_desc.get(),
GetBasePtr(&transformed_input_grad));
if (need_transpose) {
const std::vector<int> perm = {0, 3, 1, 2};
MLUCnnlTensorDesc output_reshape_desc(
*input_grad, CNNL_LAYOUT_NCHW, ToCnnlDataType(input_grad->dtype()));
MLUCnnl::Transpose(ctx,
perm,
dim_in_trans_grad.size(),
output_desc.get(),
GetBasePtr(&transformed_input_grad),
output_reshape_desc.get(),
GetBasePtr(input_grad));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(bilinear_interp_v2,
ops::InterpolateV2MLUKernel<float>,
ops::InterpolateV2MLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(nearest_interp_v2,
ops::InterpolateV2MLUKernel<float>,
ops::InterpolateV2MLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(nearest_interp_v2_grad,
ops::InterpolateV2GradMLUKernel<float>,
ops::InterpolateV2GradMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(bilinear_interp_v2_grad,
ops::InterpolateV2GradMLUKernel<float>,
ops::InterpolateV2GradMLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class LabelSmoothMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_t = ctx.Input<phi::DenseTensor>("X");
auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
auto* out_t = ctx.Output<phi::DenseTensor>("Out");
auto epsilon = ctx.Attr<float>("epsilon");
auto epsilon_gt = 1.0f - epsilon;
if (in_t->numel() == 0) return;
out_t->mutable_data<T>(ctx.GetPlace());
auto label_dim = in_t->dims()[in_t->dims().size() - 1];
MLUCnnlTensorDesc x_desc(*in_t);
MLUCnnlTensorDesc out_desc(*out_t);
auto data_type = ToCnnlDataType<T>();
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_ADD, data_type, CNNL_NOT_PROPAGATE_NAN);
if (ctx.HasInput("PriorDist")) {
MLUCnnlTensorDesc dist_desc(*dist_t);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
x_desc.get(),
GetBasePtr(in_t),
dist_desc.get(),
GetBasePtr(dist_t),
out_desc.get(),
GetBasePtr(out_t),
data_type,
epsilon_gt,
epsilon);
} else {
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
phi::DenseTensor dist_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1, label_dim}, dev_ctx);
MLUCnnlTensorDesc dist_desc(dist_tensor);
auto value = static_cast<T>(1.0f / label_dim);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value,
dist_desc.get(),
GetBasePtr(&dist_tensor));
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
x_desc.get(),
GetBasePtr(in_t),
dist_desc.get(),
GetBasePtr(&dist_tensor),
out_desc.get(),
GetBasePtr(out_t),
data_type,
epsilon_gt,
epsilon);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(label_smooth,
ops::LabelSmoothMLUKernel<float>,
ops::LabelSmoothMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
using DDim = framework::DDim;
template <typename T>
class LayerNormMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto epsilon = ctx.Attr<float>("epsilon");
const auto* x = ctx.Input<phi::DenseTensor>("X");
const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
auto* y = ctx.Output<phi::DenseTensor>("Y");
auto* mean = ctx.Output<phi::DenseTensor>("Mean");
auto* variance = ctx.Output<phi::DenseTensor>("Variance");
auto place = ctx.GetPlace();
y->mutable_data<T>(place);
mean->mutable_data<T>(place);
variance->mutable_data<T>(place);
const auto& x_dims = x->dims();
std::vector<int> scale_bias_axes;
std::vector<int> mean_var_axes;
for (auto i = 0; i < x_dims.size(); ++i) {
if (i >= begin_norm_axis) {
scale_bias_axes.push_back(x_dims[i]);
} else {
mean_var_axes.push_back(x_dims[i]);
}
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc mean_var_desc(
mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
// cnnl only support both of scale and bias is NULL or not.
if (!scale && !bias) {
MLUCnnl::LayerNormForward(ctx,
begin_norm_axis,
x_desc.get(),
GetBasePtr(x),
nullptr /*scale_bias_desc*/,
nullptr /*scale*/,
nullptr /*bias*/,
epsilon,
y_desc.get(),
GetBasePtr(y),
mean_var_desc.get(),
GetBasePtr(mean),
GetBasePtr(variance));
} else {
phi::DenseTensor tmp_scale(x->dtype());
if (!scale) {
tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
} else {
tmp_scale = *scale;
}
phi::DenseTensor tmp_bias(x->dtype());
if (!bias) {
tmp_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), &tmp_bias);
} else {
tmp_bias = *bias;
}
// scale and bias should have same type with x/y
MLUCnnlTensorDesc float32_desc(
scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
MLUCnnlTensorDesc float16_desc(
scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
cnnlCastDataType_t cast_type = GetCastDataType(VT::FP32, VT::FP16);
phi::DenseTensor final_scale(x->dtype());
if (final_scale.dtype() == DataType::FLOAT16 &&
tmp_scale.dtype() == DataType::FLOAT32) {
final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
// cast scale to fp16
MLUCnnl::Cast(ctx,
cast_type,
float32_desc.get(),
GetBasePtr(&tmp_scale),
float16_desc.get(),
GetBasePtr(&final_scale));
} else {
final_scale = tmp_scale;
}
phi::DenseTensor final_bias(x->dtype());
if (final_bias.dtype() == DataType::FLOAT16 &&
tmp_bias.dtype() == DataType::FLOAT32) {
final_bias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
// cast bias to fp16
MLUCnnl::Cast(ctx,
cast_type,
float32_desc.get(),
GetBasePtr(&tmp_bias),
float16_desc.get(),
GetBasePtr(&final_bias));
} else {
final_bias = tmp_bias;
}
MLUCnnlTensorDesc scale_bias_desc(
scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
MLUCnnl::LayerNormForward(ctx,
begin_norm_axis,
x_desc.get(),
GetBasePtr(x),
scale_bias_desc.get(),
GetBasePtr(&final_scale),
GetBasePtr(&final_bias),
epsilon,
y_desc.get(),
GetBasePtr(y),
mean_var_desc.get(),
GetBasePtr(mean),
GetBasePtr(variance));
}
}
};
template <typename T>
class LayerNormGradMLUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto* x = ctx.Input<phi::DenseTensor>("X");
const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dscale =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
auto place = ctx.GetPlace();
dx->mutable_data<T>(place);
const auto& x_dims = x->dims();
std::vector<int> scale_bias_axes;
std::vector<int> mean_var_axes;
for (auto i = 0; i < x_dims.size(); ++i) {
if (i >= begin_norm_axis) {
scale_bias_axes.push_back(x_dims[i]);
} else {
mean_var_axes.push_back(x_dims[i]);
}
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnlTensorDesc mean_var_desc(
mean_var_axes.size(), mean_var_axes.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dx_desc(*dx);
phi::DenseTensor tmp_scale(x->dtype());
if (!scale) {
tmp_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &tmp_scale);
} else {
tmp_scale = *scale;
}
MLUCnnlTensorDesc float32_desc(
scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_FLOAT);
MLUCnnlTensorDesc float16_desc(
scale_bias_axes.size(), scale_bias_axes.data(), CNNL_DTYPE_HALF);
cnnlCastDataType_t cast_fp32_to_fp16 = GetCastDataType(VT::FP32, VT::FP16);
cnnlCastDataType_t cast_fp16_to_fp32 = GetCastDataType(VT::FP16, VT::FP32);
phi::DenseTensor final_scale(x->dtype());
if (final_scale.dtype() == DataType::FLOAT16 &&
tmp_scale.dtype() == DataType::FLOAT32) {
final_scale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
// cast scale to fp16
MLUCnnl::Cast(ctx,
cast_fp32_to_fp16,
float32_desc.get(),
GetBasePtr(&tmp_scale),
float16_desc.get(),
GetBasePtr(&final_scale));
} else {
final_scale = tmp_scale;
}
phi::DenseTensor tmp_dscale(x->dtype());
if (dscale && (tmp_dscale.dtype() == dscale->dtype())) {
dscale->mutable_data<T>(place);
tmp_dscale = *dscale;
} else {
tmp_dscale.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
}
phi::DenseTensor tmp_dbias(x->dtype());
if (dbias && (tmp_dbias.dtype() == dbias->dtype())) {
dbias->mutable_data<T>(place);
tmp_dbias = *dbias;
} else {
tmp_dbias.mutable_data<T>(phi::make_ddim(scale_bias_axes), place);
}
MLUCnnlTensorDesc scale_desc(
scale_bias_axes.size(), scale_bias_axes.data(), ToCnnlDataType<T>());
MLUCnnl::LayerNormBackward(ctx,
begin_norm_axis,
x_desc.get(),
GetBasePtr(x),
dy_desc.get(),
GetBasePtr(dy),
scale_desc.get(),
GetBasePtr(&final_scale),
mean_var_desc.get(),
GetBasePtr(mean),
GetBasePtr(variance),
dx_desc.get(),
GetBasePtr(dx),
GetBasePtr(&tmp_dscale),
GetBasePtr(&tmp_dbias));
if (dscale && (tmp_dscale.dtype() == DataType::FLOAT16 &&
dscale->dtype() == DataType::FLOAT32)) {
dscale->mutable_data<MPDType>(place);
MLUCnnl::Cast(ctx,
cast_fp16_to_fp32,
float16_desc.get(),
GetBasePtr(&tmp_dscale),
float32_desc.get(),
GetBasePtr(dscale));
}
if (dbias && (tmp_dbias.dtype() == DataType::FLOAT16 &&
dbias->dtype() == DataType::FLOAT32)) {
dbias->mutable_data<MPDType>(place);
MLUCnnl::Cast(ctx,
cast_fp16_to_fp32,
float16_desc.get(),
GetBasePtr(&tmp_dbias),
float32_desc.get(),
GetBasePtr(dbias));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(layer_norm,
ops::LayerNormMLUKernel<float>,
ops::LayerNormMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(layer_norm_grad,
ops::LayerNormGradMLUKernel<float>,
ops::LayerNormGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class LookupTableV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *ids_t = ctx.Input<phi::DenseTensor>("Ids"); // int tensor
auto *output_t = ctx.Output<phi::DenseTensor>("Out"); // float tensor
auto *table_t = ctx.Input<phi::DenseTensor>("W");
int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
auto *table_var = ctx.InputVar("W");
PADDLE_ENFORCE_EQ(
table_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument("mlu only accept phi::DenseTensor"));
output_t->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc ids_desc(*ids_t);
MLUCnnlTensorDesc table_desc(*table_t);
MLUCnnlTensorDesc output_desc(*output_t);
MLUCnnl::EmbeddingForward(ctx,
padding_idx,
table_desc.get(),
GetBasePtr(table_t),
ids_desc.get(),
static_cast<const int *>(GetBasePtr(ids_t)),
output_desc.get(),
GetBasePtr(output_t));
}
};
template <typename T>
class LookupTableV2GradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *table_var = ctx.InputVar("W");
PADDLE_ENFORCE_EQ(
table_var->IsType<phi::DenseTensor>(),
true,
platform::errors::PermissionDenied(
"Unsupported Variable Type , idx in "
"LookupTableV2GradMLUKernel should be phi::DenseTensor."));
bool is_sparse = ctx.Attr<bool>("is_sparse");
PADDLE_ENFORCE_EQ(
is_sparse,
false,
platform::errors::InvalidArgument(
"LookupTableV2GradMLUKernel dose NOT support is_sparse = True."));
auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");
auto *output_grad_t =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto *table_grad_t =
ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
table_grad_t->mutable_data<T>(ctx.GetPlace());
int padding_idx = static_cast<int>(ctx.Attr<int64_t>("padding_idx"));
int64_t ids_numel = ids_t->numel();
PADDLE_ENFORCE_EQ(
ids_numel <= std::numeric_limits<int32_t>::max(),
true,
platform::errors::OutOfRange(
"Number of ids greater than int32_t::max , please check "
"number of ids in LookupTableV2GradMLUKernel."));
phi::DenseTensor ids_int32(ids_t->dtype());
if (ids_t->dtype() != DataType::INT32) {
ids_int32.mutable_data<int>(ids_t->dims(), ctx.GetPlace());
MLUCnnlTensorDesc ids_desc(*ids_t);
MLUCnnlTensorDesc ids_int32_desc(ids_int32);
auto cast_type = GetCastDataType(ids_t->dtype(), DataType::INT32);
MLUCnnl::Cast(ctx,
cast_type,
ids_desc.get(),
GetBasePtr(ids_t),
ids_int32_desc.get(),
GetBasePtr(&ids_int32));
} else {
ids_int32 = *ids_t;
}
MLUCnnlTensorDesc ids_int32_desc(ids_int32);
MLUCnnlTensorDesc output_grad_desc(*output_grad_t);
MLUCnnlTensorDesc table_grad_desc(*table_grad_t);
MLUCnnl::EmbeddingBackward(ctx,
padding_idx,
false,
ids_int32_desc.get(),
GetBasePtr(&ids_int32),
output_grad_desc.get(),
GetBasePtr(output_grad_t),
table_grad_desc.get(),
GetBasePtr(table_grad_t));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(lookup_table_v2,
ops::LookupTableV2MLUKernel<float>,
ops::LookupTableV2MLUKernel<int>,
ops::LookupTableV2MLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(lookup_table_v2_grad,
ops::LookupTableV2GradMLUKernel<float>,
ops::LookupTableV2GradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class MaskedSelectedMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto input = ctx.Input<phi::DenseTensor>("X");
auto mask = ctx.Input<phi::DenseTensor>("Mask");
auto out = ctx.Output<phi::DenseTensor>("Y");
auto input_dim = input->dims();
auto mask_dim = mask->dims();
PADDLE_ENFORCE_EQ(
input_dim,
mask_dim,
platform::errors::InvalidArgument(
"The dim size of input and mask in OP(masked_selected) "
"must be equal, but got input dim:(%ld), mask dim: "
"(%ld). Please check input "
"value.",
input_dim,
mask_dim));
phi::DenseTensor number(framework::TransToPhiDataType(VT::INT32));
void* number_ptr = number.mutable_data<int32_t>({1}, ctx.GetPlace());
out->Resize(mask->dims());
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc mask_desc(*mask);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Mask(ctx,
CNNL_MASKED_SELECT,
input_desc.get(),
GetBasePtr(input),
mask_desc.get(),
GetBasePtr(mask),
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out),
static_cast<uint32_t*>(number_ptr));
}
};
template <typename T>
class MaskedSelectedGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto mask = ctx.Input<phi::DenseTensor>("Mask");
auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
phi::DenseTensor mask_int32, out_size;
std::vector<int32_t> out_size_vec;
mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
MLUCnnlTensorDesc mask_desc(*mask);
MLUCnnlTensorDesc mask_int32_desc(mask_int32);
MLUCnnlTensorDesc out_size_desc(out_size);
auto cast_type = GetCastDataType(mask->dtype(), DataType::INT32);
MLUCnnl::Cast(ctx,
cast_type,
mask_desc.get(),
GetBasePtr(mask),
mask_int32_desc.get(),
GetBasePtr(&mask_int32));
auto mask_int32_dim = phi::vectorize(mask_int32.dims());
std::vector<int32_t> reduce_dims;
for (size_t i = 0; i < mask_int32_dim.size(); i++) {
reduce_dims.push_back(static_cast<int>(i));
}
std::string reduce_name = "reduce_sum";
cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
MLUCnnlReduceDesc reduce_desc(reduce_dims,
reduce_op,
ToCnnlDataType<int32_t>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true,
reduce_desc.get(),
nullptr,
mask_int32_desc.get(),
GetBasePtr(&mask_int32),
0,
nullptr,
nullptr,
out_size_desc.get(),
GetBasePtr(&out_size));
paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
dev_ctx.Wait();
phi::DenseTensor mask_int32_tmp;
mask_int32_tmp.ShareDataWith(mask_int32);
mask_int32_tmp.Resize({mask_int32.numel()});
phi::DenseTensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)),
indices_int32(framework::TransToPhiDataType(VT::INT32));
topk_v2_out.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
indices_int32.mutable_data<int32_t>({mask_int32.numel()}, ctx.GetPlace());
MLUCnnlTensorDesc topk_v2_out_desc(topk_v2_out);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnlTensorDesc mask_int32_tmp_desc(mask_int32_tmp);
const int dim = 0;
MLUCnnl::TopK(ctx,
mask_int32.numel(),
dim,
true,
false,
mask_int32_tmp_desc.get(),
GetBasePtr(&mask_int32_tmp),
topk_v2_out_desc.get(),
GetBasePtr(&topk_v2_out),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
auto stream = ctx.template device_context<MLUDeviceContext>().stream();
phi::DenseTensor indices_int32_out;
indices_int32_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
memory::Copy(ctx.GetPlace(),
GetBasePtr(&indices_int32_out),
ctx.GetPlace(),
GetBasePtr(&indices_int32),
out_size_vec[0] * sizeof(int32_t),
stream);
phi::DenseTensor y_grad_tmp_out;
y_grad_tmp_out.mutable_data<T>({out_size_vec[0]}, ctx.GetPlace());
MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out);
memory::Copy(ctx.GetPlace(),
GetBasePtr(&y_grad_tmp_out),
ctx.GetPlace(),
GetBasePtr(y_grad),
out_size_vec[0] * sizeof(T),
stream);
phi::DenseTensor indices_int32_tmp;
indices_int32_tmp.ShareDataWith(indices_int32_out);
indices_int32_tmp.Resize({out_size_vec[0], 1});
MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp);
const cnnlScatterNdMode_t mode = CNNL_SCATTERND_UPDATE;
x_grad->Resize({x_grad->numel()});
x_grad->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_grad_desc(*x_grad);
MLUCnnl::ScatterNd(ctx,
mode,
indices_int32_tmp_desc.get(),
GetBasePtr(&indices_int32_tmp),
y_grad_tmp_out_desc.get(),
GetBasePtr(&y_grad_tmp_out),
nullptr,
nullptr,
x_grad_desc.get(),
GetBasePtr(x_grad));
x_grad->Resize(mask->dims());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(masked_select,
ops::MaskedSelectedMLUKernel<float>,
ops::MaskedSelectedMLUKernel<int>,
ops::MaskedSelectedMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(masked_select_grad,
ops::MaskedSelectedGradMLUKernel<float>,
ops::MaskedSelectedGradMLUKernel<int>,
ops::MaskedSelectedGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
static void Mul(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const float alpha) {
Out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out),
ToCnnlDataType<T>(),
alpha);
}
template <typename T>
static void MatMul2D(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const bool trans_x,
const bool trans_y,
const float alpha) {
Out->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE_LT(fabs(alpha - 1.0),
std::numeric_limits<float>::epsilon(),
platform::errors::InvalidArgument(
"MLU(matmul): alpha should be equal to 1.0! "
"Other values are not supported yet."
"But received alpha is %d.",
alpha));
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Matmul(ctx,
trans_x,
trans_y,
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out));
}
template <typename T>
static void MatMulND(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const bool trans_x,
const bool trans_y,
const float alpha) {
if (!Out->initialized()) {
Out->mutable_data<T>(ctx.GetPlace());
}
PADDLE_ENFORCE_LT(fabs(alpha - 1.0),
std::numeric_limits<float>::epsilon(),
platform::errors::InvalidArgument(
"MLU(matmul): alpha should be equal to 1.0! "
"Other values are not supported yet."
"But received alpha is %d.",
alpha));
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::BatchMatmul(ctx,
trans_x,
trans_y,
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out));
}
template <typename T>
static void ReduceDims(const framework::ExecutionContext& ctx,
const std::vector<int64_t>& dims,
const std::vector<int64_t>& bcast_dims,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
std::vector<int64_t> axes;
int64_t size = bcast_dims.size();
int64_t diff = bcast_dims.size() - dims.size();
for (int64_t i = 0; i < size; ++i) {
if (i < diff) {
axes.push_back(i);
continue;
}
if (bcast_dims[i] > dims[i - diff]) {
axes.push_back(i);
}
}
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
std::vector<int> reduce_dims(axes.begin(), axes.end());
MLUCnnlReduceDesc reduce_desc(reduce_dims,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_desc.get(),
nullptr,
in_desc.get(),
GetBasePtr(&in),
0 /*indices_size*/,
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out));
}
template <typename T>
class MatMulMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Y = ctx.Input<phi::DenseTensor>("Y");
auto* Out = ctx.Output<phi::DenseTensor>("Out");
bool transpose_x = ctx.Attr<bool>("transpose_X");
bool transpose_y = ctx.Attr<bool>("transpose_Y");
float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
std::vector<int64_t> x_dims = phi::vectorize(X->dims());
std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
int x_ndim = x_dims.size();
int y_ndim = y_dims.size();
// Case 1: [K] x [K] = [1]
// Equal: [1, K] x [K, 1] = [1, 1] => [1]
const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
if (all_one_dim) {
Out->Resize({1, 1});
}
// Resize dim 1 to 2
phi::DenseTensor x_temp, y_temp;
x_temp.ShareDataWith(*X);
y_temp.ShareDataWith(*Y);
if (x_ndim == 1) {
x_dims.insert(x_dims.begin(), 1);
x_temp.Resize(phi::make_ddim(x_dims));
x_ndim = 2;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if (out_dims.size() < y_dims.size()) {
std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
temp_out_dims.insert(temp_out_dims.end() - 1, 1);
Out->Resize(phi::make_ddim(temp_out_dims));
}
}
if (y_ndim == 1) {
y_dims.push_back(1);
y_temp.Resize(phi::make_ddim(y_dims));
y_ndim = 2;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if (out_dims.size() < x_dims.size()) {
std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
temp_out_dims.push_back(1);
Out->Resize(phi::make_ddim(temp_out_dims));
}
}
const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
if (transpose_y) {
PADDLE_ENFORCE_EQ(
y_dims[y_ndim - 1],
K,
platform::errors::InvalidArgument("Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d",
y_ndim - 1,
K,
y_ndim - 1,
y_dims[y_ndim - 1]));
} else {
PADDLE_ENFORCE_EQ(
y_dims[y_ndim - 2],
K,
platform::errors::InvalidArgument("Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d",
y_ndim - 2,
K,
y_ndim - 2,
y_dims[y_ndim - 2]));
}
if (x_ndim == 2 && y_ndim == 2) {
// Case 2: [M, K] x [K, N] = [M, N]
MatMul2D<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
} else {
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
MatMulND<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
}
if (phi::vectorize(Out->dims()) != out_dims) {
Out->Resize(phi::make_ddim(out_dims));
}
}
};
template <typename T>
class MatMulGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Y = ctx.Input<phi::DenseTensor>("Y");
auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
bool transpose_x = ctx.Attr<bool>("transpose_X");
bool transpose_y = ctx.Attr<bool>("transpose_Y");
float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
std::vector<int64_t> x_dims = phi::vectorize(X->dims());
std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
int x_ndim = x_dims.size();
int y_ndim = y_dims.size();
int out_ndim = out_dims.size();
// Case 1: [K] x [K] = [1]
if (x_ndim == 1 && y_ndim == 1) {
if (dX) {
Mul<T>(ctx, *dOut, *Y, dX, alpha);
}
if (dY) {
Mul<T>(ctx, *dOut, *X, dY, alpha);
}
return;
}
// Resize dim 1 to 2
phi::DenseTensor x_temp, y_temp, dout_temp;
x_temp.ShareDataWith(*X);
y_temp.ShareDataWith(*Y);
dout_temp.ShareDataWith(*dOut);
if (x_ndim == 1) {
x_dims.insert(x_dims.begin(), 1);
out_dims.insert(out_dims.end() - 1, 1);
x_temp.Resize(phi::make_ddim(x_dims));
dout_temp.Resize(phi::make_ddim(out_dims));
x_ndim = 2;
out_ndim += 1;
}
if (y_ndim == 1) {
y_dims.push_back(1);
out_dims.push_back(1);
y_temp.Resize(phi::make_ddim(y_dims));
dout_temp.Resize(phi::make_ddim(out_dims));
y_ndim = 2;
out_ndim += 1;
}
// Case 2: [M, K] x [K, N] = [M, N]
if (out_ndim == 2) {
if (dX) {
dX->Resize(phi::make_ddim(x_dims));
if (transpose_x) {
MatMul2D<T>(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha);
} else {
MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha);
}
dX->Resize(X->dims());
}
if (dY) {
dY->Resize(phi::make_ddim(y_dims));
if (transpose_y) {
MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha);
} else {
MatMul2D<T>(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha);
}
dY->Resize(Y->dims());
}
return;
}
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
std::vector<int64_t> x_bcast_dims(out_ndim, 1);
std::vector<int64_t> y_bcast_dims(out_ndim, 1);
std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
if (dX) {
phi::DenseTensor dx_temp(X->type());
if (x_dims != x_bcast_dims) {
dx_temp.Resize(phi::make_ddim(x_bcast_dims));
} else {
dX->mutable_data<T>(ctx.GetPlace());
dx_temp.ShareDataWith(*dX);
}
if (transpose_x) {
MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha);
} else {
MatMulND<T>(
ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y, alpha);
}
if (x_dims != x_bcast_dims) {
ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
}
}
if (dY) {
phi::DenseTensor dy_temp(Y->type());
if (y_dims != y_bcast_dims) {
dy_temp.Resize(phi::make_ddim(y_bcast_dims));
} else {
dY->mutable_data<T>(ctx.GetPlace());
dy_temp.ShareDataWith(*dY);
}
if (transpose_y) {
MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha);
} else {
MatMulND<T>(
ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false, alpha);
}
if (y_dims != y_bcast_dims) {
ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(matmul,
ops::MatMulMLUKernel<float>,
ops::MatMulMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(matmul_grad,
ops::MatMulGradMLUKernel<float>,
ops::MatMulGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
static void Mul(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out) {
Out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out),
ToCnnlDataType<T>());
}
template <typename T>
static void MatMul2D(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const bool trans_x,
const bool trans_y) {
Out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Matmul(ctx,
trans_x,
trans_y,
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out));
}
template <typename T>
static void MatMul2DwithReduceBatch(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const bool trans_x,
const bool trans_y) {
if (!Out->initialized()) {
Out->mutable_data<T>(ctx.GetPlace());
}
// reshape to 2D matmul
std::vector<int64_t> x_dims = phi::vectorize(X.dims());
std::vector<int64_t> y_dims = phi::vectorize(Y.dims());
std::vector<int> realx_dims(
{static_cast<int>(x_dims[0] * x_dims[1]), static_cast<int>(x_dims[2])});
std::vector<int> realy_dims(
{static_cast<int>(y_dims[0] * y_dims[1]), static_cast<int>(y_dims[2])});
MLUCnnlTensorDesc x_desc(2, realx_dims.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(2, realy_dims.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Matmul(ctx,
trans_x,
trans_y,
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out));
}
template <typename T>
static void MatMulND(const framework::ExecutionContext& ctx,
const phi::DenseTensor& X,
const phi::DenseTensor& Y,
phi::DenseTensor* Out,
const bool trans_x,
const bool trans_y) {
if (!Out->initialized()) {
Out->mutable_data<T>(ctx.GetPlace());
}
MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::BatchMatmul(ctx,
trans_x,
trans_y,
x_desc.get(),
GetBasePtr(&X),
y_desc.get(),
GetBasePtr(&Y),
out_desc.get(),
GetBasePtr(Out));
}
template <typename T>
static void ReduceDims(const framework::ExecutionContext& ctx,
const std::vector<int64_t>& dims,
const std::vector<int64_t>& bcast_dims,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
std::vector<int64_t> axes;
int64_t size = bcast_dims.size();
int64_t diff = bcast_dims.size() - dims.size();
for (int64_t i = 0; i < size; ++i) {
if (i < diff) {
axes.push_back(i);
continue;
}
if (bcast_dims[i] > dims[i - diff]) {
axes.push_back(i);
}
}
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
std::vector<int> reduce_dims(axes.begin(), axes.end());
MLUCnnlReduceDesc reduce_desc(reduce_dims,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_desc.get(),
nullptr,
in_desc.get(),
GetBasePtr(&in),
0 /*indices_size*/,
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out));
}
template <typename T>
class MatMulV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Y = ctx.Input<phi::DenseTensor>("Y");
auto* Out = ctx.Output<phi::DenseTensor>("Out");
const bool trans_x = ctx.Attr<bool>("trans_x");
const bool trans_y = ctx.Attr<bool>("trans_y");
std::vector<int64_t> x_dims = phi::vectorize(X->dims());
std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
int x_ndim = x_dims.size();
int y_ndim = y_dims.size();
// Case 1: [K] x [K] = [1]
// Equal: [1, K] x [K, 1] = [1, 1] => [1]
const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
if (all_one_dim) {
Out->Resize({1, 1});
}
// Resize dim 1 to 2
phi::DenseTensor x_temp, y_temp;
x_temp.ShareDataWith(*X);
y_temp.ShareDataWith(*Y);
if (x_ndim == 1) {
x_dims.insert(x_dims.begin(), 1);
x_temp.Resize(phi::make_ddim(x_dims));
x_ndim = 2;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if (out_dims.size() < y_dims.size()) {
std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
temp_out_dims.insert(temp_out_dims.end() - 1, 1);
Out->Resize(phi::make_ddim(temp_out_dims));
}
}
if (y_ndim == 1) {
y_dims.push_back(1);
y_temp.Resize(phi::make_ddim(y_dims));
y_ndim = 2;
// matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
if (out_dims.size() < x_dims.size()) {
std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
temp_out_dims.push_back(1);
Out->Resize(phi::make_ddim(temp_out_dims));
}
}
const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
if (trans_y) {
PADDLE_ENFORCE_EQ(
y_dims[y_ndim - 1],
K,
platform::errors::InvalidArgument("Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d",
y_ndim - 1,
K,
y_ndim - 1,
y_dims[y_ndim - 1]));
} else {
PADDLE_ENFORCE_EQ(
y_dims[y_ndim - 2],
K,
platform::errors::InvalidArgument("Input(Y) has error dim."
"Y'dims[%d] must be equal to %d"
"But received Y'dims[%d] is %d",
y_ndim - 2,
K,
y_ndim - 2,
y_dims[y_ndim - 2]));
}
if (x_ndim == 2 && y_ndim == 2) {
// Case 2: [M, K] x [K, N] = [M, N]
MatMul2D<T>(ctx, x_temp, y_temp, Out, trans_x, trans_y);
} else {
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
MatMulND<T>(ctx, x_temp, y_temp, Out, trans_x, trans_y);
}
if (phi::vectorize(Out->dims()) != out_dims) {
Out->Resize(phi::make_ddim(out_dims));
}
}
};
template <typename T>
class MatMulGradV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* X = ctx.Input<phi::DenseTensor>("X");
auto* Y = ctx.Input<phi::DenseTensor>("Y");
auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
const bool trans_x = ctx.Attr<bool>("trans_x");
const bool trans_y = ctx.Attr<bool>("trans_y");
std::vector<int64_t> x_dims = phi::vectorize(X->dims());
std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
int x_ndim = x_dims.size();
int y_ndim = y_dims.size();
int out_ndim = out_dims.size();
// Case 1: [K] x [K] = [1]
if (x_ndim == 1 && y_ndim == 1) {
if (dX) {
Mul<T>(ctx, *dOut, *Y, dX);
}
if (dY) {
Mul<T>(ctx, *dOut, *X, dY);
}
return;
}
// Resize dim 1 to 2
phi::DenseTensor x_temp, y_temp, dout_temp;
x_temp.ShareDataWith(*X);
y_temp.ShareDataWith(*Y);
dout_temp.ShareDataWith(*dOut);
if (x_ndim == 1) {
x_dims.insert(x_dims.begin(), 1);
out_dims.insert(out_dims.end() - 1, 1);
x_temp.Resize(phi::make_ddim(x_dims));
dout_temp.Resize(phi::make_ddim(out_dims));
x_ndim = 2;
out_ndim += 1;
}
if (y_ndim == 1) {
y_dims.push_back(1);
out_dims.push_back(1);
y_temp.Resize(phi::make_ddim(y_dims));
dout_temp.Resize(phi::make_ddim(out_dims));
y_ndim = 2;
out_ndim += 1;
}
// Case 2: [M, K] x [K, N] = [M, N]
if (out_ndim == 2) {
if (dX) {
dX->Resize(phi::make_ddim(x_dims));
if (trans_x) {
MatMul2D<T>(ctx, y_temp, dout_temp, dX, trans_y, true);
} else {
MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !trans_y);
}
dX->Resize(X->dims());
}
if (dY) {
dY->Resize(phi::make_ddim(y_dims));
if (trans_y) {
MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, trans_x);
} else {
MatMul2D<T>(ctx, x_temp, dout_temp, dY, !trans_x, false);
}
dY->Resize(Y->dims());
}
return;
}
// Case 3: [B, M, K] x [K, N] = [B, M, N]
// Case 4: [B, M, K] x [B, K, N] = [B, M, N]
std::vector<int64_t> x_bcast_dims(out_ndim, 1);
std::vector<int64_t> y_bcast_dims(out_ndim, 1);
std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
if (dX) {
phi::DenseTensor dx_temp(X->type());
if (x_dims != x_bcast_dims) {
dx_temp.Resize(phi::make_ddim(x_bcast_dims));
} else {
dX->mutable_data<T>(ctx.GetPlace());
dx_temp.ShareDataWith(*dX);
}
if (trans_x) {
MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, trans_y, true);
} else {
MatMulND<T>(ctx, dout_temp, y_temp, &dx_temp, false, !trans_y);
}
if (x_dims != x_bcast_dims) {
ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
}
}
if (dY) {
// Case 3: [B, M, K] x [K, N] = [B, M, N] better performance
// otherwise, tensor dy_temp in else branch might encounter
// numel overflow due to cnnlTensorDescriptor limitation
if (x_dims.size() == 3 && phi::vectorize(Y->dims()).size() == 2) {
if (trans_y) {
MatMul2DwithReduceBatch<T>(ctx, dout_temp, x_temp, dY, true, trans_x);
} else {
MatMul2DwithReduceBatch<T>(
ctx, x_temp, dout_temp, dY, !trans_x, false);
}
} else {
phi::DenseTensor dy_temp(Y->type());
if (y_dims != y_bcast_dims) {
dy_temp.Resize(phi::make_ddim(y_bcast_dims));
} else {
dY->mutable_data<T>(ctx.GetPlace());
dy_temp.ShareDataWith(*dY);
}
if (trans_y) {
MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, trans_x);
} else {
MatMulND<T>(ctx, x_temp, dout_temp, &dy_temp, !trans_x, false);
}
if (y_dims != y_bcast_dims) {
ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
}
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(matmul_v2,
ops::MatMulV2MLUKernel<float>,
ops::MatMulV2MLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(matmul_v2_grad,
ops::MatMulGradV2MLUKernel<float>,
ops::MatMulGradV2MLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class MeanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<phi::DenseTensor>("X");
auto* output = context.Output<phi::DenseTensor>("Out");
const T* in_data = input->data<T>();
T* out_data = output->mutable_data<T>(context.GetPlace());
auto numel = input->numel();
auto rank = input->dims().size();
auto place = context.GetPlace();
auto stream = context.template device_context<MLUDeviceContext>().stream();
if (rank == 0) { // scalar
memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
return;
}
std::vector<int> reduce_dims;
reduce_dims.reserve(rank);
for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i);
}
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
MLUCnnlReduceDesc reduction_desc(reduce_dims,
CNNL_REDUCE_AVG,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
reinterpret_cast<const void*>(in_data),
0 /*indices_size*/,
nullptr,
nullptr,
output_desc.get(),
reinterpret_cast<void*>(out_data));
}
};
template <typename T>
class MeanMLUGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto output_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(
output_grad->numel(),
1,
platform::errors::InvalidArgument(
"Mean Gradient Input phi::DenseTensor len should be 1. But "
"received Out@Grad's elements num is %d.",
output_grad->numel()));
auto input_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(context.GetPlace());
auto in_data = output_grad->data<T>();
auto numel = input_grad->numel();
auto rank = input_grad->dims().size();
auto out_data = input_grad->data<T>();
auto place = context.GetPlace();
auto stream = context.template device_context<MLUDeviceContext>().stream();
if (rank == 0) { // scalar
memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
return;
}
// means
phi::DenseTensor mean_var(output_grad->dtype());
mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
MLUCnnlTensorDesc mean_var_desc(
mean_var, CNNL_LAYOUT_ARRAY, ToCnnlDataType(mean_var.dtype()));
auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
MLUCnnl::Fill(context,
CNNL_POINTER_MODE_HOST,
&value,
mean_var_desc.get(),
GetBasePtr(&mean_var));
// means mul output_grad
MLUCnnlTensorDesc in_desc(
*output_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output_grad->dtype()));
MLUCnnlTensorDesc out_desc(
*input_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input_grad->dtype()));
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(context,
op_tensor_desc.get(),
in_desc.get(),
reinterpret_cast<const void*>(in_data),
mean_var_desc.get(),
GetBasePtr(&mean_var),
out_desc.get(),
reinterpret_cast<void*>(out_data),
ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(mean,
ops::MeanMLUKernel<float>,
ops::MeanMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(mean_grad,
ops::MeanMLUGradKernel<float>,
ops::MeanMLUGradKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class MeshgridMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
PADDLE_ENFORCE_EQ((ins.size() > 1) && (ins.size() < 7),
true,
platform::errors::InvalidArgument(
"Excepted phi::DenseTensor numbers between 2 and 6, "
"but only received d% .",
ins.size()));
int64_t size = ins.size();
std::vector<int64_t> shape(size);
for (int64_t i = 0; i < size; i++) {
switch (ins[i]->dims().size()) {
case 0:
shape[i] = 1;
break;
case 1:
shape[i] = ins[i]->dims()[0];
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected scalar or 1D tensor in the tensor list but got tensor "
"%d: ",
i));
}
}
MLUCnnlTensorDesc out_desc(size, shape.data(), ToCnnlDataType<T>());
framework::DDim out_dims = phi::make_ddim(shape);
for (int64_t i = 0; i < size; i++) {
std::vector<int64_t> view_shape(size, 1);
view_shape[i] = shape[i];
outs[i]->Resize(out_dims);
outs[i]->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc in_desc(size, view_shape.data(), ToCnnlDataType<T>());
MLUCnnl::BroadcastTo(ctx,
in_desc.get(),
GetBasePtr(ins[i]),
out_desc.get(),
GetBasePtr(outs[i]));
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_MLU_KERNEL(
meshgrid,
paddle::operators::MeshgridMLUKernel<int>,
paddle::operators::MeshgridMLUKernel<float>,
paddle::operators::MeshgridMLUKernel<int64_t>,
paddle::operators::MeshgridMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class OneHotV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto* in = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
int depth = ctx.Attr<int>("depth");
if (ctx.HasInput("depth_tensor")) {
std::vector<int32_t> depth_data;
depth_data = phi::GetVectorFromTensor<int>(
ctx.Input<phi::DenseTensor>("depth_tensor"));
depth = depth_data[0];
auto out_dims = out->dims();
out_dims[out_dims.size() - 1] = depth;
out->Resize(out_dims);
}
out->mutable_data<float>(ctx.GetPlace());
float on_value = 1.0f, off_value = 0.0f;
const int in_off_dim[1] = {1};
phi::DenseTensor on_value_tensor =
ctx.AllocateTmpTensor<float, MLUDeviceContext>(
framework::DDim(in_off_dim, 1), dev_ctx);
phi::DenseTensor off_value_tensor =
ctx.AllocateTmpTensor<float, MLUDeviceContext>(
framework::DDim(in_off_dim, 1), dev_ctx);
FillMLUTensorWithHostValue(ctx, on_value, &on_value_tensor);
FillMLUTensorWithHostValue(ctx, off_value, &off_value_tensor);
if (framework::TransToProtoVarType(in->dtype()) ==
framework::proto::VarType::INT32) {
MLUCnnlTensorDesc desc_indices(*in);
MLUCnnl::OneHot(ctx,
desc_indices.get(),
GetBasePtr(in),
depth,
GetBasePtr(&on_value_tensor),
GetBasePtr(&off_value_tensor),
-1,
ToCnnlDataType(out->dtype()),
GetBasePtr(out));
} else {
phi::DenseTensor transformed_in;
transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
// use cnnlCast to cast int64_t to int32_t then do one_hot
MLUCnnlTensorDesc in_desc(*in);
MLUCnnlTensorDesc transformed_in_desc(transformed_in);
cnnlCastDataType_t cast_type = GetCastDataType(
framework::TransToProtoVarType(in->dtype()),
framework::TransToProtoVarType(transformed_in.dtype()));
MLUCnnl::Cast(ctx,
cast_type,
in_desc.get(),
GetBasePtr(in),
transformed_in_desc.get(),
GetBasePtr(&transformed_in));
MLUCnnl::OneHot(ctx,
transformed_in_desc.get(),
GetBasePtr(&transformed_in),
depth,
GetBasePtr(&on_value_tensor),
GetBasePtr(&off_value_tensor),
-1,
ToCnnlDataType(out->dtype()),
GetBasePtr(out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(one_hot_v2,
ops::OneHotV2MLUKernel<int32_t>,
ops::OneHotV2MLUKernel<int64_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/pooling.h"
namespace paddle {
namespace operators {
namespace {
cnnlPoolingMode_t ToCnnlPoolingMode(const std::string &pooling_type,
bool exclusive,
bool adaptive) {
cnnlPoolingMode_t pooling_mode;
if (pooling_type == "max") {
pooling_mode = CNNL_POOLING_MAX;
} else if (pooling_type == "avg") {
if (exclusive && !adaptive) {
pooling_mode = CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
} else {
pooling_mode = CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument("Unknown pooling_type: %s",
pooling_type));
}
return pooling_mode;
}
} // namespace
template <typename T>
class MLUPoolOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
phi::DenseTensor *out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::string data_format = ctx.Attr<std::string>("data_format");
bool global_pooling = ctx.Attr<bool>("global_pooling");
bool ceil_mode = ctx.Attr<bool>("ceil_mode");
bool exclusive = ctx.Attr<bool>("exclusive");
bool adaptive = ctx.Attr<bool>("adaptive");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
PADDLE_ENFORCE_EQ(in_x->dims().size(),
4,
platform::errors::InvalidArgument(
"Only support 4-dims for mlu pool2d kernel."));
const bool channel_last = data_format == "NHWC";
// default
cnnlTensorLayout_t cnnl_layout = CNNL_LAYOUT_NCHW;
auto out_dims = out->dims();
int64_t out_h = out_dims[2];
int64_t out_w = out_dims[3];
auto in_x_dims = in_x->dims();
framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
if (channel_last) {
cnnl_layout = CNNL_LAYOUT_NHWC;
out_h = out_dims[1];
out_w = out_dims[2];
data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
}
phi::funcs::UpdatePadding(&paddings,
global_pooling,
adaptive,
padding_algorithm,
data_dims,
strides,
ksize);
if (global_pooling) {
phi::funcs::UpdateKernelSize(&ksize, data_dims);
}
MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out, cnnl_layout, ToCnnlDataType<T>());
cnnlPoolingMode_t pool_mode =
ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
// transpose NCHW to NHWC since cnnl pool2d has worse performance in that
// layout.
phi::DenseTensor trans_in_x;
phi::DenseTensor trans_out;
if (channel_last) {
trans_in_x = *in_x;
trans_out = *out;
} else {
std::vector<int> perm{0, 2, 3, 1};
TransposeFromMLUTensor<T>(
ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
trans_out = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
{out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx);
}
MLUCnnlTensorDesc trans_in_x_desc(
trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
if (!adaptive) {
MLUCnnlPoolingDesc pool_desc(pool_mode,
CNNL_NOT_PROPAGATE_NAN,
ksize[0],
ksize[1],
paddings[0],
paddings[1],
paddings[2],
paddings[3],
strides[0],
strides[1],
1 /*row_dilation*/,
1 /*col_dilation*/,
ceil_mode);
size_t extra_input_size = 0;
cnnlHandle_t handle =
ctx.template device_context<MLUDeviceContext>().cnnl_handle();
cnnlGetPoolingExtraInputSize(
handle, pool_mode, out_w, out_h, &extra_input_size);
if (extra_input_size > 0) {
phi::DenseTensor extra_host_tensor;
extra_host_tensor.mutable_data<int8_t>(
{static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
cnnlInitPoolingExtraInput(handle,
pool_desc.get(),
trans_in_x_desc.get(),
trans_out_desc.get(),
GetBasePtr(&extra_host_tensor));
phi::DenseTensor extra_device_tensor =
ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
{static_cast<int64_t>(extra_input_size)}, dev_ctx);
framework::TensorCopy(
extra_host_tensor, ctx.GetPlace(), &extra_device_tensor);
// Increase extra_host_tensor holder_ reference count until copy
// complete.
auto increase_ref_count = [extra_host_tensor]() {
VLOG(4) << "Finished copying extra_host_tensor["
<< GetBasePtr(&extra_host_tensor)
<< "] in mlu pooling kernel.";
};
dev_ctx.AddStreamCallback(increase_ref_count);
MLUCnnl::PoolingForward(
ctx,
pool_mode,
out_h,
out_w,
pool_desc.get(),
nullptr /*alpha*/,
trans_in_x_desc.get(),
GetBasePtr(&trans_in_x),
nullptr /*beta*/,
GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/,
trans_out_desc.get(),
GetBasePtr(&trans_out));
} else {
MLUCnnl::PoolingForward(ctx,
pool_mode,
out_h,
out_w,
pool_desc.get(),
nullptr /*alpha*/,
trans_in_x_desc.get(),
GetBasePtr(&trans_in_x),
nullptr /*beta*/,
nullptr /*params_shape_ptr*/,
trans_out_desc.get(),
GetBasePtr(&trans_out));
}
} else {
MLUCnnl::AdaptivePoolingForward(ctx,
pool_mode,
trans_in_x_desc.get(),
GetBasePtr(&trans_in_x),
trans_out_desc.get(),
GetBasePtr(&trans_out),
nullptr,
nullptr);
}
if (!channel_last) {
std::vector<int> perm{0, 3, 1, 2};
TransposeFromMLUTensor<T>(
ctx, perm, &trans_out, out, false /*need_reshape_or_alloc*/);
}
}
};
template <typename T, typename IDX_T>
class MLUPoolGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto &dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
const phi::DenseTensor *in_x = ctx.Input<phi::DenseTensor>("X");
const phi::DenseTensor *out = ctx.Input<phi::DenseTensor>("Out");
const phi::DenseTensor *out_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
phi::DenseTensor *in_x_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
in_x_grad->mutable_data<T>(ctx.GetPlace());
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
bool ceil_mode = ctx.Attr<bool>("ceil_mode");
bool exclusive = ctx.Attr<bool>("exclusive");
bool adaptive = ctx.Attr<bool>("adaptive");
std::string data_format = ctx.Attr<std::string>("data_format");
bool global_pooling = ctx.Attr<bool>("global_pooling");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
const bool channel_last = data_format == "NHWC";
auto in_x_dims = in_x->dims();
framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
if (channel_last) {
data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
}
phi::funcs::UpdatePadding(&paddings,
global_pooling,
adaptive,
padding_algorithm,
data_dims,
strides,
ksize);
if (global_pooling) {
phi::funcs::UpdateKernelSize(&ksize, data_dims);
}
// inputs need with NHWC layout
phi::DenseTensor trans_in_x;
phi::DenseTensor trans_out;
phi::DenseTensor trans_out_grad;
phi::DenseTensor trans_in_x_grad;
if (channel_last) {
trans_in_x = *in_x;
trans_out = *out;
trans_out_grad = *out_grad;
trans_in_x_grad = *in_x_grad;
} else {
std::vector<int> perm{0, 2, 3, 1};
TransposeFromMLUTensor<T>(
ctx, perm, in_x, &trans_in_x, true /*need_reshape_or_alloc*/);
TransposeFromMLUTensor<T>(
ctx, perm, out, &trans_out, true /*need_reshape_or_alloc*/);
TransposeFromMLUTensor<T>(
ctx, perm, out_grad, &trans_out_grad, true /*need_reshape_or_alloc*/);
auto in_x_grad_dims = in_x_grad->dims();
trans_in_x_grad =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({in_x_grad_dims[0],
in_x_grad_dims[2],
in_x_grad_dims[3],
in_x_grad_dims[1]},
dev_ctx);
}
MLUCnnlTensorDesc trans_in_x_desc(
trans_in_x, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
trans_out, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_grad_desc(
trans_out_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_in_x_grad_desc(
trans_in_x_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType<T>());
cnnlPoolingMode_t pool_mode =
ToCnnlPoolingMode(pooling_type, exclusive, adaptive);
MLUCnnlPoolingDesc pool_desc(pool_mode,
CNNL_NOT_PROPAGATE_NAN,
ksize[0],
ksize[1],
paddings[0],
paddings[1],
paddings[2],
paddings[3],
strides[0],
strides[1],
1 /*row_dilation*/,
1 /*col_dilation*/,
ceil_mode);
if (pooling_type == "max") {
phi::DenseTensor index_tensor =
ctx.AllocateTmpTensor<IDX_T, MLUDeviceContext>(trans_out_grad.dims(),
dev_ctx);
MLUCnnlTensorDesc index_tensor_desc(
index_tensor, CNNL_LAYOUT_NHWC, ToCnnlDataType<IDX_T>());
MLUCnnl::PoolingIndex(ctx,
pool_desc.get(),
trans_in_x_desc.get(),
GetBasePtr(&trans_in_x),
index_tensor_desc.get(),
GetBasePtr(&index_tensor));
if (adaptive) {
MLUCnnl::AdaptivePoolingBackward(ctx,
pool_mode,
trans_out_grad_desc.get(),
GetBasePtr(&trans_out_grad),
index_tensor_desc.get(),
GetBasePtr(&index_tensor),
trans_in_x_grad_desc.get(),
GetBasePtr(&trans_in_x_grad));
} else {
MLUCnnl::PoolingBackward(ctx,
pool_desc.get(),
nullptr /*alpha*/,
index_tensor_desc.get(),
GetBasePtr(&index_tensor),
trans_out_grad_desc.get(),
GetBasePtr(&trans_out_grad),
trans_in_x_desc.get(),
GetBasePtr(&trans_in_x),
nullptr /*beta*/,
trans_in_x_grad_desc.get(),
GetBasePtr(&trans_in_x_grad));
}
} else {
if (adaptive) {
MLUCnnl::AdaptivePoolingBackward(ctx,
pool_mode,
trans_out_grad_desc.get(),
GetBasePtr(&trans_out_grad),
nullptr /*index_tensor_desc.get()*/,
nullptr /*GetBasePtr(&index_tensor)*/,
trans_in_x_grad_desc.get(),
GetBasePtr(&trans_in_x_grad));
} else {
MLUCnnl::PoolingBackward(ctx,
pool_desc.get(),
nullptr /*alpha*/,
nullptr,
nullptr,
trans_out_grad_desc.get(),
GetBasePtr(&trans_out_grad),
nullptr,
nullptr,
nullptr /*beta*/,
trans_in_x_grad_desc.get(),
GetBasePtr(&trans_in_x_grad));
}
}
if (!channel_last) {
std::vector<int> perm{0, 3, 1, 2};
TransposeFromMLUTensor<T>(ctx,
perm,
&trans_in_x_grad,
in_x_grad,
false /*need_reshape_or_alloc*/);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(pool2d,
ops::MLUPoolOpKernel<float>,
ops::MLUPoolOpKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(pool2d_grad,
ops::MLUPoolGradOpKernel<float, int>,
ops::MLUPoolGradOpKernel<plat::float16, int16_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/randperm_op.h"
namespace paddle {
namespace operators {
template <typename T>
class RandpermMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
int n = ctx.Attr<int>("n");
unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
framework::Variable* out_var = ctx.OutputVar("Out");
phi::DenseTensor* out_tensor =
framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
phi::DenseTensor tmp_tensor;
tmp_tensor.Resize(phi::make_ddim({n}));
T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
random_permate<T>(tmp_data, n, seed);
framework::TensorCopySync(tmp_tensor, ctx.GetPlace(), out_tensor);
}
};
} // namespace operators
} // namespace paddle
template <typename T>
using kernel = paddle::operators::RandpermMLUKernel<T>;
REGISTER_OP_MLU_KERNEL(
randperm, kernel<int64_t>, kernel<int>, kernel<float>, kernel<double>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/range_op.h"
namespace paddle {
namespace operators {
template <typename T>
class RangeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* start_t = context.Input<phi::DenseTensor>("Start");
auto* end_t = context.Input<phi::DenseTensor>("End");
auto* step_t = context.Input<phi::DenseTensor>("Step");
auto* out = context.Output<phi::DenseTensor>("Out");
phi::DenseTensor n;
framework::TensorCopy(
*start_t,
platform::CPUPlace(),
context.template device_context<platform::MLUDeviceContext>(),
&n);
context.template device_context<paddle::platform::MLUDeviceContext>()
.Wait();
T start = n.data<T>()[0];
framework::TensorCopy(
*end_t,
platform::CPUPlace(),
context.template device_context<platform::MLUDeviceContext>(),
&n);
context.template device_context<paddle::platform::MLUDeviceContext>()
.Wait();
T end = n.data<T>()[0];
framework::TensorCopy(
*step_t,
platform::CPUPlace(),
context.template device_context<platform::MLUDeviceContext>(),
&n);
context.template device_context<paddle::platform::MLUDeviceContext>()
.Wait();
T step = n.data<T>()[0];
int64_t size = 0;
GetSize(start, end, step, &size);
out->Resize(phi::make_ddim({size}));
out->mutable_data<T>(context.GetPlace());
std::vector<T> odata;
T value = start;
for (int64_t i = 0; i < size; ++i) {
odata.push_back(value);
value += step;
}
framework::TensorFromVector(odata, context.device_context(), out);
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_MLU_KERNEL(range,
paddle::operators::RangeMLUKernel<int>,
paddle::operators::RangeMLUKernel<int64_t>,
paddle::operators::RangeMLUKernel<float>,
paddle::operators::RangeMLUKernel<double>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class Reshape2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
std::vector<int32_t> target_shape_vector;
auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
if (shape_tensor_vector.size() > 0) {
for (auto* shape_tensor : shape_tensor_vector) {
PADDLE_ENFORCE_EQ(
shape_tensor->dims().size(),
1,
platform::errors::InvalidArgument(
"If the element type of 'shape' in Reshape Op is Tensor, "
"the element's shape must be [1]. But received the element's "
"shape is [%d]",
shape_tensor->dims().size()));
target_shape_vector.push_back(
phi::GetVectorFromTensor<int>(shape_tensor)[0]);
}
} else {
auto* shape_tensor = ctx.HasInput("Shape")
? ctx.Input<phi::DenseTensor>("Shape")
: nullptr;
if (shape_tensor) {
target_shape_vector = phi::GetVectorFromTensor<int>(shape_tensor);
} else {
target_shape_vector = ctx.Attr<std::vector<int>>("shape");
PADDLE_ENFORCE_GT(
target_shape_vector.size(),
0,
platform::errors::InvalidArgument(
"The length of shape attribute should be larger than 0 when "
"input ShapeTensor and Shape are empty!"));
}
}
int num_negative =
std::count(target_shape_vector.begin(), target_shape_vector.end(), -1);
PADDLE_ENFORCE_LE(
num_negative,
1,
platform::errors::InvalidArgument(
"The max number of -1 in shape attribute or shape tensor is 1 "
"but received %d.",
num_negative));
auto it_zero =
std::find(target_shape_vector.begin(), target_shape_vector.end(), 0);
if (it_zero != target_shape_vector.end()) {
int x_rank = x->dims().size();
for (size_t i = 0; i < target_shape_vector.size(); i++) {
if (target_shape_vector[i] == 0) {
PADDLE_ENFORCE_LT(
i,
x_rank,
platform::errors::InvalidArgument(
"The index of 0 in shape attribute or shape tensor",
"should be less than input dim size, ",
"but the index is %d and input dim size is %d",
i,
x_rank));
target_shape_vector[i] = x->dims().at(i);
}
}
}
auto it =
std::find(target_shape_vector.begin(), target_shape_vector.end(), -1);
if (it != target_shape_vector.end()) {
auto ddim_out_vec = phi::vectorize(x->dims());
int ddim_out_product = std::accumulate(
ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies<int>());
int reshape_out_product = std::accumulate(target_shape_vector.begin(),
target_shape_vector.end(),
-1,
std::multiplies<int>());
int index = std::distance(target_shape_vector.begin(), it);
target_shape_vector[index] = ddim_out_product / reshape_out_product;
}
auto out_dims = phi::make_ddim(target_shape_vector);
out->mutable_data<T>(out_dims, ctx.GetPlace());
// output should copy to mlu
framework::TensorCopy(
*x,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
out);
out->Resize(out_dims);
}
};
template <typename DeviceContext, typename T>
class Reshape2GradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto in_dims = d_x->dims();
d_x->mutable_data(ctx.GetPlace(), d_out->type());
framework::TensorCopy(
*d_out,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
d_x);
d_x->Resize(in_dims);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
reshape2,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, int64_t>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, bool>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::Reshape2MLUKernel<paddle::platform::MLUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
reshape2_grad,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, int>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, int64_t>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, bool>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, double>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext, uint8_t>,
ops::Reshape2GradMLUKernel<paddle::platform::MLUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace operators {
using DDim = framework::DDim;
using TensorList = std::vector<phi::DenseTensor>;
template <typename TensorType, typename T>
void reset_parameter_vector(
const std::vector<TensorType>& raw_params_vec,
const int& num_layers,
const bool& is_bidirec,
std::vector<std::vector<std::pair<T*, size_t>>>* params_vec) {
// the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
// + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
// ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
const int& direction_num = is_bidirec ? 2 : 1;
const int& layer_weight_size = 4 * direction_num;
const int& all_weight_size = num_layers * layer_weight_size;
const int& bias_start_idx = all_weight_size / 2;
for (int i = 0; i < num_layers; i++) {
params_vec->at(i).resize(layer_weight_size);
for (int j = 0; j < layer_weight_size; j++) {
int k = j % 4;
const int& section = j / 4;
int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
if (k >= 2) {
tensor_idx += bias_start_idx;
}
using remove_cv_t = typename std::remove_cv<T>::type;
params_vec->at(i)[j] = std::make_pair(
const_cast<T*>(
raw_params_vec[tensor_idx]->template data<remove_cv_t>()),
raw_params_vec[tensor_idx]->numel() * sizeof(T));
}
}
}
template <typename DeviceContext, typename T>
class RNNMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// Input
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
bool has_seq_length = ctx.HasInput("SequenceLength");
// Output
auto state = ctx.MultiOutput<phi::DenseTensor>("State");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto* reserve_data = ctx.Output<phi::DenseTensor>("Reserve");
// Attributes
const int& num_layers = ctx.Attr<int>("num_layers");
const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
const int& hidden_size = ctx.Attr<int>("hidden_size");
const std::string& mode = ctx.Attr<std::string>("mode");
const phi::DenseTensor* sequence_length = nullptr;
if (has_seq_length) {
sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
}
auto init_h = pre_state[0]; // -> hx
auto init_c = pre_state[1]; // -> cx
auto last_h = state[0];
auto last_c = state[1];
// check shape
const int in_out_dim_num = input->dims().size();
const int& seq_len = input->dims()[0]; // time_step
const int& batch_size = input->dims()[1];
const int& input_dim = input->dims()[2];
const int& direction_num = is_bidirec ? 2 : 1;
int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim};
int out_dim_arr[in_out_dim_num] = {
seq_len, batch_size, direction_num * hidden_size};
int proj_size = hidden_size;
std::vector<int> seq_len_vec(batch_size, seq_len);
if (has_seq_length) { // set seq_len if no padding, otherwise seq_len for
// each element.
seq_len_vec = phi::GetVectorFromTensor(sequence_length);
}
cnnlDirectionMode_t direction =
is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL;
PADDLE_ENFORCE_EQ(
mode,
"LSTM",
platform::errors::InvalidArgument(
"MLU only support LSTM mode now, current mode is %s", mode));
PADDLE_ENFORCE_EQ(
num_layers,
1,
platform::errors::InvalidArgument(
"MLU only support 1 num_layers, current num_layers is %s",
num_layers));
PADDLE_ENFORCE_EQ(
init_h->dims()[0],
num_layers * direction_num,
platform::errors::InvalidArgument("The num_layers of in RNN layer must"
" be the same as first dim of init "
"hidden, but received num_layers:%d,"
" dim:%d",
num_layers,
init_h->dims()[0]));
PADDLE_ENFORCE_EQ(
init_c->dims()[0],
num_layers * direction_num,
platform::errors::InvalidArgument(
"The num_layers of in RNN layer must"
" be the same as first dim of cell state hidden, but received"
" num_layers:%d, dim:%d",
num_layers,
init_c->dims()[0]));
// weightlist
std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
parameter_lists.resize(num_layers);
reset_parameter_vector(
weight_list, num_layers, is_bidirec, &parameter_lists);
// init the output and allocate the memory
output->mutable_data<T>(ctx.GetPlace()); // -> y in cnnl
last_h->mutable_data<T>(ctx.GetPlace()); // -> hy in cnnl
last_c->mutable_data<T>(ctx.GetPlace()); // -> cy in cnnl
MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC,
ToCnnlDataType(input->dtype()),
in_out_dim_num,
in_dim_arr,
static_cast<int>(seq_len_vec.size()),
seq_len_vec.data(),
nullptr);
MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC,
ToCnnlDataType(input->dtype()),
in_out_dim_num,
out_dim_arr,
static_cast<int>(seq_len_vec.size()),
seq_len_vec.data(),
nullptr);
MLUCnnlTensorDesc hx_desc(*init_h);
MLUCnnlTensorDesc cx_desc(*init_c);
MLURNNDesc rnn_desc(CNNL_LSTM,
CNNL_RNN_DOUBLE_BIAS,
direction,
CNNL_RNN_LINEAR_INPUT,
ToCnnlDataType(input->dtype()),
ToCnnlDataType(input->dtype()),
input_dim,
hidden_size,
/*projection*/ proj_size,
num_layers,
nullptr,
CNNL_RNN_PADDED_IO_DISABLED);
rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED);
// copy weight params
size_t weightspace_size;
phi::DenseTensor weightspace;
PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
weightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
{static_cast<int64_t>(weightspace_size)}, dev_ctx);
void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace());
auto w_x = parameter_lists[0][0];
auto w_h = parameter_lists[0][1];
auto b_x = parameter_lists[0][2];
auto b_h = parameter_lists[0][3];
auto actual_total_w_size =
w_x.second + w_h.second + b_x.second + b_h.second;
void* w_x_ptr = weightspace_ptr;
void* w_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second;
void* b_x_ptr =
static_cast<char*>(weightspace_ptr) + w_x.second + w_h.second;
void* b_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second +
w_h.second + b_x.second;
memory::Copy(weightspace.place(),
w_x_ptr,
weightspace.place(),
w_x.first,
w_x.second,
nullptr);
memory::Copy(weightspace.place(),
w_h_ptr,
weightspace.place(),
w_h.first,
w_h.second,
nullptr);
memory::Copy(weightspace.place(),
b_x_ptr,
weightspace.place(),
b_x.first,
b_x.second,
nullptr);
memory::Copy(weightspace.place(),
b_h_ptr,
weightspace.place(),
b_h.first,
b_h.second,
nullptr);
if (is_bidirec) {
auto bw_x = parameter_lists[0][4];
auto bw_h = parameter_lists[0][5];
auto bb_x = parameter_lists[0][6];
auto bb_h = parameter_lists[0][7];
void* bw_x_ptr =
static_cast<char*>(weightspace_ptr) + actual_total_w_size;
void* bw_h_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second;
void* bb_x_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second + bw_h.second;
void* bb_h_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second + bw_h.second +
bb_x.second;
actual_total_w_size +=
bw_x.second + bw_h.second + bb_x.second + bb_h.second;
memory::Copy(weightspace.place(),
bw_x_ptr,
weightspace.place(),
bw_x.first,
bw_x.second,
nullptr);
memory::Copy(weightspace.place(),
bw_h_ptr,
weightspace.place(),
bw_h.first,
bw_h.second,
nullptr);
memory::Copy(weightspace.place(),
bb_x_ptr,
weightspace.place(),
bb_x.first,
bb_x.second,
nullptr);
memory::Copy(weightspace.place(),
bb_h_ptr,
weightspace.place(),
bb_h.first,
bb_h.second,
nullptr);
}
PADDLE_ENFORCE_EQ(weightspace_size,
actual_total_w_size,
platform::errors::InvalidArgument(
"The weightsize doesn't match"
" weightspace_size:%d, actual_total_w_size:%d",
weightspace_size,
actual_total_w_size));
// get reservespace_ptr
int gate_num = 4;
int hidden_data_idx = (num_layers - 1);
hidden_data_idx += (gate_num + 1) * num_layers;
const int& block_size = direction_num * seq_len * batch_size * hidden_size;
reserve_data->Resize({hidden_data_idx, block_size});
reserve_data->mutable_data<T>(ctx.GetPlace());
MLUCnnl::RNNForward(ctx,
rnn_desc.get(),
seq_len_vec.data(),
weightspace_ptr,
weightspace_size,
input_seq_data_desc.get(),
GetBasePtr(input),
out_seq_data_desc.get(),
GetBasePtr(output),
hx_desc.get(),
GetBasePtr(init_h),
GetBasePtr(last_h),
cx_desc.get(),
GetBasePtr(init_c),
GetBasePtr(last_c),
GetBasePtr(reserve_data));
if (has_seq_length) {
// if has_seq_length, do mask out the output of cnnlRNNForwardTraining
auto masked_mode = CNNL_MASKED_FILL;
float off_value = 0.0f;
phi::DenseTensor on_value_tensor(input->dtype());
phi::DenseTensor masked_tensor(framework::TransToPhiDataType(VT::INT8));
phi::DenseTensor h_masked_tensor(framework::TransToPhiDataType(VT::INT8));
on_value_tensor.Resize({1});
masked_tensor.Resize({seq_len, batch_size, direction_num * hidden_size});
h_masked_tensor.Resize(
{seq_len, batch_size, direction_num * hidden_size});
on_value_tensor.mutable_data<T>(ctx.GetPlace());
masked_tensor.mutable_data<int8_t>(ctx.GetPlace());
int8_t* h_masked_ptr =
h_masked_tensor.mutable_data<int8_t>(platform::CPUPlace());
for (int t = 0; t < seq_len; ++t) {
for (int n = 0; n < batch_size; ++n) {
for (int c = 0; c < direction_num * hidden_size; ++c) {
auto tmp_seq_len = seq_len_vec[n];
auto offset = t * batch_size * direction_num * hidden_size +
n * direction_num * hidden_size + c;
*(h_masked_ptr + offset) = t >= tmp_seq_len ? 1 : 0;
}
}
}
framework::TensorCopy(
h_masked_tensor, ctx.GetPlace(), dev_ctx, &masked_tensor);
dev_ctx.Wait();
FillMLUTensorWithHostValue(ctx, off_value, &on_value_tensor);
MLUCnnlTensorDesc on_value_desc(on_value_tensor);
MLUCnnlTensorDesc output_desc(*output);
MLUCnnlTensorDesc masked_desc(masked_tensor);
MLUCnnl::Mask(ctx,
masked_mode,
output_desc.get(),
GetBasePtr(output),
masked_desc.get(),
GetBasePtr(&masked_tensor),
on_value_desc.get(),
GetBasePtr(&on_value_tensor),
output_desc.get(),
GetBasePtr(output),
nullptr);
}
}
};
template <typename DeviceContext, typename T>
class RNNMLUGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto stream = ctx.template device_context<MLUDeviceContext>().stream();
// get the tensor pointer for the input
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto pre_state = ctx.MultiInput<phi::DenseTensor>("PreState");
auto weight_list = ctx.MultiInput<phi::DenseTensor>("WeightList");
auto* output = ctx.Input<phi::DenseTensor>("Out");
auto* reserve_data = ctx.Input<phi::DenseTensor>("Reserve");
const int& num_layers = ctx.Attr<int>("num_layers");
const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
const int& hidden_size = ctx.Attr<int>("hidden_size");
const std::string& mode = ctx.Attr<std::string>("mode");
bool has_seq_length = ctx.HasInput("SequenceLength");
const phi::DenseTensor* sequence_length = nullptr;
if (has_seq_length) {
sequence_length = ctx.Input<phi::DenseTensor>("SequenceLength");
}
PADDLE_ENFORCE_EQ(
mode,
"LSTM",
platform::errors::InvalidArgument(
"XPU only support LSTM mode now, current mode is %s", mode));
auto init_h = pre_state[0]; // -> hx
auto init_c = pre_state[1]; // -> cx
auto output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto state_grad =
ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("State"));
auto last_h_grad = state_grad[0]; // -> dhy
auto last_c_grad = state_grad[1]; // -> dcy
// get the tensor pointer for the output
auto* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
auto weight_grad_list =
ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("WeightList"));
auto pre_state_grad =
ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("PreState"));
phi::DenseTensor* init_h_grad = nullptr;
phi::DenseTensor* init_c_grad = nullptr;
if (pre_state_grad.size() > 0) { // has gradient
init_h_grad = pre_state_grad[0]; // -> dhx
init_c_grad = pre_state_grad[1]; // -> dcx
}
// check shape
const int in_out_dim_num = input->dims().size();
const int& seq_len = input->dims()[0];
const int& batch_size = input->dims()[1];
const int& input_dim = input->dims()[2];
const int& direction_num = is_bidirec ? 2 : 1;
int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim};
int out_dim_arr[in_out_dim_num] = {
seq_len, batch_size, direction_num * hidden_size};
int proj_size = hidden_size;
PADDLE_ENFORCE_EQ(
num_layers,
1,
platform::errors::InvalidArgument(
"MLU only support 1 num_layers, current num_layers is %s",
num_layers));
PADDLE_ENFORCE_EQ(
init_h->dims()[0],
num_layers * direction_num,
platform::errors::InvalidArgument("The num_layers of in RNN layer must"
" be the same as first dim of init"
"hidden, but received num_layers:%d,"
" dim:%d",
num_layers,
init_h->dims()[0]));
PADDLE_ENFORCE_EQ(
init_c->dims()[0],
num_layers * direction_num,
platform::errors::InvalidArgument(
"The num_layers of in RNN layer must"
" be the same as first dim of cell state hidden, but received"
" num_layers:%d, dim:%d",
num_layers,
init_c->dims()[0]));
std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists;
parameter_lists.resize(num_layers);
reset_parameter_vector(
weight_list, num_layers, is_bidirec, &parameter_lists);
for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
}
std::vector<std::vector<std::pair<T*, size_t>>> parameter_lists_grad;
parameter_lists_grad.resize(num_layers);
reset_parameter_vector(
weight_grad_list, num_layers, is_bidirec, &parameter_lists_grad);
// allocate the memory and initization the input_grad
input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), input_grad);
phi::DenseTensor a, b;
phi::DenseTensor* dynamic_grad_pre_h = &a;
phi::DenseTensor* dynamic_grad_pre_c = &b;
if (init_h_grad) {
init_h_grad->mutable_data<T>(last_h_grad->dims(), ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), init_h_grad);
} else {
dynamic_grad_pre_h->Resize(last_h_grad->dims());
dynamic_grad_pre_h->mutable_data<T>(ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0), dynamic_grad_pre_h);
init_h_grad = dynamic_grad_pre_h;
}
if (init_c_grad) {
init_c_grad->mutable_data<T>(last_c_grad->dims(), ctx.GetPlace());
} else {
dynamic_grad_pre_c->Resize(last_h_grad->dims());
dynamic_grad_pre_c->mutable_data<T>(ctx.GetPlace());
init_c_grad = dynamic_grad_pre_c;
}
std::vector<int> seq_len_vec(batch_size, seq_len);
if (has_seq_length) {
seq_len_vec = phi::GetVectorFromTensor(sequence_length);
}
cnnlDirectionMode_t direction =
is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL;
MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC,
ToCnnlDataType(input->dtype()),
in_out_dim_num,
in_dim_arr,
static_cast<int>(seq_len_vec.size()),
seq_len_vec.data(),
nullptr);
MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC,
ToCnnlDataType(input->dtype()),
in_out_dim_num,
out_dim_arr,
static_cast<int>(seq_len_vec.size()),
seq_len_vec.data(),
nullptr);
MLUCnnlTensorDesc hx_desc(*init_h);
MLUCnnlTensorDesc cx_desc(*init_c);
MLURNNDesc rnn_desc(CNNL_LSTM,
CNNL_RNN_DOUBLE_BIAS,
direction,
CNNL_RNN_LINEAR_INPUT,
ToCnnlDataType(input->dtype()),
ToCnnlDataType(input->dtype()),
input_dim,
hidden_size,
/*projection*/ proj_size,
num_layers,
nullptr,
CNNL_RNN_PADDED_IO_DISABLED);
rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED);
// copy weight
size_t weightspace_size;
phi::DenseTensor weightspace, dweightspace;
PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize(
GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size));
weightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
{static_cast<int64_t>(weightspace_size)}, dev_ctx);
dweightspace = ctx.AllocateTmpTensor<T, DeviceContext>(
{static_cast<int64_t>(weightspace_size)}, dev_ctx);
void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace());
auto w_x = parameter_lists[0][0];
auto w_h = parameter_lists[0][1];
auto b_x = parameter_lists[0][2];
auto b_h = parameter_lists[0][3];
auto actual_total_w_size =
w_x.second + w_h.second + b_x.second + b_h.second;
void* w_x_ptr = weightspace_ptr;
void* w_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second;
void* b_x_ptr =
static_cast<char*>(weightspace_ptr) + w_x.second + w_h.second;
void* b_h_ptr = static_cast<char*>(weightspace_ptr) + w_x.second +
w_h.second + b_x.second;
memory::Copy(weightspace.place(),
w_x_ptr,
weightspace.place(),
w_x.first,
w_x.second,
stream);
memory::Copy(weightspace.place(),
w_h_ptr,
weightspace.place(),
w_h.first,
w_h.second,
stream);
memory::Copy(weightspace.place(),
b_x_ptr,
weightspace.place(),
b_x.first,
b_x.second,
stream);
memory::Copy(weightspace.place(),
b_h_ptr,
weightspace.place(),
b_h.first,
b_h.second,
stream);
if (is_bidirec) {
auto bw_x = parameter_lists[0][4];
auto bw_h = parameter_lists[0][5];
auto bb_x = parameter_lists[0][6];
auto bb_h = parameter_lists[0][7];
void* bw_x_ptr =
static_cast<char*>(weightspace_ptr) + actual_total_w_size;
void* bw_h_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second;
void* bb_x_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second + bw_h.second;
void* bb_h_ptr = static_cast<char*>(weightspace_ptr) +
actual_total_w_size + bw_x.second + bw_h.second +
bb_x.second;
actual_total_w_size +=
bw_x.second + bw_h.second + bb_x.second + bb_h.second;
memory::Copy(weightspace.place(),
bw_x_ptr,
weightspace.place(),
bw_x.first,
bw_x.second,
stream);
memory::Copy(weightspace.place(),
bw_h_ptr,
weightspace.place(),
bw_h.first,
bw_h.second,
stream);
memory::Copy(weightspace.place(),
bb_x_ptr,
weightspace.place(),
bb_x.first,
bb_x.second,
stream);
memory::Copy(weightspace.place(),
bb_h_ptr,
weightspace.place(),
bb_h.first,
bb_h.second,
stream);
}
dev_ctx.Wait();
PADDLE_ENFORCE_EQ(weightspace_size,
actual_total_w_size,
platform::errors::InvalidArgument(
"The weightsize doesn't match"
" weightspace_size:%d, actual_total_w_size:%d",
weightspace_size,
actual_total_w_size));
MLUCnnl::RNNBackward(ctx,
rnn_desc.get(),
CNNL_WGRAD_MODE_SET,
seq_len_vec.data(),
GetBasePtr(&weightspace),
GetBasePtr(&dweightspace),
weightspace.numel() * sizeof(T),
input_seq_data_desc.get(),
GetBasePtr(input),
GetBasePtr(input_grad),
out_seq_data_desc.get(),
GetBasePtr(output),
GetBasePtr(output_grad),
hx_desc.get(),
GetBasePtr(init_h),
GetBasePtr(last_h_grad),
GetBasePtr(init_h_grad),
cx_desc.get(),
GetBasePtr(init_c),
GetBasePtr(last_c_grad),
GetBasePtr(init_c_grad),
const_cast<void*>(GetBasePtr(reserve_data)),
reserve_data->numel() * sizeof(T));
void* dweightspace_ptr = dweightspace.mutable_data(ctx.GetPlace());
auto dw_x = parameter_lists_grad[0][0];
auto dw_h = parameter_lists_grad[0][1];
auto db_x = parameter_lists_grad[0][2];
auto db_h = parameter_lists_grad[0][3];
auto dactual_total_w_size =
dw_x.second + dw_h.second + db_x.second + db_h.second;
void* dw_x_ptr = dweightspace_ptr;
void* dw_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second;
void* db_x_ptr =
static_cast<char*>(dweightspace_ptr) + dw_x.second + dw_h.second;
void* db_h_ptr = static_cast<char*>(dweightspace_ptr) + dw_x.second +
dw_h.second + db_x.second;
memory::Copy(weightspace.place(),
dw_x.first,
weightspace.place(),
dw_x_ptr,
dw_x.second,
stream);
memory::Copy(weightspace.place(),
dw_h.first,
weightspace.place(),
dw_h_ptr,
dw_h.second,
stream);
memory::Copy(weightspace.place(),
db_x.first,
weightspace.place(),
db_x_ptr,
db_x.second,
stream);
memory::Copy(weightspace.place(),
db_h.first,
weightspace.place(),
db_h_ptr,
db_h.second,
stream);
if (is_bidirec) {
auto dbw_x = parameter_lists_grad[0][4];
auto dbw_h = parameter_lists_grad[0][5];
auto dbb_x = parameter_lists_grad[0][6];
auto dbb_h = parameter_lists_grad[0][7];
void* dbw_x_ptr =
static_cast<char*>(dweightspace_ptr) + dactual_total_w_size;
void* dbw_h_ptr = static_cast<char*>(dweightspace_ptr) +
dactual_total_w_size + dbw_x.second;
void* dbb_x_ptr = static_cast<char*>(dweightspace_ptr) +
dactual_total_w_size + dbw_x.second + dbw_h.second;
void* dbb_h_ptr = static_cast<char*>(dweightspace_ptr) +
dactual_total_w_size + dbw_x.second + dbw_h.second +
dbb_x.second;
dactual_total_w_size +=
dbw_x.second + dbw_h.second + dbb_x.second + dbb_h.second;
memory::Copy(weightspace.place(),
dbw_x.first,
weightspace.place(),
dbw_x_ptr,
dbw_x.second,
stream);
memory::Copy(weightspace.place(),
dbw_h.first,
weightspace.place(),
dbw_h_ptr,
dbw_h.second,
stream);
memory::Copy(weightspace.place(),
dbb_x.first,
weightspace.place(),
dbb_x_ptr,
dbb_x.second,
stream);
memory::Copy(weightspace.place(),
dbb_h.first,
weightspace.place(),
dbb_h_ptr,
dbb_h.second,
stream);
}
dev_ctx.Wait();
PADDLE_ENFORCE_EQ(weightspace_size,
dactual_total_w_size,
platform::errors::InvalidArgument(
"The weightsize doesn't match"
" weightspace_size:%d, dactual_total_w_size:%d",
weightspace_size,
dactual_total_w_size));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
rnn, ops::RNNMLUKernel<paddle::platform::MLUDeviceContext, float>);
REGISTER_OP_MLU_KERNEL(
rnn_grad, ops::RNNMLUGradKernel<paddle::platform::MLUDeviceContext, float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ROIAlignOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<phi::DenseTensor>("X");
auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
out->set_layout(phi::DataLayout::kNHWC);
auto pooled_height = ctx.Attr<int>("pooled_height");
auto pooled_width = ctx.Attr<int>("pooled_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto aligned = ctx.Attr<bool>("aligned");
const auto& in_dims = in->dims();
int batch_size = in_dims[0];
int rois_num = rois->dims()[0];
if (rois_num == 0) return;
auto cplace = platform::CPUPlace();
std::vector<int> roi_batch_id_list(rois_num);
int rois_batch_size = 0;
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
rois_batch_size = rois_num_t->numel();
PADDLE_ENFORCE_EQ(
rois_batch_size,
batch_size,
platform::errors::InvalidArgument(
"The batch size of rois and the batch size of images "
" must be the same. But received the batch size of rois is %d, "
"and the batch size of images is %d",
rois_batch_size,
batch_size));
std::vector<int> rois_num_list(rois_batch_size);
memory::Copy(cplace,
rois_num_list.data(),
ctx.GetPlace(),
rois_num_t->data<int>(),
sizeof(int) * rois_batch_size,
nullptr /*stream*/);
int last_idx = 0;
for (int i = 0; i < rois_batch_size; i++) {
int end_idx = last_idx + rois_num_list[i];
for (int j = last_idx; j < end_idx; j++) {
roi_batch_id_list[j] = i;
}
last_idx = end_idx;
}
} else {
auto lod = rois->lod();
PADDLE_ENFORCE_EQ(lod.empty(),
false,
platform::errors::InvalidArgument(
"Input(ROIs) phi::DenseTensor of ROIAlignOp "
"does not contain LoD information."));
auto rois_lod = lod.back();
rois_batch_size = rois_lod.size() - 1;
PADDLE_ENFORCE_EQ(rois_batch_size,
batch_size,
platform::errors::InvalidArgument(
"The rois_batch_size and imgs "
"batch_size must be the same. But received "
"rois_batch_size = %d, "
"batch_size = %d",
rois_batch_size,
batch_size));
int rois_num_with_lod = rois_lod[rois_batch_size];
PADDLE_ENFORCE_EQ(
rois_num,
rois_num_with_lod,
platform::errors::InvalidArgument(
"The actual number of rois and the number of rois "
"provided from Input(RoIsLoD) in RoIAlign must be the same."
" But received actual number of rois is %d, and the number "
"of rois from RoIsLoD is %d",
rois_num,
rois_num_with_lod));
for (int i = 0; i < rois_batch_size; i++) {
int start_idx = rois_lod[i];
int end_idx = rois_lod[i + 1];
for (int j = start_idx; j < end_idx; j++) {
roi_batch_id_list[j] = i;
}
}
}
// only support float32 for now
phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
rois_cpu.Resize({rois_num, 4});
rois_cpu.mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(*rois, cplace, dev_ctx, &rois_cpu);
dev_ctx.Wait();
T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
// boxes; [batch_idx, x1, y1, x2, y2]
phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
boxes_cpu.Resize({rois_num, 5});
boxes_mlu.Resize({rois_num, 5});
T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
boxes_mlu.mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < rois_num; ++i) {
boxes_cpu_ptr[i * 5 + 0] = static_cast<T>(roi_batch_id_list[i]);
boxes_cpu_ptr[i * 5 + 1] = rois_cpu_ptr[i * 4 + 0];
boxes_cpu_ptr[i * 5 + 2] = rois_cpu_ptr[i * 4 + 1];
boxes_cpu_ptr[i * 5 + 3] = rois_cpu_ptr[i * 4 + 2];
boxes_cpu_ptr[i * 5 + 4] = rois_cpu_ptr[i * 4 + 3];
}
// copy boxes_cpu to boxes_mlu
framework::TensorCopy(boxes_cpu, ctx.GetPlace(), dev_ctx, &boxes_mlu);
dev_ctx.Wait();
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
phi::DenseTensor input_nhwc(in->type());
phi::DenseTensor output_nhwc(out->type());
TransposeFromMLUTensor<T>(
ctx, perm_to_nhwc, in, &input_nhwc, true /*need_reshape_or_alloc*/);
auto output_dims = out->dims();
output_nhwc.mutable_data<T>(
{output_dims[0], output_dims[2], output_dims[3], output_dims[1]},
ctx.GetPlace());
MLUCnnlTensorDesc input_desc(
input_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(input_nhwc.dtype()));
MLUCnnlTensorDesc boxes_desc(boxes_mlu);
MLUCnnlTensorDesc out_desc(
output_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(output_nhwc.dtype()));
MLUCnnl::RoiAlign(ctx,
pooled_height,
pooled_width,
sampling_ratio,
spatial_scale,
aligned,
input_desc.get(),
GetBasePtr(&input_nhwc),
boxes_desc.get(),
GetBasePtr(&boxes_mlu),
out_desc.get(),
GetBasePtr(&output_nhwc));
TransposeFromMLUTensor<T>(
ctx, perm_to_nchw, &output_nhwc, out, false /*need_reshape_or_alloc*/);
};
};
template <typename T>
class ROIAlignGradOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto spatial_scale = ctx.Attr<T>("spatial_scale");
auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
auto aligned = ctx.Attr<bool>("aligned");
int rois_num = rois->dims()[0];
if (!in_grad) {
return;
}
in_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> roi_batch_id_list(rois_num);
auto cplace = platform::CPUPlace();
int rois_batch_size = 0;
if (ctx.HasInput("RoisNum")) {
auto* rois_num_t = ctx.Input<phi::DenseTensor>("RoisNum");
rois_batch_size = rois_num_t->numel();
std::vector<int> rois_num_list(rois_batch_size);
memory::Copy(cplace,
rois_num_list.data(),
ctx.GetPlace(),
rois_num_t->data<int>(),
sizeof(int) * rois_batch_size,
nullptr /*stream*/);
int last_idx = 0;
for (int i = 0; i < rois_batch_size; i++) {
int end_idx = last_idx + rois_num_list[i];
for (int j = last_idx; j < end_idx; j++) {
roi_batch_id_list[j] = i;
}
last_idx = end_idx;
}
} else {
auto rois_lod = rois->lod().back();
rois_batch_size = rois_lod.size() - 1;
for (int i = 0; i < rois_batch_size; i++) {
int start_idx = rois_lod[i];
int end_idx = rois_lod[i + 1];
for (int j = start_idx; j < end_idx; j++) {
roi_batch_id_list[j] = i;
}
}
}
phi::DenseTensor rois_cpu(framework::TransToPhiDataType(VT::FP32));
rois_cpu.Resize({rois_num, 4});
rois_cpu.mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(*rois, cplace, dev_ctx, &rois_cpu);
dev_ctx.Wait();
T* rois_cpu_ptr = rois_cpu.mutable_data<T>(platform::CPUPlace());
// boxes; [batch_idx, x1, y1, x2, y2]
phi::DenseTensor boxes_cpu(framework::TransToPhiDataType(VT::FP32));
phi::DenseTensor boxes_mlu(framework::TransToPhiDataType(VT::FP32));
boxes_cpu.Resize({rois_num, 5});
boxes_mlu.Resize({rois_num, 5});
T* boxes_cpu_ptr = boxes_cpu.mutable_data<T>(platform::CPUPlace());
boxes_mlu.mutable_data<T>(ctx.GetPlace());
for (int i = 0; i < rois_num; ++i) {
boxes_cpu_ptr[i * 5 + 0] = static_cast<T>(roi_batch_id_list[i]);
boxes_cpu_ptr[i * 5 + 1] = rois_cpu_ptr[i * 4 + 0];
boxes_cpu_ptr[i * 5 + 2] = rois_cpu_ptr[i * 4 + 1];
boxes_cpu_ptr[i * 5 + 3] = rois_cpu_ptr[i * 4 + 2];
boxes_cpu_ptr[i * 5 + 4] = rois_cpu_ptr[i * 4 + 3];
}
// copy boxes_cpu to boxes_mlu
framework::TensorCopy(boxes_cpu, ctx.GetPlace(), dev_ctx, &boxes_mlu);
dev_ctx.Wait();
const std::vector<int> perm_to_nhwc = {0, 2, 3, 1};
const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
phi::DenseTensor grads_nhwc(out_grad->type());
phi::DenseTensor grads_image_nhwc(in_grad->type());
TransposeFromMLUTensor<T>(ctx,
perm_to_nhwc,
out_grad,
&grads_nhwc,
true /*need_reshape_or_alloc*/);
auto grads_image_dims = in_grad->dims();
grads_image_nhwc.mutable_data<T>({grads_image_dims[0],
grads_image_dims[2],
grads_image_dims[3],
grads_image_dims[1]},
ctx.GetPlace());
MLUCnnlTensorDesc grads_desc(
grads_nhwc, CNNL_LAYOUT_NHWC, ToCnnlDataType(grads_nhwc.dtype()));
MLUCnnlTensorDesc boxes_desc(boxes_mlu);
MLUCnnlTensorDesc grads_image_desc(
grads_image_nhwc,
CNNL_LAYOUT_NHWC,
ToCnnlDataType(grads_image_nhwc.dtype()));
MLUCnnl::RoiAlignBackward(ctx,
sampling_ratio,
spatial_scale,
aligned,
grads_desc.get(),
GetBasePtr(&grads_nhwc),
boxes_desc.get(),
GetBasePtr(&boxes_mlu),
grads_image_desc.get(),
GetBasePtr(&grads_image_nhwc));
TransposeFromMLUTensor<T>(ctx,
perm_to_nchw,
&grads_image_nhwc,
in_grad,
false /*need_reshape_or_alloc*/);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(roi_align, ops::ROIAlignOpMLUKernel<float>);
REGISTER_OP_MLU_KERNEL(roi_align_grad, ops::ROIAlignGradOpMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ScaleMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = GetDevCtxFromCTX(ctx);
auto* in_var = ctx.InputVar("X");
auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
// cnnl require input, scale, bias with same type. And all in device side.
auto scale = static_cast<T>(ctx.Attr<float>("scale"));
phi::DenseTensor scale_tensor;
if (ctx.HasInput("ScaleTensor")) {
phi::DenseTensor float_scale_tensor =
*ctx.Input<phi::DenseTensor>("ScaleTensor");
if (framework::TransToProtoVarType(float_scale_tensor.dtype()) !=
framework::TransToProtoVarType(in->dtype())) {
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc float_scale_desc(float_scale_tensor);
MLUCnnlTensorDesc final_scale_desc(scale_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(
framework::TransToProtoVarType(float_scale_tensor.dtype()),
framework::TransToProtoVarType(scale_tensor.dtype()));
MLUCnnl::Cast(ctx,
cast_type,
float_scale_desc.get(),
GetBasePtr(&float_scale_tensor),
final_scale_desc.get(),
GetBasePtr(&scale_tensor));
} else {
scale_tensor = float_scale_tensor;
}
} else {
scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&scale,
scale_desc.get(),
GetBasePtr(&scale_tensor));
}
auto bias = static_cast<T>(ctx.Attr<float>("bias"));
phi::DenseTensor bias_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc bias_desc(bias_tensor);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&bias,
bias_desc.get(),
GetBasePtr(&bias_tensor));
auto* out_var = ctx.OutputVar("Out");
if (in_var->IsType<phi::SelectedRows>() && in_var != out_var) {
auto& in_slr = in_var->Get<phi::SelectedRows>();
auto* out_slr = out_var->GetMutable<phi::SelectedRows>();
out_slr->set_rows(in_slr.rows());
out_slr->set_height(in_slr.height());
}
auto* out =
framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
out->mutable_data<T>(in->place());
MLUCnnlTensorDesc input_desc(*in);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc output_desc(*out);
const int axis = std::max(in->dims().size() - 1, 0);
auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
if (bias_after_scale) {
MLUCnnl::Scale(ctx,
axis,
input_desc.get(),
GetBasePtr(in),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
output_desc.get(),
GetBasePtr(out));
} else {
phi::DenseTensor new_bias_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc new_bias_desc(new_bias_tensor);
MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL,
ToCnnlDataType(in->dtype()),
CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
new_bias_desc.get(),
GetBasePtr(&new_bias_tensor),
ToCnnlDataType(in->dtype()));
MLUCnnl::Scale(ctx,
axis,
input_desc.get(),
GetBasePtr(in),
scale_desc.get(),
GetBasePtr(&scale_tensor),
new_bias_desc.get(),
GetBasePtr(&new_bias_tensor),
output_desc.get(),
GetBasePtr(out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(scale,
ops::ScaleMLUKernel<float>,
ops::ScaleMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class ScatterMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* indices = ctx.Input<phi::DenseTensor>("Ids");
auto* updates = ctx.Input<phi::DenseTensor>("Updates");
bool overwrite = ctx.Attr<bool>("overwrite");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc indices_desc(*indices);
MLUCnnlTensorDesc updates_desc(*updates);
MLUCnnlTensorDesc out_desc(*out);
cnnlScatterRefMode_t mode;
if (overwrite) {
mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
} else {
phi::DenseTensor tensor_zeros(updates->type());
tensor_zeros.mutable_data<T>(updates->dims(), ctx.GetPlace());
MLUCnnlTensorDesc tensor_zeros_desc(tensor_zeros);
float value = 0.0;
auto value_t = static_cast<T>(value);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&value_t,
tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros));
mode = CNNL_SCATTERREF_UPDATE;
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
tensor_zeros_desc.get(),
GetBasePtr(&tensor_zeros),
indices_desc.get(),
GetBasePtr(indices),
mode);
mode = CNNL_SCATTERREF_ADD;
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(x),
updates_desc.get(),
GetBasePtr(updates),
indices_desc.get(),
GetBasePtr(indices),
mode);
}
paddle::framework::TensorCopy(*x, place, out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(scatter,
ops::ScatterMLUKernel<float>,
ops::ScatterMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <numeric>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/set_value_op.h"
namespace paddle {
namespace operators {
using MLUDeviceContext = platform::MLUDeviceContext;
template <typename T>
class SetValueMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<phi::DenseTensor>("Input");
auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto starts_tensor_list =
ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
auto steps_tensor_list =
ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
auto axes = ctx.Attr<std::vector<int64_t>>("axes");
auto starts = ctx.Attr<std::vector<int64_t>>("starts");
auto ends = ctx.Attr<std::vector<int64_t>>("ends");
auto steps = ctx.Attr<std::vector<int64_t>>("steps");
auto shape = ctx.Attr<std::vector<int64_t>>("shape");
auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
if (!starts_tensor_list.empty()) {
starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
}
if (!ends_tensor_list.empty()) {
ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
}
if (!steps_tensor_list.empty()) {
steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
}
auto in_dims = in->dims();
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
auto slice_dims =
phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps);
auto decrease_slice_dims =
phi::funcs::GetDecreasedDims(slice_dims, decrease_axes);
auto slice_dims_for_assign = decrease_slice_dims;
if (!none_axes.empty()) {
std::vector<int64_t> slice_dims_with_none;
size_t none_axes_cur = 0, decrease_axes_cur = 0;
for (int i = 0; i < slice_dims.size(); ++i) {
while (none_axes_cur < none_axes.size() &&
none_axes[none_axes_cur] <= i) {
slice_dims_with_none.push_back(1);
none_axes_cur++;
}
if (decrease_axes_cur < decrease_axes.size() &&
decrease_axes[decrease_axes_cur] == i) {
decrease_axes_cur++;
} else {
slice_dims_with_none.push_back(slice_dims[i]);
}
}
while (none_axes_cur < none_axes.size()) {
slice_dims_with_none.push_back(1);
none_axes_cur++;
}
slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
}
int in_size = in_dims.size();
int starts_indices[in_size] = {0};
int ends_indices[in_size] = {0};
int strides_indices[in_size] = {0};
for (int i = 0; i < in_dims.size(); ++i) {
starts_indices[i] = 0;
ends_indices[i] = static_cast<int>(slice_dims[i]);
strides_indices[i] = 1;
}
for (size_t i = 0; i < axes.size(); i++) {
int axis_index = axes[i];
starts_indices[axis_index] = static_cast<int>(starts[i]);
ends_indices[axis_index] = static_cast<int>(ends[i]);
strides_indices[axis_index] = static_cast<int>(steps[i]);
}
phi::DenseTensor value_t(in->type());
if (value_tensor != nullptr) {
value_t.ShareDataWith(*value_tensor);
} else {
auto value_dims = phi::make_ddim(shape);
CheckIsDimsMatch(slice_dims_for_assign, value_dims);
value_t.mutable_data<T>(value_dims, ctx.GetPlace());
auto value_name =
GetValueName(framework::TransToProtoVarType(in->dtype()));
CopyVectorToTensor<T>(value_name.c_str(), &value_t, ctx);
value_t.Resize(value_dims);
}
phi::DenseTensor value_temp(in->type());
if (slice_dims_for_assign == value_t.dims()) {
value_temp.ShareDataWith(value_t);
} else {
value_temp.Resize(slice_dims_for_assign);
value_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc value_t_desc(value_t);
MLUCnnlTensorDesc value_temp_desc(value_temp);
MLUCnnl::BroadcastTo(ctx,
value_t_desc.get(),
GetBasePtr(&value_t),
value_temp_desc.get(),
GetBasePtr(&value_temp));
}
int64_t input_numel = phi::product(in_dims);
int64_t value_numel = phi::product(value_temp.dims());
phi::DenseTensor in_temp, out_temp, val_temp, index_out;
int64_t stride_step = phi::product(in_dims);
std::vector<int64_t> index_indices(stride_step);
std::iota(index_indices.begin(), index_indices.end(), 0);
phi::DenseTensor index_temp;
in_temp.ShareDataWith(*in);
val_temp.ShareDataWith(value_temp);
paddle::framework::TensorFromVector(
index_indices, ctx.device_context(), &index_temp);
index_temp.Resize(in_dims);
auto index_dims = in_dims;
for (int i = 0; i < in_dims.size(); ++i) {
if (starts_indices[i] < 0 || ends_indices[i] < 0) {
starts_indices[i] -= in_dims[i];
ends_indices[i] -= in_dims[i];
}
if (strides_indices[i] > 0)
index_dims[i] =
static_cast<int>((ends_indices[i] - starts_indices[i] - 1) /
strides_indices[i]) +
1;
else
index_dims[i] =
static_cast<int>((ends_indices[i] - starts_indices[i] + 1) /
strides_indices[i]) +
1;
}
auto new_in_dims = phi::make_ddim({input_numel});
auto new_val_dims = phi::make_ddim({value_numel});
in_temp.Resize(new_in_dims);
val_temp.Resize(new_val_dims);
index_out.Resize(index_dims);
index_out.mutable_data<int64_t>(ctx.GetPlace());
cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
MLUCnnlTensorDesc x_desc(in_temp);
MLUCnnlTensorDesc indices_desc(index_temp);
MLUCnnlTensorDesc indices_out_desc(index_out);
MLUCnnlTensorDesc updates_desc(val_temp);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::StridedSlice(ctx,
starts_indices,
ends_indices,
strides_indices,
indices_desc.get(),
GetBasePtr(&index_temp),
indices_out_desc.get(),
GetBasePtr(&index_out));
PADDLE_ENFORCE_EQ(
static_cast<int64_t>(phi::product(index_out.dims())),
phi::product(slice_dims_for_assign),
platform::errors::InvalidArgument(
"OP(set_value) error index indices and value update not match "));
phi::DenseTensor index_final;
index_final.ShareDataWith(index_out);
int64_t indices_numel = phi::product(index_dims);
auto new_index_dims = phi::make_ddim({indices_numel});
index_final.Resize(new_index_dims);
MLUCnnlTensorDesc indices_final_desc(index_final);
MLUCnnl::ScatterRefFunctor(ctx,
x_desc.get(),
GetBasePtr(&in_temp),
updates_desc.get(),
GetBasePtr(&val_temp),
indices_final_desc.get(),
GetBasePtr(&index_final),
mode);
in_temp.Resize(in_dims);
paddle::framework::TensorCopy(in_temp, ctx.GetPlace(), out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(set_value,
ops::SetValueMLUKernel<int>,
ops::SetValueMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using SelectedRows = phi::SelectedRows;
template <typename T>
class ShapeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in_var = ctx.InputVar("Input");
framework::DDim in_dims;
if (in_var->IsType<phi::SelectedRows>()) {
in_dims = in_var->Get<phi::SelectedRows>().value().dims();
} else {
in_dims = in_var->Get<phi::DenseTensor>().dims();
}
auto* out_t = ctx.Output<phi::DenseTensor>("Out");
out_t->Resize({in_dims.size()});
out_t->mutable_data<int32_t>(ctx.GetPlace());
// shape op cpu
phi::DenseTensor shape_on_cpu(
framework::TransToPhiDataType(framework::proto::VarType::INT32));
shape_on_cpu.Resize({in_dims.size()});
auto cpu_data = shape_on_cpu.mutable_data<int32_t>(platform::CPUPlace());
for (int i = 0; i < in_dims.size(); ++i) {
cpu_data[i] = in_dims[i];
}
// cpu to mlu
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(shape_on_cpu, ctx.GetPlace(), dev_ctx, out_t);
dev_ctx.Wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(shape,
ops::ShapeMLUKernel<bool>,
ops::ShapeMLUKernel<uint8_t>,
ops::ShapeMLUKernel<int8_t>,
ops::ShapeMLUKernel<int>,
ops::ShapeMLUKernel<int64_t>,
ops::ShapeMLUKernel<paddle::platform::float16>,
ops::ShapeMLUKernel<float>,
ops::ShapeMLUKernel<double>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
const int kIgnoreIndex = -100;
void CheckAttrs(const framework::ExecutionContext& ctx) {
// cnnl not support normalize and ignore_index
bool normalize = ctx.Attr<bool>("normalize");
int ignore_index = ctx.Attr<int>("ignore_index");
PADDLE_ENFORCE_EQ(normalize,
false,
platform::errors::InvalidArgument(
"attr normalize must be false, but got true"));
PADDLE_ENFORCE_EQ(ignore_index,
kIgnoreIndex,
platform::errors::InvalidArgument(
"attr ignore_index must be default %d, but got %d",
kIgnoreIndex,
ignore_index));
}
template <typename T>
class SigmoidCrossEntropyWithLogitsMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
CheckAttrs(ctx);
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* label = ctx.Input<phi::DenseTensor>("Label");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc label_desc(*label);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::BceWithLogits(ctx,
CNNL_BCE_WITH_LOGITS_NONE,
x_desc.get(),
GetBasePtr(x),
label_desc.get(),
GetBasePtr(label),
nullptr,
nullptr,
nullptr,
nullptr,
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class SigmoidCrossEntropyWithLogitsMLUGradKernel
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
CheckAttrs(ctx);
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* label = ctx.Input<phi::DenseTensor>("Label");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto place = ctx.GetPlace();
dx->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc label_desc(*label);
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnl::BceWithLogitsBackward(ctx,
CNNL_BCE_WITH_LOGITS_NONE,
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
label_desc.get(),
GetBasePtr(label),
nullptr,
nullptr,
nullptr,
nullptr,
x_desc.get(),
GetBasePtr(dx));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
sigmoid_cross_entropy_with_logits,
ops::SigmoidCrossEntropyWithLogitsMLUKernel<float>,
ops::SigmoidCrossEntropyWithLogitsMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(
sigmoid_cross_entropy_with_logits_grad,
ops::SigmoidCrossEntropyWithLogitsMLUGradKernel<float>,
ops::SigmoidCrossEntropyWithLogitsMLUGradKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class SizeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("Input");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<int64_t>(ctx.GetPlace());
int64_t size = x->numel();
FillMLUTensorWithHostValue<int64_t>(ctx, size, out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(size,
ops::SizeMLUKernel<int>,
ops::SizeMLUKernel<int64_t>,
ops::SizeMLUKernel<paddle::platform::float16>,
ops::SizeMLUKernel<float>,
ops::SizeMLUKernel<double>,
ops::SizeMLUKernel<bool>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/slice_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class SliceMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
auto ends = ctx.Attr<std::vector<int>>("ends");
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
// Get the accurate attribute value of starts and ends
auto starts_tensor_list =
ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
if (ctx.HasInput("StartsTensor")) {
starts = phi::GetVectorFromTensor<int>(
ctx.Input<phi::DenseTensor>("StartsTensor"));
} else if (starts_tensor_list.size() > 0) {
starts = GetDataFromTensorList<int>(starts_tensor_list);
}
auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
if (ctx.HasInput("EndsTensor")) {
ends = phi::GetVectorFromTensor<int>(
ctx.Input<phi::DenseTensor>("EndsTensor"));
} else if (ends_tensor_list.size() > 0) {
ends = GetDataFromTensorList<int>(ends_tensor_list);
}
PADDLE_ENFORCE_EQ(
starts.size(),
axes.size(),
platform::errors::InvalidArgument(
"The size of starts must be equal to the size of axes."));
PADDLE_ENFORCE_EQ(
ends.size(),
axes.size(),
platform::errors::InvalidArgument(
"The size of ends must be equal to the size of axes."));
const auto& in_dims = input->dims();
auto slice_dims = out->dims();
bool reset_slice_dims = false;
if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
// Infer output dims
for (size_t i = 0; i < axes.size(); ++i) {
// when start == -1 && end == start+1
if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
auto ret =
std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
if (ret != decrease_axis.end()) {
ends[i] = in_dims[axes[i]];
}
}
}
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
slice_dims = phi::funcs::GetSliceDims<int>(
in_dims, axes, starts, ends, nullptr, nullptr);
reset_slice_dims = true;
auto out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
out->Resize(out_dims);
}
if (slice_dims.size() != in_dims.size() && !reset_slice_dims) {
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
slice_dims = phi::funcs::GetSliceDims<int>(
in_dims, axes, starts, ends, nullptr, nullptr);
}
int in_dim_size = input->dims().size();
if (static_cast<int>(axes.size()) != in_dim_size) {
std::vector<int> tmp_starts(in_dim_size, 0);
const auto& in_dims_vec = phi::vectorize(input->dims());
std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
for (size_t i = 0; i < axes.size(); ++i) {
tmp_starts[axes[i]] = starts[i];
tmp_ends[axes[i]] = ends[i];
}
starts.swap(tmp_starts);
ends.swap(tmp_ends);
}
std::vector<int> strides(in_dim_size, 1);
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc out_desc(slice_dims.size(),
phi::vectorize(slice_dims).data(),
ToCnnlDataType<T>());
MLUCnnl::StridedSlice(ctx,
starts.data(),
ends.data(),
strides.data(),
input_desc.get(),
GetBasePtr(input),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class SliceGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dinput =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
auto axes = ctx.Attr<std::vector<int>>("axes");
auto starts = ctx.Attr<std::vector<int>>("starts");
auto ends = ctx.Attr<std::vector<int>>("ends");
// Get the accurate attribute value of starts and ends
auto starts_tensor_list =
ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
if (ctx.HasInput("StartsTensor")) {
starts = phi::GetVectorFromTensor<int>(
ctx.Input<phi::DenseTensor>("StartsTensor"));
} else if (starts_tensor_list.size() > 0) {
starts = GetDataFromTensorList<int>(starts_tensor_list);
}
auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
if (ctx.HasInput("EndsTensor")) {
ends = phi::GetVectorFromTensor<int>(
ctx.Input<phi::DenseTensor>("EndsTensor"));
} else if (ends_tensor_list.size() > 0) {
ends = GetDataFromTensorList<int>(ends_tensor_list);
}
const auto& in_dims = input->dims();
auto slice_dims = dout->dims();
if (slice_dims.size() != in_dims.size()) {
phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
slice_dims = phi::funcs::GetSliceDims<int>(
in_dims, axes, starts, ends, nullptr, nullptr);
}
int in_dim_size = input->dims().size();
if (static_cast<int>(axes.size()) != in_dim_size) {
std::vector<int> tmp_starts(in_dim_size, 0);
const auto& in_dims_vec = phi::vectorize(input->dims());
std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
for (size_t i = 0; i < axes.size(); ++i) {
tmp_starts[axes[i]] = starts[i];
tmp_ends[axes[i]] = ends[i];
}
starts.swap(tmp_starts);
ends.swap(tmp_ends);
}
std::vector<int> strides(in_dim_size, 1);
dinput->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc dout_desc(slice_dims.size(),
phi::vectorize(slice_dims).data(),
ToCnnlDataType<T>());
MLUCnnlTensorDesc dinput_desc(*dinput);
MLUCnnl::StridedSliceGrad(ctx,
starts.data(),
ends.data(),
strides.data(),
dout_desc.get(),
GetBasePtr(dout),
dinput_desc.get(),
GetBasePtr(dinput));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(slice,
ops::SliceMLUKernel<float>,
ops::SliceMLUKernel<int>,
ops::SliceMLUKernel<bool>,
ops::SliceMLUKernel<int64_t>,
ops::SliceMLUKernel<double>,
ops::SliceMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(slice_grad,
ops::SliceGradMLUKernel<float>,
ops::SliceGradMLUKernel<int>,
ops::SliceGradMLUKernel<bool>,
ops::SliceGradMLUKernel<int64_t>,
ops::SliceGradMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace paddle {
namespace operators {
template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
class SoftmaxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const int rank = in->dims().size();
const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const int cnnl_softmax_dims = 3;
const int d1 = phi::funcs::SizeToAxis(axis, in->dims());
const int d2 = in->dims()[axis];
const int d3 = phi::funcs::SizeOutAxis(axis, in->dims());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
std::vector<int> regard_in_shape{d1, 1, d2};
if (d3 != 1) {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
regard_in_shape = {d1, d2, d3};
}
static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
MLUCnnlTensorDesc in_desc(
cnnl_softmax_dims, regard_in_shape.data(), ToCnnlDataType<T>());
MLUCnnl::SoftmaxForward(ctx,
algo,
mode,
NULL,
in_desc.get(),
GetBasePtr(in),
NULL,
in_desc.get(),
GetBasePtr(out));
}
};
template <cnnlSoftmaxAlgorithm_t softmax_algo, typename T>
class SoftmaxGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
dX->mutable_data<T>(ctx.GetPlace());
const int rank = out->dims().size();
const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const int cnnl_softmax_dims = 3;
const int d1 = phi::funcs::SizeToAxis(axis, out->dims());
const int d2 = out->dims()[axis];
const int d3 = phi::funcs::SizeOutAxis(axis, out->dims());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
std::vector<int> regard_out_shape{d1, 1, d2};
if (d3 != 1) {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
regard_out_shape = {d1, d2, d3};
}
static const cnnlSoftmaxAlgorithm_t algo = softmax_algo;
MLUCnnlTensorDesc out_desc(
cnnl_softmax_dims, regard_out_shape.data(), ToCnnlDataType<T>());
MLUCnnl::SoftmaxBackward(ctx,
algo,
mode,
out_desc.get(),
GetBasePtr(out),
out_desc.get(),
GetBasePtr(dOut),
out_desc.get(),
GetBasePtr(dX));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
softmax,
ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
ops::SoftmaxMLUKernel<CNNL_SOFTMAX_ACCURATE, plat::float16>);
REGISTER_OP_MLU_KERNEL(softmax_grad,
ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE, float>,
ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_ACCURATE,
paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(log_softmax,
ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, float>,
ops::SoftmaxMLUKernel<CNNL_SOFTMAX_LOG, plat::float16>);
REGISTER_OP_MLU_KERNEL(
log_softmax_grad,
ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, float>,
ops::SoftmaxGradMLUKernel<CNNL_SOFTMAX_LOG, paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* logits = ctx.Input<phi::DenseTensor>("Logits");
auto* labels = ctx.Input<phi::DenseTensor>("Label");
auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
auto* loss = ctx.Output<phi::DenseTensor>("Loss");
auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
auto soft_label = ctx.Attr<bool>("soft_label");
PADDLE_ENFORCE_EQ(ctx.Attr<bool>("use_softmax"),
true,
platform::errors::InvalidArgument(
"use_softmax=False is not supported in "
"the mlu kernel of softmax_with_cross_entropy."));
const int rank = logits->dims().size();
const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
loss->mutable_data<T>(ctx.GetPlace());
backprop->mutable_data<T>(ctx.GetPlace());
softmax->mutable_data<T>(ctx.GetPlace());
// cnnl softmax only support 3-dims, regard all shape as [d1, d2, d3]
const int cnnl_softmax_dims = 3;
const int d1 = phi::funcs::SizeToAxis(axis, logits->dims());
const int d2_logits = logits->dims()[axis];
const int d2_labels = labels->dims()[axis];
const int d3 = phi::funcs::SizeOutAxis(axis, logits->dims());
// CNNL_SOFTMAX_MODE_LOW_DIMENSION has better perfermence, use it as much as
// possible.
cnnlSoftmaxMode_t mode = CNNL_SOFTMAX_MODE_LOW_DIMENSION;
std::vector<int> regard_logits_shape{d1, 1, d2_logits};
std::vector<int> regard_labels_shape{d1, 1, d2_labels};
std::vector<int> regard_loss_shape{d1, 1, 1};
if (d3 != 1) {
mode = CNNL_SOFTMAX_MODE_MEDIUM_DIMENSION;
regard_logits_shape = {d1, d2_logits, d3};
regard_labels_shape = {d1, d2_labels, d3};
regard_loss_shape = {d1, 1, d3};
}
MLUCnnlTensorDesc logits_desc(
cnnl_softmax_dims, regard_logits_shape.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc labels_desc(
cnnl_softmax_dims, regard_labels_shape.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc loss_desc(
cnnl_softmax_dims, regard_loss_shape.data(), ToCnnlDataType<T>());
const cnnlSoftmaxAlgorithm_t algo = CNNL_SOFTMAX_ACCURATE;
MLUCnnl::SoftmaxForward(ctx,
algo,
mode,
NULL,
logits_desc.get(),
GetBasePtr(logits),
NULL,
logits_desc.get(),
GetBasePtr(softmax));
if (soft_label) {
const cnnlComputationPreference_t prefer =
CNNL_COMPUTATION_HIGH_PRECISION;
MLUCnnl::SoftmaxCrossEntropyWithLogits(ctx,
mode,
prefer,
logits_desc.get(),
GetBasePtr(logits),
labels_desc.get(),
GetBasePtr(labels),
loss_desc.get(),
GetBasePtr(loss),
logits_desc.get(),
GetBasePtr(backprop));
} else {
PADDLE_ENFORCE_EQ(d3,
1,
platform::errors::InvalidArgument(
"If soft_label=False, axis must be -1 or"
" can be regard as last dimention in mlu kernel."));
phi::DenseTensor labels_int32(framework::TransToPhiDataType(VT::INT32));
labels_int32.Resize(labels->dims());
labels_int32.mutable_data<int32_t>(ctx.GetPlace());
MLUCnnlTensorDesc labels_int64_desc(*labels);
MLUCnnlTensorDesc labels_int32_desc(labels_int32);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
labels_int64_desc.get(),
GetBasePtr(labels),
labels_int32_desc.get(),
GetBasePtr(&labels_int32));
const int regard_sparse_shape[cnnl_softmax_dims - 1] = {d1, 1};
MLUCnnlTensorDesc sparse_labels_desc(cnnl_softmax_dims - 1,
regard_sparse_shape,
ToCnnlDataType<int32_t>());
MLUCnnlTensorDesc sparse_loss_desc(
cnnl_softmax_dims - 1, regard_sparse_shape, ToCnnlDataType<T>());
MLUCnnl::SparseSoftmaxXentWithLogits(ctx,
mode,
logits_desc.get(),
GetBasePtr(logits),
sparse_labels_desc.get(),
GetBasePtr(&labels_int32),
sparse_loss_desc.get(),
GetBasePtr(loss),
logits_desc.get(),
GetBasePtr(backprop));
}
}
};
template <typename T>
class SoftmaxWithCrossEntropyGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
auto* loss_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
auto* logits_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
PADDLE_ENFORCE_NOT_NULL(backprop,
platform::errors::PreconditionNotMet(
"backprop should not be null in MLU kernel of "
"softmax_with_cross_entropy_grad."));
logits_grad->mutable_data<T>(ctx.GetPlace());
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnlTensorDesc backprop_desc(*backprop);
MLUCnnlTensorDesc loss_grad_desc(*loss_grad);
MLUCnnlTensorDesc logits_grad_desc(*logits_grad);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
backprop_desc.get(),
GetBasePtr(backprop),
loss_grad_desc.get(),
GetBasePtr(loss_grad),
logits_grad_desc.get(),
GetBasePtr(logits_grad),
ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
softmax_with_cross_entropy,
ops::SoftmaxWithCrossEntropyMLUKernel<float>,
ops::SoftmaxWithCrossEntropyMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradMLUKernel<float>,
ops::SoftmaxWithCrossEntropyGradMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/split_op.h"
#include "paddle/phi/core/tensor_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class SplitMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// init parameter
auto* in = ctx.Input<phi::DenseTensor>("X");
auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
int num = ctx.Attr<int>("num");
std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
int axis = ctx.Attr<int>("axis");
auto in_dims = in->dims();
auto out_size = outs.size();
auto num_tensor = num == 0 ? out_size : num;
bool need_resize_outs_dims = false;
if (ctx.HasInput("AxisTensor")) {
auto* axis_tensor = ctx.Input<phi::DenseTensor>("AxisTensor");
axis = phi::GetVectorFromTensor(axis_tensor)[0];
need_resize_outs_dims = true;
}
auto sections_tensor_list =
ctx.MultiInput<phi::DenseTensor>("SectionsTensorList");
if (sections_tensor_list.size() > 0) {
sections = GetDataFromTensorList(sections_tensor_list);
need_resize_outs_dims = true;
}
if (need_resize_outs_dims) {
std::vector<framework::DDim> outs_dims =
UpdateOutsDims(true, true, in_dims, num, sections, axis, out_size);
for (size_t j = 0; j < outs.size(); ++j) {
outs[j]->Resize(outs_dims[j]);
}
}
// init out tensors
std::vector<void*> vct_tensor;
std::vector<MLUCnnlTensorDesc> output_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < outs.size(); i++) {
outs[i]->mutable_data<T>(ctx.GetPlace());
output_descs.emplace_back(MLUCnnlTensorDesc(
*outs[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(outs[i]->dtype())));
desc_vector.push_back(output_descs.back().get());
vct_tensor.push_back(GetBasePtr(outs[i]));
}
// init in tensors
MLUCnnlTensorDesc input_desc(
*in, CNNL_LAYOUT_ARRAY, ToCnnlDataType(in->dtype()));
// MLU should do sth
MLUCnnl::Split(ctx,
num_tensor,
axis,
input_desc.get(),
GetBasePtr(in),
desc_vector.data(),
vct_tensor.data());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(split,
ops::SplitMLUKernel<float>,
ops::SplitMLUKernel<int64_t>,
ops::SplitMLUKernel<int>,
ops::SplitMLUKernel<bool>,
ops::SplitMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class SquaredL2NormMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &dev_ctx = context.template device_context<MLUDeviceContext>();
auto *x = context.Input<phi::DenseTensor>("X");
auto *out = context.Output<phi::DenseTensor>("Out");
auto place = context.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc input_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
// L2Loss
MLUCnnl::L2Loss(context, input_desc.get(), GetBasePtr(x), GetBasePtr(out));
// do mul
phi::DenseTensor scale_tensor =
context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
phi::DenseTensor bias_tensor =
context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
FillMLUTensorWithHostValue(context, static_cast<T>(2.0f), &scale_tensor);
FillMLUTensorWithHostValue(context, static_cast<T>(0.0f), &bias_tensor);
MLUCnnl::Scale(context,
0,
out_desc.get(),
GetBasePtr(out),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class SquaredL2NormGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
auto &dev_ctx = context.template device_context<MLUDeviceContext>();
auto *x = context.Input<phi::DenseTensor>("X");
auto *x_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *out_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(
out_grad->numel(),
1,
platform::errors::InvalidArgument(
"Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
auto place = context.GetPlace();
// broadcast out_grad
phi::DenseTensor broadcasted_out_grad;
broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
MLUCnnlTensorDesc broadcasted_out_grad_desc(broadcasted_out_grad);
MLUCnnlTensorDesc out_grad_desc(*out_grad);
MLUCnnl::BroadcastTo(context,
out_grad_desc.get(),
GetBasePtr(out_grad),
broadcasted_out_grad_desc.get(),
GetBasePtr(&broadcasted_out_grad));
// mul x
phi::DenseTensor tmp_x_grad;
tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc tmp_x_grad_desc(tmp_x_grad);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType(x->dtype()), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(context,
mul_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
broadcasted_out_grad_desc.get(),
GetBasePtr(&broadcasted_out_grad),
tmp_x_grad_desc.get(),
GetBasePtr(&tmp_x_grad),
ToCnnlDataType(x->dtype()));
// mul
phi::DenseTensor scale_tensor =
context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
phi::DenseTensor bias_tensor =
context.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc scale_desc(scale_tensor);
MLUCnnlTensorDesc bias_desc(bias_tensor);
FillMLUTensorWithHostValue(context, static_cast<T>(2.0f), &scale_tensor);
FillMLUTensorWithHostValue(context, static_cast<T>(0.0f), &bias_tensor);
x_grad->mutable_data<T>(place);
MLUCnnlTensorDesc x_grad_desc(*x_grad);
MLUCnnl::Scale(context,
0,
tmp_x_grad_desc.get(),
GetBasePtr(&tmp_x_grad),
scale_desc.get(),
GetBasePtr(&scale_tensor),
bias_desc.get(),
GetBasePtr(&bias_tensor),
x_grad_desc.get(),
GetBasePtr(x_grad));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(squared_l2_norm,
ops::SquaredL2NormMLUKernel<float>,
ops::SquaredL2NormMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(squared_l2_norm_grad,
ops::SquaredL2NormGradMLUKernel<float>,
ops::SquaredL2NormGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <memory>
#include <string>
#include "paddle/fluid/operators/squeeze_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
squeeze,
ops::SqueezeKernel<plat::MLUDeviceContext, float>,
ops::SqueezeKernel<plat::MLUDeviceContext, double>,
ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
ops::SqueezeKernel<plat::MLUDeviceContext, int>,
ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
squeeze_grad,
ops::SqueezeGradKernel<plat::MLUDeviceContext, float>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, double>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, bool>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, int>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, uint8_t>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
ops::SqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
squeeze2,
ops::SqueezeKernel<plat::MLUDeviceContext, float>,
ops::SqueezeKernel<plat::MLUDeviceContext, double>,
ops::SqueezeKernel<plat::MLUDeviceContext, plat::float16>,
ops::SqueezeKernel<plat::MLUDeviceContext, bool>,
ops::SqueezeKernel<plat::MLUDeviceContext, int>,
ops::SqueezeKernel<plat::MLUDeviceContext, uint8_t>,
ops::SqueezeKernel<plat::MLUDeviceContext, int8_t>,
ops::SqueezeKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
squeeze2_grad,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, float>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, double>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, bool>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, int>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, uint8_t>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, int8_t>,
ops::Squeeze2GradKernel<plat::MLUDeviceContext, int64_t>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class StackMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto x = ctx.MultiInput<phi::DenseTensor>("X");
auto* y = ctx.Output<phi::DenseTensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int num = static_cast<int>(x.size());
PADDLE_ENFORCE_GT(num,
0,
platform::errors::InvalidArgument(
"number of input phi::DenseTensor <= 0"));
std::vector<MLUCnnlTensorDesc> x_descs;
std::vector<cnnlTensorDescriptor_t> x_raw_descs;
std::vector<const void*> x_ptrs;
for (int i = 0; i < num; i++) {
if (x[i]->dims().size() != 0) {
std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
in_dims.insert(in_dims.begin() + axis, 1);
x_descs.emplace_back(MLUCnnlTensorDesc(
in_dims.size(), in_dims.data(), ToCnnlDataType<T>()));
} else {
int input_dims = 1;
x_descs.emplace_back(
MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
}
x_raw_descs.push_back(x_descs.back().get());
x_ptrs.push_back(GetBasePtr(x[i]));
}
y->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc y_desc(*y);
MLUCnnl::Concat(ctx,
num,
axis,
x_raw_descs.data(),
x_ptrs.data(),
y_desc.get(),
GetBasePtr(y));
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_MLU_KERNEL(
stack,
paddle::operators::StackMLUKernel<int64_t>,
paddle::operators::StackMLUKernel<int>,
paddle::operators::StackMLUKernel<float>,
paddle::operators::StackMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/utils.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/strided_slice.h"
namespace paddle {
namespace operators {
using Variable = framework::Variable;
using LoDTensorArray = framework::LoDTensorArray;
using DDim = framework::DDim;
static void ProcessStridedSliceParams(
const std::vector<int>& axes,
const DDim& input_dims,
const std::vector<int64_t>& starts,
const std::vector<int64_t>& ends,
const std::vector<int64_t>& strides,
const std::vector<int>& infer_flags,
const std::vector<int>& decrease_axis,
std::vector<int>* starts_indices_vector,
std::vector<int>* ends_indices_vector,
std::vector<int>* strides_indices_vector) {
for (size_t axis = 0; axis < axes.size(); axis++) {
int64_t start = starts[axis];
int64_t end = ends[axis];
int64_t stride = strides[axis];
int axis_index = axes[axis];
int64_t dim_size = input_dims[axis_index];
bool decrease_axis_affect = false;
if (start == -1 && end == 0 && infer_flags[axis] == -1) {
auto ret =
std::find(decrease_axis.begin(), decrease_axis.end(), axis_index);
if (ret != decrease_axis.end()) {
decrease_axis_affect = true;
}
}
if (stride < 0) {
if (start < 0) {
start = std::max(start, -dim_size);
} else {
start = std::min(start, dim_size - 1) - dim_size;
}
if (end < 0) {
end = std::max(end, -dim_size - 1);
} else {
end = end - dim_size;
}
} else {
if (start < 0) {
start = std::max(start, -dim_size) + dim_size;
} else {
start = std::min(start, dim_size - 1);
}
if (end < 0) {
end = end + dim_size;
} else {
end = std::min(end, dim_size);
}
}
if (decrease_axis_affect) {
if (stride < 0) {
end = start - 1;
} else {
end = start + 1;
}
}
(*starts_indices_vector)[axis_index] = static_cast<int>(start);
(*ends_indices_vector)[axis_index] = static_cast<int>(end);
(*strides_indices_vector)[axis_index] = static_cast<int>(stride);
}
}
template <typename T>
class StridedSliceMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Variable* input_var = ctx.InputVar("Input");
bool is_tensor_array = input_var->IsType<LoDTensorArray>();
PADDLE_ENFORCE_EQ(is_tensor_array,
false,
platform::errors::InvalidArgument(
"phi::DenseTensor array as input is not supported."));
int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
switch (rank) {
case 1:
StridedSliceCompute<1>(ctx);
break;
case 2:
StridedSliceCompute<2>(ctx);
break;
case 3:
StridedSliceCompute<3>(ctx);
break;
case 4:
StridedSliceCompute<4>(ctx);
break;
case 5:
StridedSliceCompute<5>(ctx);
break;
case 6:
StridedSliceCompute<6>(ctx);
break;
case 7:
StridedSliceCompute<7>(ctx);
break;
case 8:
StridedSliceCompute<8>(ctx);
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"The rank of input is supported up to 8."));
break;
}
}
private:
template <size_t D>
void StridedSliceCompute(const framework::ExecutionContext& ctx) const {
auto place = ctx.GetPlace();
auto in = ctx.Input<phi::DenseTensor>("Input");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto in_dims = in->dims();
// list<int>
auto starts_int = ctx.Attr<std::vector<int>>("starts");
auto ends_int = ctx.Attr<std::vector<int>>("ends");
auto strides_int = ctx.Attr<std::vector<int>>("strides");
std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
auto axes = ctx.Attr<std::vector<int>>("axes");
auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
// vector<phi::DenseTensor<int32>>
auto list_new_starts_tensor =
ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
auto list_new_ends_tensor =
ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
auto list_new_strides_tensor =
ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
// phi::DenseTensor<int32>
if (list_new_starts_tensor.size() > 0) {
starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
} else if (ctx.HasInput("StartsTensor")) {
auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
}
if (list_new_ends_tensor.size() > 0) {
ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
} else if (ctx.HasInput("EndsTensor")) {
auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
}
if (list_new_strides_tensor.size() > 0) {
strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
} else if (ctx.HasInput("StridesTensor")) {
auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
}
// out dims calculation
std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
phi::funcs::StridedSliceOutDims(starts,
ends,
strides,
axes,
infer_flags,
in_dims,
decrease_axis,
out_dims_vector.data(),
axes.size(),
false);
framework::DDim out_dims(phi::make_ddim(out_dims_vector));
// construct the starts_indices, ends_indices and strides_indices tensor for
// calling StridedSlice op
std::vector<int> starts_indices_vector(D, 0);
std::vector<int> ends_indices_vector(out_dims_vector.begin(),
out_dims_vector.end());
std::vector<int> strides_indices_vector(D, 1);
ProcessStridedSliceParams(axes,
in_dims,
starts,
ends,
strides,
infer_flags,
decrease_axis,
&starts_indices_vector,
&ends_indices_vector,
&strides_indices_vector);
auto out_dims_origin = out_dims;
if (decrease_axis.size() > 0) {
std::vector<int64_t> new_out_shape;
for (size_t i = 0; i < decrease_axis.size(); ++i) {
PADDLE_ENFORCE_EQ(
out_dims[decrease_axis[i]],
1,
platform::errors::InvalidArgument(
"the size of decrease dimension should be 1, but received %d.",
out_dims[decrease_axis[i]]));
out_dims_origin[decrease_axis[i]] = 0;
}
for (int i = 0; i < out_dims_origin.size(); ++i) {
if (out_dims_origin[i] != 0) {
new_out_shape.push_back(out_dims_origin[i]);
}
}
if (new_out_shape.size() == 0) {
new_out_shape.push_back(1);
}
out_dims_origin = phi::make_ddim(new_out_shape);
}
out->Resize(out_dims_origin);
out->mutable_data<T>(place);
MLUCnnlTensorDesc in_desc(*in);
MLUCnnlTensorDesc out_desc(
out_dims_vector.size(), out_dims_vector.data(), ToCnnlDataType<T>());
MLUCnnl::StridedSlice(ctx,
starts_indices_vector.data(),
ends_indices_vector.data(),
strides_indices_vector.data(),
in_desc.get(),
GetBasePtr(in),
out_desc.get(),
GetBasePtr(out));
}
};
template <typename T>
class StridedSliceGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Variable* input_var = ctx.InputVar("Input");
bool is_tensor_array = input_var->IsType<LoDTensorArray>();
PADDLE_ENFORCE_EQ(is_tensor_array,
false,
platform::errors::InvalidArgument(
"phi::DenseTensor array as input is not supported."));
int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
switch (rank) {
case 1:
StridedSliceGradCompute<1>(ctx);
break;
case 2:
StridedSliceGradCompute<2>(ctx);
break;
case 3:
StridedSliceGradCompute<3>(ctx);
break;
case 4:
StridedSliceGradCompute<4>(ctx);
break;
case 5:
StridedSliceGradCompute<5>(ctx);
break;
case 6:
StridedSliceGradCompute<6>(ctx);
break;
case 7:
StridedSliceGradCompute<7>(ctx);
break;
case 8:
StridedSliceGradCompute<8>(ctx);
break;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"The rank of input is supported up to 8."));
break;
}
}
private:
template <size_t D>
void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const {
auto place = ctx.GetPlace();
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto input_dims = input->dims();
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
dx->mutable_data<T>(input_dims, place);
auto starts_int = ctx.Attr<std::vector<int>>("starts");
auto ends_int = ctx.Attr<std::vector<int>>("ends");
auto strides_int = ctx.Attr<std::vector<int>>("strides");
std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
auto axes = ctx.Attr<std::vector<int>>("axes");
auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
auto list_new_ends_tensor =
ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
auto list_new_starts_tensor =
ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
auto list_new_strides_tensor =
ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
if (list_new_starts_tensor.size() > 0) {
starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
} else if (ctx.HasInput("StartsTensor")) {
auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
}
if (list_new_ends_tensor.size() > 0) {
ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
} else if (ctx.HasInput("EndsTensor")) {
auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
}
if (list_new_strides_tensor.size() > 0) {
strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
} else if (ctx.HasInput("StridesTensor")) {
auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
}
std::vector<int64_t> out_dims_vector(input_dims.size(), -1);
phi::funcs::StridedSliceOutDims(starts,
ends,
strides,
axes,
infer_flags,
input_dims,
decrease_axis,
out_dims_vector.data(),
axes.size(),
false);
std::vector<int> starts_indices_vector(D, 0);
std::vector<int> ends_indices_vector(out_dims_vector.begin(),
out_dims_vector.end());
std::vector<int> strides_indices_vector(D, 1);
ProcessStridedSliceParams(axes,
input_dims,
starts,
ends,
strides,
infer_flags,
decrease_axis,
&starts_indices_vector,
&ends_indices_vector,
&strides_indices_vector);
MLUCnnlTensorDesc dout_desc(
out_dims_vector.size(), out_dims_vector.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dx_desc(*input);
MLUCnnl::StridedSliceGrad(ctx,
starts_indices_vector.data(),
ends_indices_vector.data(),
strides_indices_vector.data(),
dout_desc.get(),
GetBasePtr(dout),
dx_desc.get(),
GetBasePtr(dx));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(strided_slice,
ops::StridedSliceMLUKernel<plat::float16>,
ops::StridedSliceMLUKernel<bool>,
ops::StridedSliceMLUKernel<int>,
ops::StridedSliceMLUKernel<int64_t>,
ops::StridedSliceMLUKernel<float>);
REGISTER_OP_MLU_KERNEL(strided_slice_grad,
ops::StridedSliceGradMLUKernel<plat::float16>,
ops::StridedSliceGradMLUKernel<float>,
ops::StridedSliceGradMLUKernel<bool>,
ops::StridedSliceGradMLUKernel<int>,
ops::StridedSliceGradMLUKernel<int64_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
using SelectedRows = phi::SelectedRows;
template <typename DeviceContext, typename T>
class SumMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto out_var = ctx.OutputVar("Out");
if (out_var->IsType<phi::DenseTensor>()) {
// init
auto *out = out_var->GetMutable<phi::DenseTensor>();
auto ins = ctx.MultiInput<phi::DenseTensor>("X");
out->mutable_data<T>(ctx.GetPlace());
auto place = ctx.GetPlace();
int ins_size = static_cast<int>(ins.size());
if (ins_size == 1) {
framework::TensorCopy(*ins[0], place, out);
return;
}
// MLU shoul do sth
std::vector<const void *> inputs;
std::vector<MLUCnnlTensorDesc> input_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (int i = 0; i < ins_size; i++) {
input_descs.emplace_back(MLUCnnlTensorDesc(
*ins[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(ins[i]->dtype())));
desc_vector.push_back(input_descs.back().get());
inputs.push_back(GetBasePtr(ins[i]));
}
// init out tensors
MLUCnnlTensorDesc output_desc(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
uint32_t ins_size_t = static_cast<uint32_t>(ins_size);
MLUCnnl::AddN(ctx,
ins_size_t,
desc_vector.data(),
inputs.data(),
output_desc.get(),
GetBasePtr(out));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) must be phi::DenseTensor or But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
sum,
ops::SumMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::SumMLUKernel<paddle::platform::MLUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the Licnse. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/collective_helper.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
#define NO_USE_CNCL 0
#define GET_LAYOUT_OFFSET 2
static std::vector<cnnlTensorLayout_t> supported_input_layout = {
CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC};
template <typename T>
class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
float epsilon = ctx.Attr<float>("epsilon");
float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = phi::StringToDataLayout(layout_str);
PADDLE_ENFORCE_EQ(use_global_stats,
false,
platform::errors::InvalidArgument(
"sync_batch_norm doesn't support "
"to set use_global_stats True. Please use batch_norm "
"in this case."));
const auto *x = ctx.Input<phi::DenseTensor>("X");
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
auto *y = ctx.Output<phi::DenseTensor>("Y");
const auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
platform::errors::InvalidArgument(
"The Input dim size should be larger than 1."));
PADDLE_ENFORCE_LE(x_dims.size(),
5,
platform::errors::InvalidArgument(
"The Input dim size should be less than 6."));
int N, C, H, W, D;
phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
y->mutable_data<T>(ctx.GetPlace());
mean_out->mutable_data<MPDType>(ctx.GetPlace());
variance_out->mutable_data<MPDType>(ctx.GetPlace());
saved_mean->mutable_data<MPDType>(ctx.GetPlace());
saved_variance->mutable_data<MPDType>(ctx.GetPlace());
phi::DenseTensor trans_x;
phi::DenseTensor trans_y;
std::vector<int> forward_perm;
std::vector<int> backward_perm;
std::vector<int> trans_shape;
const bool need_transpose =
((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
x_dims.size() == 5);
if (need_transpose) {
SetMLUTransposePerm(
x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
trans_y.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
MLUCnnlTensorDesc desc_x(*x);
MLUCnnlTensorDesc desc_trans_x(
trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
MLUCnnl::Transpose(ctx,
forward_perm,
x_dims.size(),
desc_x.get(),
GetBasePtr(x),
desc_trans_x.get(),
GetBasePtr(&trans_x));
} else {
trans_x = *x;
trans_y = *y;
}
MLUCnnlTensorDesc desc_trans(
trans_x,
supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
ToCnnlDataType<T>());
bool test_mode = is_test && (!trainable_stats);
if (test_mode) { // inference
MLUCnnlTensorDesc desc_weight_bias_mean_var(*bias);
MLUCnnl::FusedBatchNorm(ctx,
false /*is_training*/,
desc_trans.get(),
GetBasePtr(&trans_x),
desc_weight_bias_mean_var.get(),
GetBasePtr(scale),
GetBasePtr(bias),
GetBasePtr(mean),
GetBasePtr(variance),
epsilon,
momentum,
desc_trans.get(),
GetBasePtr(&trans_y),
nullptr,
nullptr,
nullptr,
nullptr);
} else { // training
if (ctx.HasInput("MomentumTensor")) {
const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
phi::DenseTensor mom_cpu;
paddle::framework::TensorCopySync(
*mom_tensor, platform::CPUPlace(), &mom_cpu);
momentum = mom_cpu.data<float>()[0];
}
phi::DenseTensor local_mean, local_var;
local_mean.mutable_data<MPDType>(mean->dims(), ctx.GetPlace());
local_var.mutable_data<MPDType>(variance->dims(), ctx.GetPlace());
MLUCnnlTensorDesc desc_mean_var(*mean_out);
// cacl local_mean and local_var
MLUCnnl::SyncBatchNormStats(ctx,
desc_trans.get(),
GetBasePtr(&trans_x),
epsilon,
desc_mean_var.get(),
GetBasePtr(&local_mean),
desc_mean_var.get(),
GetBasePtr(&local_var));
phi::DenseTensor input_count;
input_count.mutable_data<MPDType>(phi::make_ddim({1}), ctx.GetPlace());
FillMLUTensorWithHostValue<MPDType>(
ctx, static_cast<MPDType>(x->numel() / C), &input_count);
phi::DenseTensor count_all;
phi::DenseTensor mean_all(mean->dtype());
phi::DenseTensor invstd_all(variance->dtype());
#ifdef PADDLE_WITH_CNCL
auto &dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto *comm = dev_ctx.cncl_comm();
if (comm) {
auto cncl_comm = paddle::platform::CNCLCommContext::Instance().Get(
0, ctx.GetPlace());
auto *comm = cncl_comm->comm();
auto comm_stream = cncl_comm->stream();
int count;
PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCommCount(&count, comm));
count_all.mutable_data<MPDType>(phi::make_ddim({count}),
ctx.GetPlace());
mean_all.mutable_data<MPDType>(phi::make_ddim({count, mean->numel()}),
ctx.GetPlace());
invstd_all.mutable_data<MPDType>(
phi::make_ddim({count, variance->numel()}), ctx.GetPlace());
// before comm_stream exec, need sync compute_stream.
dev_ctx.Wait();
cnclDataType_t dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(count_all.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&input_count),
GetBasePtr(&count_all),
1,
dtype,
comm,
comm_stream));
auto cncl_dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(mean_all.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_mean),
GetBasePtr(&mean_all),
local_mean.numel(),
cncl_dtype,
comm,
comm_stream));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_var),
GetBasePtr(&invstd_all),
local_var.numel(),
cncl_dtype,
comm,
comm_stream));
// after comm_stream exec, need sync queue for using compute_stream
// correctly.
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
#else
if (NO_USE_CNCL) {
#endif
} else {
count_all = input_count;
mean_all.ShareDataWith(local_mean);
invstd_all.ShareDataWith(local_var);
mean_all.Resize(phi::make_ddim({1, local_mean.numel()}));
invstd_all.Resize(phi::make_ddim({1, local_var.numel()}));
}
MLUCnnlTensorDesc desc_all_mean_invstd(
invstd_all, CNNL_LAYOUT_NC, ToCnnlDataType<MPDType>());
MLUCnnlTensorDesc desc_moving_mean_var(*mean_out);
MLUCnnlTensorDesc desc_saved_mean_var(*saved_mean);
MLUCnnlTensorDesc desc_count_all(count_all);
MLUCnnl::SyncBatchNormGatherStatsWithCounts(ctx,
momentum,
epsilon,
desc_all_mean_invstd.get(),
GetBasePtr(&mean_all),
desc_all_mean_invstd.get(),
GetBasePtr(&invstd_all),
desc_moving_mean_var.get(),
GetBasePtr(mean_out),
desc_moving_mean_var.get(),
GetBasePtr(variance_out),
desc_count_all.get(),
GetBasePtr(&count_all),
desc_saved_mean_var.get(),
GetBasePtr(saved_mean),
desc_saved_mean_var.get(),
GetBasePtr(saved_variance));
MLUCnnlTensorDesc desc_other_param(*saved_mean);
MLUCnnl::SyncBatchNormElemt(ctx,
desc_trans.get(),
GetBasePtr(&trans_x),
desc_other_param.get(),
GetBasePtr(saved_mean),
desc_other_param.get(),
GetBasePtr(saved_variance),
desc_other_param.get(),
GetBasePtr(scale),
desc_other_param.get(),
GetBasePtr(bias),
desc_trans.get(),
GetBasePtr(&trans_y));
}
if (need_transpose) {
MLUCnnlTensorDesc desc_y(*y);
MLUCnnlTensorDesc desc_trans_y(trans_y);
MLUCnnl::Transpose(ctx,
backward_perm,
trans_y.dims().size(),
desc_trans_y.get(),
GetBasePtr(&trans_y),
desc_y.get(),
GetBasePtr(y));
}
}
};
template <typename T>
class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const std::string layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout layout = phi::StringToDataLayout(layout_str);
const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
// init output
auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto *d_scale =
ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
const auto *saved_inv_var = ctx.Input<phi::DenseTensor>("SavedVariance");
const phi::DenseTensor *x;
if (ctx.HasInput("Y")) {
PADDLE_ENFORCE_EQ(true,
false,
platform::errors::InvalidArgument(
"sync_batch_norm_grad doesn't support input Y"));
} else {
x = ctx.Input<phi::DenseTensor>("X");
}
const auto &x_dims = x->dims();
PADDLE_ENFORCE_GE(x_dims.size(),
2,
platform::errors::InvalidArgument(
"The Input X dim size should be larger than 1."));
PADDLE_ENFORCE_LE(x_dims.size(),
5,
platform::errors::InvalidArgument(
"The Input X dim size should be less than 6."));
int N, C, H, W, D;
phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
PADDLE_ENFORCE_EQ(scale->dims()[0],
C,
platform::errors::InvalidArgument(
"Expected first dim for input parameter(scale) of "
"OP(sync_batch_norm) be (%d), but given (%d).",
C,
scale->dims()[0]));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<MPDType>(ctx.GetPlace());
d_bias->mutable_data<MPDType>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(scale->dims().size(),
1UL,
platform::errors::InvalidArgument(
"Expected rank for input parameter(scale) of "
"OP(sync_batch_norm) be (1), but given (%d).",
scale->dims().size()));
phi::DenseTensor trans_x;
phi::DenseTensor trans_dy;
phi::DenseTensor trans_dx;
std::vector<int> forward_perm;
std::vector<int> backward_perm;
std::vector<int> trans_shape;
const bool need_transpose =
((layout == DataLayout::kNCHW && x_dims.size() != 2) ||
x_dims.size() == 5);
if (need_transpose) {
SetMLUTransposePerm(
x_dims, layout, &forward_perm, &backward_perm, &trans_shape);
trans_x.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
trans_dy.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
trans_dx.mutable_data<T>(phi::make_ddim(trans_shape), ctx.GetPlace());
MLUCnnlTensorDesc desc_x(*x);
MLUCnnlTensorDesc desc_trans_x(
trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype()));
MLUCnnl::Transpose(ctx,
forward_perm,
x_dims.size(),
desc_x.get(),
GetBasePtr(x),
desc_trans_x.get(),
GetBasePtr(&trans_x));
MLUCnnl::Transpose(ctx,
forward_perm,
x_dims.size(),
desc_x.get(),
GetBasePtr(d_y),
desc_trans_x.get(),
GetBasePtr(&trans_dy));
} else {
trans_x = *x;
trans_dy = *d_y;
trans_dx = *d_x;
}
MLUCnnlTensorDesc desc_trans(
trans_x,
supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET],
ToCnnlDataType<T>());
phi::DenseTensor sum_dy, sum_dy_xmu;
sum_dy.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
sum_dy_xmu.mutable_data<MPDType>(bias->dims(), ctx.GetPlace());
MLUCnnlTensorDesc desc_other_param(*bias);
MLUCnnl::SyncBatchnormBackwardReduce(
ctx,
desc_trans.get(),
GetBasePtr(&trans_dy),
desc_trans.get(),
GetBasePtr(&trans_x),
desc_other_param.get(),
GetBasePtr(saved_mean),
desc_other_param.get(),
GetBasePtr(saved_inv_var),
d_scale ? desc_other_param.get() : nullptr,
d_scale ? GetBasePtr(d_scale) : nullptr,
d_bias ? desc_other_param.get() : nullptr,
d_bias ? GetBasePtr(d_bias) : nullptr,
desc_other_param.get(),
GetBasePtr(&sum_dy),
desc_other_param.get(),
GetBasePtr(&sum_dy_xmu),
true /*compute sum_dy, sum_dy_xmu*/,
d_scale ? true : false /*compute d_scale*/,
d_bias ? true : false /*compute d_bias*/);
phi::DenseTensor numel_count;
numel_count.mutable_data<int32_t>(phi::make_ddim({1}), ctx.GetPlace());
FillMLUTensorWithHostValue<int32_t>(
ctx, static_cast<int32_t>(x->numel() / C), &numel_count);
#ifdef PADDLE_WITH_CNCL
auto &dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto *comm = dev_ctx.cncl_comm();
if (comm) {
auto cncl_comm =
paddle::platform::CNCLCommContext::Instance().Get(0, ctx.GetPlace());
auto *comm = cncl_comm->comm();
auto comm_stream = cncl_comm->stream();
// before comm_stream exec, need sync compute_stream.
dev_ctx.Wait();
cnclDataType_t dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(numel_count.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&numel_count),
GetBasePtr(&numel_count),
1,
dtype,
cnclSum,
comm,
comm_stream));
auto cncl_dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(sum_dy.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy),
GetBasePtr(&sum_dy),
sum_dy.numel(),
cncl_dtype,
cnclSum,
comm,
comm_stream));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy_xmu),
GetBasePtr(&sum_dy_xmu),
sum_dy_xmu.numel(),
cncl_dtype,
cnclSum,
comm,
comm_stream));
// after comm_stream exec, need sync queue for using compute_stream
// correctly.
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
}
#endif
if (d_x) {
MLUCnnlTensorDesc desc_count(numel_count);
MLUCnnl::SyncBatchNormBackwardElemt(ctx,
desc_trans.get(),
GetBasePtr(&trans_dy),
desc_trans.get(),
GetBasePtr(&trans_x),
desc_other_param.get(),
GetBasePtr(saved_mean),
desc_other_param.get(),
GetBasePtr(saved_inv_var),
desc_other_param.get(),
GetBasePtr(scale),
desc_other_param.get(),
GetBasePtr(&sum_dy),
desc_other_param.get(),
GetBasePtr(&sum_dy_xmu),
desc_count.get(),
GetBasePtr(&numel_count),
desc_trans.get(),
GetBasePtr(&trans_dx));
if (need_transpose) {
MLUCnnlTensorDesc desc_dx(*d_x);
MLUCnnlTensorDesc desc_trans_dx(trans_dx);
MLUCnnl::Transpose(ctx,
backward_perm,
trans_dx.dims().size(),
desc_trans_dx.get(),
GetBasePtr(&trans_dx),
desc_dx.get(),
GetBasePtr(d_x));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(sync_batch_norm,
ops::SyncBatchNormMLUKernel<float>,
ops::SyncBatchNormMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(sync_batch_norm_grad,
ops::SyncBatchNormMLUGradKernel<float>,
ops::SyncBatchNormMLUGradKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/tile_op_functor.h"
namespace paddle {
namespace operators {
template <typename T>
class TileMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
PADDLE_ENFORCE_GE(
rank,
1,
platform::errors::InvalidArgument(
"The rank of the input 'x' for tile op must be a positive "
"integer, but the value received is %d.",
rank));
PADDLE_ENFORCE_LE(
rank,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The rank of the input 'x' for tile op "
"must be less than or equal to %d, but the value received is %d.",
MAX_RANK_SUPPORTED,
rank));
auto repeat_times = get_repeat_times(context);
int repeat_times_size = repeat_times.size();
PADDLE_ENFORCE_GE(
repeat_times_size,
1,
platform::errors::InvalidArgument(
"The number of elements of the input 'repeat_times' for tile "
"op must be positive, but the value received is %d.",
repeat_times_size));
PADDLE_ENFORCE_LE(
repeat_times_size,
MAX_RANK_SUPPORTED,
platform::errors::InvalidArgument(
"The number of elements of the input 'repeat_times' for tile op "
"must be less than or equal to %d, but the value received is %d.",
MAX_RANK_SUPPORTED,
repeat_times_size));
auto* in0 = context.Input<phi::DenseTensor>("X");
auto in_dims = in0->dims();
for (size_t i = 0; i < repeat_times.size(); ++i) {
PADDLE_ENFORCE_GT(
repeat_times[i],
0,
platform::errors::InvalidArgument(
"All elements of the input 'repeat_times' for tile op must "
"be positive integers, but the value received is %d.",
repeat_times[i]));
}
auto vec_in_dims = phi::vectorize<int>(in_dims);
if (repeat_times.size() < vec_in_dims.size()) {
int diff = vec_in_dims.size() - repeat_times.size();
repeat_times.insert(repeat_times.begin(), diff, 1);
} else {
int diff = repeat_times.size() - vec_in_dims.size();
vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
}
PADDLE_ENFORCE_EQ(
repeat_times.size(),
vec_in_dims.size(),
platform::errors::InvalidArgument(
"The rank (%d) of the input 'x' and the rank (%d) of the input "
"'repeat_times' for tile op must match after promotion.",
vec_in_dims.size(),
repeat_times.size()));
auto* out0 = context.Output<phi::DenseTensor>("Out");
bool repeat_one_times = true;
for (size_t i = 0; i < repeat_times.size(); ++i) {
if (repeat_times[i] != 1) {
repeat_one_times = false;
}
}
if (repeat_one_times) {
paddle::framework::TensorCopy(*in0, context.GetPlace(), out0);
} else {
framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
framework::DDim out_dims(new_in_dims);
for (size_t i = 0; i < repeat_times.size(); ++i) {
out_dims[i] *= repeat_times[i];
}
out0->Resize(out_dims);
out0->mutable_data<T>(context.GetPlace());
MLUCnnlTensorDesc x_desc(*in0);
MLUCnnlTensorDesc out_desc(*out0);
MLUCnnl::BroadcastTo(context,
x_desc.get(),
GetBasePtr(in0),
out_desc.get(),
GetBasePtr(out0));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(tile,
ops::TileMLUKernel<bool>,
ops::TileMLUKernel<int>,
ops::TileMLUKernel<int64_t>,
ops::TileMLUKernel<float>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/top_k_op.h"
namespace paddle {
namespace operators {
template <typename T>
class TopkMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto* indices = ctx.Output<phi::DenseTensor>("Indices");
const auto& place = ctx.GetPlace();
size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<phi::DenseTensor>("K");
if (k_t) {
auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
auto size = k_t->numel() * sizeof(int);
memory::Copy(platform::CPUPlace(),
reinterpret_cast<void*>(&k),
k_t->place(),
k_t_ptr,
size,
nullptr);
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
output->mutable_data<T>(place);
indices->mutable_data<int64_t>(place);
const bool largest = true;
const bool sorted = true;
const int axis = -1;
// cnnl only support int32/int16 type of indices
phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
indices_int32.Resize(indices->dims());
indices_int32.mutable_data<int32_t>(place);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc values_output_desc(*output);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnl::TopK(ctx,
k,
axis,
largest,
sorted,
input_desc.get(),
GetBasePtr(input),
values_output_desc.get(),
GetBasePtr(output),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
// cast indices type to int64
MLUCnnlTensorDesc cast_output_desc(*indices);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
indices_int32_desc.get(),
GetBasePtr(&indices_int32),
cast_output_desc.get(),
GetBasePtr(indices));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(top_k,
ops::TopkMLUKernel<float>,
ops::TopkMLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class TopkV2MLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
auto* indices = ctx.Output<phi::DenseTensor>("Indices");
const auto& place = ctx.GetPlace();
const auto& sorted = static_cast<bool>(ctx.Attr<bool>("sorted"));
const auto& largest = static_cast<bool>(ctx.Attr<bool>("largest"));
// axis < 0, cacluate the real axis
int axis = static_cast<int>(ctx.Attr<int>("axis"));
if (axis < 0) {
const auto& in_dims = input->dims();
axis += in_dims.size();
}
size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<phi::DenseTensor>("K");
if (k_t) {
auto k_t_ptr = static_cast<const void*>(k_t->data<int>());
auto size = k_t->numel() * sizeof(int);
memory::Copy(platform::CPUPlace(),
reinterpret_cast<void*>(&k),
k_t->place(),
k_t_ptr,
size,
nullptr);
framework::DDim output_dims = output->dims();
// accroding to axis to set K value in the dim
output_dims[axis] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
output->mutable_data<T>(place);
indices->mutable_data<int64_t>(place);
// cnnl only support int32/int16 type of indices
phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
indices_int32.Resize(indices->dims());
indices_int32.mutable_data<int32_t>(place);
MLUCnnlTensorDesc input_desc(*input);
MLUCnnlTensorDesc values_output_desc(*output);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnl::TopK(ctx,
k,
axis,
largest,
sorted,
input_desc.get(),
GetBasePtr(input),
values_output_desc.get(),
GetBasePtr(output),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
// cast indices type to int64
MLUCnnlTensorDesc cast_output_desc(*indices);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
indices_int32_desc.get(),
GetBasePtr(&indices_int32),
cast_output_desc.get(),
GetBasePtr(indices));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(top_k_v2,
ops::TopkV2MLUKernel<float>,
ops::TopkV2MLUKernel<paddle::platform::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class TransposeMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
out->mutable_data<T>(ctx.device_context().GetPlace());
TransposeFromMLUTensor<T>(
ctx, axis, x, out, false /*need_reshape_or_alloc*/);
}
};
template <typename T>
class TransposeGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
std::vector<int> reversed_axis(axis);
for (size_t i = 0; i < axis.size(); i++) {
reversed_axis[axis[i]] = i;
}
x_grad->mutable_data<T>(ctx.GetPlace());
TransposeFromMLUTensor<T>(
ctx, reversed_axis, out_grad, x_grad, false /*need_reshape_or_alloc*/);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(transpose2,
ops::TransposeMLUKernel<float>,
ops::TransposeMLUKernel<paddle::platform::float16>,
ops::TransposeMLUKernel<int>,
ops::TransposeMLUKernel<int16_t>,
ops::TransposeMLUKernel<uint8_t>,
ops::TransposeMLUKernel<int8_t>,
ops::TransposeMLUKernel<bool>);
REGISTER_OP_MLU_KERNEL(transpose2_grad,
ops::TransposeGradMLUKernel<float>,
ops::TransposeGradMLUKernel<paddle::platform::float16>,
ops::TransposeGradMLUKernel<int>,
ops::TransposeGradMLUKernel<int16_t>,
ops::TransposeGradMLUKernel<uint8_t>,
ops::TransposeGradMLUKernel<int8_t>,
ops::TransposeGradMLUKernel<bool>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class TrilTriuMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
int diagonal = ctx.Attr<int>("diagonal");
bool lower = ctx.Attr<bool>("lower");
bool upper;
if (lower) {
upper = 0;
} else {
upper = 1;
}
out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::TrilTriu(ctx,
diagonal,
upper,
x_desc.get(),
GetBasePtr(x),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(tril_triu,
ops::TrilTriuMLUKernel<float>,
ops::TrilTriuMLUKernel<int32_t>,
ops::TrilTriuMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <limits>
#include <random>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
#include "paddle/phi/core/generator.h"
namespace paddle {
namespace operators {
template <typename T>
class TruncatedGaussianRandomMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
float mean = context.Attr<float>("mean");
float std = context.Attr<float>("std");
auto* tensor = context.Output<phi::DenseTensor>("Out");
tensor->mutable_data<T>(context.GetPlace());
phi::DenseTensor cpu_tensor(tensor->dtype());
cpu_tensor.Resize(tensor->dims());
T* data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
1.0);
TruncatedNormal<T> truncated_normal(mean, std);
int64_t size = tensor->numel();
unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
auto engine = phi::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = truncated_normal(dist(*engine));
}
auto& dev_ctx =
context.template device_context<platform::MLUDeviceContext>();
framework::TensorCopy(cpu_tensor, context.GetPlace(), dev_ctx, tensor);
dev_ctx.Wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(truncated_gaussian_random,
ops::TruncatedGaussianRandomMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/uniform_random_op.h"
#include "paddle/phi/core/generator.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUUniformRandomKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
phi::DenseTensor *tensor = nullptr;
auto out_var = ctx.OutputVar("Out");
std::vector<int64_t> new_shape;
auto list_new_shape_tensor =
ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
if (ctx.HasInput("ShapeTensor")) {
auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
new_shape = GetNewDataFromShapeTensor(shape_tensor);
} else if (list_new_shape_tensor.size() > 0) {
new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
}
}
if (out_var->IsType<phi::SelectedRows>()) {
auto *selected_rows = out_var->GetMutable<phi::SelectedRows>();
tensor = selected_rows->mutable_value();
auto shape = ctx.Attr<std::vector<int64_t>>("shape");
if (!new_shape.empty()) shape = new_shape;
tensor->Resize(phi::make_ddim(shape));
selected_rows->mutable_rows()->reserve(shape[0]);
} else if (out_var->IsType<phi::DenseTensor>()) {
tensor = out_var->GetMutable<phi::DenseTensor>();
if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Expected type of Output(out) in uniform_random_op must be "
"phi::DenseTensor, "
"SelectedRows. But got "
"unsupport type: %s.",
framework::ToTypeName(out_var->Type())));
}
tensor->mutable_data<T>(ctx.GetPlace());
int64_t size = tensor->numel();
phi::DenseTensor cpu_tensor(tensor->dtype());
cpu_tensor.Resize(tensor->dims());
T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
std::uniform_real_distribution<T> dist(
static_cast<T>(ctx.Attr<float>("min")),
static_cast<T>(ctx.Attr<float>("max")));
unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
auto engine = phi::GetCPURandomEngine(seed);
for (int64_t i = 0; i < size; ++i) {
data_cpu[i] = dist(*engine);
}
unsigned int diag_num =
static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
unsigned int diag_step =
static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
if (diag_num > 0) {
PADDLE_ENFORCE_GT(
size,
(diag_num - 1) * (diag_step + 1),
platform::errors::InvalidArgument(
"ShapeInvalid: the diagonal's elements is equal (num-1) "
"* (step-1) with num %d, step %d,"
"It should be smaller than %d, but received %d",
diag_num,
diag_step,
(diag_num - 1) * (diag_step + 1),
size));
for (int64_t i = 0; i < diag_num; ++i) {
int64_t pos = i * diag_step + i;
data_cpu[pos] = diag_val;
}
}
// copy to MLU
framework::TensorCopy(
cpu_tensor,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
tensor);
ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_MLU_KERNEL(uniform_random,
paddle::operators::MLUUniformRandomKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include <memory>
#include <string>
#include "paddle/fluid/operators/unsqueeze_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
unsqueeze,
ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
unsqueeze2,
ops::UnsqueezeKernel<plat::MLUDeviceContext, float>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, double>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, plat::float16>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, bool>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int8_t>,
ops::UnsqueezeKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
unsqueeze_grad,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, float>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, double>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, plat::float16>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, bool>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int8_t>,
ops::UnsqueezeGradKernel<plat::MLUDeviceContext, int64_t>);
REGISTER_OP_MLU_KERNEL(
unsqueeze2_grad,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, float>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, double>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, plat::float16>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, bool>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int8_t>,
ops::Unsqueeze2GradKernel<plat::MLUDeviceContext, int64_t>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class UnStackMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.MultiOutput<phi::DenseTensor>("Y");
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += x->dims().size();
int num = x->dims()[axis];
std::vector<MLUCnnlTensorDesc> out_descs;
std::vector<cnnlTensorDescriptor_t> out_raw_descs;
std::vector<void *> out_ptrs;
std::vector<int64_t> new_dims = phi::vectorize(x->dims());
new_dims[axis] = 1;
for (int i = 0; i < num; i++) {
out[i]->mutable_data<T>(ctx.GetPlace());
out_descs.emplace_back(MLUCnnlTensorDesc(
new_dims.size(), new_dims.data(), ToCnnlDataType<T>()));
out_raw_descs.push_back(out_descs.back().get());
out_ptrs.push_back(GetBasePtr(out[i]));
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnl::Split(ctx,
num,
axis,
x_desc.get(),
GetBasePtr(x),
out_raw_descs.data(),
out_ptrs.data());
}
};
template <typename T>
class UnStackGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
int axis = ctx.Attr<int>("axis");
if (axis < 0) axis += (x[0]->dims().size() + 1);
int num = static_cast<int>(x.size());
std::vector<MLUCnnlTensorDesc> x_descs;
std::vector<cnnlTensorDescriptor_t> x_raw_descs;
std::vector<const void *> x_ptrs;
for (int i = 0; i < num; i++) {
if (x[i]->dims().size() != 0) {
std::vector<int64_t> in_dims = phi::vectorize(x[i]->dims());
in_dims.insert(in_dims.begin() + axis, 1);
x_descs.emplace_back(MLUCnnlTensorDesc(
in_dims.size(), in_dims.data(), ToCnnlDataType<T>()));
} else {
int input_dims = 1;
x_descs.emplace_back(
MLUCnnlTensorDesc(1, &input_dims, ToCnnlDataType<T>()));
}
x_raw_descs.push_back(x_descs.back().get());
x_ptrs.push_back(GetBasePtr(x[i]));
}
y->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc y_desc(*y);
MLUCnnl::Concat(ctx,
num,
axis,
x_raw_descs.data(),
x_ptrs.data(),
y_desc.get(),
GetBasePtr(y));
}
};
} // namespace operators
} // namespace paddle
namespace plat = paddle::platform;
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(unstack,
ops::UnStackMLUKernel<float>,
ops::UnStackMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(unstack_grad,
ops::UnStackGradMLUKernel<float>,
ops::UnStackGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUWhereIndexKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<phi::DenseTensor>("Condition");
auto* out = context.Output<phi::DenseTensor>("Out");
auto dims = condition->dims();
const int rank = dims.size();
phi::DenseTensor num_true;
num_true.mutable_data<int>({1}, context.GetPlace());
MLUCnnlTensorDesc con_desc(*condition);
MLUCnnlTensorDesc num_true_desc(num_true);
MLUCnnl::NumTrue(context,
con_desc.get(),
GetBasePtr(condition),
num_true_desc.get(),
GetBasePtr(&num_true));
phi::DenseTensor local_true_num;
paddle::framework::TensorCopySync(
num_true, platform::CPUPlace(), &local_true_num);
auto true_num = *local_true_num.data<int>();
out->Resize(phi::make_ddim({true_num, rank}));
out->mutable_data<int64_t>(context.GetPlace());
if (true_num == 0) {
return;
}
auto& dev_ctx = context.template device_context<MLUDeviceContext>();
phi::DenseTensor out_int32 =
context.AllocateTmpTensor<int32_t, MLUDeviceContext>(out->dims(),
dev_ctx);
MLUCnnlTensorDesc out_int32_desc(out_int32);
MLUCnnlTensorDesc out_desc(*out);
bool as_tuple = false;
MLUCnnl::Where(context,
con_desc.get(),
GetBasePtr(condition),
num_true_desc.get(),
GetBasePtr(&num_true),
as_tuple,
out_int32_desc.get(),
GetBasePtr(&out_int32));
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(context,
cast_type,
out_int32_desc.get(),
GetBasePtr(&out_int32),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(where_index,
ops::MLUWhereIndexKernel<int>,
ops::MLUWhereIndexKernel<bool>,
ops::MLUWhereIndexKernel<float>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class WhereMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* condition = context.Input<phi::DenseTensor>("Condition");
auto* X = context.Input<phi::DenseTensor>("X");
auto* Y = context.Input<phi::DenseTensor>("Y");
auto* out = context.Output<phi::DenseTensor>("Out");
auto place = context.GetPlace();
out->mutable_data<T>(place);
MLUCnnlTensorDesc x_desc(*X);
MLUCnnlTensorDesc y_desc(*Y);
MLUCnnlTensorDesc condition_desc(*condition);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Select(context,
condition_desc.get(),
GetBasePtr(condition),
x_desc.get(),
GetBasePtr(X),
y_desc.get(),
GetBasePtr(Y),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(
where,
ops::WhereMLUKernel<paddle::platform::MLUDeviceContext, float>,
ops::WhereMLUKernel<paddle::platform::MLUDeviceContext, int>);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册