未验证 提交 a6aa701e 编写于 作者: J jjyaoao 提交者: GitHub

delete paddle/fluid/operators/math,metrics,optimizers,reduce_ops/*_npu.* (#52674)

上级 b451aff8
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AccuracyNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* inference = ctx.Input<phi::DenseTensor>("Out");
auto* label = ctx.Input<phi::DenseTensor>("Label");
auto* indices = ctx.Input<phi::DenseTensor>("Indices");
auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
auto* correct = ctx.Output<phi::DenseTensor>("Correct");
auto* total = ctx.Output<phi::DenseTensor>("Total");
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
int num_samples = inference->dims()[0];
if (num_samples == 0) {
return;
}
// cast `indices` or `label` if their type is not consistent
Tensor cast_indices(phi::DataType::INT32);
Tensor cast_label(phi::DataType::INT32);
if (indices->dtype() != label->dtype()) {
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
if (framework::TransToProtoVarType(indices->dtype()) !=
framework::proto::VarType::INT32) {
cast_indices.Resize(indices->dims());
cast_indices.mutable_data<int>(ctx.GetPlace());
const auto& runner_cast_indices =
NpuOpRunner("Cast",
{*indices},
{cast_indices},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_indices.Run(stream);
} else {
cast_indices.ShareDataWith(*indices);
}
if (framework::TransToProtoVarType(label->dtype()) !=
framework::proto::VarType::INT32) {
cast_label.Resize(label->dims());
cast_label.mutable_data<int>(ctx.GetPlace());
const auto& runner_cast_label =
NpuOpRunner("Cast",
{*label},
{cast_label},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast_label.Run(stream);
} else {
cast_label.ShareDataWith(*label);
}
} else {
cast_indices.ShareDataWith(*indices);
cast_label.ShareDataWith(*label);
}
// equal
Tensor tmp_equal(phi::DataType::BOOL);
tmp_equal.Resize(inference->dims());
tmp_equal.mutable_data<bool>(ctx.GetPlace());
const auto& runner_equal =
NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
runner_equal.Run(stream);
// cast equal
Tensor tmp_equal_cast(phi::DataType::FLOAT32);
tmp_equal_cast.Resize(inference->dims());
tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
const auto& runner_cast_equal = NpuOpRunner(
"Cast",
{tmp_equal},
{tmp_equal_cast},
{{"dst_type",
static_cast<int>(ConvertToNpuDtype(
framework::TransToProtoVarType(tmp_equal_cast.dtype())))}});
runner_cast_equal.Run(stream);
// [correct]
// reduce_max
Tensor tmp_correct_max(phi::DataType::FLOAT32);
tmp_correct_max.Resize(phi::make_ddim({num_samples}));
tmp_correct_max.mutable_data<float>(ctx.GetPlace());
const auto& runner_reduce_max =
NpuOpRunner("ReduceMaxD",
{tmp_equal_cast},
{tmp_correct_max},
{{"axes", std::vector<int>{1}}, {"keep_dims", false}});
runner_reduce_max.Run(stream);
// reduce_sum
Tensor tmp_correct(phi::DataType::FLOAT32);
tmp_correct.Resize(correct->dims());
tmp_correct.mutable_data<float>(ctx.GetPlace());
const auto& runner_reduce_sum =
NpuOpRunner("ReduceSumD",
{tmp_correct_max},
{tmp_correct},
{{"axes", std::vector<int>{0}}, {"keep_dims", false}});
runner_reduce_sum.Run(stream);
// cast to int
correct->mutable_data<int>(ctx.GetPlace());
const auto& runner_cast_correct =
NpuOpRunner("Cast",
{tmp_correct},
{*correct},
{{"dst_type",
static_cast<int>(ConvertToNpuDtype(
framework::TransToProtoVarType(correct->dtype())))}});
runner_cast_correct.Run(stream);
// [total]
total->mutable_data<int>(ctx.GetPlace());
FillNpuTensorWithConstant<int>(total, static_cast<int>(num_samples));
// use `total` of type `float32` for calculating accuracy
Tensor tmp_total(phi::DataType::FLOAT32);
tmp_total.Resize(total->dims());
tmp_total.mutable_data<float>(ctx.GetPlace());
FillNpuTensorWithConstant<float>(&tmp_total,
static_cast<float>(num_samples));
// [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace());
const auto& runner_accuracy =
NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
runner_accuracy.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
accuracy,
ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>,
ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class AdamNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<phi::DenseTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Grad(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(param_var->Type())));
auto* grad = ctx.Input<phi::DenseTensor>("Grad");
auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(
*skip_update_tensor, ctx.device_context(), &skip_update_vec);
skip_update = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update) {
VLOG(4) << "Adam skip update";
framework::TensorCopy(
*param,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
param_out);
framework::TensorCopy(
*mom1,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
mom1_out);
framework::TensorCopy(
*mom2,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
mom2_out);
framework::TensorCopy(
*beta1_pow,
beta1_pow->place(),
ctx.template device_context<platform::DeviceContext>(),
beta1_pow_out);
framework::TensorCopy(
*beta2_pow,
beta2_pow->place(),
ctx.template device_context<platform::DeviceContext>(),
beta2_pow_out);
return;
}
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
param_out->mutable_data<T>(ctx.GetPlace());
mom1_out->mutable_data<T>(ctx.GetPlace());
mom2_out->mutable_data<T>(ctx.GetPlace());
// NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform
// place.
phi::DenseTensor beta1_pow_tmp;
phi::DenseTensor beta2_pow_tmp;
if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>();
beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta1_pow_tmp, beta1);
beta1_pow = &beta1_pow_tmp;
}
if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>();
beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta2_pow_tmp, beta2);
beta2_pow = &beta2_pow_tmp;
}
const phi::DenseTensor* beta1_tensor = nullptr;
const phi::DenseTensor* beta2_tensor = nullptr;
const phi::DenseTensor* epsilon_tensor = nullptr;
phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
if (ctx.HasInput("Beta1Tensor")) {
beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(Beta1Tensor) size must be 1, but get %d",
beta1_tensor->numel()));
} else {
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
beta1_tensor = &beta1_tmp;
}
if (ctx.HasInput("Beta2Tensor")) {
beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(Beta2Tensor) size must be 1, but get %d",
beta2_tensor->numel()));
} else {
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
beta2_tensor = &beta2_tmp;
}
if (ctx.HasInput("EpsilonTensor")) {
epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(EpsilonTensor) size must be 1, but get %d",
epsilon_tensor->numel()));
} else {
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
epsilon_tensor = &epsilon_tmp;
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
<< "beta2_pow.numel() : " << beta2_pow->numel();
VLOG(3) << "param.numel(): " << param->numel();
PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner("ApplyAdamD",
{
*param,
*mom1,
*mom2,
*beta1_pow,
*beta2_pow,
*lr,
*beta1_tensor,
*beta2_tensor,
*epsilon_tensor,
*grad,
},
{
*param_out,
*mom1_out,
*mom2_out,
},
{});
runner.Run(stream);
// NOTE(zhiqiu): ApplyAdamD updates params inplace, so
// if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param->data<T>()) {
framework::TensorCopy(
*param,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
param_out);
}
if (mom1_out->data<T>() != mom1->data<T>()) {
framework::TensorCopy(
*mom1,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
mom1_out);
}
if (mom2_out->data<T>() != mom2->data<T>()) {
framework::TensorCopy(
*mom2,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
mom2_out);
}
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
const auto& runner_m1 =
NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
runner_m1.Run(stream);
const auto& runner_m2 =
NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
runner_m2.Run(stream);
}
}
};
template <typename T>
class AdamWNPUKernel : public AdamNPUKernel<platform::NPUDeviceContext, T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
VLOG(3) << "NPU AdamW Kernel";
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
VLOG(3) << "Has SkipUpdate";
auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(
*skip_update_tensor, ctx.device_context(), &skip_update_vec);
skip_update = skip_update_vec[0];
}
VLOG(3) << "Skip update" << skip_update;
bool with_decay = ctx.Attr<bool>("with_decay");
if (!skip_update && with_decay) {
float coeff = ctx.Attr<float>("coeff");
auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
auto place = ctx.GetPlace();
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
phi::DenseTensor one(phi::DataType::FLOAT32);
phi::DenseTensor decay(phi::DataType::FLOAT32);
phi::DenseTensor tmp(phi::DataType::FLOAT32);
tmp.mutable_data<float>({1}, place);
one.mutable_data<float>({1}, place);
decay.mutable_data<float>({1}, place);
FillNpuTensorWithConstant<float>(&one, 1.0f);
framework::NPUAttributeMap attr_input = {{"value", coeff}};
const auto& runner1 = NpuOpRunner("Muls", {*lr}, {tmp}, attr_input);
runner1.Run(stream);
const auto& runner2 = NpuOpRunner("Sub", {one, tmp}, {decay}, {});
runner2.Run(stream);
if (ctx.HasInput("MasterParam")) {
PADDLE_THROW(platform::errors::Unimplemented(
"Master Parma is not supported on npu"));
} else {
auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
param_out->mutable_data<T>(ctx.GetPlace());
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<phi::DenseTensor>("Param");
const auto& runner =
NpuOpRunner("Mul",
{*param, decay},
{*const_cast<phi::DenseTensor*>(param)},
{});
runner.Run(stream);
}
}
AdamNPUKernel<platform::NPUDeviceContext, T>::Compute(ctx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
adam,
ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(adamw,
ops::AdamWNPUKernel<float>,
ops::AdamWNPUKernel<paddle::platform::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace paddle {
namespace operators {
template <typename T>
class NPUMergedMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto params = ctx.MultiInput<phi::DenseTensor>("Param");
auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
size_t n = params.size();
PADDLE_ENFORCE_EQ(n,
params_out.size(),
platform::errors::InvalidArgument(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d.",
params_out.size(),
n));
for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_EQ(params[i],
params_out[i],
platform::errors::InvalidArgument(
"The size of Input(Param) and Output(ParamOut) "
"must be the same Tensors."));
}
auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
PADDLE_ENFORCE_EQ(
n,
grads.size(),
platform::errors::InvalidArgument(
"The size of Input(Grad) must be equal to Input(Param), but got "
"the size of Input(Grad) is %d, the size of Input(Param) is %d.",
grads.size(),
n));
auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
PADDLE_ENFORCE_EQ(n,
velocitys.size(),
platform::errors::InvalidArgument(
"The size of Input(Velocity) must be equal to "
"Input(Param), but got the size of Input(Velocity) "
"is %d, the size of Input(Param) is %d.",
velocitys.size(),
n));
auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
PADDLE_ENFORCE_EQ(
n,
velocitys_out.size(),
platform::errors::InvalidArgument(
"The size of Output(VelocityOut) must be "
"equal to Input(Param), but got the size of Output(VelocityOut) is "
"%d, the size of Input(Param) is %d.",
velocitys_out.size(),
n));
for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_EQ(velocitys[i],
velocitys_out[i],
platform::errors::InvalidArgument(
"Input(Velocity) and Output(VelocityOut) must be "
"the same Tensors."));
}
T mu = static_cast<T>(ctx.Attr<float>("mu"));
auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
if (lrs.size() != 1) {
PADDLE_ENFORCE_EQ(
n,
lrs.size(),
platform::errors::InvalidArgument(
"If the size of Input(LearningRate) is not 1, the size of "
"Input(LearningRate) must be "
"equal to Input(Param), but got the size of Input(LearningRate) "
"is %d, the size of Input(Param) is %d.",
lrs.size(),
n));
}
auto use_nesterov = ctx.Attr<bool>("use_nesterov");
auto regularization_methods =
ctx.Attr<std::vector<std::string>>("regularization_method");
auto regularization_coeffs =
ctx.Attr<std::vector<float>>("regularization_coeff");
if (regularization_methods.size() != 0) {
PADDLE_ENFORCE_EQ(
n,
regularization_methods.size(),
platform::errors::InvalidArgument(
"The size of Attr(regularization_method) must be equal "
"to Input(Param), but got the size of "
"Attr(regularization_method) is %d, the size of Input(Param) is "
"%d.",
regularization_methods.size(),
n));
PADDLE_ENFORCE_EQ(
n,
regularization_coeffs.size(),
platform::errors::InvalidArgument(
"The size of Attr(regularization_coeff) must be equal "
"to Input(Param), but got the size of Attr(regularization_coeff) "
"is %d, the size of Input(Param) is %d.",
regularization_coeffs.size(),
n));
}
VLOG(5) << "use_nesterov: " << use_nesterov
<< ", regularization_methods.size(): "
<< regularization_methods.size()
<< ", regularization_coeffs.size(): "
<< regularization_coeffs.size();
auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
Tensor mu_tensor;
mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
FillNpuTensorWithConstant<T>(&mu_tensor, mu);
for (size_t idx = 0; idx < n; ++idx) {
phi::RegularizationType regularization_flag =
regularization_methods.size() > 0 &&
regularization_methods[idx] == "l2_decay"
? phi::RegularizationType::kL2DECAY
: phi::RegularizationType::kNONE;
float regularization_coeff = 0.0;
if (regularization_coeffs.size() != 0) {
regularization_coeff = regularization_coeffs[idx];
}
auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
auto param = params[idx];
auto param_out = params_out[idx];
auto velocity = velocitys[idx];
auto velocity_out = velocitys_out[idx];
auto grad = grads[idx];
Tensor regularized_grad;
if (regularization_flag == phi::RegularizationType::kL2DECAY) {
regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
const auto& runner1 = NpuOpRunner("Muls",
{*param},
{regularized_grad},
{{"value", regularization_coeff}});
runner1.Run(dev_ctx.stream());
const auto& runner2 = NpuOpRunner(
"Add", {regularized_grad, *grad}, {regularized_grad}, {});
runner2.Run(dev_ctx.stream());
} else {
regularized_grad.ShareDataWith(*grad);
}
framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
// NOTE: ApplyMomentum will change the input
const auto& runner = NpuOpRunner("ApplyMomentum",
{*param_out,
*velocity_out,
*learning_rate,
regularized_grad,
mu_tensor},
{*param_out},
{{"use_nesterov", use_nesterov}});
runner.Run(dev_ctx.stream());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(merged_momentum,
ops::NPUMergedMomentumOpKernel<float>,
ops::NPUMergedMomentumOpKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/fluid/operators/optimizers/sgd_op.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace paddle {
namespace operators {
template <typename T>
class NPUMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
std::string regularization_method =
ctx.Attr<std::string>("regularization_method");
auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
phi::RegularizationType regularization_flag{
phi::RegularizationType::kNONE}; // disable regularization
if (regularization_method == "l2_decay") {
regularization_flag = phi::RegularizationType::kL2DECAY;
}
T mu = static_cast<T>(ctx.Attr<float>("mu"));
bool use_nesterov = ctx.Attr<bool>("use_nesterov");
auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
auto param = ctx.Input<phi::DenseTensor>("Param");
auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
param_out->mutable_data<T>(ctx.GetPlace());
velocity_out->mutable_data<T>(ctx.GetPlace());
auto* grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<phi::DenseTensor>()) {
auto grad = ctx.Input<phi::DenseTensor>("Grad");
Tensor mu_tensor;
mu_tensor.mutable_data<T>(phi::make_ddim({1}), ctx.GetPlace());
FillNpuTensorWithConstant<T>(&mu_tensor, mu);
Tensor regularized_grad;
if (regularization_flag == phi::RegularizationType::kL2DECAY) {
regularized_grad.mutable_data<T>(grad->dims(), ctx.GetPlace());
const auto& runner1 = NpuOpRunner("Muls",
{*param},
{regularized_grad},
{{"value", regularization_coeff}});
runner1.Run(dev_ctx.stream());
const auto& runner2 = NpuOpRunner(
"Add", {regularized_grad, *grad}, {regularized_grad}, {});
runner2.Run(dev_ctx.stream());
} else {
regularized_grad.ShareDataWith(*grad);
}
framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
// NOTE: ApplyMomentum will change the input
const auto& runner = NpuOpRunner("ApplyMomentum",
{*param_out,
*velocity_out,
*learning_rate,
regularized_grad,
mu_tensor},
{*param_out},
{{"use_nesterov", use_nesterov}});
runner.Run(dev_ctx.stream());
} else if (grad_var->IsType<phi::SelectedRows>()) {
PADDLE_ENFORCE_EQ(
false,
true,
platform::errors::PermissionDenied("Unsupport SparseMomentum"));
} else {
PADDLE_ENFORCE_EQ(false,
true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(momentum,
ops::NPUMomentumOpKernel<float>,
ops::NPUMomentumOpKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class RMSPROPNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *grad_var = ctx.InputVar("Grad");
auto *param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto *moment_out = ctx.Output<phi::DenseTensor>("MomentOut");
auto *mean_square_out = ctx.Output<phi::DenseTensor>("MeanSquareOut");
param_out->mutable_data<T>(ctx.GetPlace());
moment_out->mutable_data<T>(ctx.GetPlace());
mean_square_out->mutable_data<T>(ctx.GetPlace());
auto epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto rho = static_cast<T>(ctx.Attr<float>("decay"));
auto momentum = static_cast<T>(ctx.Attr<float>("momentum"));
auto *p_tensor = ctx.Input<phi::DenseTensor>("Param");
auto *ms_tensor = ctx.Input<phi::DenseTensor>("MeanSquare");
auto *lr_tensor = ctx.Input<phi::DenseTensor>("LearningRate");
auto *mom_tensor = ctx.Input<phi::DenseTensor>("Moment");
bool centered = ctx.Attr<bool>("centered");
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (grad_var->IsType<phi::DenseTensor>()) {
auto *grad_tensor = ctx.Input<phi::DenseTensor>("Grad");
if (centered) {
framework::NPUAttributeMap attr_input = {{"use_locking", false}};
const phi::DenseTensor *rho_tensor = nullptr;
const phi::DenseTensor *momentum_tensor = nullptr;
const phi::DenseTensor *epsilon_tensor = nullptr;
phi::DenseTensor rho_tmp(phi::DataType::FLOAT32);
rho_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&rho_tmp, rho);
rho_tensor = &rho_tmp;
phi::DenseTensor momentum_tmp(phi::DataType::FLOAT32);
momentum_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&momentum_tmp, momentum);
momentum_tensor = &momentum_tmp;
phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
epsilon_tensor = &epsilon_tmp;
auto *mg_tensor = ctx.Input<phi::DenseTensor>("MeanGrad");
auto *mean_grad_out = ctx.Output<phi::DenseTensor>("MeanGradOut");
mean_grad_out->mutable_data<T>(ctx.GetPlace());
const auto &runner_applycenterrmsprop = NpuOpRunner(
std::string("ApplyCenteredRMSPropD"),
{*p_tensor,
*mg_tensor,
*ms_tensor,
*mom_tensor,
*lr_tensor,
*rho_tensor,
*momentum_tensor,
*epsilon_tensor,
*grad_tensor},
{*param_out, *mean_grad_out, *mean_square_out, *moment_out},
{attr_input});
runner_applycenterrmsprop.Run(stream);
} else {
framework::NPUAttributeMap attr_input = {
{"rho", rho}, {"momentum", momentum}, {"epsilon", epsilon}};
const auto &runner_applyrmsprop = NpuOpRunner(
std::string("ApplyRMSPropD"),
{*p_tensor, *ms_tensor, *mom_tensor, *lr_tensor, *grad_tensor},
{*param_out, *mean_square_out, *moment_out},
{attr_input});
runner_applyrmsprop.Run(stream);
}
} else {
PADDLE_ENFORCE_EQ(false,
true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in RmspropOp. Excepted LodTensor, "
"But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
rmsprop, ops::RMSPROPNPUKernel<paddle::platform::NPUDeviceContext, float>)
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/optimizers/sgd_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SGDNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
auto* param_var = ctx.Input<phi::DenseTensor>("Param");
auto* grad_var = ctx.Input<phi::DenseTensor>("Grad");
auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
param_out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("ApplyGradientDescent",
{*param_var, *learning_rate, *grad_var},
{*param_out},
{});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
// NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
// if param and param_out is not same, we need to do copy.
if (param_out->data<T>() != param_var->data<T>()) {
framework::TensorCopy(
*param_var,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
param_out);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
sgd,
ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceAnyNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const phi::DenseTensor* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
bool keep_dim = ctx.Attr<bool>("keep_dim");
auto dims = ctx.Attr<std::vector<int>>("dim");
out->mutable_data<T>(ctx.GetPlace());
// set attr
NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP_ITSELF(reduce_any);
USE_OP_DEVICE_KERNEL(reduce_any, NPU);
template <typename T>
void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<phi::DenseTensor>();
std::vector<bool> init_x = {true, false, false, false};
f::TensorFromVector<bool>(init_x, ctx, tensor_x);
tensor_x->Resize(phi::make_ddim({2}));
ctx.Wait();
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<phi::DenseTensor>();
// run
std::vector<int> axes;
f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
auto op = f::OpRegistry::CreateOp(
"reduce_any", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
std::vector<bool> out_vec;
f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
ctx.Wait();
std::vector<bool> expected_vec = {true};
EXPECT_EQ(out_vec.size(), expected_vec.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], expected_vec[i]);
}
}
TEST(reduce_any, NPU) {
f::Scope scope;
auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<bool>(&scope, *ctx);
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ReduceMaxNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto dims = ctx.Attr<std::vector<int>>("dim");
bool keep_dim = ctx.Attr<bool>("keep_dim");
bool reduce_all = ctx.Attr<bool>("reduce_all");
int out_dtype = ctx.Attr<int>("out_dtype");
auto place = ctx.GetPlace();
phi::DenseTensor cast_out(x->type());
cast_out.Resize(out->dims());
cast_out.mutable_data<T>(place);
auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
if (out_dtype != -1) {
cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
}
if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
if (cast_out_dtype == framework::proto::VarType::FP32) {
out->mutable_data<float>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP16) {
out->mutable_data<paddle::platform::float16>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT16) {
out->mutable_data<int16_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT32) {
out->mutable_data<int32_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT64) {
out->mutable_data<int64_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP64) {
out->mutable_data<double>(place);
} else if (cast_out_dtype == framework::proto::VarType::BOOL) {
out->mutable_data<bool>(place);
}
} else {
out->ShareDataWith(cast_out);
}
framework::NPUAttributeMap attr_input = {{"axes", dims},
{"keep_dims", keep_dim}};
if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
}
const auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("ReduceMaxD", {inputs[0]}, {outputs[0]}, attrs);
runner.Run(dev_ctx.stream());
};
NpuOpRunner::TypeAdapter({*x},
{cast_out},
attr_input,
dev_ctx,
op_func,
{framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner =
NpuOpRunner("ReduceMaxD", {*x}, {cast_out}, attr_input);
runner.Run(dev_ctx.stream());
}
if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
const auto& runner_cast =
NpuOpRunner("Cast",
{cast_out},
{*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(dev_ctx.stream());
}
}
};
template <typename DeviceContext, typename T>
class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<phi::DenseTensor>("X");
auto* out = context.Input<phi::DenseTensor>("Out");
auto* out_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto reduce_dims = context.Attr<std::vector<int>>("dim");
bool reduce_all = context.Attr<bool>("reduce_all");
int in_dtype = context.Attr<int>("in_dtype");
PADDLE_ENFORCE_EQ(
in_dtype == -1,
true,
platform::errors::InvalidArgument(
"NPU only support in_dtype == -1 in reduce_max_grad op."));
auto* x_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
x_grad->mutable_data<T>(context.GetPlace());
auto& dev_ctx =
context.template device_context<paddle::platform::NPUDeviceContext>();
auto place = context.GetPlace();
auto stream = dev_ctx.stream();
// broadcast
auto x_dims_vec = phi::vectorize(x->dims());
if (reduce_all) {
reduce_dims.clear();
for (size_t d = 0; d < x_dims_vec.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
phi::DenseTensor tmp_out, tmp_out_grad;
auto tmp_out_dims_vec = x_dims_vec;
for (auto d : reduce_dims) {
if (d < 0) {
d += x_dims_vec.size();
}
tmp_out_dims_vec[d] = 1;
}
tmp_out.ShareDataWith(*out);
tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
tmp_out_grad.ShareDataWith(*out_grad);
tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
phi::DenseTensor transformed_out(x->type());
transformed_out.Resize(phi::make_ddim(x_dims_vec));
transformed_out.mutable_data<T>(place);
NpuOpRunner r_brd_out;
r_brd_out.SetType("BroadcastTo")
.AddInput(tmp_out)
.AddInput(std::move(x_dims_vec))
.AddOutput(transformed_out)
.Run(stream);
phi::DenseTensor transformed_out_grad(x->type());
transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
transformed_out_grad.mutable_data<T>(place);
NpuOpRunner r_brd_out_grad;
r_brd_out_grad.SetType("BroadcastTo")
.AddInput(tmp_out_grad)
.AddInput(std::move(x_dims_vec))
.AddOutput(transformed_out_grad)
.Run(stream);
// compare
phi::DenseTensor equal_cond;
equal_cond.mutable_data<bool>(x_grad->dims(), place);
const auto& r_equal =
NpuOpRunner("Equal", {*x, transformed_out}, {equal_cond}, {});
r_equal.Run(stream);
// select
phi::DenseTensor t_zero;
t_zero.mutable_data<T>(x_grad->dims(), place);
FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
t_zero.Resize(x_grad->dims());
const auto& r_sel = NpuOpRunner(
"SelectV2", {equal_cond, transformed_out_grad, t_zero}, {*x_grad}, {});
r_sel.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
reduce_max,
ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, float>,
ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int64_t>,
ops::ReduceMaxNPUKernel<plat::NPUDeviceContext, int>);
REGISTER_OP_NPU_KERNEL(
reduce_max_grad,
ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, float>,
ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int64_t>,
ops::ReduceMaxGradNPUKernel<plat::NPUDeviceContext, int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
namespace paddle {
namespace operators {
template <typename T>
class NPUReduceMeanOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output = ctx.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(ctx.GetPlace());
bool reduce_all = ctx.Attr<bool>("reduce_all");
auto dims = ctx.Attr<std::vector<int>>("dim");
bool keep_dim = ctx.Attr<bool>("keep_dim");
auto input_dims = input->dims();
if (reduce_all) {
dims.clear();
for (int i = 0; i < input_dims.size(); i++) {
dims.push_back(static_cast<int>(i));
}
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
NpuOpRunner runner;
runner.SetType("ReduceMean")
.AddInput(*input)
.AddInput(std::move(dims))
.AddOutput(*output)
.AddAttrs({{"keep_dims", keep_dim}})
.Run(stream);
}
};
template <typename T>
class NPUReduceMeanGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("X");
auto* output_grad =
ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* input_grad =
ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(ctx.GetPlace());
bool reduce_all = ctx.Attr<bool>("reduce_all");
auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
auto input_dims = input->dims();
int reduce_numel = 1;
if (reduce_all) {
reduce_dims.clear();
for (int d = 0; d < input_dims.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
for (auto& d : reduce_dims) {
if (d < 0) {
d = d + input_dims.size();
}
reduce_numel *= input_dims[d];
}
phi::DenseTensor tensor_value(input_grad->dtype());
tensor_value.mutable_data<T>({1}, ctx.GetPlace());
FillNpuTensorWithConstant<T>(
&tensor_value, static_cast<T>(1.0f / static_cast<T>(reduce_numel)));
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
NpuOpRunner runner;
runner.SetType("Fill")
.AddInput(phi::vectorize(input_dims))
.AddInput(tensor_value)
.AddOutput(*input_grad)
.Run(stream);
phi::DenseTensor transformed_input_grad, transformed_out_grad;
phi::DenseTensor tmp_output_grad;
auto tmp_output_dims = input_dims;
for (auto d : reduce_dims) {
tmp_output_dims[d] = 1;
}
tmp_output_grad.ShareDataWith(*output_grad);
tmp_output_grad.Resize(tmp_output_dims);
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
NpuElementWiseOpBroadcast<T>(dev_ctx,
input_grad,
&tmp_output_grad,
0,
&transformed_input_grad,
&transformed_out_grad);
const auto& runner2 =
NpuOpRunner("Mul",
{transformed_input_grad, transformed_out_grad},
{*input_grad},
{});
runner2.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(reduce_mean, ops::NPUReduceMeanOpKernel<float>);
REGISTER_OP_NPU_KERNEL(reduce_mean_grad, ops::NPUReduceMeanGradOpKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ReduceMinNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto dims = ctx.Attr<std::vector<int>>("dim");
bool keep_dim = ctx.Attr<bool>("keep_dim");
bool reduce_all = ctx.Attr<bool>("reduce_all");
int out_dtype = ctx.Attr<int>("out_dtype");
auto place = ctx.GetPlace();
phi::DenseTensor cast_out(x->type());
cast_out.Resize(out->dims());
cast_out.mutable_data<T>(place);
auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
if (out_dtype != -1) {
cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
}
if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
if (cast_out_dtype == framework::proto::VarType::FP32) {
out->mutable_data<float>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP16) {
out->mutable_data<paddle::platform::float16>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT16) {
out->mutable_data<int16_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT32) {
out->mutable_data<int32_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT64) {
out->mutable_data<int64_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP64) {
out->mutable_data<double>(place);
} else if (cast_out_dtype == framework::proto::VarType::BOOL) {
out->mutable_data<bool>(place);
}
} else {
out->ShareDataWith(cast_out);
}
framework::NPUAttributeMap attr_input = {{"axes", dims},
{"keep_dims", keep_dim}};
if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
}
const auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
if (x->dtype() == phi::DataType::INT64) {
auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
const std::vector<phi::DenseTensor>& outputs,
const NPUAttributeMap& attrs,
const platform::NPUDeviceContext& dev_ctx) {
const auto& runner =
NpuOpRunner("ReduceMinD", {inputs[0]}, {outputs[0]}, attrs);
runner.Run(dev_ctx.stream());
};
NpuOpRunner::TypeAdapter({*x},
{cast_out},
attr_input,
dev_ctx,
op_func,
{framework::proto::VarType::INT32},
{framework::proto::VarType::INT32});
} else {
const auto& runner =
NpuOpRunner("ReduceMinD", {*x}, {cast_out}, attr_input);
runner.Run(dev_ctx.stream());
}
if (framework::TransToProtoVarType(x->type()) != cast_out_dtype) {
auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
const auto& runner_cast =
NpuOpRunner("Cast",
{cast_out},
{*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(dev_ctx.stream());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
reduce_min,
ops::ReduceMinNPUKernel<plat::NPUDeviceContext, float>,
ops::ReduceMinNPUKernel<plat::NPUDeviceContext, plat::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int64_t>,
#endif
ops::ReduceMinNPUKernel<plat::NPUDeviceContext, int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ReduceProdNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto dims = ctx.Attr<std::vector<int>>("dim");
bool keep_dim = ctx.Attr<bool>("keep_dim");
bool reduce_all = ctx.Attr<bool>("reduce_all");
int out_dtype = ctx.Attr<int>("out_dtype");
auto place = ctx.GetPlace();
phi::DenseTensor cast_out(x->type());
cast_out.Resize(out->dims());
cast_out.mutable_data<T>(place);
auto cast_out_dtype = framework::TransToProtoVarType(x->dtype());
if (out_dtype != -1) {
cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
}
if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
if (cast_out_dtype == framework::proto::VarType::FP32) {
out->mutable_data<float>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP16) {
out->mutable_data<paddle::platform::float16>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT16) {
out->mutable_data<int16_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT32) {
out->mutable_data<int32_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT64) {
out->mutable_data<int64_t>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP64) {
out->mutable_data<double>(place);
} else if (cast_out_dtype == framework::proto::VarType::BOOL) {
out->mutable_data<bool>(place);
}
} else {
out->ShareDataWith(cast_out);
}
framework::NPUAttributeMap attr_input = {{"axes", dims},
{"keep_dims", keep_dim}};
if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner =
NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input);
runner.Run(stream);
if (framework::TransToProtoVarType(x->dtype()) != cast_out_dtype) {
auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
const auto& runner_cast =
NpuOpRunner("Cast",
{cast_out},
{*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
reduce_prod,
ops::ReduceProdNPUKernel<plat::NPUDeviceContext, float>,
ops::ReduceProdNPUKernel<plat::NPUDeviceContext, plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/operators/unsqueeze_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ReduceSumNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
bool reduce_all = ctx.Attr<bool>("reduce_all");
bool keep_dims = ctx.Attr<bool>("keep_dim");
auto dims = ctx.Attr<std::vector<int>>("dim");
out->mutable_data<T>(ctx.GetPlace());
// special case
if (x->dims().size() == 1 && keep_dims == false) {
keep_dims = true;
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
phi::DenseTensor cast_x;
phi::DenseTensor cast_out;
// NOTE: ReduceSumD only supports fp32 and fp16
if (framework::TransToProtoVarType(x->dtype()) !=
framework::proto::VarType::FP32 &&
framework::TransToProtoVarType(x->dtype()) !=
framework::proto::VarType::FP16) {
cast_x.Resize(x->dims());
cast_x.mutable_data<float>(ctx.GetPlace());
auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
const auto& runner_cast = NpuOpRunner(
"Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
cast_out.Resize(out->dims());
cast_out.mutable_data<float>(ctx.GetPlace());
} else {
cast_x.ShareDataWith(*x);
cast_out.ShareDataWith(*out);
}
if (reduce_all) {
std::vector<int> dim_vec;
for (int i = 0; i < x->dims().size(); i++) {
dim_vec.push_back(i);
}
const auto& runner =
NpuOpRunner("ReduceSumD",
{cast_x},
{cast_out},
{{"axes", dim_vec}, {"keep_dims", keep_dims}});
runner.Run(stream);
} else {
const auto& runner =
NpuOpRunner("ReduceSumD",
{cast_x},
{cast_out},
{{"axes", dims}, {"keep_dims", keep_dims}});
runner.Run(stream);
}
if (framework::TransToProtoVarType(x->dtype()) !=
framework::proto::VarType::FP32 &&
framework::TransToProtoVarType(x->dtype()) !=
framework::proto::VarType::FP16) {
auto dst_dtype =
ConvertToNpuDtype(framework::TransToProtoVarType(out->dtype()));
const auto& runner_cast =
NpuOpRunner("Cast",
{cast_out},
{*out},
{{"dst_type", static_cast<int>(dst_dtype)}});
runner_cast.Run(stream);
}
}
};
template <typename DeviceContext, typename T>
class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
bool reduce_all = ctx.Attr<bool>("reduce_all");
bool keep_dims = ctx.Attr<bool>("keep_dim");
auto dims = ctx.Attr<std::vector<int>>("dim");
x_grad->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (keep_dims || reduce_all) {
const auto& runner = NpuOpRunner("BroadcastToD",
{*out_grad},
{*x_grad},
{{"shape", phi::vectorize(x->dims())}});
runner.Run(stream);
} else {
framework::DDim out_dims;
out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
dims, out_grad->dims());
phi::DenseTensor out_grad_tmp(out_grad->type());
out_grad_tmp.Resize(out_dims);
out_grad_tmp.mutable_data<T>(ctx.GetPlace());
framework::TensorCopy(
*out_grad,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
&out_grad_tmp);
out_grad_tmp.Resize(out_dims);
const auto& runner = NpuOpRunner("BroadcastToD",
{out_grad_tmp},
{*x_grad},
{{"shape", phi::vectorize(x->dims())}});
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
reduce_sum,
ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
#endif
ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
reduce_sum_grad,
ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
#endif
ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SequenceMaskNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Output<phi::DenseTensor>("Y");
int maxlen = ctx.Attr<int>("maxlen");
if (ctx.HasInput("MaxLenTensor")) {
auto max_len_tensor = ctx.Input<phi::DenseTensor>("MaxLenTensor");
PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
platform::errors::InvalidArgument(
"Input(MaxLenTensor) should not be NULL."
"But received Input(MaxLenTensor) is NULL"));
phi::DenseTensor temp;
paddle::framework::TensorCopySync(
*max_len_tensor, platform::CPUPlace(), &temp);
maxlen = *temp.data<int32_t>();
PADDLE_ENFORCE_GT(
maxlen,
0,
platform::errors::InvalidArgument(
"Input(MaxLenTensor) value should be greater than 0. But "
"received Input(MaxLenTensor) value = %d.",
maxlen));
}
if (maxlen < 0) {
auto x_numel = x->numel();
if (x_numel == 0) {
maxlen = 0;
} else {
std::vector<T> x_vec;
framework::TensorToVector(*x, dev_ctx, &x_vec);
auto x_data = x_vec.data();
maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
}
}
auto y_dim = phi::vectorize<int>(x->dims());
y_dim.push_back(maxlen);
phi::DenseTensor cast_x;
cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
const auto& cast1_runner = NpuOpRunner(
"Cast",
{*x},
{cast_x},
{{"dst_type",
ConvertToNpuDtype(framework::TransToProtoVarType(cast_x.dtype()))}});
cast1_runner.Run(dev_ctx.stream());
phi::DenseTensor tmp;
tmp.mutable_data<int32_t>(phi::make_ddim({maxlen}), ctx.GetPlace());
NpuOpRunner range_runner;
range_runner.SetType("Range");
range_runner.AddInput(std::vector<int32_t>({0}));
range_runner.AddInput(std::vector<int32_t>({maxlen}));
range_runner.AddInput(std::vector<int32_t>({1}));
range_runner.AddOutput(tmp);
range_runner.Run(dev_ctx.stream());
phi::DenseTensor expand_tmp;
expand_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
const auto& expand_runner =
NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
expand_runner.Run(dev_ctx.stream());
auto x_dims = phi::vectorize<int>(x->dims());
x_dims.push_back(1);
cast_x.Resize(phi::make_ddim({x_dims}));
phi::DenseTensor x_tmp;
x_tmp.mutable_data<int32_t>(phi::make_ddim(y_dim), ctx.GetPlace());
const auto& tile_runner =
NpuOpRunner("TileWithAxis",
{cast_x},
{x_tmp},
{{"axis", x->dims().size()}, {"tiles", maxlen}});
tile_runner.Run(dev_ctx.stream());
phi::DenseTensor y_tmp;
y_tmp.mutable_data<uint8_t>(phi::make_ddim(y_dim), ctx.GetPlace());
const auto& less_runner =
NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
less_runner.Run(dev_ctx.stream());
y->Resize(phi::make_ddim(y_dim));
auto out_dtype = static_cast<framework::proto::VarType::Type>(
ctx.Attr<int>("out_dtype"));
if (out_dtype == framework::proto::VarType::INT32) {
y->mutable_data<int32_t>(ctx.GetPlace());
} else if (out_dtype == framework::proto::VarType::INT64) {
y->mutable_data<int64_t>(ctx.GetPlace());
} else if (out_dtype == framework::proto::VarType::FP32) {
y->mutable_data<float>(ctx.GetPlace());
} else if (out_dtype == framework::proto::VarType::FP64) {
y->mutable_data<double>(ctx.GetPlace());
} else if (out_dtype == framework::proto::VarType::BOOL) {
y->mutable_data<bool>(ctx.GetPlace());
} else if (out_dtype == framework::proto::VarType::UINT8) {
y->mutable_data<uint8_t>(ctx.GetPlace());
} else {
PADDLE_ENFORCE(false,
platform::errors::InvalidArgument(
"out_dtype only supporing int32, int64, fp32, fp64, "
"bool, uint8, but receive out_dtype is %d",
out_dtype));
}
const auto& cast2_runner = NpuOpRunner(
"Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}});
cast2_runner.Run(dev_ctx.stream());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
sequence_mask,
ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int32_t>,
ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int64_t>,
ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, float>,
ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, double>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册