/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { template class NPUBatchNormOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto &dev_ctx = ctx.template device_context(); const float epsilon = ctx.Attr("epsilon"); float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); const bool use_global_stats = ctx.Attr("use_global_stats"); const bool trainable_stats = ctx.Attr("trainable_statistics"); const bool test_mode = is_test && (!trainable_stats); const std::string data_layout = ctx.Attr("data_layout"); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( "The input tensor X's dimension must equal to 4. But " "received X's shape = [%s], X's dimension = [%d].", x_dims, x_dims.size())); auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); Tensor x_tensor, y_tesnor; x_tensor.ShareDataWith(*x); y_tesnor.ShareDataWith(*y); if (data_layout == "NHWC") { x_tensor.set_layout(DataLayout::kNHWC); y_tesnor.set_layout(DataLayout::kNHWC); } bool training = !test_mode && !use_global_stats; if (!training) { const auto *est_mean = ctx.Input("Mean"); const auto *est_var = ctx.Input("Variance"); framework::Tensor reserve_space1, reserve_space2; reserve_space1.mutable_data(est_mean->dims(), ctx.GetPlace()); reserve_space2.mutable_data(est_var->dims(), ctx.GetPlace()); const auto &runner = NpuOpRunner( "BatchNorm", {x_tensor, *scale, *bias, *est_mean, *est_var}, {y_tesnor, reserve_space1, reserve_space2, reserve_space1, reserve_space2}, {{"epsilon", epsilon}, {"is_training", training}, {"data_format", data_layout}}); auto stream = dev_ctx.stream(); runner.Run(stream); } else { // if MomentumTensor is set, use MomentumTensor value, momentum // is only used in this training branch if (ctx.HasInput("MomentumTensor")) { const auto *mom_tensor = ctx.Input("MomentumTensor"); Tensor mom_cpu; TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); momentum = mom_cpu.data()[0]; } auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); mean_out->mutable_data(ctx.GetPlace()); variance_out->mutable_data(ctx.GetPlace()); saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); framework::Tensor mean_tmp, variance_tmp; mean_tmp.mutable_data(mean_out->dims(), ctx.GetPlace()); variance_tmp.mutable_data(variance_out->dims(), ctx.GetPlace()); const auto &runner = NpuOpRunner( "BatchNorm", {x_tensor, *scale, *bias}, {y_tesnor, mean_tmp, variance_tmp, *saved_mean, *saved_variance}, {{"epsilon", epsilon}, {"is_training", training}, {"data_format", data_layout}}); auto stream = dev_ctx.stream(); runner.Run(stream); // Ascend can't output the estimated mean and variance framework::Tensor this_factor_tensor; this_factor_tensor.mutable_data(framework::make_ddim({1}), ctx.GetPlace()); framework::TensorFromVector({static_cast(1. - momentum)}, dev_ctx, &this_factor_tensor); framework::Tensor momentum_tensor; momentum_tensor.mutable_data(framework::make_ddim({1}), ctx.GetPlace()); framework::TensorFromVector({static_cast(momentum)}, dev_ctx, &momentum_tensor); framework::Tensor ones_tensor; ones_tensor.mutable_data(mean_out->dims(), ctx.GetPlace()); framework::TensorFromVector( std::vector(framework::product(mean_out->dims()), 1.0f), dev_ctx, &ones_tensor); const auto &runner1 = NpuOpRunner("AddMatMatElements", {*mean_out, *saved_mean, ones_tensor, momentum_tensor, this_factor_tensor}, {*mean_out}, {}); runner1.Run(stream); const auto &runner2 = NpuOpRunner( "AddMatMatElements", {*variance_out, *saved_variance, ones_tensor, momentum_tensor, this_factor_tensor}, {*variance_out}, {}); runner2.Run(stream); } } }; template class NPUBatchNormGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto &dev_ctx = ctx.template device_context(); const float epsilon = ctx.Attr("epsilon"); const std::string data_layout = ctx.Attr("data_layout"); bool use_global_stats = ctx.Attr("use_global_stats"); const auto *y_grad = ctx.Input(framework::GradVarName("Y")); const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); auto *saved_mean = ctx.Input("SavedMean"); auto *saved_variance = ctx.Input("SavedVariance"); auto *x_grad = ctx.Output(framework::GradVarName("X")); auto *scale_grad = ctx.Output(framework::GradVarName("Scale")); auto *bias_grad = ctx.Output(framework::GradVarName("Bias")); const bool is_test = ctx.Attr("is_test"); use_global_stats = is_test || use_global_stats; const Tensor *x = ctx.Input("X"); const auto &x_dims = x->dims(); PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( "The input tensor X's dimension must equal to 4. But " "received X's shape = [%s], X's dimension = [%d].", x_dims, x_dims.size())); // init output Tensor scale_grad_tmp, bias_grad_tmp, x_grad_tmp; if (scale_grad && bias_grad) { scale_grad->mutable_data(ctx.GetPlace()); bias_grad->mutable_data(ctx.GetPlace()); scale_grad_tmp.ShareDataWith(*scale_grad); bias_grad_tmp.ShareDataWith(*bias_grad); } else { scale_grad_tmp.mutable_data(scale->dims(), ctx.GetPlace()); bias_grad_tmp.mutable_data(bias->dims(), ctx.GetPlace()); } Tensor x_tensor, y_grad_tensor, x_grad_tensor; x_tensor.ShareDataWith(*x); y_grad_tensor.ShareDataWith(*y_grad); if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); x_grad_tensor.ShareDataWith(*x_grad); } else { x_grad_tensor.mutable_data(x->dims(), ctx.GetPlace()); } if (data_layout == "NHWC") { x_tensor.set_layout(DataLayout::kNHWC); y_grad_tensor.set_layout(DataLayout::kNHWC); x_grad_tensor.set_layout(DataLayout::kNHWC); } if (!use_global_stats) { const auto &runner = NpuOpRunner( "BatchNormGrad", {y_grad_tensor, x_tensor, *scale, *saved_mean, *saved_variance}, {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *saved_mean, *saved_variance}, // segment fault if no reserve_space_3 and // reserve_space_4 {{"epsilon", epsilon}, {"is_training", true}, {"data_format", data_layout}}); auto stream = dev_ctx.stream(); runner.Run(stream); } else { const auto *running_mean = ctx.Input("Mean"); const auto *running_var = ctx.Input("Variance"); const auto &runner = NpuOpRunner( "BatchNormGrad", {y_grad_tensor, x_tensor, *scale, *running_mean, *running_var}, {x_grad_tensor, scale_grad_tmp, bias_grad_tmp, *running_mean, *running_var}, // segment fault if no reserve_space_3 and // reserve_space_4 {{"epsilon", epsilon}, {"is_training", true}, {"data_format", data_layout}}); auto stream = dev_ctx.stream(); runner.Run(stream); } } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(batch_norm, ops::NPUBatchNormOpKernel, ops::NPUBatchNormOpKernel); REGISTER_OP_NPU_KERNEL(batch_norm_grad, ops::NPUBatchNormGradOpKernel, ops::NPUBatchNormGradOpKernel);