From 906e7f921e6df8d6376902aa581e9c2f03fdc0dc Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Wed, 23 Sep 2020 10:34:02 +0800 Subject: [PATCH] add fuse_bn_act op (#27230) * add fused_bn_add_relu op --- cmake/operators.cmake | 3 +- paddle/fluid/operators/fused/CMakeLists.txt | 8 +- .../fused/fused_bn_add_activation_op.cc | 255 +++++++++++++ .../fused/fused_bn_add_activation_op.cu | 338 ++++++++++++++++++ .../fused/fused_bn_add_activation_op.h | 106 ++++++ python/paddle/fluid/contrib/layers/nn.py | 191 +++++++++- .../contrib/mixed_precision/fp16_lists.py | 1 + .../contrib/mixed_precision/fp16_utils.py | 9 +- .../tests/unittests/test_fused_bn_add_act.py | 215 +++++++++++ 9 files changed, 1120 insertions(+), 6 deletions(-) create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.cc create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.cu create mode 100644 paddle/fluid/operators/fused/fused_bn_add_activation_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index aea972ab3db..21080fbe8fd 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -127,7 +127,8 @@ function(op_library TARGET) "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" -"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op") +"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" +"fused_bn_add_activation_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 3fc5f3bfc6b..477a9162fe3 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -8,7 +8,8 @@ register_operators(EXCLUDES multihead_matmul_op fused_embedding_eltwise_layernorm_op fusion_group_op - fusion_gru_op) + fusion_gru_op + fused_bn_add_activation_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -47,4 +48,9 @@ if (WITH_GPU) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n") cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op) endif() + # fused_bn_add_activation + if (NOT ${CUDNN_VERSION} VERSION_LESS 7401) + op_library(fused_bn_add_activation_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n") + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc new file mode 100644 index 00000000000..5b3ed03bb64 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc @@ -0,0 +1,255 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +void FusedBatchNormAddActOp::InferShape( + framework::InferShapeContext *ctx) const { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", + "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", + "FusedBatchNormAddActOp"); + + // check output + OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut", + "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut", + "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean", + "FusedBatchNormAddActOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance", + "FusedBatchNormAddActOp"); + + const auto x_dims = ctx->GetInputDim("X"); + const auto z_dims = ctx->GetInputDim("Z"); + PADDLE_ENFORCE_EQ(x_dims, z_dims, + platform::errors::InvalidArgument( + "ShapeError: the shapes of input " + "must be equal. But received: the shape " + "of input X = [%s], and the shape of " + "input Y = [%s]", + x_dims, z_dims)); + PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument( + "ShapeError: the dimensions of input " + "must greater than or equal to 2." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument( + "ShapeError: the dimensions of input " + "must smaller than or equal to 5." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); + + const int64_t C = x_dims[x_dims.size() - 1]; + + auto scale_dim = ctx->GetInputDim("Scale"); + auto bias_dim = ctx->GetInputDim("Bias"); + + PADDLE_ENFORCE_EQ( + scale_dim.size(), 1UL, + platform::errors::InvalidArgument( + "ShapeError: the dimension of scale must equal to 1." + "But received: the shape of scale is [%s], the dimension " + "of scale is [%d]", + scale_dim, scale_dim.size())); + PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL, + platform::errors::InvalidArgument( + "ShapeError: the dimension of bias must equal to 1." + "But received: the shape of bias is [%s],the dimension " + "of bias is [%d]", + bias_dim, bias_dim.size())); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 || + framework::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], C, + platform::errors::InvalidArgument( + "ShapeError: the shape of scale must equal to [%d]" + "But received: the shape of scale is [%d]", + C, scale_dim[0])); + PADDLE_ENFORCE_EQ(bias_dim[0], C, + platform::errors::InvalidArgument( + "ShapeError: the shape of bias must equal to [%d]" + "But received: the shape of bias is [%d]", + C, bias_dim[0])); + } + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + // By default, the type of the scale, bias, mean, + // and var tensors should be float when input tensor's dtype is float16. + auto bn_param_type = framework::proto::VarType::FP32; + + PADDLE_ENFORCE_EQ( + bn_param_type, ctx.Input("Scale")->type(), + platform::errors::InvalidArgument("Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, ctx.Input("Bias")->type(), + platform::errors::InvalidArgument("Bias input should be of float type")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); +} + +void FusedBatchNormAddActOpMaker::Make() { + AddInput("X", "The input tensor"); + AddInput("Z", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("ReserveSpace", + "Reserve GPU space for triggering the new semi-persistent " + "NHWC kernel"); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, + platform::errors::InvalidArgument( + "'epsilon' should be between 0.0 and 0.001.")); + }); + AddAttr("act_type", "The activation type to be fused.") + .SetDefault("relu"); + AddComment(R"DOC( +Fused Batch Normalization with activation. + +Batch Norm has been implemented as discussed in the paper: +https://arxiv.org/pdf/1502.03167.pdf +Batch Norm can be used as a normalizer function for conv2d and fully_connected operations. +Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`. + +)DOC"); +} + +void FusedBatchNormAddActGradOp::InferShape( + framework::InferShapeContext *ctx) const { + // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", + "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", + "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", + "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean", + "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance", + "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", + framework::GradVarName("Y"), "FusedBatchNormAddActGradOp"); + + // check output + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output", + framework::GradVarName("Z"), "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output", + framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output", + framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp"); + + const auto in_dims = ctx->GetInputDim("X"); + const int C = in_dims[in_dims.size() - 1]; + + ctx->SetOutputDim(framework::GradVarName("X"), in_dims); + ctx->SetOutputDim(framework::GradVarName("Z"), in_dims); + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); +} + +framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW(platform::errors::NotFound( + "Can not find Y@GRAD in the execution context.")); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW( + platform::errors::NotFound("Can not get the tensor value of Y@GRAD.")); + } + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout, + library); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + fused_bn_add_activation, ops::FusedBatchNormAddActOp, + ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType, + ops::FusedBatchNormAddActGradOpMaker, + ops::FusedBatchNormAddActGradOpMaker); +REGISTER_OPERATOR(fused_bn_add_activation_grad, + ops::FusedBatchNormAddActGradOp); diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu new file mode 100644 index 00000000000..7f1d297cda3 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -0,0 +1,338 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/norm_utils.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +class FusedBatchNormAddActKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + double epsilon = static_cast(ctx.Attr("epsilon")); + float momentum = ctx.Attr("momentum"); + std::string act_type = ctx.Attr("act_type"); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + // Get the size for each dimension. + // NHWC [batch_size, in_height, in_width, in_channels] + const auto *x = ctx.Input("X"); + const auto *z = ctx.Input("Z"); + const auto &in_dims = x->dims(); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + + auto &dev_ctx = ctx.template device_context(); + + // ------------------- cudnn descriptors --------------------- + auto handle = dev_ctx.cudnn_handle(); + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * D * C, 1, W * D * C, D * C, C}; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, + data_desc_, mode_)); + + double this_factor = 1. - momentum; + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; + platform::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + Tensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + auto *reserve_space = ctx.Output("ReserveSpace"); + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + platform::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*zDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*activationDesc=*/activation_desc_, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(), + reserve_space_size); + workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), + workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, mode_, bnOps_, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, z->template data(), data_desc_, + y->template data(), bn_param_desc_, + scale->template data>(), + bias->template data>(), this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()), + activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr, + reserve_space_size)); + + // clean when exit. + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +template +class FusedBatchNormAddActGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + double epsilon = static_cast(ctx.Attr("epsilon")); + std::string act_type = ctx.Attr("act_type"); + + const auto *x = ctx.Input("X"); + const auto *z = ctx.Input("Z"); + const auto *y = ctx.Input("Y"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *reserve_space = ctx.Input("ReserveSpace"); + + const auto &in_dims = x->dims(); + + int N, C, H, W, D; + const DataLayout data_layout = DataLayout::kNHWC; + ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_z = ctx.Output(framework::GradVarName("Z")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_z->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE_EQ( + d_scale && d_bias, true, + platform::errors::PreconditionNotMet( + "Both the scale grad and the bias grad must not be null.")); + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL, + platform::errors::PreconditionNotMet( + "The scale only has one dimension.")); + PADDLE_ENFORCE_EQ( + scale->dims()[0], C, + platform::errors::PreconditionNotMet( + "The size of scale is equal to the channel of Input(X).")); + + auto &dev_ctx = ctx.template device_context(); + + std::vector dims = {N, C, H, W, D}; + std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, + data_desc_, mode_)); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const auto *saved_mean_data = + saved_mean->template data>(); + const auto *saved_var_data = + saved_var->template data>(); + + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + Tensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION; + platform::ScopedActivationDescriptor scope_act_desc; + cudnnActivationDescriptor_t activation_desc_ = + scope_act_desc.descriptor(act_type); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/data_desc_, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/activation_desc_, + /*sizeInBytes=*/&workspace_size)); + + workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(), + workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/dev_ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/bnOps_, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/x->template data(), + /*yDesc=*/data_desc_, + /*yData=*/y->template data(), + /*dyDesc=*/data_desc_, + /*dyData=*/d_y->template data(), + /*dzDesc=*/data_desc_, + /*dzData=*/d_z->template data(), + /*dxDesc=*/data_desc_, + /*dxData=*/d_x->template data(), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale->template data>(), + /*bnBiasData=*/bias->template data>(), + /*dBnScaleData=*/d_scale->template data>(), + /*dBnBiasData=*/d_bias->template data>(), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesmc=*/activation_desc_, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast(reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); + + // clean when exit. + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 7401 +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + fused_bn_add_activation, + ops::FusedBatchNormAddActKernel); +REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad, + ops::FusedBatchNormAddActGradKernel< + plat::CUDADeviceContext, plat::float16>); +#endif diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h new file mode 100644 index 00000000000..5c7df96e60d --- /dev/null +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h @@ -0,0 +1,106 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedBatchNormAddActOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +template +class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Z", this->Input("Z")); + op->SetInput("Y", this->Output("Y")); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + op->SetInput("Scale", this->Input("Scale")); + op->SetInput("Bias", this->Input("Bias")); + op->SetInput("SavedMean", this->Output("SavedMean")); + op->SetInput("SavedVariance", this->Output("SavedVariance")); + op->SetInput("ReserveSpace", this->Output("ReserveSpace")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z")); + op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias")); + } +}; + +class FusedBatchNormAddActOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Y"}}; + return m; + } +}; + +template +class FusedBatchNormAddActKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class FusedBatchNormAddActGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 7b564b3f837..ac6493b1c29 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype from paddle.fluid import core +from paddle.fluid.param_attr import ParamAttr from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_ @@ -57,7 +58,7 @@ __all__ = [ 'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat', 'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice', - 'correlation' + 'correlation', 'fused_bn_add_act' ] @@ -1625,3 +1626,191 @@ def correlation(x, }, outputs={"Output": output}) return output + + +def fused_bn_add_act(x, + y, + momentum=0.9, + epsilon=1e-05, + param_attr=None, + bias_attr=None, + moving_mean_name=None, + moving_variance_name=None, + act=None, + name=None): + """ + This Op performs batch norm on input x, and adds the result to input y. Then + it performs activation on the sum. The data format of inputs must be NHWC + `[batch, in_height, in_width, in_channels]`. + + Args: + x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type + is float16. + y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type + is float16. + momentum(float|Tensor, optional): The value used for the moving_mean and + moving_var computation. This should be a float number or a tensor with + shape [1] and data type as float32. The updated formula is: + :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)` + :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)` + Default is 0.9. + epsilon(float, optional): A value added to the denominator for + numerical stability. Default is 1e-5. + param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale` + of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as param_attr, the name of scale can be set in ParamAttr. + If the Initializer of the param_attr is not set, the parameter is initialized + with Xavier. Default: None. + bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm. + If it is set to None or one attribute of ParamAttr, batch_norm + will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. + If the Initializer of the bias_attr is not set, the bias is initialized zero. + Default: None. + moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it + is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm + will save global mean with the string. + moving_variance_name(str, optional): The name of the moving_variance which store the global Variance. + If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm + will save global variance with the string. + act(string, optional): Activation type, linear|relu|prelu|... + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + def build_program(main_program, startup_program): + with fluid.program_guard(main_program, startup_program): + x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32') + y = fluid.layers.data(name="y", shape=[1], dtype='int64') + conv1_1 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + bias_attr=False, + data_format='NHWC') + conv1_2 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + bias_attr=False, + data_format='NHWC') + bn = fluid.layers.batch_norm( + input=conv1_1, + act=None, + data_layout='NHWC') + fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn) + prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=y) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = fluid.contrib.mixed_precision.decorate( + sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) + sgd.minimize(loss) + + return x, y, loss + + iters = 5 + batch_size = 16 + support_gpu = fluid.is_compiled_with_cuda() + if support_gpu: + main_program = fluid.Program() + startup_program = fluid.Program() + place = fluid.CUDAPlace(0) + x, y, loss = build_program(main_program, startup_program) + + feeder = fluid.DataFeeder(feed_list=[x, y], place=place) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + exe = fluid.Executor(place) + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe.run(startup_program) + for _ in range(iters): + data = next(train_reader()) + loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss]) + """ + helper = LayerHelper('fused_bn_add_act', **locals()) + + check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'], + 'fused_bn_add_act') + check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'], + 'fused_bn_add_act') + bn_param_dtype = core.VarDesc.VarType.FP32 + + x_shape = x.shape + channel_num = x_shape[-1] + param_shape = [channel_num] + + # create parameter + scale = helper.create_parameter( + attr=helper.param_attr, + shape=param_shape, + dtype=bn_param_dtype, + default_initializer=Constant(1.0)) + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=param_shape, + dtype=bn_param_dtype, + is_bias=True) + mean = helper.create_parameter( + attr=ParamAttr( + name=moving_mean_name, initializer=Constant(0.0), trainable=False), + shape=param_shape, + dtype=bn_param_dtype) + mean.stop_gradient = True + variance = helper.create_parameter( + attr=ParamAttr( + name=moving_variance_name, + initializer=Constant(1.0), + trainable=False), + shape=param_shape, + dtype=bn_param_dtype) + variance.stop_gradient = True + + # create output + # mean and mean_out share the same memory + mean_out = mean + # variance and variance out share the same memory + variance_out = variance + saved_mean = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_variance = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + reserve_space = helper.create_variable_for_type_inference( + dtype=core.VarDesc.VarType.FP16, stop_gradient=True) + batch_norm_out = helper.create_variable_for_type_inference( + core.VarDesc.VarType.FP16) + + inputs = { + "X": x, + "Z": y, + "Scale": scale, + "Bias": bias, + } + attrs = {"epsilon": epsilon, 'momentum': momentum} + + outputs = { + "Y": batch_norm_out, + "MeanOut": mean_out, + "VarianceOut": variance_out, + "SavedMean": saved_mean, + "SavedVariance": saved_variance, + "ReserveSpace": reserve_space + } + + helper.append_op( + type="fused_bn_add_activation", + inputs=inputs, + outputs=outputs, + attrs=attrs) + + return batch_norm_out diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 1f301b7148d..a9f080c514d 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -135,6 +135,7 @@ gray_list = { 'get_tensor_from_selected_rows', 'sign', 'cast', + 'fused_bn_add_activation', } ''' # The set of ops that don't support fp16 calculation diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 0b142ff33de..0ff166d8dc8 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -69,8 +69,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): ] for in_name in op.input_names: - if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm': - if in_name != 'X': + if src_dtype == core.VarDesc.VarType.FP32 and op.type in [ + 'batch_norm', 'fused_bn_add_activation' + ]: + if in_name not in {'X', 'Z'}: continue for in_var_name in op.input(in_name): in_var = block.var(in_var_name) @@ -102,7 +104,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): op._set_attr('in_dtype', dest_dtype) if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16: for out_name in op.output_names: - if op.type == 'batch_norm' and out_name != 'Y': + if op.type in ['batch_norm', 'fused_bn_add_activation' + ] and out_name != 'Y': continue for out_var_name in op.output(out_name): out_var = block.var(out_var_name) diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py new file mode 100644 index 00000000000..1bc305cd1f4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py @@ -0,0 +1,215 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "Paddle core is not compiled with CUDA") +class TestFusedBnAddActAPI(unittest.TestCase): + def setUp(self): + self.conv_param_attr1 = fluid.ParamAttr( + name='conv2d_1.weight', + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + self.conv_param_attr2 = fluid.ParamAttr( + name='conv2d_2.weight', + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + self.bn_param_attr1 = fluid.ParamAttr( + name='batch_norm_w_1', + initializer=fluid.initializer.Constant(value=1.0)) + self.bn_bias_attr1 = fluid.ParamAttr( + name='batch_norm_b_1', + initializer=fluid.initializer.Constant(value=0.0)) + self.bn_param_attr2 = fluid.ParamAttr( + name='batch_norm_w_2', + initializer=fluid.initializer.Constant(value=1.0)) + self.bn_bias_attr2 = fluid.ParamAttr( + name='batch_norm_b_2', + initializer=fluid.initializer.Constant(value=0.0)) + self.fc_param_attr = fluid.ParamAttr( + name='fc.weight', + initializer=fluid.initializer.Xavier(uniform=False)) + + def build_fused_program(self, + main_program, + startup_program, + use_cuda, + seed=1): + with fluid.program_guard(main_program, startup_program): + x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32') + y = fluid.layers.data(name="y", shape=[1], dtype='int64') + conv1_1 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + param_attr=self.conv_param_attr1, + bias_attr=False, + data_format='NHWC') + conv1_2 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + param_attr=self.conv_param_attr2, + bias_attr=False, + data_format='NHWC') + bn = fluid.layers.batch_norm( + input=conv1_1, + param_attr=self.bn_param_attr1, + bias_attr=self.bn_bias_attr1, + act=None, + data_layout='NHWC') + fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act( + conv1_2, + bn, + param_attr=self.bn_param_attr2, + bias_attr=self.bn_bias_attr2) + prediction = fluid.layers.fc(input=fused_bn_add_act, + size=10, + act='softmax', + param_attr=self.fc_param_attr) + loss = fluid.layers.cross_entropy(input=prediction, label=y) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = fluid.contrib.mixed_precision.decorate( + sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) + sgd.minimize(loss) + + return x, y, loss + + def build_origin_program(self, + main_program, + startup_program, + use_cuda, + seed=1): + with fluid.program_guard(main_program, startup_program): + x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32') + y = fluid.layers.data(name="y", shape=[1], dtype='int64') + conv1_1 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + param_attr=self.conv_param_attr1, + bias_attr=False, + data_format='NHWC') + conv1_2 = fluid.layers.conv2d( + input=x, + filter_size=3, + num_filters=32, + stride=1, + padding=1, + act=None, + param_attr=self.conv_param_attr2, + bias_attr=False, + data_format='NHWC') + bn1 = fluid.layers.batch_norm( + input=conv1_1, + param_attr=self.bn_param_attr1, + bias_attr=self.bn_bias_attr1, + act=None, + data_layout='NHWC') + bn2 = fluid.layers.batch_norm( + input=conv1_2, + param_attr=self.bn_param_attr2, + bias_attr=self.bn_bias_attr2, + act=None, + data_layout='NHWC') + out = bn1 + bn2 + out = fluid.layers.relu(out) + prediction = fluid.layers.fc(input=out, + size=10, + act='softmax', + param_attr=self.fc_param_attr) + loss = fluid.layers.cross_entropy(input=prediction, label=y) + loss = fluid.layers.mean(loss) + sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = fluid.contrib.mixed_precision.decorate( + sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) + sgd.minimize(loss) + + return x, y, loss + + def check(self, place, use_cuda): + paddle.manual_seed(1) + paddle.framework.random._manual_program_seed(1) + iters = 5 + batch_size = 16 + + # build_fused_program + main_program = fluid.Program() + startup_program = fluid.Program() + x, y, loss = self.build_fused_program(main_program, startup_program, + use_cuda) + feeder = fluid.DataFeeder(feed_list=[x, y], place=place) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + exe = fluid.Executor(place) + loss_vals_fused = [] + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe.run(startup_program) + for _ in range(iters): + data = next(train_reader()) + loss_v = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss]) + loss_vals_fused.append(loss_v[0][0]) + + # build_origin_program + main_program = fluid.Program() + startup_program = fluid.Program() + x, y, loss = self.build_origin_program(main_program, startup_program, + use_cuda) + feeder = fluid.DataFeeder(feed_list=[x, y], place=place) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=batch_size) + loss_vals = [] + scope = fluid.Scope() + with fluid.scope_guard(scope): + exe.run(startup_program) + for _ in range(iters): + data = next(train_reader()) + loss_v = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss]) + loss_vals.append(loss_v[0][0]) + + # check loss + for i in range(iters): + self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5) + + def test_fuse_bn_add_act(self): + place = fluid.CUDAPlace(0) + self.check(place, use_cuda=True) + + +if __name__ == '__main__': + unittest.main() -- GitLab