From 10fbb831edd0225d34639b5de476453a5ed0c1e0 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 10 Jul 2018 13:07:43 +0800 Subject: [PATCH] Skip BatchNorm when feature only has 1 element. (#11578) * Fix batch norm when only 1 elements in normzalize dimension during training. --- paddle/fluid/operators/batch_norm_op.cc | 21 ++++- paddle/fluid/operators/batch_norm_op.cu.cc | 77 +++++++++++-------- paddle/fluid/operators/cross_entropy_op.cc | 3 +- .../unittests/test_fake_dequantize_op.py | 1 - .../fluid/tests/unittests/test_parallel_op.py | 4 +- 5 files changed, 66 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 693bf973c2b..5912a1a17cb 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -216,6 +216,18 @@ class BatchNormKernel saved_mean_e.setZero(); saved_variance_e.setZero(); + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), C); + + if ((N * sample_size) == 1) { + LOG(WARNING) << "Only 1 element in normalization dimension, " + << "we skip the batch norm calculation, let y = x."; + framework::TensorCopySync(*x, ctx.GetPlace(), y); + return; + } + switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); @@ -247,10 +259,6 @@ class BatchNormKernel PADDLE_THROW("Unknown storage order: %s", data_layout_str); } - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), C); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), C); running_mean_arr = running_mean_arr * momentum + saved_mean_e * (1. - momentum); running_var_arr = @@ -427,6 +435,11 @@ class BatchNormGradKernel d_bias_arr.setZero(); d_scale_arr.setZero(); + if ((N * sample_size) == 1) { + framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x); + return; + } + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); switch (data_layout) { diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc index 550dd32d367..ca6cd866935 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu.cc @@ -72,6 +72,9 @@ class BatchNormKernel int N, C, H, W, D; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + // ------------------- cudnn descriptors --------------------- cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -93,7 +96,7 @@ class BatchNormKernel mode_ = CUDNN_BATCHNORM_SPATIAL; #endif - VLOG(1) << "Setting descriptors."; + VLOG(3) << "Setting descriptors."; std::vector dims; std::vector strides; if (data_layout == DataLayout::kNCHW) { @@ -113,11 +116,6 @@ class BatchNormKernel const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); - auto *y = ctx.Output("Y"); - - // alloc memory - y->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.template device_context(); auto handle = dev_ctx.cudnn_handle(); @@ -162,22 +160,28 @@ class BatchNormKernel functor(dev_ctx, saved_mean, static_cast>(0)); functor(dev_ctx, saved_variance, static_cast>(0)); - double this_factor = 1. - momentum; - - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), - data_desc_, x->template data(), data_desc_, - y->template mutable_data(ctx.GetPlace()), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); + if ((N * H * W * D) == 1) { + LOG(WARNING) << "Only 1 element in normalization dimension, " + << "we skip the batch norm calculation, let y = x."; + framework::TensorCopySync(*x, ctx.GetPlace(), y); + } else { + double this_factor = 1. - momentum; + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), + data_desc_, x->template data(), data_desc_, + y->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data>(), + bias->template data>(), this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); + } } // clean when exit. @@ -209,6 +213,25 @@ class BatchNormGradKernel int N, C, H, W, D; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); + if ((N * H * W * D) == 1) { + framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x); + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); PADDLE_ENFORCE_EQ(scale->dims()[0], C); @@ -247,21 +270,11 @@ class BatchNormGradKernel CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( bn_param_desc_, data_desc_, mode_)); - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - d_x->mutable_data(ctx.GetPlace()); - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); - const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); const void *saved_mean_data = saved_mean->template data(); const void *saved_var_data = saved_var->template data(); - auto &dev_ctx = ctx.template device_context(); CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), CudnnDataType::kOne(), diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index d5e095f9cad..a3bec3da451 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -124,8 +124,7 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss.") - .Reuse("X"); + "[N x 1]. The cross entropy loss."); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py index 281068e945e..026ac2112b2 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py @@ -40,7 +40,6 @@ class TestFakeDequantizeMaxAbsOp(OpTest): self.op_type = "fake_dequantize_max_abs" x = np.random.randn(31, 65).astype("float32") yq, scale = quantize_max_abs(x, self.num_bits) - print 'scale ', scale ydq = dequantize_max_abs(yq, self.num_bits, scale) self.inputs = {'X': yq} diff --git a/python/paddle/fluid/tests/unittests/test_parallel_op.py b/python/paddle/fluid/tests/unittests/test_parallel_op.py index 79bea148f93..9ba5f988f31 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_op.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py @@ -113,7 +113,9 @@ class BaseParallelForTest(unittest.TestCase): generator = callback() # Automatically insert parallel do if use_parallel = True if use_parallel: - places = fluid.layers.get_places() + thread_num = fluid.core.get_cuda_device_count( + ) if use_gpu else 8 + places = fluid.layers.get_places(thread_num) pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl) data = next(generator) -- GitLab