Call new cudnn batch norm API regardless of data type and data layout (#30157)

666efc23 · AshburnLee · GitHub · 5c8455d6 · 666efc23 · 666efc23
7 changed file
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -114,7 +114,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                 << "CUDNN_BN_MIN_EPSILON instead.";
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
    } else {
@@ -122,7 +122,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    }
 #else
    mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
    VLOG(3) << "Setting descriptors.";
    std::vector<int> dims;
@@ -151,7 +151,10 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    auto handle = dev_ctx.cudnn_handle();
    // Now, depending on whether we are running test or not, we have two paths.
-    if (test_mode || use_global_stats) {
+    // It is training mode when it's not reference AND not using pre-trained
+    // model.
+    bool training = !test_mode && !use_global_stats;
+    if (!training) {
      // only when test we use input to do computation.
      const auto *est_mean = ctx.Input<Tensor>("Mean");
      const auto *est_var = ctx.Input<Tensor>("Variance");
@@ -234,7 +237,6 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
        bool called = false;
 #if CUDNN_VERSION_MIN(7, 4, 1)
-        if (compute_format == DataLayout::kNHWC) {
        called = true;
        size_t workspace_size = 0;
        size_t reserve_space_size = 0;
@@ -281,11 +283,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
            ctx.GetPlace(), transformed_x.type(), workspace_size);
        PADDLE_ENFORCE_CUDA_SUCCESS(
            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                  handle, mode_, CUDNN_BATCHNORM_OPS_BN,
+                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kZero(), data_desc_,
-                  data_desc_, transformed_x.template data<T>(), nullptr,
+                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                  nullptr, data_desc_, transformed_y.template data<T>(),
+                transformed_y.template data<T>(), bn_param_desc_,
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+                scale->template data<BatchNormParamType<T>>(),
                bias->template data<BatchNormParamType<T>>(), this_factor,
                mean_out->template mutable_data<BatchNormParamType<T>>(
                    ctx.GetPlace()),
@@ -298,8 +300,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
                    ctx.GetPlace()),
                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
                reserve_space_size));
-        }
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-#endif
        if (!called) {
          PADDLE_ENFORCE_CUDA_SUCCESS(
              platform::dynload::cudnnBatchNormalizationForwardTraining(
@@ -640,7 +641,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                   << "CUDNN_BN_MIN_EPSILON instead.";
      }
      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
      } else {
@@ -648,7 +649,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
      }
 #else
      mode_ = CUDNN_BATCHNORM_SPATIAL;
-#endif
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
          data_desc_, CudnnDataType<T>::type,
@@ -672,10 +673,10 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                        num, transformed_x.data<T>(), grid2, block, stream);
      }
+      // This branch calls CUDNN APIs
      if (d_scale && d_bias) {
        bool called = false;
 #if CUDNN_VERSION_MIN(7, 4, 1)
-        if (compute_format == DataLayout::kNHWC) {
        called = true;
        size_t workspace_size = 0;
        void *workspace_ptr = nullptr;
@@ -738,8 +739,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
                /*reserveSpace=*/const_cast<T *>(
                    reserve_space->template data<T>()),
                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-        }
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-#endif
        if (!called) {
          PADDLE_ENFORCE_CUDA_SUCCESS(
              platform::dynload::cudnnBatchNormalizationBackward(
@@ -764,6 +764,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
              ctx, &transformed_d_x, d_x);
        }
      } else {
+        // This branch call CUDA kernels
        if (compute_format == DataLayout::kNCHW) {
          if (d_x) {
            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<

--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -178,6 +178,9 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
    op->SetInput("Bias", this->Input("Bias"));
    op->SetInput("SavedMean", this->Output("SavedMean"));
    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    if (this->HasOutput("ReserveSpace")) {
+      op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+    }
    // used when setting use_global_stats True during training
    if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1309,12 +1309,6 @@ class BatchNorm(layers.Layer):
            dtype=self._dtype)
        self._variance.stop_gradient = True
-        self._has_reserve_space = False
-        if data_layout == 'NHWC':
-            flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
-            if flag is not None and flag.lower() in ['true', '1']:
-                self._has_reserve_space = True
        self._in_place = in_place
        self._data_layout = data_layout
        self._momentum = momentum
@@ -1341,7 +1335,6 @@ class BatchNorm(layers.Layer):
            batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
                input, self.weight, self.bias, self._mean, self._variance,
                mean_out, variance_out, *attrs)
            return dygraph_utils._append_activation_in_dygraph(
                batch_norm_out, act=self._act, use_mkldnn=self._use_mkldnn)
@@ -1371,11 +1364,8 @@ class BatchNorm(layers.Layer):
            dtype=self._dtype, stop_gradient=True)
        saved_variance = self._helper.create_variable_for_type_inference(
            dtype=self._dtype, stop_gradient=True)
-        reserve_space = None
-        if self._has_reserve_space:
        reserve_space = self._helper.create_variable_for_type_inference(
-                dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+            dtype=self._helper.input_dtype(input), stop_gradient=True)
        batch_norm_out = input if self._in_place else self._helper.create_variable_for_type_inference(
            self._dtype)
@@ -1388,7 +1378,7 @@ class BatchNorm(layers.Layer):
            "SavedVariance": [saved_variance]
        }
        if reserve_space is not None:
-            outputs["ReserveSpace"] = reserve_space
+            outputs["ReserveSpace"] = [reserve_space]
        self._helper.append_op(
            type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2792,12 +2792,6 @@ def batch_norm(input,
                             'batch_norm')
    dtype = helper.input_dtype()
-    has_reserve_space = False
-    if data_layout == 'NHWC':
-        flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
-        if flag is not None and flag.lower() in ['true', '1']:
-            has_reserve_space = True
    # use fp32 for bn parameter
    if dtype == core.VarDesc.VarType.FP16:
        dtype = core.VarDesc.VarType.FP32
@@ -2845,17 +2839,16 @@ def batch_norm(input,
    # create output
    # mean and mean_out share the same memory
    mean_out = mean
-    # variance and variance out share the same memory
+    # variance and variance_out share the same memory
    variance_out = variance
    saved_mean = helper.create_variable_for_type_inference(
        dtype=dtype, stop_gradient=True)
    saved_variance = helper.create_variable_for_type_inference(
        dtype=dtype, stop_gradient=True)
    reserve_space = None
-    if has_reserve_space:
+    if not is_test:
        reserve_space = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+            dtype=helper.input_dtype(), stop_gradient=True)
    batch_norm_out = input if in_place else \
            helper.create_variable_for_type_inference(dtype)
@@ -2998,12 +2991,6 @@ def inplace_abn(input,
                             'inplace_abn')
    dtype = helper.input_dtype()
-    has_reserve_space = False
-    if data_layout == 'NHWC':
-        flag = os.environ.get('FLAGS_cudnn_batchnorm_spatial_persistent')
-        if flag is not None and flag.lower() in ['true', '1']:
-            has_reserve_space = True
    input_shape = input.shape
    if data_layout == 'NCHW':
        channel_num = input_shape[1]
@@ -3053,12 +3040,8 @@ def inplace_abn(input,
        dtype=dtype, stop_gradient=True)
    saved_variance = helper.create_variable_for_type_inference(
        dtype=dtype, stop_gradient=True)
-    reserve_space = None
-    if has_reserve_space:
    reserve_space = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+        dtype=dtype, stop_gradient=True)
    batch_norm_out = input
    inputs = {
@@ -3082,7 +3065,6 @@ def inplace_abn(input,
        inputs['MomemtumTensor'] = momentum
    else:
        attrs['momentum'] = momentum
    outputs = {
        "Y": batch_norm_out,
        "MeanOut": mean_out,

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -440,16 +440,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
                    "SavedMean": block.var('saved_mean'),
                    "SavedVariance": block.var('saved_variance')
                }
-                has_reserve_space = False
+                block.create_var(name="reserve_space", dtype='float32')
-                if data_format == 'NHWC':
-                    flag = os.environ.get(
-                        'FLAGS_cudnn_batchnorm_spatial_persistent')
-                    if flag is not None and flag.lower() in ['true', '1']:
-                        has_reserve_space = True
-                if has_reserve_space:
-                    block.create_var(name="reserve_space", dtype='float16')
                outputs["ReserveSpace"] = block.var('reserve_space')
-                    del os.environ['FLAGS_cudnn_batchnorm_spatial_persistent']
                bn_op = block.append_op(
                    type="batch_norm",
                    inputs=inputs,

--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -122,7 +122,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        bn_fetches = exe.run(program=main,
@@ -142,7 +142,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
        if not only_forward:
            others = [
                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
            ]
            fetch_names += others
        for nm in fetch_names:

--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -166,7 +166,6 @@ def batch_norm(x,
          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
          print(batch_norm_out)
    """
    assert len(x.shape) >= 2, "input dim must be larger than 1"
    # input ad out must share the memory
@@ -196,7 +195,6 @@ def batch_norm(x,
        batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
            x, weight, bias, running_mean, running_var, mean_out, variance_out,
            *attrs)
        return dygraph_utils._append_activation_in_dygraph(
            batch_norm_out, act=None)
@@ -230,13 +228,16 @@ def batch_norm(x,
    saved_variance = helper.create_variable_for_type_inference(
        dtype=dtype, stop_gradient=True)
    batch_norm_out = helper.create_variable_for_type_inference(dtype)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True)
    outputs = {
        "Y": [batch_norm_out],
        "MeanOut": [running_mean],
        "VarianceOut": [running_var],
        "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance]
+        "SavedVariance": [saved_variance],
+        "ReserveSpace": [reserve_space]
    }
    helper.append_op(