Add trainable_statistics in attr for batch_norm. (#24072) (#24135)

* Add trainable_statistics in attr for batch_norm * Unifying behavior of dynamic graph and static graph

Add trainable_statistics in attr for batch_norm. (#24072) (#24135)
* Add trainable_statistics in attr for batch_norm * Unifying behavior of dynamic graph and static graph
6c770503 · qingqing01 · GitHub · 839ba08e · 6c770503 · 6c770503
7 changed file
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -33,7 +33,9 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "BatchNorm");
  bool is_test = ctx->Attrs().Get<bool>("is_test");
-  if (!is_test) {
+  bool trainable_stats = ctx->Attrs().Get<bool>("trainable_statistics");
+  bool test_mode = is_test && (!trainable_stats);
+  if (!test_mode) {
    OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut", "BatchNorm");
    OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
                   "BatchNorm");
@@ -258,7 +260,11 @@ void BatchNormOpMaker::Make() {
                "global mean and variance are also used during train time, "
                "the BN acts as scaling and shiffting.")
      .SetDefault(false);
+  AddAttr<bool>("trainable_statistics",
+                "(bool, default false) Whether to calculate mean and variance "
+                "in test mode. If setting true in test mode, mean and variace "
+                "will be calculated by current batch statistics.")
+      .SetDefault(false);
  AddComment(R"DOC(
 Batch Normalization.
@@ -281,8 +287,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
    float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
+    bool test_mode = is_test && (!trainable_stats);
-    bool global_stats = is_test || use_global_stats;
+    bool global_stats = test_mode || use_global_stats;
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout data_layout =

--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -47,10 +47,13 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    float momentum = ctx.Attr<float>("momentum");
    const bool is_test = ctx.Attr<bool>("is_test");
    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout data_layout =
        framework::StringToDataLayout(data_layout_str);
+    bool test_mode = is_test && (!trainable_stats);
    // Get the size for each dimension.
    // NCHW [batch_size, in_channels, in_height, in_width]
    const auto *x = ctx.Input<Tensor>("X");
@@ -66,7 +69,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    auto dtype = platform::CudnnDataType<T>::type;
    const bool fast_nhwc_batch_norm =
-        is_test ||
+        test_mode ||
        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
    auto compute_format =
@@ -133,7 +136,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    PADDLE_ENFORCE_CUDA_SUCCESS(
        platform::dynload::cudnnDeriveBNTensorDescriptor(
            bn_param_desc_, data_desc_,
-            is_test ? CUDNN_BATCHNORM_SPATIAL : mode_));
+            test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
    const auto *scale = ctx.Input<Tensor>("Scale");
    const auto *bias = ctx.Input<Tensor>("Bias");
@@ -143,7 +146,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    auto handle = dev_ctx.cudnn_handle();
    // Now, depending on whether we are running test or not, we have two paths.
-    if (is_test || use_global_stats) {
+    if (test_mode || use_global_stats) {
      // only when test we use input to do computation.
      const auto *est_mean = ctx.Input<Tensor>("Mean");
      const auto *est_var = ctx.Input<Tensor>("Variance");

--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -120,8 +120,10 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const bool is_test = ctx.Attr<bool>("is_test");
    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
    const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
+    bool test_mode = is_test && (!trainable_stats);
-    bool global_stats = is_test || use_global_stats;
+    bool global_stats = test_mode || use_global_stats;
    auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
@@ -156,7 +158,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto flags = mkldnn::normalization_flags::use_scale_shift;  // 001
    if (global_stats)
      flags |= mkldnn::normalization_flags::use_global_stats;  // 010
-    if (fuse_with_relu && is_test)
+    if (fuse_with_relu && test_mode)
      flags |= mkldnn::normalization_flags::fuse_norm_relu;  // 100
    BatchNormMKLDNNHandler<T> handler(

--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -28,6 +28,7 @@ class SyncBatchNormKernel<platform::CUDADeviceContext, T>
    const std::string layout_str = ctx.Attr<std::string>("data_layout");
    const DataLayout layout = framework::StringToDataLayout(layout_str);
    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
    PADDLE_ENFORCE_EQ(use_global_stats, false,
                      platform::errors::InvalidArgument(
                          "sync_batch_norm doesn't support "
@@ -47,9 +48,10 @@ class SyncBatchNormKernel<platform::CUDADeviceContext, T>
    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
    auto *saved_inv_variance = ctx.Output<Tensor>("SavedVariance");
+    bool test_mode = is_test && (!trainable_stats);
    SyncBatchNormFunctor<platform::CUDADeviceContext, T>(
        ctx, layout, x, y, est_mean, est_var, mean_out, variance_out,
-        saved_mean, saved_inv_variance, epsilon, momentum, is_test,
+        saved_mean, saved_inv_variance, epsilon, momentum, test_mode,
        use_global_stats);
  }
 };

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1276,12 +1276,12 @@ class BatchNorm(layers.Layer):
        variance_out = self._variance
        if in_dygraph_mode():
-            _is_test = not self.training and not self._trainable_statistics
            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", _is_test, "data_layout", self._data_layout,
+                     "is_test", not self.training, "data_layout",
-                     "use_mkldnn", False, "fuse_with_relu",
+                     self._data_layout, "use_mkldnn", False, "fuse_with_relu",
                     self._fuse_with_relu, "use_global_stats",
-                     self._use_global_stats)
+                     self._use_global_stats, 'trainable_statistics',
+                     self._trainable_statistics)
            batch_norm_out, _, _, _, _ = core.ops.batch_norm(
                input, self.weight, self.bias, self._mean, self._variance,
                mean_out, variance_out, *attrs)
@@ -1298,7 +1298,8 @@ class BatchNorm(layers.Layer):
            "data_layout": self._data_layout,
            "use_mkldnn": False,
            "fuse_with_relu": self._fuse_with_relu,
-            "use_global_stats": self._use_global_stats
+            "use_global_stats": self._use_global_stats,
+            "trainable_statistics": self._trainable_statistics,
        }
        inputs = {

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2522,16 +2522,7 @@ class Block(object):
        """
        if in_dygraph_mode():
            attrs = kwargs.get("attrs", {})
-            if _dygraph_tracer_._train_mode == False:
-                # eval mode
-                if ('trainable_statistics' not in attrs
-                    ) or not attrs['trainable_statistics']:
-                    attrs['is_test'] = True
-                else:
-                    attrs['is_test'] = False
            type = kwargs.get("type", None)
            op = Operator(
                block=self,
                desc=None,

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -623,5 +623,53 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
            self.assertRaises(TypeError, batch_norm, x2)
+class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+            def compute(x, is_test, trainable_statistics):
+                with fluid.dygraph.guard(p):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    y = bn(fluid.dygraph.to_variable(x))
+                return y.numpy()
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute(x, False, False)
+            y2 = compute(x, True, True)
+            self.assertTrue(np.allclose(y1, y2))
+    def test_static(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            exe = fluid.Executor(p)
+            shape = [4, 10, 16, 16]
+            def compute(x_np, is_test, trainable_statistics):
+                with program_guard(Program(), Program()):
+                    bn = fluid.dygraph.BatchNorm(
+                        shape[1],
+                        is_test=is_test,
+                        trainable_statistics=trainable_statistics)
+                    x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+                    y = bn(x)
+                    exe.run(fluid.default_startup_program())
+                    r = exe.run(feed={'x': x_np}, fetch_list=[y])[0]
+                return r
+            x = np.random.randn(*shape).astype("float32")
+            y1 = compute(x, False, False)
+            y2 = compute(x, True, True)
+            self.assertTrue(np.allclose(y1, y2))
 if __name__ == '__main__':
    unittest.main()