diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 7400f45e0592687b6169cebec3a8ef31a6c7a8b8..f43fcb2d9f8e7892866f26f96fff21fdf4c2464f 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -356,6 +356,16 @@ class CompiledProgram(object):
         if self._build_strategy.sync_batch_norm:
             self._build_strategy.enable_sequential_execution = True
 
+        if self._program is not None and self._program._enable_dgc:
+            assert use_cuda, "DGC only used under cuda"
+            assert self._build_strategy.num_trainers * len(
+                places) > 1, "DGC is not useful for single card training"
+            assert self._build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "DGC \
+                only used for AllReduce BuildStrategy"
+
+            # DGC doesn't support fuse for now, close fuse.
+            self._build_strategy.fuse_all_reduce_ops = False
+
         self._persistable_vars = []
         for node in self._graph.nodes():
             if node.is_var() and node.var() is not None and node.var().persistable() and \
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 179bac78ff8d1f684cb3ec59e9420be1b6e23ffa..58380cf8e1440cab766f8a88c33341e22a0300ca 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -175,15 +175,6 @@ class ParallelExecutor(object):
         ) if use_cuda else framework.cpu_places()
         self._scope = scope if scope is not None else executor.global_scope()
 
-        if main_program is not None and main_program._enable_dgc:
-            assert build_strategy.num_trainers > 1, "dgc is not useful when num_trainers <= 1"
-            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "dgc \
-                only used for allreduce"
-
-            assert build_strategy.num_trainers * len(
-                self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda, "dgc only used under cuda"
-
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 4288a6c52af1297e00a49f03d9b06998d52d4798..ac0713d65ecc4477c9f50b386e8dbf739208729f 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -334,10 +334,6 @@ class TestDistRunnerBase(object):
             build_stra.num_trainers = 1
             build_stra.trainer_id = 0
 
-        if args.use_dgc:
-            # fuse_all_reduce_ops require that gradients should not be sparse types
-            build_stra.fuse_all_reduce_ops = False
-
         print_to_err(type(self).__name__, "begin to compile with data parallel")
         binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
             loss_name=avg_cost.name,