diff --git a/PaddleCV/image_classification/models/se_resnext.py b/PaddleCV/image_classification/models/se_resnext.py
index 697c551e147bed481c0aa88682b0866716d59f93..a6d0d635ac46893f41858b7be9c2d1e5a903724b 100644
--- a/PaddleCV/image_classification/models/se_resnext.py
+++ b/PaddleCV/image_classification/models/se_resnext.py
@@ -69,7 +69,8 @@ class SE_ResNeXt():
                 pool_size=3,
                 pool_stride=2,
                 pool_padding=1,
-                pool_type='max')
+                pool_type='max',
+                use_cudnn=False)
         elif layers == 101:
             cardinality = 32
             reduction_ratio = 16
@@ -88,7 +89,8 @@ class SE_ResNeXt():
                 pool_size=3,
                 pool_stride=2,
                 pool_padding=1,
-                pool_type='max')
+                pool_type='max',
+                use_cudnn=False)
         elif layers == 152:
             cardinality = 64
             reduction_ratio = 16
@@ -118,7 +120,7 @@ class SE_ResNeXt():
                 name='conv3')
             conv = fluid.layers.pool2d(
                 input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
-                pool_type='max')
+                pool_type='max', use_cudnn=False)
         n = 1 if layers == 50 or layers == 101 else 3
         for block in range(len(depth)):
             n += 1
@@ -132,7 +134,11 @@ class SE_ResNeXt():
                     name=str(n) + '_' + str(i + 1))
 
         pool = fluid.layers.pool2d(
-            input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+            input=conv,
+            pool_size=7,
+            pool_type='avg',
+            global_pooling=True,
+            use_cudnn=False)
         drop = fluid.layers.dropout(
             x=pool, dropout_prob=0.5, seed=self.params['dropout_seed'])
         stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
@@ -224,7 +230,11 @@ class SE_ResNeXt():
                            reduction_ratio,
                            name=None):
         pool = fluid.layers.pool2d(
-            input=input, pool_size=0, pool_type='avg', global_pooling=True)
+            input=input,
+            pool_size=0,
+            pool_type='avg',
+            global_pooling=True,
+            use_cudnn=False)
         stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
         squeeze = fluid.layers.fc(
             input=pool,
diff --git a/PaddleCV/image_classification/train.py b/PaddleCV/image_classification/train.py
index cb33fd72c0fa4194132bfaed5cedc159e3963850..3c929e5fa70e9124d137528d7a60b3a92e0cd196 100644
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -22,10 +22,23 @@ import time
 import sys
 import functools
 import math
+
+def set_paddle_flags(flags):
+    for key, value in flags.items():
+        if os.environ.get(key, None) is None:
+            os.environ[key] = str(value)
+
+
+# NOTE(paddle-dev): All of these flags should be
+# set before `import paddle`. Otherwise, it would
+# not take any effect. 
+set_paddle_flags({
+    'FLAGS_eager_delete_tensor_gb': 0,  # enable gc 
+    'FLAGS_fraction_of_gpu_memory_to_use': 0.98
+})
 import argparse
 import functools
 import subprocess
-
 import paddle
 import paddle.fluid as fluid
 import paddle.dataset.flowers as flowers
@@ -50,6 +63,7 @@ add_arg('class_dim',        int,   1000,                 "Class number.")
 add_arg('image_shape',      str,   "3,224,224",          "input image size")
 add_arg('model_save_dir',   str,   "output",             "model save directory")
 add_arg('with_mem_opt',     bool,  True,                 "Whether to use memory optimization or not.")
+add_arg('with_inplace',     bool,  True,                 "Whether to use inplace memory optimization.")
 add_arg('pretrained_model', str,   None,                 "Whether to use pretrained model.")
 add_arg('checkpoint',       str,   None,                 "Whether to resume checkpoint.")
 add_arg('lr',               float, 0.1,                  "set learning rate.")
@@ -412,10 +426,20 @@ def train(args):
     # use_ngraph is for CPU only, please refer to README_ngraph.md for details
     use_ngraph = os.getenv('FLAGS_use_ngraph')
     if not use_ngraph:
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.memory_optimize = args.with_mem_opt
+        build_strategy.enable_inplace = args.with_inplace
+        build_strategy.fuse_all_reduce_ops=1
+
+        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_iteration_per_drop_scope = 10
+
         train_exe = fluid.ParallelExecutor(
             main_program=train_prog,
             use_cuda=bool(args.use_gpu),
-            loss_name=train_cost.name)
+            loss_name=train_cost.name,
+            build_strategy=build_strategy,
+            exec_strategy=exec_strategy)
     else:
         train_exe = exe
 
@@ -429,6 +453,7 @@ def train(args):
         test_info = [[], [], []]
         train_time = []
         batch_id = 0
+        time_record=[]
         try:
             while True:
                 t1 = time.time()
@@ -450,6 +475,7 @@ def train(args):
 
                 t2 = time.time()
                 period = t2 - t1
+                time_record.append(period)
 
                 loss = np.mean(np.array(loss))
                 train_info[0].append(loss)
@@ -457,6 +483,8 @@ def train(args):
                 train_time.append(period)
 
                 if batch_id % 10 == 0:
+                    period = np.mean(time_record)
+                    time_record=[]
                     if use_mixup:
                         print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}"
                               .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.2f sec" % period))