Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor-communicator test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-async-ssa-graph-executor-communicator test=develop
baf02328 · Qiao Longfei · 9db1a9e1 · d8d73ff3 · baf02328 · baf02328
113 changed file
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -34,7 +34,7 @@ ExternalProject_Add(
    BUILD_IN_SOURCE 1
 )
-ADD_LIBRARY(dgc SHARED IMPORTED GLOBAL)
+ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
 ADD_DEPENDENCIES(dgc extern_dgc)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -95,7 +95,7 @@ paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size',
 paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b'))
 paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8'))
 paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4'))
-paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'f19dd380864e61134ce3814e4be0de4b'))
+paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '59b1c6bf2f0fa9dc649c85fef3a3b2ea'))
 paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa'))
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
@@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits',
 paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
 paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
 paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
-paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee'))
+paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
 paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
 paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
 paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
@@ -225,10 +225,12 @@ paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=Non
 paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6'))
 paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932'))
 paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2fa6782d43d02ae64482d21235a82949'))
+paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', 'fe4481fb31363b09cfdd228fc6776ddf'))
 paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb'))
 paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1546136806fef5c08f6918544bd9151d'))
 paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
 paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
+paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
@@ -276,7 +278,7 @@ paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None,
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
 paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
 paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
-paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
+paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385'))
 paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
 paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -357,6 +359,7 @@ paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], vara
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
 paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8'))
 paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
+paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
 paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99'))
@@ -406,6 +409,7 @@ paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', '
 paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665'))
 paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
 paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
+paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
@@ -428,63 +432,75 @@ paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys',
 paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '3802be78fbfb206dae64a2d9f8480970'))
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
 paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
        proto_desc)
-cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper)
+cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
 cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -25,8 +25,12 @@ if(WITH_DISTRIBUTE)
 endif()
 if(WITH_GPU)
+    set(dgc_deps "")
+    if(NOT WIN32)
+        set(dgc_deps dgc)
+    endif()
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor dgc)
+            dynload_cuda variable_visitor ${dgc_deps})
    nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
    if(WITH_DISTRIBUTE)

--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -68,7 +68,7 @@ void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const {
      for (auto& o_it : outputs) {
        for (auto& v : o_it.second) {  // values
          vars[v] = order;
-          VLOG(1) << "in all_reduce_deps_pass:" << v;
+          VLOG(10) << "in all_reduce_deps_pass:" << v;
        }
      }
      order++;

--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const {
      continue;
    TryInplaceOpInputOutput(op, graph);
  }
-  // graph->ResolveHazard(var_nodes_);
 }
 void InplacePass::InplaceModifyDesc(const std::string& var,
@@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var,
    auto* op_desc = op->Op();
    op_desc->RenameInput(var, cache_var);
    op_desc->RenameOutput(var, cache_var);
-    if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
    op_desc->Flush();
  }
 }
@@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
 void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
                                          ir::Graph* graph) const {
  VLOG(4) << "Try to inplace op " << op->Name();
-  // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
-  //               "op_desc is nullptr");
  // some pre-requirments need to meet if the op want to inplaced.
  PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr");
@@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const {
 // check if op2 depends on op1's output
 bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const {
-  auto print_op = [&](ir::Node* op, const char* name) {
+  if (VLOG_IS_ON(4)) {
-    std::ostringstream os;
+    auto print_op = [&](ir::Node* op, const char* name) {
-    os << "        " << name << " : " << op->Name() << " ";
+      std::ostringstream os;
-    os << "Input args : ";
+      os << "        " << name << " : " << op->Name() << " ";
-    for (auto& arg : op->inputs) os << arg->Name() << " ";
+      os << "Input args : ";
-    os << "Output args : ";
+      for (auto& arg : op->inputs) os << arg->Name() << " ";
-    for (auto& arg : op->outputs) os << arg->Name() << " ";
+      os << "Output args : ";
-    os << "Level : " << op_level_.at(op);
+      for (auto& arg : op->outputs) os << arg->Name() << " ";
-    VLOG(4) << os.str();
+      os << "Level : " << op_level_.at(op);
-  };
+      VLOG(4) << os.str();
-  print_op(op1, "OP1");
+    };
-  print_op(op2, "OP2");
+    print_op(op1, "OP1");
+    print_op(op2, "OP2");
+  }
  if (op1 == op2) return true;
  if (op_level_.at(op1) >= op_level_.at(op2)) return false;

--- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc
@@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) {
  for (auto& node : nodes) {
    pool.Insert(node.get());
  }
-  // FIXME(liuwei1031) this API has changed,
-  // disable these tests temporarily
+  auto* n = nodes[0].get();
-  // FindNextBestFitNode
+  auto* cache = pool.FindBestFitNode(n);
-  // auto* n = nodes[0].get();
+  ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c");
-  // auto* cache = pool.FindBestFitNode(n);
+  auto* cache_b = pool.FindNextBestFitNode(n, cache);
-  // PADDLE_ENFORCE(cache->Name() == "a");
+  ASSERT_TRUE(cache_b->Name() != cache->Name());
-  // cache = pool.FindNextBestFitNode(n, cache);
+  ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c");
-  // PADDLE_ENFORCE(cache->Name() == "c");
+  cache = pool.FindNextBestFitNode(n, cache_b);
-  // cache = pool.FindNextBestFitNode(n, cache);
+  ASSERT_TRUE(cache == nullptr);
-  // PADDLE_ENFORCE(cache->Name() == "b");
 }
 }  // namespace details

--- a/paddle/fluid/framework/inplace_op_inference_test.cc
+++ b/paddle/fluid/framework/inplace_op_inference_test.cc
@@ -12,9 +12,14 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
+#include <iostream>
 #include <iterator>
+#include <memory>
 #include <string>
+#include <vector>
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/details/inplace_op_pass.h"
+#include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut,
 namespace paddle {
 namespace framework {
-// TEST(InferInplace, SingleOpInplaceInToOut) {
+void FakeSuccData(ProgramDesc* prog) {  // NOLINT
-//   ProgramDesc prog;
+  prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-//   auto* op = prog.MutableBlock(0)->AppendOp();
+  prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-//   op->SetType("single_op");
+  prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-//   op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-//   op->SetOutput("Out", {"test2_out"});
+  prog->MutableBlock(0)->Var("test2_out");
-//
+  prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128});
-//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+}
-//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+void FakeNoInplaceData(ProgramDesc* prog) {  // NOLINT
-//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+  prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
-//   prog.MutableBlock(0)->Var("test2_out");
+  prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128});
-//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128});
+  prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
-//
+  prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
-//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  prog->MutableBlock(0)->Var("test2_out");
-//   auto in_to_outs = infer_inplace(*op);
+  prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128});
-//   EXPECT_EQ(in_to_outs.size(), 1ul);
+}
-//   auto it = in_to_outs.begin();
-//   EXPECT_EQ(it->first, "test2_a");
+ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) {
-//   EXPECT_EQ(it->second, "test2_out");
+  ir::Node* op_node = nullptr;
-// }
+  for (auto& item : g->Nodes()) {
-//
+    if (item->Name() == name) {
-// TEST(InferInplace, SingleGradOpInplaceInToOut) {
+      op_node = item;
-//   ProgramDesc prog;
+      break;
-//   auto* op = prog.MutableBlock(0)->AppendOp();
+    }
-//   op->SetType("single_op_grad");
+  }
-//   op->SetInput(GradVarName("Out"), {"test2_out"});
+  return op_node;
-//   op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"});
+}
-//
-//   prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR);
+std::unique_ptr<ir::Graph> test_SingleOpInplaceInToOut(
-//   prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024});
+    std::unique_ptr<ir::Graph> g) {
-//   prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR);
+  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
-//   prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR);
+  ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op");
-//   prog.MutableBlock(0)->Var("test2_out");
+  EXPECT_NE(op_node, nullptr);
-//   prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024});
+  pass->Apply(g.get());
-//
+  return g;
-//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+}
-//   auto in_to_outs = infer_inplace(*op);
-//   EXPECT_EQ(in_to_outs.size(), 1ul);
+TEST(InferInplace, SingleOpInplaceInToOut) {
-//   auto it = in_to_outs.begin();
+  ProgramDesc prog;
-//   EXPECT_EQ(it->first, "test2_out");
+  auto* op = prog.MutableBlock(0)->AppendOp();
-//   EXPECT_EQ(it->second, "test2_a");
+  op->SetType("single_op");
-// }
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-//
+  op->SetOutput("Out", {"test2_out"});
-// TEST(InferInplace, MultiOutInplaceInToOut) {
-//   ProgramDesc prog;
+  FakeSuccData(&prog);
-//   auto* op = prog.MutableBlock(0)->AppendOp();
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-//   op->SetType("multi_out_op");
+  g = test_SingleOpInplaceInToOut(std::move(g));
-//   op->SetInput("X", {"a0", "a1"});
+  auto op_node = GetNodeFromGraph(g.get(), "single_op");
-//   op->SetInput("Y", {"b0"});
-//   op->SetInput("Z", {"c0", "c1"});
+  EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a");
-//   op->SetOutput("Out", {"o0"});
+}
-//   op->SetOutput("YOut", {"y0"});
-//   op->SetOutput("ZOut", {"z0"});
+TEST(InferInplace, SingleOpInplaceInToOutNoInplace) {
-//
+  ProgramDesc prog;
-//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+  auto* op = prog.MutableBlock(0)->AppendOp();
-//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetType("single_op");
-//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
-//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+  op->SetOutput("Out", {"test2_out"});
-//   prog.MutableBlock(0)->Var("o0");
-//   prog.MutableBlock(0)->Var("y0");
+  FakeNoInplaceData(&prog);
-//   prog.MutableBlock(0)->Var("z0");
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  g = test_SingleOpInplaceInToOut(std::move(g));
-//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  auto op_node = GetNodeFromGraph(g.get(), "single_op");
-//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out");
-//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+}
-//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
-//
+TEST(InferInplace, MultiOutInplaceInToOut) {
-//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+  ProgramDesc prog;
-//   auto in_to_outs = infer_inplace(*op);
+  auto* op = prog.MutableBlock(0)->AppendOp();
-//   EXPECT_EQ(in_to_outs.size(), 3ul);
+  op->SetType("multi_out_op");
-//   std::unordered_map<std::string, std::string> expects = {
+  op->SetInput("X", {"a0", "a1"});
-//       {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"},
+  op->SetInput("Y", {"b0"});
-//   };
+  op->SetInput("Z", {"c0", "c1"});
-//   EXPECT_TRUE(expects == in_to_outs);
+  op->SetOutput("Out", {"o0"});
-// }
+  op->SetOutput("YOut", {"y0"});
-//
+  op->SetOutput("ZOut", {"z0"});
-// TEST(InferInplace, MultiGradInplaceInToOut) {
-//   ProgramDesc prog;
+  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
-//   auto* op = prog.MutableBlock(0)->AppendOp();
+  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
-//   op->SetType("multi_out_grad");
+  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
-//   op->SetInput(GradVarName("Out"), {"o0"});
+  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-//   op->SetInput(GradVarName("YOut"), {"y0"});
+  prog.MutableBlock(0)->Var("o0");
-//   op->SetInput(GradVarName("ZOut"), {"z0"});
+  prog.MutableBlock(0)->Var("y0");
-//   op->SetOutput(GradVarName("X"), {"a0", "a1"});
+  prog.MutableBlock(0)->Var("z0");
-//   op->SetOutput(GradVarName("Y"), {"b0"});
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
-//   op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
-//
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
-//   prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
-//   prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
-//   prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
-//   prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
-//   prog.MutableBlock(0)->Var("o0");
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-//   prog.MutableBlock(0)->Var("y0");
+  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
-//   prog.MutableBlock(0)->Var("z0");
+  pass->Apply(g.get());
-//   prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  auto op_node = GetNodeFromGraph(g.get(), "multi_out_op");
-//   prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  ASSERT_TRUE(op_node != nullptr);
-//   prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  EXPECT_EQ(op_node->outputs[0]->Name(), "a0");
-//   prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  EXPECT_EQ(op_node->outputs[1]->Name(), "b0");
-//   prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  EXPECT_EQ(op_node->outputs[2]->Name(), "c0");
-//   prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024});
+}
-//
-//   auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_;
+TEST(InferInplace, MultiGradInplaceInToOut) {
-//   auto in_to_outs = infer_inplace(*op);
+  ProgramDesc prog;
-//
+  auto* op = prog.MutableBlock(0)->AppendOp();
-//   EXPECT_EQ(in_to_outs.size(), 3ul);
+  op->SetType("multi_out_grad");
-//   std::unordered_map<std::string, std::string> expects = {
+  op->SetInput(GradVarName("Out"), {"o0"});
-//       {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+  op->SetInput(GradVarName("YOut"), {"y0"});
-//   };
+  op->SetInput(GradVarName("ZOut"), {"z0"});
-//   EXPECT_TRUE(expects == in_to_outs);
+  op->SetOutput(GradVarName("X"), {"a0", "a1"});
-// }
+  op->SetOutput(GradVarName("Y"), {"b0"});
+  op->SetOutput(GradVarName("Z"), {"c0", "c1"});
+  prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("o0");
+  prog.MutableBlock(0)->Var("y0");
+  prog.MutableBlock(0)->Var("z0");
+  prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024});
+  prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024});
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  std::unique_ptr<details::InplacePass> pass(new details::InplacePass());
+  pass->Apply(g.get());
+  auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad");
+  ASSERT_TRUE(op_node != nullptr);
+  EXPECT_EQ(op_node->outputs[0]->Name(), "o0");
+  EXPECT_EQ(op_node->outputs[2]->Name(), "y0");
+  EXPECT_EQ(op_node->outputs[3]->Name(), "c0");
+  std::unordered_map<std::string, std::string> expects = {
+      {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"},
+  };
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -68,21 +68,12 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
-pass_library(simplify_anakin_detection_pattern_pass inference)
+pass_library(quant_conv2d_dequant_fuse_pass inference)
-pass_library(anakin_fillconstant_elementwisemul_fuse inference)
+pass_library(fillconstant_elementwisemul_fuse inference)
-# There may be many transpose-flatten structures in a model, and the output of
+if(ANAKIN_FOUND)
-# these structures will be used as inputs to the concat Op. This pattern will
+pass_library(simplify_anakin_priorbox_detection_out_pass inference)
-# be detected by our pass. The index here represents the number of structures in the
+endif()
-# pattern. We use index 3 ~ 6, because these quantities of structures are
-# common in the models.
-foreach (index RANGE 2 6)
-   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
-endforeach()
-foreach (index RANGE 2 6)
-   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
-endforeach()
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base mkldnn)

--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
@@ -15,7 +15,7 @@
 #include <memory>
 #include <string>
-#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 namespace paddle {
@@ -29,8 +29,8 @@ namespace ir {
  GET_IR_NODE(elementwise_mul);   \
  GET_IR_NODE(elementwise_mul_out);
-void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
+void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
-  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+  const std::string pattern_name = "fillconstant_elementwisemul_fuse";
  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
@@ -39,8 +39,8 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
                ->assert_is_op_input("elementwise_mul", "X")
                ->AsInput();
-  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+  patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
-                                                         pattern_name);
+                                                   pattern_name);
  pattern(x);
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -79,5 +79,5 @@ void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const {
 }  // namespace framework
 }  // namespace paddle
-REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
+REGISTER_PASS(fillconstant_elementwisemul_fuse,
-              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
+              paddle::framework::ir::FillconstantElementwisemulFuse);
--- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
@@ -21,9 +21,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
-class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+class FillconstantElementwisemulFuse : public FusePassBase {
 public:
-  virtual ~AnakinFillconstantElementwisemulFuse() {}
+  virtual ~FillconstantElementwisemulFuse() {}
 protected:
  void ApplyImpl(ir::Graph* graph) const override;

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1471,7 +1471,8 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
 }
 PDNode *patterns::AnakinDetectionPattern::operator()(
-    std::vector<PDNode *> conv_in, int times) {
+    std::vector<PDNode *> conv_in, int times, std::string priorbox_type,
+    bool is_reshape) {
  // The times represents the repeat times of the
  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
  const int kNumFields = 7;
@@ -1486,37 +1487,38 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
  const int kMultiClassSecondInputNmsOffset = times + 1;
  std::vector<PDNode *> nodes;
+  std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2";
  for (int i = 0; i < times; i++) {
    nodes.push_back(
        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
-            ->assert_is_op("density_prior_box"));
+            ->assert_is_op(priorbox_type));
    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
-                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_output(priorbox_type, "Boxes")
-                        ->assert_is_op_input("reshape2", "X")
+                        ->assert_is_op_input(op_after_priorbox, "X")
                        ->AsIntermediate());
    nodes.push_back(
        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
-            ->assert_is_op("reshape2"));
+            ->assert_is_op(op_after_priorbox));
    nodes.push_back(
        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
-            ->assert_is_op_output("reshape2")
+            ->assert_is_op_output(op_after_priorbox)
            ->assert_is_op_nth_input("concat", "X", i)
            ->AsIntermediate());
    nodes.push_back(
        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
-            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_output(priorbox_type, "Variances")
-            ->assert_is_op_input("reshape2", "X")
+            ->assert_is_op_input(op_after_priorbox, "X")
            ->AsIntermediate());
    nodes.push_back(
        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
-            ->assert_is_op("reshape2"));
+            ->assert_is_op(op_after_priorbox));
    nodes.push_back(
        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
-            ->assert_is_op_output("reshape2")
+            ->assert_is_op_output(op_after_priorbox)
            ->assert_is_op_nth_input("concat", "X", i)
            ->AsIntermediate());
  }
@@ -1612,7 +1614,7 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
  return multiclass_nms_out;
 }
-PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+PDNode *patterns::FillConstantElementWiseMulFuse::operator()(
    PDNode *elementwise_op_input) {
  auto fill_constant =
      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
@@ -1635,6 +1637,76 @@ PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
  return elementwise_mul_out;
 }
+void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
+                                              const std::string &op_type,
+                                              const std::string &weight_name,
+                                              int times) {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+  // the quant op always be one.
+  auto quant_op_in_scale =
+      pattern->NewNode(GetNodeName("quant_op_in_scale"))
+          ->assert_is_op_input("fake_quantize_range_abs_max", "InScale")
+          ->AsInput();
+  auto quant_op = pattern->NewNode(GetNodeName("quant_op"))
+                      ->assert_is_op("fake_quantize_range_abs_max");
+  auto quant_op_out_scale =
+      pattern->NewNode(GetNodeName("quant_op_out_scale"))
+          ->assert_is_op_output("fake_quantize_range_abs_max", "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
+  auto quant_op_out =
+      pattern->NewNode(GetNodeName("quant_op_out"))
+          ->assert_is_op_output("fake_quantize_range_abs_max", "Out")
+          ->assert_is_op_input(op_type)
+          ->AsIntermediate();
+  // there are 'times' quantized and dequant op
+  std::vector<PDNode *> nodes;
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i))
+            ->assert_is_op_input(op_type, weight_name)
+            ->AsInput());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i))
+            ->assert_is_op(op_type));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
+            ->assert_is_op_output(op_type)
+            ->assert_is_op_input("fake_dequantize_max_abs", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
+            ->assert_is_op("fake_dequantize_max_abs"));
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
+            ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+            ->AsOutput());
+  }
+  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
+  quant_op_out->LinksFrom({quant_op});
+  for (int i = 0; i < times; i++) {
+    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
+        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
+    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOffset]});
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kDequantOpOffset]});
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -848,7 +848,8 @@ struct AnakinDetectionPattern : public PatternBase {
  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
-  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times,
+                     std::string priorbox_type, bool is_reshape);
  std::string GetNodeName(const std::string& op_type) {
    return PDNodeName(name_scope_, repr_, id_, op_type);
@@ -859,9 +860,9 @@ struct AnakinDetectionPattern : public PatternBase {
  }
 };
-struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
+struct FillConstantElementWiseMulFuse : public PatternBase {
-  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
+  FillConstantElementWiseMulFuse(PDPattern* pattern,
-                                       const std::string& name_scope)
+                                 const std::string& name_scope)
      : PatternBase(pattern, name_scope,
                    "anakin_fillconstant_elementwisemul_fuse") {}
@@ -874,6 +875,22 @@ struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
  PATTERN_DECL_NODE(elementwise_mul_out);
 };
+struct QuantDequantOpFuse : public PatternBase {
+  QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quant_dequant_fuse") {}
+  void operator()(PDNode* quant_op_input, const std::string& op_name,
+                  const std::string& weight_name, int times = 1);
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
 }  // namespace patterns
 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
+                     std::string op_type) {
+  const std::string pattern_name = "quant_dequant_fuse";
+  //  FusePassBase::Init(pattern_name, graph);
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("fake_quantize_range_abs_max", "X")
+                ->AsInput();
+  std::string quantized_op_type = "";
+  std::string weight_name = "";
+  if (op_type == "conv2d") {
+    quantized_op_type = "conv2d";
+    weight_name = "Filter";
+  } else if (op_type == "conv2d_fusion") {
+    quantized_op_type = "conv2d_fusion";
+    weight_name = "Filter";
+  } else if (op_type == "mul") {
+    quantized_op_type = "mul";
+    weight_name = "Y";
+  } else if (op_type == "fc") {
+    quantized_op_type = "fc";
+    weight_name = "W";
+  } else {
+    PADDLE_ENFORCE(
+        "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for "
+        "now.");
+  }
+  patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x, quantized_op_type, weight_name, times);
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* input_node = subgraph.at(x);
+    Node* quant_op_in_scale =
+        subgraph.at(pattern.GetPDNode("quant_op_in_scale"));
+    Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op"));
+    Node* quant_op_out_scale =
+        subgraph.at(pattern.GetPDNode("quant_op_out_scale"));
+    Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out"));
+    std::vector<Node*> nodes;
+    for (int i = 0; i < times; i++) {
+      nodes.push_back(subgraph.at(
+          pattern.GetPDNode("quantized_op_weight" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i))));
+      nodes.push_back(subgraph.at(
+          pattern.GetPDNode("quantized_op_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
+    }
+    int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
+    int range = ((1 << (bit_length - 1)) - 1);
+    // Prepare input scale
+    std::string input_scale_var_name = quant_op->Op()->Input("InScale").front();
+    PADDLE_ENFORCE(scope);
+    const LoDTensor& input_scale_tensor =
+        scope->FindVar(input_scale_var_name)->Get<LoDTensor>();
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place()));
+    const float* input_scale_data = input_scale_tensor.data<float>();
+    float input_scale = input_scale_data[0];
+    std::unordered_set<const Node*> delete_nodes;
+    for (int i = 0; i < times; i++) {
+      // max_range = (range * range) / weight_scale
+      float max_range = boost::get<float>(
+          nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
+      float weight_scale = (range * range) / max_range;
+      auto base_op_desc =
+          *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
+      std::string new_input = input_node->Name();
+      std::string new_output =
+          nodes[i * kNumFields + kDequantOpOutOffset]->Name();
+      framework::OpDesc new_op_desc(base_op_desc, nullptr);
+      new_op_desc.SetType(quantized_op_type);
+      if (quantized_op_type == "conv2d" ||
+          quantized_op_type == "conv2d_fusion") {
+        new_op_desc.SetInput("Input", {new_input});
+        new_op_desc.SetOutput("Output", {new_output});
+      } else if (quantized_op_type == "fc") {
+        new_op_desc.SetInput("Input", {new_input});
+        new_op_desc.SetOutput("Out", {new_output});
+      } else if (quantized_op_type == "mul") {
+        new_op_desc.SetInput("X", {new_input});
+        new_op_desc.SetOutput("Out", {new_output});
+      }
+      new_op_desc.SetAttr("enable_int8", true);
+      new_op_desc.SetAttr("input_scale", input_scale);
+      new_op_desc.SetAttr("weight_scale", weight_scale);
+      new_op_desc.Flush();
+      auto* new_op = graph->CreateOpNode(&new_op_desc);
+      IR_NODE_LINK_TO(input_node, new_op);
+      IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
+      IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
+    }
+    delete_nodes.insert(quant_op_in_scale);
+    delete_nodes.insert(quant_op);
+    delete_nodes.insert(quant_op_out);
+    delete_nodes.insert(quant_op_out_scale);
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph, delete_nodes);
+  };
+  gpd(graph, handler);
+}
+void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "quant_dequant_fuse";
+  FusePassBase::Init(pattern_name, graph);
+  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul"};
+  auto* scope = param_scope();
+  for (auto& op_type : quantized_op_types) {
+    for (int i = 1; i <= 6; i++) {
+      RunQuantDequant(graph, scope, i, op_type);
+    }
+  }
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
+              paddle::framework::ir::QuantDequantFusePass);
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class QuantDequantFusePass : public FusePassBase {
+ public:
+  virtual ~QuantDequantFusePass() {}
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -17,25 +17,24 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h"
 namespace paddle {
 namespace framework {
 namespace ir {
-template <int times>
+void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density,
-void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+                                bool is_reshape) {
-    ir::Graph *graph) const {
  const std::string pattern_name =
      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
-  FusePassBase::Init(pattern_name, graph);
+  std::string priorbox_type = is_density ? "density_prior_box" : "prior_box";
  GraphPatternDetector gpd;
  std::vector<PDNode *> input_nodes;
  for (int i = 0; i < times; i++) {
    input_nodes.push_back(gpd.mutable_pattern()
                              ->NewNode("x" + std::to_string(i))
-                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->assert_is_op_input(priorbox_type, "Input")
                              ->AsInput());
  }
  input_nodes.push_back(gpd.mutable_pattern()
@@ -49,7 +48,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
                            ->AsInput());
  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(input_nodes, times);
+  pattern(input_nodes, times, priorbox_type, is_reshape);
  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                     Graph *g) {
@@ -119,8 +118,7 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
    bool box_normalized =
        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
-    // auto variance =
-    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
    int background_label =
        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
    float score_threshold =
@@ -138,7 +136,6 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
    }
-    // int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
    framework::OpDesc concat1_desc;
    concat1_desc.SetType("concat");
    concat1_desc.SetInput("X", concat1_input_names);
@@ -213,31 +210,24 @@ void SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
  gpd(graph, handler);
 }
-template class SimplifyAnakinDetectionPatternPass<1>;
+void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const {
-template class SimplifyAnakinDetectionPatternPass<2>;
+  const int pattern_nums = 6;
-template class SimplifyAnakinDetectionPatternPass<3>;
+  const std::string pattern_name = "simplify_anakin_detection_pattern_pass";
-template class SimplifyAnakinDetectionPatternPass<4>;
+  FusePassBase::Init(pattern_name, graph);
-template class SimplifyAnakinDetectionPatternPass<5>;
+  std::vector<bool> options = {true, false};
-template class SimplifyAnakinDetectionPatternPass<6>;
+  for (const auto &is_density : options) {
+    for (const auto &is_reshape : options) {
+      for (int i = 1; i <= pattern_nums; i++) {
+        RunSimplifyAnakinDetection(graph, i, is_density, is_reshape);
+      }
+    }
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
-REGISTER_PASS(simplify_anakin_detection_pattern_pass,
+typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
+    priorbox_pattern;
+REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern);
-REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
-REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
-REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
-REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
-REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
-              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
@@ -26,7 +26,6 @@ namespace ir {
 // these structures will be used as inputs to the concat Op. This pattern will
 // be detected by our pass. The times here represents the repeat times of this
 // structure.
-template <int times>
 class SimplifyAnakinDetectionPatternPass : public FusePassBase {
 public:
  virtual ~SimplifyAnakinDetectionPatternPass() {}

--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -25,11 +25,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
-template <int times>
+void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) {
-void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
  const std::string pattern_name =
      "transpose_flatten" + std::to_string(times) + "_concat_fuse";
-  FusePassBase::Init(pattern_name, graph);
  GraphPatternDetector gpd;
  std::vector<PDNode *> input_nodes;
@@ -122,31 +120,18 @@ void TransposeFlattenConcatFusePass<times>::ApplyImpl(ir::Graph *graph) const {
  gpd(graph, handler);
 }
-template class TransposeFlattenConcatFusePass<1>;
+void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
-template class TransposeFlattenConcatFusePass<2>;
+  const int pattern_nums = 6;
-template class TransposeFlattenConcatFusePass<3>;
+  const std::string pattern_name = "transpose_flatten_concat_fuse";
-template class TransposeFlattenConcatFusePass<4>;
+  FusePassBase::Init(pattern_name, graph);
-template class TransposeFlattenConcatFusePass<5>;
+  for (int i = 1; i <= pattern_nums; i++) {
-template class TransposeFlattenConcatFusePass<6>;
+    RunTransposeFlattenConcatFuse(graph, i);
+  }
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<1>);
+              paddle::framework::ir::TransposeFlattenConcatFusePass);
-REGISTER_PASS(transpose_flatten2_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<2>);
-REGISTER_PASS(transpose_flatten3_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<3>);
-REGISTER_PASS(transpose_flatten4_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<4>);
-REGISTER_PASS(transpose_flatten5_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<5>);
-REGISTER_PASS(transpose_flatten6_concat_fuse_pass,
-              paddle::framework::ir::TransposeFlattenConcatFusePass<6>);
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 #pragma once
+#include <memory>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
@@ -24,7 +26,6 @@ namespace ir {
 // these structures will be used as inputs to the concat Op. This pattern will
 // be detected by our pass. The times here represents the repeat times of this
 // structure.
-template <int times>
 class TransposeFlattenConcatFusePass : public FusePassBase {
 public:
  virtual ~TransposeFlattenConcatFusePass() {}

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
  }
 }
-static DDim GetDims(const Scope& scope, const std::string& name,
+static DDim GetDimsDebug(const Scope& scope, const std::string& name,
-                    bool get_actual_dim = false) {
+                         bool get_actual_dim = false) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return DDim({-1});
@@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name,
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    // if (UNLIKELY(!tensor.IsInitialized())) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
-    //   return DDim({-1});
+      return DDim({-1});
-    // }
+    }
    return tensor.dims();
  } else if (var->IsType<SelectedRows>()) {
    if (get_actual_dim) {
@@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) {
  return -1;
 }
-static LoD GetLoD(const Scope& scope, const std::string& name) {
+static LoD GetLoDDebug(const Scope& scope, const std::string& name) {
  Variable* var = scope.FindVar(name);
  auto default_lod = LoD({{}});
@@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
  if (var->IsType<LoDTensor>()) {
    const LoDTensor& tensor = var->Get<LoDTensor>();
-    // if (UNLIKELY(!tensor.IsInitialized())) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
-    //   return default_lod;
+      return default_lod;
-    // }
+    }
    return tensor.lod();
  } else {
    return default_lod;
@@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
          }
          std::string dtype = GetDtype(*scope, var_name);
          ss << ":" << dtype;
-          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
-          ss << "(" << GetLoD(*scope, var_name) << ")";
+          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
        }
      }
      if (i != input.second.size() - 1) {
@@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
          }
          std::string dtype = GetDtype(*scope, output.second[i]);
          ss << ":" << dtype;
-          ss << "[" << GetDims(*scope, var_name, true) << "]";
+          ss << "[" << GetDimsDebug(*scope, var_name, true) << "]";
-          ss << "(" << GetLoD(*scope, var_name) << ")";
+          ss << "(" << GetLoDDebug(*scope, var_name) << ")";
        }
      }
      if (i != output.second.size() - 1) {
@@ -1017,7 +1017,7 @@ Scope* OperatorWithKernel::PrepareData(
    // of search key even though the set is empty.
    if (!no_buffer_ins.empty() &&
        no_buffer_ins.count(var_name_item.first) > 0) {
-      VLOG(1) << "Skip scanning input " << var_name_item.first
+      VLOG(7) << "Skip scanning input " << var_name_item.first
              << " in Operator " << type_;
      continue;
    }

--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -34,25 +34,41 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
  auto input_name = op_desc.Input("Input").front();
  auto image_name = op_desc.Input("Image").front();
  auto output_name = op_desc.Output("Boxes").front();
+  auto op_type = op_desc.Type();
+  auto op_name = op_type + ":" + op_desc.Output("Boxes").front();
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
+  // only for density_prior_box
+  std::vector<float> fixed_sizes = {};
+  std::vector<float> fixed_ratios = {};
+  std::vector<int> densities = {};
-  auto fixed_sizes =
+  std::vector<float> min_sizes = {};
-      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+  std::vector<float> max_sizes = {};
-  auto fixed_ratios =
+  std::vector<float> aspect_ratios = {};
-      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+  bool is_clip = false;
-  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+  bool is_flip = false;
+  if (op_type == "density_prior_box") {
+    fixed_sizes =
+        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+    fixed_ratios =
+        boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+    densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  } else if (op_type == "prior_box") {
+    min_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("min_sizes"));
+    max_sizes = boost::get<std::vector<float>>(op_desc.GetAttr("max_sizes"));
+    aspect_ratios =
+        boost::get<std::vector<float>>(op_desc.GetAttr("aspect_ratios"));
+    is_clip = boost::get<bool>(op_desc.GetAttr("clip"));
+    is_flip = boost::get<bool>(op_desc.GetAttr("flip"));
+  }
  std::vector<float> dens;
  for (auto& ele : densities) {
    dens.push_back(static_cast<float>(ele));
  }
-  // lack flip
-  // auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
-  for (auto& ele : variances) {
-    LOG(INFO) << ele;
-  }
  // lack img_h, img_w
  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
@@ -66,14 +82,14 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
  std::vector<float> temp_v = {};
  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
-  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", min_sizes);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", max_sizes);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", aspect_ratios);
  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_size", fixed_sizes);
  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratio", fixed_ratios);
  engine_->AddOpAttr<PTuple<float>>(op_name, "density", dens);
-  engine_->AddOpAttr(op_name, "is_flip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_flip", is_flip);
-  engine_->AddOpAttr(op_name, "is_clip", static_cast<bool>(false));
+  engine_->AddOpAttr(op_name, "is_clip", is_clip);
  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
@@ -88,3 +104,4 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op,
 }  // namespace paddle
 REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -48,7 +48,7 @@ class AnakinOpConverter {
    framework::OpDesc op_desc(op, nullptr);
    std::string op_type = op_desc.Type();
    AnakinOpConverter *it = nullptr;
+    if (op_type == "depthwise_conv2d") op_type = "conv2d";
    if (op_type == "reshape2") op_type = "reshape";
    if (op_type == "transpose2") op_type = "transpose";
    if (op_type == "flatten2") op_type = "flatten";

--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -42,6 +42,8 @@ struct SimpleOpTypeSetTeller : public Teller {
    teller_set.insert("dropout");
    teller_set.insert("sigmoid");
    teller_set.insert("sum");
+    teller_set.insert("depthwise_conv2d");
+    teller_set.insert("prior_box");
  }
  bool operator()(const std::string& op_type,

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -37,14 +37,14 @@ using framework::ir::Node;
 void analysis::AnakinSubgraphPass::ApplyImpl(
    framework::ir::Graph *graph) const {
-  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get());
+  framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph);
  auto teller = [](const framework::ir::Node *node) {
    if (!node->IsOp() || !node->Op()) return false;
    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
  };
-  SubGraphFuser fuser(graph.get(), teller, 6 /* min_subgraph_size */);
+  SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */);
  fuser();
  std::vector<std::string> graph_param_names =
@@ -56,10 +56,10 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params);
+      CreateAnakinOp(node, graph, graph_param_names, &repetitive_params);
      std::unordered_set<const Node *> nodes2remove(
          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
-      framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
    }
  }
@@ -69,7 +69,7 @@ void analysis::AnakinSubgraphPass::ApplyImpl(
      nodes2remove.insert(node);
    }
  }
-  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
  graph->Set(framework::ir::kRepetitiveParamAttr,
             new std::vector<std::string>(repetitive_params));
 }

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -192,6 +192,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
          block_desc.Proto()->SerializeAsString());
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
+  SetAttr(op_desc->Proto(), "gpu_id", Get<int>("gpu_device_id"));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
  SetAttr(op_desc->Proto(), "parameters", params);

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -52,6 +52,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  for (auto &var_name : all_vars) {
    if (std::count(repetitive_params.begin(), repetitive_params.end(),
                   var_name)) {
+      scope->EraseVars({var_name});
      continue;
    }
    auto *var = scope->FindLocalVar(var_name);

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -886,4 +886,5 @@ USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
 USE_ANAKIN_CONVERTER(dropout);
 USE_ANAKIN_CONVERTER(sum);
+USE_ANAKIN_CONVERTER(prior_box);
 #endif
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -70,17 +70,15 @@ void GpuPassStrategy::EnableMKLDNN() {
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
-    "infer_clean_graph_pass",                   //
+    "infer_clean_graph_pass",                       //
-    "simplify_anakin_detection_pattern_pass5",  //
+    "simplify_anakin_priorbox_detection_out_pass",  //
-    "simplify_anakin_detection_pattern_pass4",  //
+    "fillconstant_elementwisemul_fuse",             //
-    "simplify_anakin_detection_pattern_pass3",  //
+    "fc_fuse_pass",                                 //
-    "simplify_anakin_detection_pattern_pass2",  //
+    "conv_elementwise_add_fuse_pass",               //
-    "anakin_fillconstant_elementwisemul_fuse",  //
+    "conv_bn_fuse_pass",                            //
-    "fc_fuse_pass",                             //
+    "conv_elementwise_add_fuse_pass",               //
-    "conv_elementwise_add_fuse_pass",           //
+    "fc_gru_fuse_pass",                             //
-    "conv_bn_fuse_pass",                        //
+    "quant_conv2d_dequant_fuse_pass",               //
-    "conv_elementwise_add_fuse_pass",           //
-    "fc_gru_fuse_pass",                         //
    "anakin_subgraph_pass",
 });
@@ -97,13 +95,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
        "runtime_context_cache_pass",           //
-#endif
+#endif                                          //
+        "transpose_flatten_concat_fuse_pass",
  });
-  for (int i = 6; i >= 2; i--) {
-    passes_.push_back("transpose_flatten" + std::to_string(i) +
-                      "_concat_fuse_pass");
-  }
  use_gpu_ = true;
 }

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -23,6 +23,12 @@ function(inference_analysis_api_test target install_dir filename)
        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
+function(inference_analysis_api_int8_test target model_dir data_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
+        ARGS --infer_model=${model_dir}/model --infer_data=${data_dir}/data.bin --batch_size=100)
+endfunction()
 function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
    download_model(${install_dir} ${model_name})
    inference_analysis_test(${target} SRCS ${filename}
@@ -138,6 +144,28 @@ inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
 inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
+# int8 image classification tests
+if(WITH_MKLDNN)
+  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
+  if (NOT EXISTS ${INT8_DATA_DIR})
+    inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz")
+  endif()
+  #resnet50 int8
+  set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
+  if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
+    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" )
+  endif()
+  inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
+  #mobilenet int8
+  set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
+  if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
+    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" )
+  endif()
+  inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
+endif()
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
 download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")

--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -53,19 +53,6 @@ void Split(const std::string &line, char sep, std::vector<T> *v) {
  }
 }
-template <typename T>
-constexpr paddle::PaddleDType GetPaddleDType();
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
-  return paddle::PaddleDType::INT64;
-}
-template <>
-constexpr paddle::PaddleDType GetPaddleDType<float>() {
-  return paddle::PaddleDType::FLOAT32;
-}
 // Parse tensor from string
 template <typename T>
 bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {

--- a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_int32(iterations, 0, "Number of iterations");
+namespace paddle {
+namespace inference {
+namespace analysis {
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model);
+  cfg->SetProgFile("__model__");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+  cfg->EnableMKLDNN();
+}
+template <typename T>
+class TensorReader {
+ public:
+  TensorReader(std::ifstream &file, size_t beginning_offset,
+               std::vector<int> shape, std::string name)
+      : file_(file), position(beginning_offset), shape_(shape), name_(name) {
+    numel =
+        std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<T>());
+  }
+  PaddleTensor NextBatch() {
+    PaddleTensor tensor;
+    tensor.name = name_;
+    tensor.shape = shape_;
+    tensor.dtype = GetPaddleDType<T>();
+    tensor.data.Resize(numel * sizeof(T));
+    file_.seekg(position);
+    file_.read(static_cast<char *>(tensor.data.data()), numel * sizeof(T));
+    position = file_.tellg();
+    if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream";
+    if (file_.fail())
+      throw std::runtime_error(name_ + ": failed reading file.");
+    return tensor;
+  }
+ protected:
+  std::ifstream &file_;
+  size_t position;
+  std::vector<int> shape_;
+  std::string name_;
+  size_t numel;
+};
+std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
+    const std::vector<std::vector<PaddleTensor>> &test_data, int num_images) {
+  int test_data_batch_size = test_data[0][0].shape[0];
+  CHECK_LE(static_cast<size_t>(num_images),
+           test_data.size() * test_data_batch_size);
+  PaddleTensor images;
+  images.name = "input";
+  images.shape = {num_images, 3, 224, 224};
+  images.dtype = PaddleDType::FLOAT32;
+  images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224);
+  PaddleTensor labels;
+  labels.name = "labels";
+  labels.shape = {num_images, 1};
+  labels.dtype = PaddleDType::INT64;
+  labels.data.Resize(sizeof(int64_t) * num_images);
+  for (int i = 0; i < num_images; i++) {
+    auto batch = i / test_data_batch_size;
+    auto element_in_batch = i % test_data_batch_size;
+    std::copy_n(static_cast<float *>(test_data[batch][0].data.data()) +
+                    element_in_batch * 3 * 224 * 224,
+                3 * 224 * 224,
+                static_cast<float *>(images.data.data()) + i * 3 * 224 * 224);
+    std::copy_n(static_cast<int64_t *>(test_data[batch][1].data.data()) +
+                    element_in_batch,
+                1, static_cast<int64_t *>(labels.data.data()) + i);
+  }
+  auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(2);
+  (*warmup_data)[0] = std::move(images);
+  (*warmup_data)[1] = std::move(labels);
+  return warmup_data;
+}
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              int32_t batch_size = FLAGS_batch_size) {
+  std::ifstream file(FLAGS_infer_data, std::ios::binary);
+  if (!file) {
+    FAIL() << "Couldn't open file: " << FLAGS_infer_data;
+  }
+  int64_t total_images{0};
+  file.read(reinterpret_cast<char *>(&total_images), sizeof(total_images));
+  LOG(INFO) << "Total images in file: " << total_images;
+  std::vector<int> image_batch_shape{batch_size, 3, 224, 224};
+  std::vector<int> label_batch_shape{batch_size, 1};
+  auto labels_offset_in_file =
+      static_cast<size_t>(file.tellg()) +
+      sizeof(float) * total_images *
+          std::accumulate(image_batch_shape.begin() + 1,
+                          image_batch_shape.end(), 1, std::multiplies<int>());
+  TensorReader<float> image_reader(file, 0, image_batch_shape, "input");
+  TensorReader<int64_t> label_reader(file, labels_offset_in_file,
+                                     label_batch_shape, "label");
+  auto iterations = total_images / batch_size;
+  if (FLAGS_iterations > 0 && FLAGS_iterations < iterations)
+    iterations = FLAGS_iterations;
+  for (auto i = 0; i < iterations; i++) {
+    auto images = image_reader.NextBatch();
+    auto labels = label_reader.NextBatch();
+    inputs->emplace_back(
+        std::vector<PaddleTensor>{std::move(images), std::move(labels)});
+  }
+}
+TEST(Analyzer_int8_resnet50, quantization) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  AnalysisConfig q_cfg;
+  SetConfig(&q_cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all, 100);
+  std::shared_ptr<std::vector<PaddleTensor>> warmup_data =
+      GetWarmupData(input_slots_all, 100);
+  q_cfg.EnableMkldnnQuantizer();
+  q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
+  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100);
+  CompareQuantizedAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      reinterpret_cast<const PaddlePredictor::Config *>(&q_cfg),
+      input_slots_all);
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import numpy as np
+import time
+import sys
+import random
+import functools
+import contextlib
+from PIL import Image, ImageEnhance
+import math
+from paddle.dataset.common import download
+random.seed(0)
+np.random.seed(0)
+DATA_DIM = 224
+SIZE_FLOAT32 = 4
+SIZE_INT64 = 8
+img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
+img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
+def resize_short(img, target_size):
+    percent = float(target_size) / min(img.size[0], img.size[1])
+    resized_width = int(round(img.size[0] * percent))
+    resized_height = int(round(img.size[1] * percent))
+    img = img.resize((resized_width, resized_height), Image.LANCZOS)
+    return img
+def crop_image(img, target_size, center):
+    width, height = img.size
+    size = target_size
+    if center == True:
+        w_start = (width - size) / 2
+        h_start = (height - size) / 2
+    else:
+        w_start = np.random.randint(0, width - size + 1)
+        h_start = np.random.randint(0, height - size + 1)
+    w_end = w_start + size
+    h_end = h_start + size
+    img = img.crop((w_start, h_start, w_end, h_end))
+    return img
+def process_image(img_path, mode, color_jitter, rotate):
+    img = Image.open(img_path)
+    img = resize_short(img, target_size=256)
+    img = crop_image(img, target_size=DATA_DIM, center=True)
+    if img.mode != 'RGB':
+        img = img.convert('RGB')
+    img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255
+    img -= img_mean
+    img /= img_std
+    return img
+def download_unzip():
+    int8_download = 'int8/download'
+    target_name = 'data'
+    cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                      int8_download)
+    target_folder = os.path.join(cache_folder, target_name)
+    data_urls = []
+    data_md5s = []
+    data_urls.append(
+        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
+    )
+    data_md5s.append('60f6525b0e1d127f345641d75d41f0a8')
+    data_urls.append(
+        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
+    )
+    data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
+    file_names = []
+    for i in range(0, len(data_urls)):
+        download(data_urls[i], cache_folder, data_md5s[i])
+        file_names.append(data_urls[i].split('/')[-1])
+    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
+    if not os.path.exists(zip_path):
+        cat_command = 'cat'
+        for file_name in file_names:
+            cat_command += ' ' + os.path.join(cache_folder, file_name)
+        cat_command += ' > ' + zip_path
+        os.system(cat_command)
+        print('Data is downloaded at {0}\n').format(zip_path)
+    if not os.path.exists(target_folder):
+        cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path)
+        os.system(cmd)
+        print('Data is unzipped at {0}\n'.format(target_folder))
+    data_dir = os.path.join(target_folder, 'ILSVRC2012')
+    print('ILSVRC2012 full val set at {0}\n'.format(data_dir))
+    return data_dir
+def reader():
+    data_dir = download_unzip()
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    output_file = os.path.join(data_dir, 'int8_full_val.bin')
+    with open(file_list) as flist:
+        lines = [line.strip() for line in flist]
+        num_images = len(lines)
+        if not os.path.exists(output_file):
+            print(
+                'Preprocessing to binary file...<num_images><all images><all labels>...\n'
+            )
+            with open(output_file, "w+b") as of:
+                #save num_images(int64_t) to file
+                of.seek(0)
+                num = np.array(int(num_images)).astype('int64')
+                of.write(num.tobytes())
+                for idx, line in enumerate(lines):
+                    img_path, label = line.split()
+                    img_path = os.path.join(data_dir, img_path)
+                    if not os.path.exists(img_path):
+                        continue
+                    #save image(float32) to file
+                    img = process_image(
+                        img_path, 'val', color_jitter=False, rotate=False)
+                    np_img = np.array(img)
+                    of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
+                            * idx)
+                    of.write(np_img.astype('float32').tobytes())
+                    #save label(int64_t) to file
+                    label_int = (int)(label)
+                    np_label = np.array(label_int)
+                    of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
+                            * num_images + idx * SIZE_INT64)
+                    of.write(np_label.astype('int64').tobytes())
+        print('The preprocessed binary file path {}\n'.format(output_file))
+if __name__ == '__main__':
+    reader()
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -50,6 +50,7 @@ DEFINE_bool(use_analysis, true,
 DEFINE_bool(record_benchmark, false,
            "Record benchmark after profiling the model");
 DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
+DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
 DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
 DECLARE_bool(profile);
@@ -58,6 +59,19 @@ DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace inference {
+template <typename T>
+constexpr paddle::PaddleDType GetPaddleDType();
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<int64_t>() {
+  return paddle::PaddleDType::INT64;
+}
+template <>
+constexpr paddle::PaddleDType GetPaddleDType<float>() {
+  return paddle::PaddleDType::FLOAT32;
+}
 void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) {
  const auto *analysis_config =
      reinterpret_cast<const AnalysisConfig *>(config);
@@ -392,6 +406,32 @@ void TestPrediction(const PaddlePredictor::Config *config,
  }
 }
+void CompareTopAccuracy(const std::vector<PaddleTensor> &output_slots1,
+                        const std::vector<PaddleTensor> &output_slots2) {
+  // first output: avg_cost
+  if (output_slots1.size() == 0 || output_slots2.size() == 0)
+    throw std::invalid_argument(
+        "CompareTopAccuracy: output_slots vector is empty.");
+  PADDLE_ENFORCE(output_slots1.size() >= 2UL);
+  PADDLE_ENFORCE(output_slots2.size() >= 2UL);
+  // second output: acc_top1
+  if (output_slots1[1].lod.size() > 0 || output_slots2[1].lod.size() > 0)
+    throw std::invalid_argument(
+        "CompareTopAccuracy: top1 accuracy output has nonempty LoD.");
+  if (output_slots1[1].dtype != paddle::PaddleDType::FLOAT32 ||
+      output_slots2[1].dtype != paddle::PaddleDType::FLOAT32)
+    throw std::invalid_argument(
+        "CompareTopAccuracy: top1 accuracy output is of a wrong type.");
+  float *top1_quantized = static_cast<float *>(output_slots1[1].data.data());
+  float *top1_reference = static_cast<float *>(output_slots2[1].data.data());
+  LOG(INFO) << "top1 INT8 accuracy: " << *top1_quantized;
+  LOG(INFO) << "top1 FP32 accuracy: " << *top1_reference;
+  LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
+  CHECK_LE(std::abs(*top1_quantized - *top1_reference),
+           FLAGS_quantized_accuracy);
+}
 void CompareDeterministic(
    const PaddlePredictor::Config *config,
    const std::vector<std::vector<PaddleTensor>> &inputs) {
@@ -421,6 +461,17 @@ void CompareNativeAndAnalysis(
  CompareResult(analysis_outputs, native_outputs);
 }
+void CompareQuantizedAndAnalysis(
+    const PaddlePredictor::Config *config,
+    const PaddlePredictor::Config *qconfig,
+    const std::vector<std::vector<PaddleTensor>> &inputs) {
+  PrintConfig(config, true);
+  std::vector<PaddleTensor> analysis_outputs, quantized_outputs;
+  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
+  TestOneThreadPrediction(qconfig, inputs, &quantized_outputs, true);
+  CompareTopAccuracy(quantized_outputs, analysis_outputs);
+}
 void CompareNativeAndAnalysis(
    PaddlePredictor *native_pred, PaddlePredictor *analysis_pred,
    const std::vector<std::vector<PaddleTensor>> &inputs) {

--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -23,6 +23,16 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
+// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
+// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
+// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
+// reason we set it to false by default is that this mode may use scaled
+// atomic integer reduction that may cause a numerical overflow for certain
+// input data range.
+DEFINE_bool(cudnn_batchnorm_spatial_persistent, false,
+            "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn "
+            "batch_norm, defalut is False.");
 namespace paddle {
 namespace operators {
@@ -76,7 +86,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    }
    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #if CUDNN_VERSION_MIN(7, 0, 0)
-    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
 #else
    mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif
@@ -302,7 +316,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
      }
      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #if CUDNN_VERSION_MIN(7, 0, 0)
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+      } else {
+        mode_ = CUDNN_BATCHNORM_SPATIAL;
+      }
 #else
      mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -386,7 +386,7 @@ void BenchKernelSoftmax() {
      RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
      const T* x_data = x.data<T>();
      T* y_data = y.mutable_data<T>(PlaceType());
-      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs);
+      BenchAllImpls<KernelTuple, PlaceType>(n, x_data, y_data, n, bs, 1);
    }
  }
 }

--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -34,6 +34,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kVAddRelu);
    ONE_CASE(kVSub);
    ONE_CASE(kVScal);
+    ONE_CASE(kStrideScal);
    ONE_CASE(kVAddBias);
    ONE_CASE(kVRelu);
    ONE_CASE(kVBroadcast);
@@ -55,6 +56,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kMatMul);
    ONE_CASE(kHMax);
    ONE_CASE(kHSum);
+    ONE_CASE(kStrideASum);
    ONE_CASE(kSoftmax);
    ONE_CASE(kEmbSeqPool);
    ONE_CASE(kSgd);

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -38,6 +38,8 @@ typedef enum {
  kNCHW16CMulNC,
  kSeqPool,
  kSoftmax,
+  kStrideASum,
+  kStrideScal,
  kVAdd,
  kVAddBias,
  kVAddRelu,
@@ -74,6 +76,14 @@ struct XYZNTuple {
 template <typename T>
 struct AXYNTuple : public XYZNTuple<T> {};
+// a, x, y, n, stride
+template <typename T>
+struct AXYNSTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, const T*, T*, int, int);
+};
 // x, y, n
 template <typename T>
 struct XYNTuple {
@@ -86,6 +96,14 @@ struct XYNTuple {
 template <typename T>
 struct XRNTuple : public XYNTuple<T> {};
+// x, returned value, n, stride
+template <typename T>
+struct XRNSTuple {
+  typedef T data_type;
+  typedef int attr_type;
+  typedef void (*func_type)(const T*, T*, int, int);
+};
 #define DECLARE_KERNELTUPLE(kernel_tuple, type)        \
  template <typename T>                                \
  struct type##Tuple : public kernel_tuple<T> {        \
@@ -101,6 +119,8 @@ DECLARE_KERNELTUPLE(XYZNTuple, VSub);
 DECLARE_KERNELTUPLE(AXYNTuple, VScal);
 DECLARE_KERNELTUPLE(AXYNTuple, VAddBias);
+DECLARE_KERNELTUPLE(AXYNSTuple, StrideScal);
 DECLARE_KERNELTUPLE(XYNTuple, VRelu);
 DECLARE_KERNELTUPLE(XYNTuple, VIdentity);
 DECLARE_KERNELTUPLE(XYNTuple, VSquare);
@@ -112,6 +132,8 @@ DECLARE_KERNELTUPLE(XYNTuple, VCopy);
 DECLARE_KERNELTUPLE(XRNTuple, HMax);
 DECLARE_KERNELTUPLE(XRNTuple, HSum);
+DECLARE_KERNELTUPLE(XRNSTuple, StrideASum);
 typedef struct {
  void* gates;  // gates: x_ch, x_ih, x_fh, x_oh
  const void* ct_1;
@@ -285,7 +307,7 @@ struct SoftmaxTuple {
  static constexpr KernelType kernel_type = kSoftmax;
  typedef T data_type;
  typedef int attr_type;
-  typedef void (*func_type)(const T*, T*, int, int);
+  typedef void (*func_type)(const T*, T*, int, int, int);
 };
 // nChw16c = nChw16c .* NC

--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -50,10 +50,15 @@ void VTanh(const T* x, T* y, int n) {
  compute_addbias(&b, y, y, n);
 }
-void Softmax(const T* x, T* y, int n, int bs) {
+// remain is the product of dimension shapes after the axis dimension
+void Softmax(const T* x, T* y, int n, int bs, int remain) {
  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_strideasum =
+      KernelFuncs<StrideASumTuple<T>, CPUPlace>::Cache().At(n);
+  auto compute_stridescal =
+      KernelFuncs<StrideScalTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vaddbias =
      KernelFuncs<VAddBiasTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vexp = KernelFuncs<VExpTuple<T>, CPUPlace>::Cache().At(n);
@@ -64,9 +69,17 @@ void Softmax(const T* x, T* y, int n, int bs) {
    scalar = static_cast<T>(0) - scalar;
    compute_vaddbias(&scalar, x, y, n);  // x - max
    compute_vexp(y, y, n);
-    compute_hsum(y, &scalar, n);
+    if (remain == 1) {
-    scalar = static_cast<T>(1) / scalar;
+      compute_hsum(y, &scalar, n);
-    compute_vscal(&scalar, y, y, n);
+      scalar = static_cast<T>(1) / scalar;
+      compute_vscal(&scalar, y, y, n);
+    } else {
+      for (int j = 0; j < remain; ++j) {
+        compute_strideasum(&y[j], &scalar, n, remain);
+        scalar = static_cast<T>(1) / scalar;
+        compute_stridescal(&scalar, &y[j], &y[j], n, remain);
+      }
+    }
    x += n;
    y += n;
  }

--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -26,7 +26,7 @@ using T = float;
 void VSigmoid(const T* x, T* y, int n);
 void VTanh(const T* x, T* y, int n);
-void Softmax(const T* x, T* y, int n, int bs);
+void Softmax(const T* x, T* y, int n, int bs, int remain);
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
 void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);

--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -7,6 +7,7 @@ USE_JITKERNEL_MORE(kMatMul, mkl)
 USE_JITKERNEL_MORE(kVMul, mkl)
 USE_JITKERNEL_MORE(kVAdd, mkl)
 USE_JITKERNEL_MORE(kVScal, mkl)
+USE_JITKERNEL_MORE(kStrideScal, mkl)
 USE_JITKERNEL_MORE(kVExp, mkl)
 USE_JITKERNEL_MORE(kVSquare, mkl)
 USE_JITKERNEL_MORE(kVCopy, mkl)

--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -78,6 +78,26 @@ void VScal<double>(const double* a, const double* x, double* y, int n) {
  }
 }
+template <>
+void StrideScal<float>(const float* a, const float* x, float* y, int n,
+                       int stride) {
+  if (x == y) {
+    platform::dynload::cblas_sscal(n / stride, *a, y, stride);
+  } else {
+    refer::StrideScal<float>(a, x, y, n, stride);
+  }
+}
+template <>
+void StrideScal<double>(const double* a, const double* x, double* y, int n,
+                        int stride) {
+  if (x == y) {
+    platform::dynload::cblas_dscal(n / stride, *a, y, stride);
+  } else {
+    refer::StrideScal<double>(a, x, y, n, stride);
+  }
+}
 template <>
 void VExp<float>(const float* x, float* y, int n) {
  platform::dynload::vsExp(n, x, y);
@@ -128,6 +148,16 @@ void ASum<double>(const double* x, double* res, int n) {
  res[0] = platform::dynload::cblas_dasum(n, x, 1);
 }
+template <>
+void StrideASum<float>(const float* x, float* res, int n, int stride) {
+  res[0] = platform::dynload::cblas_sasum(n / stride, x, stride);
+}
+template <>
+void StrideASum<double>(const double* x, double* res, int n, int stride) {
+  res[0] = platform::dynload::cblas_dasum(n / stride, x, stride);
+}
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool VMulKernel<float>::CanBeUsed(const int& d) const {
@@ -144,6 +174,11 @@ bool VScalKernel<float>::CanBeUsed(const int& d) const {
  return platform::MayIUse(platform::avx512f) && d > 512;
 }
+template <>
+bool StrideScalKernel<float>::CanBeUsed(const int& d) const {
+  return true;
+}
 template <>
 bool VExpKernel<float>::CanBeUsed(const int& d) const {
  return d > 7;
@@ -235,6 +270,7 @@ bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
 AWALYS_USE_ME_WITH_DOUBLE(VMul);
 AWALYS_USE_ME_WITH_DOUBLE(VAdd);
 AWALYS_USE_ME_WITH_DOUBLE(VScal);
+AWALYS_USE_ME_WITH_DOUBLE(StrideScal);
 AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
@@ -259,6 +295,7 @@ REGISTER_MKL_KERNEL(MatMul);
 REGISTER_MKL_KERNEL(VMul);
 REGISTER_MKL_KERNEL(VAdd);
 REGISTER_MKL_KERNEL(VScal);
+REGISTER_MKL_KERNEL(StrideScal);
 REGISTER_MKL_KERNEL(VExp);
 REGISTER_MKL_KERNEL(VSquare);
 REGISTER_MKL_KERNEL(VCopy);

--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -129,7 +129,14 @@ template <typename T>
 void ASum(const T* x, T* res, int n);
 template <typename T>
-void Softmax(const T* x, T* y, int n, int bs) {
+void StrideASum(const T* x, T* res, int n, int stride);
+template <typename T>
+void StrideScal(const T* a, const T* x, T* y, int n, int stride);
+// remain is the product of dimension shapes after the axis dimension
+template <typename T>
+void Softmax(const T* x, T* y, int n, int bs, int remain = 1) {
  std::vector<T> entities(bs);
  for (int i = 0; i < bs; ++i) {
    entities[i] = x[i * n];
@@ -143,9 +150,17 @@ void Softmax(const T* x, T* y, int n, int bs) {
  VExp(y, y, n * bs);
  for (int i = 0; i < bs; ++i) {
    T sum;
-    ASum(&y[i * n], &sum, n);
+    if (remain == 1) {
-    sum = static_cast<T>(1) / sum;
+      ASum(&y[i * n], &sum, n);
-    VScal(&sum, &y[i * n], &y[i * n], n);
+      sum = static_cast<T>(1) / sum;
+      VScal(&sum, &y[i * n], &y[i * n], n);
+    } else {
+      for (int j = 0; j < remain; ++j) {
+        StrideASum(&y[i * n + j], &sum, n, remain);
+        sum = static_cast<T>(1) / sum;
+        StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain);
+      }
+    }
  }
 }
@@ -193,6 +208,7 @@ DECLARE_MKL_KERNEL(VAdd);
 // AXYN
 DECLARE_MKL_KERNEL(VScal);
+DECLARE_MKL_KERNEL(StrideScal);
 // XYN
 DECLARE_MKL_KERNEL(VExp);

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -12,6 +12,7 @@ USE_JITKERNEL_REFER(kVAdd)
 USE_JITKERNEL_REFER(kVAddRelu)
 USE_JITKERNEL_REFER(kVSub)
 USE_JITKERNEL_REFER(kVScal)
+USE_JITKERNEL_REFER(kStrideScal)
 USE_JITKERNEL_REFER(kVAddBias)
 USE_JITKERNEL_REFER(kVCopy)
 USE_JITKERNEL_REFER(kVRelu)
@@ -32,6 +33,7 @@ USE_JITKERNEL_REFER(kMatMul)
 USE_JITKERNEL_REFER(kVSquare)
 USE_JITKERNEL_REFER(kHSum)
 USE_JITKERNEL_REFER(kHMax)
+USE_JITKERNEL_REFER(kStrideASum)
 USE_JITKERNEL_REFER(kSoftmax)
 USE_JITKERNEL_REFER(kEmbSeqPool)
 USE_JITKERNEL_REFER(kSgd)

--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -27,6 +27,7 @@ REGISTER_REFER_KERNEL(VAddRelu);
 REGISTER_REFER_KERNEL(VSub);
 REGISTER_REFER_KERNEL(VScal);
+REGISTER_REFER_KERNEL(StrideScal);
 REGISTER_REFER_KERNEL(VAddBias);
 REGISTER_REFER_KERNEL(VRelu);
@@ -51,6 +52,7 @@ REGISTER_REFER_KERNEL(SeqPool);
 REGISTER_REFER_KERNEL(MatMul);
 REGISTER_REFER_KERNEL(HMax);
 REGISTER_REFER_KERNEL(HSum);
+REGISTER_REFER_KERNEL(StrideASum);
 REGISTER_REFER_KERNEL(Softmax);
 REGISTER_REFER_KERNEL(EmbSeqPool);
 REGISTER_REFER_KERNEL(Sgd);

--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -411,19 +411,47 @@ void HSum(const T* x, T* res, int n) {
  }
 }
+template <typename T>
+void StrideASum(const T* x, T* res, int n, int stride) {
+  res[0] = x[0];
+  for (int i = stride; i < n; i += stride) {
+    res[0] += std::abs(x[i]);
+  }
+}
+template <typename T>
+void StrideScal(const T* a, const T* x, T* y, int n, int stride) {
+  for (int i = 0; i < n; ++i) {
+    if (i % stride == 0) {
+      y[i] = x[i] * a[0];
+    } else {
+      y[i] = x[i];
+    }
+  }
+}
 // y = e^(x - max(x))
 // y = y / sum(y)
+// remain is the product of dimension shapes after the axis dimension
 template <typename T>
-void Softmax(const T* x, T* y, int n, int bs = 1) {
+void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
  for (int i = 0; i < bs; ++i) {
    T scalar;
    HMax(x, &scalar, n);
    scalar = static_cast<T>(0) - scalar;
    VAddBias(&scalar, x, y, n);  // x - max
    VExp(y, y, n);
-    HSum(y, &scalar, n);
+    if (remain == 1) {
-    scalar = static_cast<T>(1) / scalar;
+      HSum(y, &scalar, n);
-    VScal(&scalar, y, y, n);
+      scalar = static_cast<T>(1) / scalar;
+      VScal(&scalar, y, y, n);
+    } else {
+      for (int j = 0; j < remain; j++) {
+        StrideASum(&y[j], &scalar, n, remain);
+        scalar = static_cast<T>(1) / scalar;
+        StrideScal(&scalar, &y[j], &y[j], n, remain);
+      }
+    }
    x += n;
    y += n;
  }
@@ -507,6 +535,9 @@ DECLARE_REFER_KERNEL(VSub);
 DECLARE_REFER_KERNEL(VScal);
 DECLARE_REFER_KERNEL(VAddBias);
+// const T* a, const T* x, T* y, int n, int stride
+DECLARE_REFER_KERNEL(StrideScal);
 // const T* x, T* y, int n
 DECLARE_REFER_KERNEL(VRelu);
 DECLARE_REFER_KERNEL(VIdentity);
@@ -528,6 +559,8 @@ DECLARE_REFER_KERNEL(GRUHtPart2);
 DECLARE_REFER_KERNEL(HMax);
 DECLARE_REFER_KERNEL(HSum);
+DECLARE_REFER_KERNEL(StrideASum);
 // others
 DECLARE_REFER_KERNEL(CRFDecoding);
 DECLARE_REFER_KERNEL(LayerNorm);

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -723,39 +723,122 @@ void TestKernelSoftmax() {
  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
+      for (int m : {1, 2, 3}) {  // remain
+        if (m > n || n % m != 0) {
+          continue;
+        }
+        auto ref = jit::GetReferFunc<KernelTuple>();
+        EXPECT_TRUE(ref != nullptr);
+        std::vector<T> x(bs * n), y(bs * n);
+        RandomVec<T>(bs * n, x.data());
+        const T* x_data = x.data();
+        T* y_data = y.data();
+        std::vector<T> xinp(x.size());  // inplace test
+        std::copy(x.begin(), x.end(), xinp.begin());
+        ref(x_data, y_data, n, bs, m);
+        T* xinp_data = xinp.data();
+        ref(xinp_data, xinp_data, n, bs, m);
+        ExpectEQ<T>(xinp_data, y_data, n * bs);
+        auto verifier = [](const typename KernelTuple::func_type tgt,
+                           const std::vector<T>& x, const std::vector<T>& yref,
+                           int n, int bs, int m) {
+          EXPECT_TRUE(tgt != nullptr);
+          EXPECT_EQ(yref.size(), x.size());
+          EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
+          const T* x_data = x.data();
+          const T* yref_data = yref.data();
+          std::vector<T> ytgt(n * bs);
+          T* ytgt_data = ytgt.data();
+          // test normal
+          tgt(x_data, ytgt_data, n, bs, m);
+          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+          // test inplace x
+          std::copy(x.begin(), x.end(), ytgt.begin());
+          tgt(ytgt_data, ytgt_data, n, bs, m);
+          ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+        };
+        TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs, m);
+      }
+    }
+  }
+}
+template <typename KernelTuple, typename PlaceType>
+void TestKernelStrideASum() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int d : TestSizes()) {
+    for (int m : {1, 2, 3}) {  // stride
+      if (m > d || d % m != 0) {
+        continue;
+      }
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> x(d);
+      RandomVec<T>(d, x.data());
+      T ref_res;
+      ref(x.data(), &ref_res, d, m);
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const std::vector<T>& x, const T ref_res,
+                         const int m) {
+        EXPECT_TRUE(tgt != nullptr);
+        T tgt_res;
+        tgt(x.data(), &tgt_res, x.size(), m);
+        ExpectEQ<T>(&tgt_res, &ref_res, 1);
+      };
+      TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res, m);
+    }
+  }
+}
+template <typename KernelTuple, typename PlaceType>
+void TestKernelStrideScal() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int d : TestSizes()) {
+    for (int m : {1, 2, 3}) {  // stride
+      if (m > d || d % m != 0) {
+        continue;
+      }
      auto ref = jit::GetReferFunc<KernelTuple>();
      EXPECT_TRUE(ref != nullptr);
-      std::vector<T> x(bs * n), y(bs * n);
-      RandomVec<T>(bs * n, x.data());
-      const T* x_data = x.data();
-      T* y_data = y.data();
-      std::vector<T> xinp(x.size());  // inplace test
+      const T a = static_cast<T>(3);
+      std::vector<T> x(d), yref(d);
+      std::vector<T> xinp(d);  // inplace test
+      RandomVec<T>(d, x.data());
      std::copy(x.begin(), x.end(), xinp.begin());
-      ref(x_data, y_data, n, bs);
+      const T* x_data = x.data();
+      T* yref_data = yref.data();
      T* xinp_data = xinp.data();
-      ref(xinp_data, xinp_data, n, bs);
+      // test refer code inplace
-      ExpectEQ<T>(xinp_data, y_data, n * bs);
+      ref(&a, x_data, yref_data, d, m);
+      ref(&a, xinp_data, xinp_data, d, m);
+      ExpectEQ<T>(xinp_data, yref_data, d);
-      auto verifier = [](const typename KernelTuple::func_type tgt,
+      auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
                         const std::vector<T>& x, const std::vector<T>& yref,
-                         int n, int bs) {
+                         const int m) {
        EXPECT_TRUE(tgt != nullptr);
        EXPECT_EQ(yref.size(), x.size());
-        EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
        const T* x_data = x.data();
        const T* yref_data = yref.data();
-        std::vector<T> ytgt(n * bs);
+        const int d = yref.size();
+        std::vector<T> ytgt(d);
        T* ytgt_data = ytgt.data();
        // test normal
-        tgt(x_data, ytgt_data, n, bs);
+        tgt(&a, x_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+        ExpectEQ<T>(ytgt_data, yref_data, d);
        // test inplace x
        std::copy(x.begin(), x.end(), ytgt.begin());
-        tgt(ytgt_data, ytgt_data, n, bs);
+        tgt(&a, ytgt_data, ytgt_data, d, m);
-        ExpectEQ<T>(ytgt_data, yref_data, n * bs);
+        ExpectEQ<T>(ytgt_data, yref_data, d);
      };
-      TestAllImpls<KernelTuple, PlaceType>(n, verifier, x, y, n, bs);
+      TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref, m);
    }
  }
 }
@@ -912,7 +995,7 @@ TEST(JITKernel_pool, more) {
  EXPECT_EQ(kers.size(), 10UL);
 #else
 #ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 21UL);
+  EXPECT_EQ(kers.size(), 22UL);
 #else
  EXPECT_EQ(kers.size(), 8UL);
 #endif
@@ -921,7 +1004,7 @@ TEST(JITKernel_pool, more) {
 TEST(JITKernel_pool, refer) {
  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 29UL);
+  EXPECT_EQ(kers.size(), 31UL);
 }
 // test helper
@@ -1292,3 +1375,6 @@ TEST_CPU_KERNEL(MatMul);
 TEST_CPU_KERNEL(Softmax);
 TEST_CPU_KERNEL(Sgd);
 TEST_CPU_KERNEL(VBroadcast);
+TEST_CPU_KERNEL(StrideASum);
+TEST_CPU_KERNEL(StrideScal);
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/operators/kldiv_loss_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class KLDivLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of KLDivLossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Target"),
+                   "Input(Target) of KLDivLossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of KLDivLossOp should not be null.");
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_target = ctx->GetInputDim("Target");
+    PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
+                      "Input(X) rank and Input(Target) rank should be same.");
+    for (int i = 0; i < dim_x.size(); i++) {
+      PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
+                        "Input(X) and Input(Target) should in same shape.");
+    }
+    auto reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE(
+        "mean" == reduction || "sum" == reduction || "batchmean" == reduction ||
+            "none" == reduction,
+        "Attr(reduction) can only be 'none'|'batchmean'|'sum'|'mean'.");
+    if ("none" == reduction) {
+      ctx->SetOutputDim("Loss", dim_x);
+    } else {
+      ctx->SetOutputDim("Loss", {1});
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+class KLDivLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of KL divergence loss operator. "
+             "This is a tensor with shape of [N, *], where N is the "
+             "batch size, * means any number of additional dimensions.");
+    AddInput("Target",
+             "The  tensor of KL divergence loss operator. "
+             "This is a tensor with shape of Input(X).");
+    AddOutput(
+        "Loss",
+        "The output KL divergence loss tensor. if Attr(reduction) is "
+        "'none', this tensor should be in same shape of of Input(X), else "
+        "this tensor should be in shape of [1].");
+    AddAttr<std::string>(
+        "reduction",
+        "The reduction type to apply to the output, available types "
+        "are 'none' | 'batchmean' | 'mean' | 'sum', 'none' for no "
+        "reduction, 'batchmean' for the sum of output divided by "
+        "batch size, 'mean' for the average value of all output, "
+        "'sum' for the sum of the output.")
+        .SetDefault("mean");
+    AddComment(R"DOC(
+         This operator calculates the Kullback-Leibler divergence loss
+         between Input(X) and Input(Target).
+         KL divergence loss is calculated as follows:
+         $$l(x, y) = y * (\log(y) - x)$$
+         While :math:`x` is Input(X) and :math:`y` is Input(Target).
+         While :attr:`reduction` is :attr:`none`, output loss is in
+         the same shape as Input(X), loss in each point is calculated 
+         seperately and no reduction is applied.
+         While :attr:`reduction` is :attr:`mean`, output loss is in
+         shape of [1] and loss value is the mean value of all losses.
+         While :attr:`reduction` is :attr:`sum`, output loss is in
+         shape of [1] and loss value is the sum value of all losses.
+         While :attr:`reduction` is :attr:`batchmean`, output loss is 
+         in shape of [1] and loss value is the sum value of all losses
+         divided by batch size.
+         )DOC");
+  }
+};
+class KLDivLossOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Target"), "Input(Target) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+class KLDivLossOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("kldiv_loss_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Target", Input("Target"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    op->SetAttrMap(Attrs());
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker,
+                  ops::KLDivLossOpGradMaker);
+REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    kldiv_loss, ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::KLDivLossKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    kldiv_loss_grad,
+    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::KLDivLossGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/kldiv_loss_op.cu
+++ b/paddle/fluid/operators/kldiv_loss_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/kldiv_loss_op.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    kldiv_loss,
+    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::KLDivLossKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    kldiv_loss_grad,
+    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::KLDivLossGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+using Array1 = Eigen::DSizes<int64_t, 1>;
+template <typename T>
+struct KLDivLossForward {
+  HOSTDEVICE KLDivLossForward() {}
+  HOSTDEVICE T operator()(const T& target, const T& input) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return target * (std::log(target) - input);
+    }
+  }
+};
+template <typename T>
+struct KLDivLossBackward {
+  HOSTDEVICE KLDivLossBackward() {}
+  HOSTDEVICE T operator()(const T& target, const T& grad) const {
+    if (target <= 0) {
+      return 0;
+    } else {
+      return static_cast<T>(-1.) * grad;
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class KLDivLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* target = ctx.Input<Tensor>("Target");
+    auto* loss = ctx.Output<Tensor>("Loss");
+    auto reduction = ctx.Attr<std::string>("reduction");
+    const int n = input->dims()[0];
+    loss->mutable_data<T>(ctx.GetPlace());
+    auto input_t = EigenVector<T>::Flatten(*input);
+    auto target_t = EigenVector<T>::Flatten(*target);
+    auto loss_t = EigenVector<T>::Flatten(*loss);
+    auto output = target_t.binaryExpr(input_t, KLDivLossForward<T>());
+    if ("none" == reduction) {
+      loss_t.device(place) = output;
+    } else if ("batchmean" == reduction) {
+      auto output_sum = output.sum().eval();
+      loss_t.device(place) = output_sum / output_sum.constant(n);
+    } else if ("mean" == reduction) {
+      loss_t.device(place) = output.mean();
+    } else if ("sum" == reduction) {
+      loss_t.device(place) = output.sum();
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class KLDivLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto* target = ctx.Input<Tensor>("Target");
+    auto reduction = ctx.Attr<std::string>("reduction");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    const int n = input_grad->dims()[0];
+    const int numel = input_grad->numel();
+    const int expand = numel / loss_grad->numel();
+    input_grad->mutable_data<T>(ctx.GetPlace());
+    auto target_t = EigenVector<T>::Flatten(*target);
+    auto input_grad_t = EigenVector<T>::Flatten(*input_grad);
+    auto loss_grad_t = EigenVector<T>::Flatten(*loss_grad);
+    auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand));
+    auto grad_t = target_t * loss_grad_expand;
+    input_grad_t.device(place) =
+        target_t.binaryExpr(grad_t, KLDivLossBackward<T>());
+    if ("mean" == reduction) {
+      input_grad_t.device(place) = input_grad_t / static_cast<T>(numel);
+    } else if ("batchmean" == reduction) {
+      input_grad_t.device(place) = input_grad_t / static_cast<T>(n);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -23,15 +23,16 @@ template <typename DeviceContext, typename T, bool is_test,
          typename Enable = void>
 class SoftmaxFunctor {
 public:
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
+  void operator()(const DeviceContext& context, const int axis_dim,
-                  framework::Tensor* Y);
+                  const framework::Tensor* X, framework::Tensor* Y);
 };
 template <typename DeviceContext, typename T>
 class SoftmaxGradFunctor {
 public:
-  void operator()(const DeviceContext& context, const framework::Tensor* y,
+  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad);
 };
 #ifdef PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -36,8 +36,8 @@ struct ValueClip {
 template <typename DeviceContext, typename T, bool is_test, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
-    const DeviceContext& context, const framework::Tensor* X,
+    const DeviceContext& context, const int axis_dim,
-    framework::Tensor* Y) {
+    const framework::Tensor* X, framework::Tensor* Y) {
  auto logits = EigenMatrix<T>::From(*X);
  auto softmax = EigenMatrix<T>::From(*Y);
@@ -46,10 +46,13 @@ void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
  const int batch_size = logits.dimension(kBatchDim);
  const int num_classes = logits.dimension(kClassDim);
+  const int num_remain = num_classes / axis_dim;
  Eigen::DSizes<int, 1> along_class(kClassDim);
  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
  auto shifted_logits = (logits -
                         logits.maximum(along_class)
@@ -60,11 +63,11 @@ void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
  softmax.device(*context.eigen_device()) = shifted_logits.exp();
  softmax.device(*context.eigen_device()) = (softmax *
-                                             softmax.sum(along_class)
+                                             softmax.reshape(batch_axis_remain)
+                                                 .sum(along_class)
                                                 .inverse()
                                                 .eval()
-                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_axis));
-                                                 .broadcast(one_by_class));
 }
 template <class DeviceContext>
@@ -73,8 +76,8 @@ using enable_if_CPU = typename std::enable_if<
 template <typename DeviceContext>
 class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
-  void operator()(const DeviceContext& context, const framework::Tensor* X,
+  void operator()(const DeviceContext& context, const int axis_dim,
-                  framework::Tensor* Y) {
+                  const framework::Tensor* X, framework::Tensor* Y) {
    auto in_dims = X->dims();
    const float* in_data = X->data<float>();
    float* out_data = Y->data<float>();
@@ -84,14 +87,16 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
    auto compute_softmax =
        jit::KernelFuncs<jit::SoftmaxTuple<float>, platform::CPUPlace>::Cache()
            .At(in_dims[kClassDim]);
-    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
+    compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim],
+                    in_dims[kClassDim] / axis_dim);
  }
 };
 template <typename DeviceContext, typename T>
 void SoftmaxGradFunctor<DeviceContext, T>::operator()(
-    const DeviceContext& context, const framework::Tensor* y,
+    const DeviceContext& context, const int axis_dim,
-    const framework::Tensor* y_grad, framework::Tensor* x_grad) {
+    const framework::Tensor* y, const framework::Tensor* y_grad,
+    framework::Tensor* x_grad) {
  auto softmax = EigenMatrix<T>::From(*y);
  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
  auto logits_grad = EigenMatrix<T>::From(*x_grad);
@@ -101,16 +106,19 @@ void SoftmaxGradFunctor<DeviceContext, T>::operator()(
  const int batch_size = softmax.dimension(kBatchDim);
  const int num_classes = softmax.dimension(kClassDim);
+  const int num_remain = num_classes / axis_dim;
  Eigen::DSizes<int, 1> along_class(kClassDim);
  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+  Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+  Eigen::DSizes<int, 2> one_axis(1, axis_dim);
  auto dot = (softmax * softmax_grad)
+                 .reshape(batch_axis_remain)
                 .sum(along_class)
                 .eval()
-                 .reshape(batch_by_one)
+                 .broadcast(one_axis);
-                 .broadcast(one_by_class);
  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
 }

--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -39,6 +39,20 @@ class SoftmaxOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of SoftmaxOp should not be null.");
+    auto dim_x = ctx->GetInputDim("X");
+    auto rank_x = dim_x.size();
+    auto axis = ctx->Attrs().Get<int>("axis");
+    PADDLE_ENFORCE(axis >= -rank_x && axis < rank_x,
+                   "Attr(axis) value should be in range [-R, R-1], "
+                   "R is the rank of Input(X).");
+    auto use_cudnn = ctx->Attrs().Get<bool>("use_cudnn");
+    auto use_mkldnn = ctx->Attrs().Get<bool>("use_mkldnn");
+    if (axis != rank_x - 1 && axis != -1) {
+      PADDLE_ENFORCE(!use_cudnn, "CUDNN kernel only support axis as -1.");
+      PADDLE_ENFORCE(!use_mkldnn, "MKLDNN kernel only support axis as -1.");
+    }
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -80,8 +94,12 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X",
             "The input tensor of softmax, "
-             "whose last dimension is the input_feature_dimensions.");
+             "whose dimension :attr:`axis` is the input_feature_dimensions.");
    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddAttr<int>("axis",
+                 "The dimension index of Input(x) to perform softmax,"
+                 "default -1 for last dimension")
+        .SetDefault(-1);
    AddAttr<bool>(
        "use_cudnn",
        "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -106,12 +124,13 @@ Softmax Operator.
 The input of the softmax operator is a tensor of any rank. The output tensor
 has the same shape as the input.
-The input tensor will first be logically flattened to a 2-D matrix. The matrix's
+The dimension :attr:`axis` of the input tensor will be permuted to the last.
-second dimension(row length) is as same as the last dimension of the input
+Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
+second dimension(row length) is as same as the dimension :attr:`axis` of the input
 tensor, and the first dimension(column length) is the product of all other
 dimensions of the input tensor. For each row of the matrix, the softmax operator
 squashes the K-dimensional(K is the width of the matrix, which is also the size
-of the input tensor's last dimension) vector of arbitrary real values to a
+of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
 K-dimensional vector of real values in the range [0, 1] that add up to 1.
 It computes the exponential of the given dimension and the sum of exponential
 values of all the other dimensions in the K-dimensional vector input.

--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -20,6 +20,30 @@ namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+static inline int SizeToAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+static inline int SizeFromAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
 template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
@@ -27,20 +51,27 @@ class SoftmaxKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* X = context.Input<Tensor>("X");
    auto* Out = context.Output<Tensor>("Out");
+    const int rank = X->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis_dim = X->dims()[axis];
    // allocate memory on device.
    Out->mutable_data<T>(context.GetPlace());
-    int rank = X->dims().size();
+    const int n = SizeToAxis(axis, X->dims());
-    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
+    const int d = SizeFromAxis(axis, X->dims());
-    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
+    Tensor X_2d, Out_2d;
+    X_2d.ShareDataWith(*X).Resize({n, d});
+    Out_2d.ShareDataWith(*Out).Resize({n, d});
 #ifdef PADDLE_ON_INFERENCE
    math::SoftmaxFunctor<DeviceContext, T, true>()(
-        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
+        &Out_2d);
 #else
    math::SoftmaxFunctor<DeviceContext, T, false>()(
-        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
+        context.template device_context<DeviceContext>(), axis_dim, &X_2d,
+        &Out_2d);
 #endif
  }
 };
@@ -52,18 +83,23 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
    auto* Out = context.Input<Tensor>("Out");
    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+    const int rank = dX->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis_dim = dX->dims()[axis];
    // allocate memory on device.
    dX->mutable_data<T>(context.GetPlace());
-    int rank = Out->dims().size();
+    const int n = SizeToAxis(axis, dX->dims());
-    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
+    const int d = SizeFromAxis(axis, dX->dims());
-    Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
+    Tensor dX_2d, Out_2d, dOut_2d;
-    Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
+    dX_2d.ShareDataWith(*dX).Resize({n, d});
+    Out_2d.ShareDataWith(*Out).Resize({n, d});
+    dOut_2d.ShareDataWith(*dOut).Resize({n, d});
    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
+        context.template device_context<DeviceContext>(), axis_dim, &Out_2d,
-        &dX_2d);
+        &dOut_2d, &dX_2d);
  }
 };

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -40,10 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());
+    int axis_dim = logits->dims()[logits->dims().size() - 1];
    auto& dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-        dev_ctx, logits, softmax);
+        dev_ctx, axis_dim, logits, softmax);
    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
        context.Attr<int>("ignore_index"));

--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/operators/temporal_shift_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class TemporalShiftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TemporalShiftOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TemporalShiftOp should not be null.");
+    auto dim_x = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4,
+                      "Input(X) rank should be 4 in shape of [N*T, C, H, W].");
+    int seg_num = ctx->Attrs().Get<int>("seg_num");
+    float shift_ratio = ctx->Attrs().Get<float>("shift_ratio");
+    PADDLE_ENFORCE_GT(seg_num, 0, "Attr(seg_num) should be greater than 0.");
+    PADDLE_ENFORCE(shift_ratio > 0 || shift_ratio < .5,
+                   "Attr(shift_ratio) should be greater than 0 and less "
+                   "than 0.5.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[0] % seg_num, 0,
+          "Input(X) dims[0] should be divided exactly by Attr(seg_num).");
+    }
+    ctx->SetOutputDim("Out", dim_x);
+    ctx->ShareLoD("X", "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of temporal shift operator. "
+             "This is a 4-D tensor with shape of [N*T,  C, H, W]. "
+             "While N is the batch size, T is the temporal segment "
+             "number, C is the channel number, H is the height of "
+             "features and W is the width of features.");
+    AddOutput("Out",
+              "The output tensor of temporal shift operator. "
+              "This is a 4-D tensor in the same shape with Input(X).");
+    AddAttr<int>("seg_num",
+                 "The temporal segment number, this should be a positive "
+                 "integer.");
+    AddAttr<float>(
+        "shift_ratio",
+        "The shift ratio of the channels, the first :attr:`shift_ratio` part "
+        "of channels will be shifted by -1 along the temporal dimension, "
+        "and the second :attr:`shift_ratio` part of channels will be shifted "
+        "by 1 along the temporal dimension. Default 0.25.")
+        .SetDefault(0.25);
+    AddComment(R"DOC(
+          This operator calculates the temporal shifting features for Input(X).
+          Input(X) should be in shape of [N*T, C, H, W], while N is the batch
+          size, T is the temporal segment number specified by :attr:`seg_num`, 
+          C is the channel number, H and W is the height and width of features.
+          Temporal Shifting is calculated as follows:
+          Step 1: Reshape Input(X) to [N, T, C, H, W].
+          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with 
+          padding width as 1 on each side, padding result will be in shape 
+          of [N, T+2, C, H, W].
+          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding 
+          result as follows:
+          $$
+          slice1 = x[:, :T, :C/4, :, :]
+          $$
+          $$
+          slice2 = x[:, 2:T+2, C/4:C/2, :, :]
+          $$
+          $$
+          slice3 = x[:, 1:T+1, C/2:, :, :]
+          $$
+          Step 4: Concatenate three slices along the 3rd(C) dimension and 
+          reshape result to [N*T, C, H, W].
+          For details of temporal shifting, please refer to paper: 
+          `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
+         )DOC");
+  }
+};
+class TemporalShiftOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp,
+                  ops::TemporalShiftOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
+REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel<float>,
+                       ops::TemporalShiftKernel<double>);
+REGISTER_OP_CPU_KERNEL(temporal_shift_grad, ops::TemporalShiftGradKernel<float>,
+                       ops::TemporalShiftGradKernel<double>);
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/operators/temporal_shift_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+template <typename T>
+__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
+                                  const int tchw, const int chw, const int hw,
+                                  const int w, const int t, const int c,
+                                  const float shift_ratio) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < ntchw; tid += stride) {
+    int in = tid / tchw;
+    int it = (tid % tchw) / chw;
+    int ic = (tid % chw) / hw;
+    int ih = (tid % hw) / w;
+    int iw = tid % w;
+    const int c1 = static_cast<T>(c * shift_ratio);
+    const int c2 = static_cast<T>(c * 2 * shift_ratio);
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it < 0 || src_it >= t) {
+      output[tid] = 0;
+    } else {
+      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
+      output[tid] = input[src_idx];
+    }
+  }
+}
+template <typename T>
+__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
+                                  const int ntchw, const int tchw,
+                                  const int chw, const int hw, const int w,
+                                  const int t, const int c,
+                                  const float shift_ratio) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+  for (; tid < ntchw; tid += stride) {
+    int in = tid / tchw;
+    int it = (tid % tchw) / chw;
+    int ic = (tid % chw) / hw;
+    int ih = (tid % hw) / w;
+    int iw = tid % w;
+    const int c1 = static_cast<T>(c * shift_ratio);
+    const int c2 = static_cast<T>(c * 2 * shift_ratio);
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+    if (src_it >= 0 && src_it < t) {
+      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
+      input_grad[src_idx] = output_grad[tid];
+    }
+  }
+}
+template <typename T>
+class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const int nt = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const int ntchw = nt * chw;
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    int pixelNum = nt * chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    KeTemporalShiftFw<
+        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
+  }
+};
+template <typename T>
+class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const int nt = output_grad->dims()[0];
+    const int c = output_grad->dims()[1];
+    const int h = output_grad->dims()[2];
+    const int w = output_grad->dims()[3];
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const int ntchw = nt * chw;
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), input_grad,
+        static_cast<T>(0));
+    int pixelNum = nt * chw;
+    int grid_dim = (pixelNum + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    KeTemporalShiftBw<
+        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
+        shift_ratio);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(temporal_shift, ops::TemporalShiftOpCUDAKernel<float>,
+                        ops::TemporalShiftOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(temporal_shift_grad,
+                        ops::TemporalShiftGradOpCUDAKernel<float>,
+                        ops::TemporalShiftGradOpCUDAKernel<double>);
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih,
+                                           int iw, const int tchw,
+                                           const int chw, const int hw,
+                                           const int w) {
+  return in * tchw + it * chw + ic * hw + ih * w + iw;
+}
+template <typename T>
+class TemporalShiftKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const int nt = input->dims()[0];
+    const int c = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    int src_it = 0;
+    for (int i = 0; i < output->numel(); i++) {
+      int in = i / tchw;
+      int it = (i % tchw) / chw;
+      int ic = (i % chw) / hw;
+      int ih = (i % hw) / w;
+      int iw = i % w;
+      if (ic < c1) {
+        src_it = it - 1;
+      } else if (ic < c2) {
+        src_it = it + 1;
+      } else {
+        src_it = it;
+      }
+      if (src_it < 0 || src_it >= t) {
+        output_data[i] = 0;
+      } else {
+        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
+        output_data[i] = input_data[src_idx];
+      }
+    }
+  }
+};
+template <typename T>
+class TemporalShiftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    int t = ctx.Attr<int>("seg_num");
+    float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const int nt = output_grad->dims()[0];
+    const int c = output_grad->dims()[1];
+    const int h = output_grad->dims()[2];
+    const int w = output_grad->dims()[3];
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int hw = h * w;
+    const int chw = c * hw;
+    const int tchw = t * chw;
+    const T* output_grad_data = output_grad->data<T>();
+    T* input_grad_data =
+        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
+    int src_it = 0;
+    for (int i = 0; i < output_grad->numel(); i++) {
+      int in = i / tchw;
+      int it = (i % tchw) / chw;
+      int ic = (i % chw) / hw;
+      int ih = (i % hw) / w;
+      int iw = i % w;
+      if (ic < c1) {
+        src_it = it - 1;
+      } else if (ic < c2) {
+        src_it = it + 1;
+      } else {
+        src_it = it;
+      }
+      if (src_it >= 0 && src_it < t) {
+        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
+        input_grad_data[src_idx] = output_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -52,6 +52,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
  std::string engine_key_;
  std::string engine_serialized_data_;
  bool calibration_mode_;
+  int device_id_;
 public:
  TensorRTEngineOp(const std::string &type,
@@ -62,6 +63,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
    input_names_ = Inputs("Xs");
    max_batch_size_ = Attr<int>("max_batch_size");
    workspace_size_ = Attr<int>("workspace_size");
+    device_id_ = Attr<int>("gpu_id");
    enable_int8_ = Attr<bool>("enable_int8");
    calibration_data_ = Attr<std::string>("calibration_data");
    engine_key_ = Attr<std::string>("engine_key");
@@ -79,6 +81,17 @@ class TensorRTEngineOp : public framework::OperatorBase {
    if (enable_int8_ && calibration_data_.size()) {
      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
    }
+    if (!calibration_mode_ && !engine_serialized_data_.empty()) {
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          device_id_));
+      PADDLE_ENFORCE(engine_serialized_data_.size(),
+                     "TRT serialized data should not be empty here,"
+                     "there must be error when generate serialized data in TRT "
+                     "subgraph detect pass.");
+      trt_engine_->Deserialize(engine_serialized_data_);
+    }
  }
 protected:
@@ -225,12 +238,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
    if (!trt_engine_) {
      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
-          boost::get<platform::CUDAPlace>(dev_place).device));
+          device_id_));
-      if (!engine_serialized_data_.empty()) {
+      PrepareTRTEngine(scope, trt_engine_.get());
-        trt_engine_->Deserialize(engine_serialized_data_);
-      } else {
-        PrepareTRTEngine(scope, trt_engine_.get());
-      }
    }
    return trt_engine_.get();
  }

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -108,6 +108,8 @@ TEST(TensorRTEngineOp, manual) {
                         std::vector<std::string>({"z0"}));
  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -204,6 +206,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
                         std::vector<std::string>({"z3"}));
  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);

--- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc
@@ -67,9 +67,11 @@ class CudnnCTCKernel : public framework::OpKernel<T> {
    softmax_logits.mutable_data<T>(logits->dims(), ctx.GetPlace());
    softmax_logits.set_lod(logits_lod);
    int rank = logits->dims().size();
+    int axis_dim = logits->dims()[rank - 1];
    Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1);
    Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1);
-    math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, &in_2d, &out_2d);
+    math::SoftmaxFunctor<DeviceContext, T, false>()(dev_ctx, axis_dim, &in_2d,
+                                                    &out_2d);
    // ctc needs sequences data stored in transposed padding format
    // logits and grad using padding data of layout 'TNC'

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -44,9 +44,12 @@ add_subdirectory(dynload)
 cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+set(dgc_deps "")
 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-    set(dgc_deps dgc)
+    if(NOT WIN32)
+        set(dgc_deps dgc)
+    endif()
 ELSE()
    set(dgc_deps)
 ENDIF()

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -34,7 +34,7 @@ from . import io
 from . import evaluator
 from . import initializer
 from . import layers
-from . import imperative
+from . import dygraph
 from . import contrib
 from . import nets
 from . import optimizer
@@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \
        'initializer',
        'layers',
        'contrib',
-        'imperative',
+        'dygraph',
        'transpiler',
        'nets',
        'optimizer',
@@ -180,7 +180,7 @@ def __bootstrap__():
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
            'sync_nccl_allreduce', 'limit_of_tmp_allocation',
            'times_excess_than_required_tmp_allocation',
-            'enable_inplace_whitelist'
+            'enable_inplace_whitelist', 'cudnn_batchnorm_spatial_persistent'
        ]
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -30,6 +30,8 @@ from . import slim
 from .slim import *
 from . import utils
 from .utils import *
+from . import extend_optimizer
+from .extend_optimizer import *
 __all__ = []
 __all__ += decoder.__all__
@@ -40,3 +42,4 @@ __all__ += int8_inference.__all__
 __all__ += reader.__all__
 __all__ += slim.__all__
 __all__ += utils.__all__
+__all__ += extend_optimizer.__all__
--- a/python/paddle/fluid/contrib/extend_optimizer/__init__.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/__init__.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+from . import extend_optimizer_with_weight_decay
+from .extend_optimizer_with_weight_decay import *
+__all__ = []
+__all__ += extend_optimizer_with_weight_decay.__all__
--- a/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+++ b/python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.fluid
+from paddle.fluid import framework as framework
+__all__ = ["extend_with_decoupled_weight_decay"]
+class DecoupledWeightDecay(object):
+    def __init__(self, coeff=0.0, apply_decay_param_fun=None, **kwargs):
+        if not isinstance(coeff, float) and \
+                not isinstance(coeff, framework.Variable):
+            raise TypeError("coeff should be float or Variable.")
+        self._params_name = set()
+        self._apply_decay_param_fun = apply_decay_param_fun
+        self._coeff = coeff
+        super(DecoupledWeightDecay, self).__init__(**kwargs)
+    def _scale_parameters(self, params_and_grads):
+        """
+        Adds weight decay ops.
+            scaled_parameter = parameter * coeff
+        Args:
+            params_and_grads: A list of (parameters, gradients) pairs,
+                the parameters need to decay.
+        Raises:
+            Exception: The type of coeff and parameter is not consistent.
+        """
+        if isinstance(self._coeff, float) and self._coeff == 0.0:
+            return
+        scaled_params = []
+        for param, grad in params_and_grads:
+            # If no gradient then we don't need to do anything
+            if grad is None:
+                continue
+            if self._apply_decay_param_fun is not None \
+                    and not self._apply_decay_param_fun(param.name):
+                continue
+            if isinstance(self._coeff, float):
+                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
+                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
+            else:
+                assert self._coeff.dtype == param.dtype, \
+                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                assert param.name not in self._params_name
+                scaled_params.append((param, grad, param * self._coeff))
+                self._params_name.add(param.name)
+        return scaled_params
+    def backward(self, **kargs):
+        return super(DecoupledWeightDecay, self).backward(**kargs)
+    def apply_optimize(self, **kargs):
+        return super(DecoupledWeightDecay, self).apply_optimize(**kargs)
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        params_grads = self.backward(
+            loss=loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+        scaled_params = self._scale_parameters(params_grads)
+        for p_grad_sgrad in scaled_params:
+            param, grad, scaled_param = p_grad_sgrad
+            with param.block.program._optimized_guard(
+                [param, grad]), framework.name_scope('weight decay'):
+                updated_param = paddle.fluid.layers.elementwise_sub(
+                    x=param, y=scaled_param)
+                paddle.fluid.layers.assign(input=updated_param, output=param)
+        optimize_ops = self.apply_optimize(
+            loss=loss,
+            params_grads=params_grads,
+            startup_program=startup_program)
+        return optimize_ops, params_grads
+    def __str__(self):
+        return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+def extend_with_decoupled_weight_decay(base_optimizer):
+    """
+    extend_with_decoupled_weight_decay is a decorator function, it returns an
+    optimizer class with decoupled weight decay. The returned optimizer will
+    apply weight decay on the optimized parameters with the parameters before
+    optimization, i.e: new_parameter = optimized_parameter - parameter * coeff.
+    The details of decoupled weight decay yplease refer to this
+    `DECOUPLED WEIGHT DECAY REGULARIZATION <https://arxiv.org/pdf/1711.05101.pdf>`_.
+    Args:
+        base_optimizer (Optimizer): The base_optimizer should be a derived class of Optimizer.
+    Returns:
+        OptimizerWithDecoupledWeightDecay: the optimizer with decouple weight decay.
+    Examples:
+      .. code-block:: python
+        AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
+            fluid.optimizer.Adam)
+        optimizer = AdamW(learning_rate=0.1,
+                          weight_decay=0.01)
+        optimizer.minimize(cost)
+    """
+    if not issubclass(base_optimizer, paddle.fluid.optimizer.Optimizer):
+        raise TypeError(
+            "The input(base_optimizer) should be a derived class of Optimizer.")
+    class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecay,
+                                            base_optimizer):
+        """
+        OptimizerWithDecoupledWeightDecay is used to update the optimized parameters
+        with the parameters before optimization. For more information, please refer:
+        https://arxiv.org/pdf/1711.05101.pdf.
+        Args:
+            weight_decay (float|Variable): The weight decay coefficient, it can be
+                float or Variable.
+            apply_decay_param_fun (function|None): If it is not None,
+                only variables that makes apply_decay_param_fun(variable)==True
+                will be updated. It only works when we want to specify variables.
+                Default: None.
+        """
+        def __init__(self, weight_decay, apply_decay_param_fun=None, **kwargs):
+            super(OptimizerWithDecoupledWeightDecay, self).__init__(
+                weight_decay, apply_decay_param_fun, **kwargs)
+    return OptimizerWithDecoupledWeightDecay
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Example:
+    >>from paddle.fluid.contrib.model_stat import summary
+    >>main_program = ...
+    >>summary(main_program)
+    +-----+------------+----------------+----------------+---------+------------+
+    | No. |       TYPE |          INPUT |         OUTPUT |  PARAMs |      FLOPs |
+    +-----+------------+----------------+----------------+---------+------------+
+    |   0 |     conv2d |  (3, 200, 200) | (64, 100, 100) |    9408 |  188160000 |
+    |   1 | batch_norm | (64, 100, 100) | (64, 100, 100) |     256 |     640000 |
+    |   2 |       relu | (64, 100, 100) | (64, 100, 100) |       0 |     640000 |
+    |   3 |     pool2d | (64, 100, 100) |   (64, 50, 50) |       0 |    1440000 |
+    ...
+    | 176 |     conv2d |    (512, 7, 7) |    (512, 7, 7) | 2359296 |  231211008 |
+    | 177 |       relu |    (512, 7, 7) |    (512, 7, 7) |       0 |      25088 |
+    | 178 |     conv2d |    (512, 7, 7) |   (2048, 7, 7) | 1048576 |  102760448 |
+    | 179 |       relu |   (2048, 7, 7) |   (2048, 7, 7) |       0 |     100352 |
+    | 180 |     pool2d |   (2048, 7, 7) |   (2048, 1, 1) |       0 |     100352 |
+    +-----+------------+----------------+----------------+---------+------------+
+    Total PARAMs: 48017344(0.0480G)
+    Total FLOPs: 11692747751(11.69G)
+'''
+from collections import OrderedDict
+from prettytable import PrettyTable
+def summary(main_prog):
+    '''
+    It can summary model's PARAMS, FLOPs until now.
+    It support common operator like conv, fc, pool, relu, sigmoid, bn etc. 
+    Args:
+        main_prog: main program 
+    Returns:
+        print summary on terminal
+    '''
+    collected_ops_list = []
+    for one_b in main_prog.blocks:
+        block_vars = one_b.vars
+        for one_op in one_b.ops:
+            op_info = OrderedDict()
+            spf_res = _summary_model(block_vars, one_op)
+            if spf_res is None:
+                continue
+            # TODO: get the operator name
+            op_info['type'] = one_op.type
+            op_info['input_shape'] = spf_res[0][1:]
+            op_info['out_shape'] = spf_res[1][1:]
+            op_info['PARAMs'] = spf_res[2]
+            op_info['FLOPs'] = spf_res[3]
+            collected_ops_list.append(op_info)
+    summary_table, total = _format_summary(collected_ops_list)
+    _print_summary(summary_table, total)
+def _summary_model(block_vars, one_op):
+    '''
+    Compute operator's params and flops.
+    Args:
+        block_vars: all vars of one block
+        one_op: one operator to count
+    Returns:
+        in_data_shape: one operator's input data shape
+        out_data_shape: one operator's output data shape
+        params: one operator's PARAMs 
+        flops: : one operator's FLOPs
+    '''
+    if one_op.type in ['conv2d', 'depthwise_conv2d']:
+        k_arg_shape = block_vars[one_op.input("Filter")[0]].shape
+        in_data_shape = block_vars[one_op.input("Input")[0]].shape
+        out_data_shape = block_vars[one_op.output("Output")[0]].shape
+        c_out, c_in, k_h, k_w = k_arg_shape
+        _, c_out_, h_out, w_out = out_data_shape
+        assert c_out == c_out_, 'shape error!'
+        k_groups = one_op.attr("groups")
+        kernel_ops = k_h * k_w * (c_in / k_groups)
+        bias_ops = 0 if one_op.input("Bias") == [] else 1
+        params = c_out * (kernel_ops + bias_ops)
+        flops = h_out * w_out * c_out * (kernel_ops + bias_ops)
+        # base nvidia paper, include mul and add
+        flops = 2 * flops
+    elif one_op.type == 'pool2d':
+        in_data_shape = block_vars[one_op.input("X")[0]].shape
+        out_data_shape = block_vars[one_op.output("Out")[0]].shape
+        _, c_out, h_out, w_out = out_data_shape
+        k_size = one_op.attr("ksize")
+        params = 0
+        flops = h_out * w_out * c_out * (k_size[0] * k_size[1])
+    elif one_op.type == 'mul':
+        k_arg_shape = block_vars[one_op.input("Y")[0]].shape
+        in_data_shape = block_vars[one_op.input("X")[0]].shape
+        out_data_shape = block_vars[one_op.output("Out")[0]].shape
+        # TODO: fc has mul ops
+        # add attr to mul op, tell us whether it belongs to 'fc'
+        # this's not the best way
+        if 'fc' not in one_op.output("Out")[0]:
+            return None
+        k_in, k_out = k_arg_shape
+        # bias in sum op
+        params = k_in * k_out + 1
+        flops = k_in * k_out
+    elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']:
+        in_data_shape = block_vars[one_op.input("X")[0]].shape
+        out_data_shape = block_vars[one_op.output("Out")[0]].shape
+        params = 0
+        if one_op.type == 'prelu':
+            params = 1
+        flops = 1
+        for one_dim in in_data_shape:
+            flops *= one_dim
+    elif one_op.type == 'batch_norm':
+        in_data_shape = block_vars[one_op.input("X")[0]].shape
+        out_data_shape = block_vars[one_op.output("Y")[0]].shape
+        _, c_in, h_out, w_out = in_data_shape
+        # gamma, beta
+        params = c_in * 2
+        # compute mean and std
+        flops = h_out * w_out * c_in * 2
+    else:
+        return None
+    return in_data_shape, out_data_shape, params, flops
+def _format_summary(collected_ops_list):
+    '''
+    Format summary report.
+    Args:
+        collected_ops_list: the collected operator with summary
+    Returns:
+        summary_table: summary report format
+        total: sum param and flops
+    '''
+    summary_table = PrettyTable(
+        ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
+    summary_table.align = 'r'
+    total = {}
+    total_params = []
+    total_flops = []
+    for i, one_op in enumerate(collected_ops_list):
+        # notice the order
+        table_row = [
+            i,
+            one_op['type'],
+            one_op['input_shape'],
+            one_op['out_shape'],
+            int(one_op['PARAMs']),
+            int(one_op['FLOPs']),
+        ]
+        summary_table.add_row(table_row)
+        total_params.append(int(one_op['PARAMs']))
+        total_flops.append(int(one_op['FLOPs']))
+    total['params'] = total_params
+    total['flops'] = total_flops
+    return summary_table, total
+def _print_summary(summary_table, total):
+    '''
+    Print all the summary on terminal.
+    Args:
+        summary_table: summary report format
+        total: sum param and flops
+    '''
+    parmas = total['params']
+    flops = total['flops']
+    print(summary_table)
+    print('Total PARAMs: {}({:.4f}M)'.format(
+        sum(parmas), sum(parmas) / (10**6)))
+    print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9))
+    print(
+        "Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]"
+    )
--- a/python/paddle/fluid/contrib/slim/distillation/distiller.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py
@@ -19,7 +19,7 @@ from .... import Program
 from .... import program_guard
 from .... import regularizer
-__all__ = ['FSPDistiller', 'L2Distiller']
+__all__ = ['FSPDistiller', 'L2Distiller', 'SoftLabelDistiller']
 class L2Distiller(object):
@@ -186,3 +186,91 @@ class FSPDistillerPass(object):
    def _fsp_matrix(self, fea_map_0, fea_map_1):
        return layers.fsp_matrix(fea_map_0, fea_map_1)
+class SoftLabelDistiller(object):
+    """
+    Combine two layers from student net and teacher net by softmax_with_cross_entropy loss.
+    And add the loss into the total loss using for distillation training.
+    """
+    def __init__(self,
+                 student_feature_map=None,
+                 teacher_feature_map=None,
+                 student_temperature=1.0,
+                 teacher_temperature=1.0,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy. default: 1.0
+            teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy. default: 1.0
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.distillation_loss_weight = distillation_loss_weight
+        self.student_temperature = student_temperature
+        self.teacher_temperature = teacher_temperature
+    def distiller_loss(self, graph):
+        """
+        Modify graph inplace to add softmax_with_cross_entropy loss.
+        Args: 
+            graph(GraphWrapper): The graph to be modified.
+        Returns:
+            GraphWrapper: The modified graph.
+        """
+        distiller_pass = SoftLabelDistillerPass(
+            self.student_feature_map, self.teacher_feature_map,
+            self.student_temperature, self.teacher_temperature,
+            self.distillation_loss_weight)
+        dis_graph = distiller_pass.apply(graph)
+        return dis_graph
+class SoftLabelDistillerPass(object):
+    def __init__(self,
+                 student_feature_map,
+                 teacher_feature_map,
+                 student_temperature,
+                 teacher_temperature,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            student_temperature(float): Temperature used to divide student_feature_map before softmax_with_cross_entropy.
+            teacher_temperature(float): Temperature used to divide teacher_feature_map before softmax_with_cross_entropy.
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.student_temperature = student_temperature
+        self.teacher_temperature = teacher_temperature
+        self.distillation_loss_weight = distillation_loss_weight
+    def apply(self, graph):
+        ret_graph = graph
+        with program_guard(ret_graph.program):
+            student_feature_map = ret_graph.var(self.student_feature_map)._var
+            teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
+            s_fea = student_feature_map / self.student_temperature
+            t_fea = teacher_feature_map / self.distillation_loss_weight
+            t_fea.stop_gradient = True
+            ce_loss = layers.softmax_with_cross_entropy(
+                s_fea, t_fea, soft_label=True)
+            distillation_loss = ce_loss * self.distillation_loss_weight
+            student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
+            loss = distillation_loss + student_loss
+            ret_graph.out_nodes[
+                'soft_label_loss_' + self.student_feature_map + "_" +
+                self.teacher_feature_map] = distillation_loss.name
+            ret_graph.out_nodes['loss'] = loss.name
+        return ret_graph
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -204,6 +204,10 @@ class GraphWrapper(object):
        """
        super(GraphWrapper, self).__init__()
        self.program = Program() if program is None else program
+        self.persistables = {}
+        for var in self.program.list_vars():
+            if var.persistable:
+                self.persistables[var.name] = var
        self.compiled_graph = None
        self.in_nodes = OrderedDict(in_nodes)
        self.out_nodes = OrderedDict(out_nodes)
@@ -467,7 +471,12 @@ class GraphWrapper(object):
            path(str): The path to save the persistables.
            exe(framework.Executor): The executor used to save the persistables.
        """
-        io.save_persistables(exe.exe, path, main_program=self.program)
+        # update persistables from program
+        for var in self.program.list_vars():
+            if var.persistable and var.name not in self.persistables:
+                self.persistables[var.name] = var
+        io.save_vars(exe.exe, path, vars=self.persistables.values())
    def load_persistables(self, path, exe):
        """
@@ -481,7 +490,7 @@ class GraphWrapper(object):
            return os.path.exists(os.path.join(path, var.name))
        io.load_vars(
-            exe.exe, path, main_program=self.program, predicate=if_exist)
+            exe.exe, path, vars=self.persistables.values(), predicate=if_exist)
    def update_param_shape(self, scope):
        """

--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -20,7 +20,7 @@ from .... import io
 from .... import core
 from ....compiler import CompiledProgram
 from ....compiler import BuildStrategy
-from ....framework import IrGraph
+from ....framework import IrGraph, Variable, Program
 from ..core.strategy import Strategy
 from .quantization_pass import *
@@ -88,41 +88,76 @@ class QuantizationStrategy(Strategy):
        self.save_out_nodes = save_out_nodes
        self.save_in_nodes = save_in_nodes
+    def on_compression_begin(self, context):
+        """
+        Restore graph when the compressoin task is inited from checkpoint.
+        """
+        # It is inited from checkpoint and has missed start epoch.
+        if context.epoch_id != 0 and context.epoch_id > self.start_epoch:
+            _logger.info("Restore quantization task from checkpoint")
+            self._modify_graph_for_quantization(context)
+            _logger.info("Finish restoring quantization task from checkpoint")
+    def _modify_graph_for_quantization(self, context):
+        """
+        Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
+        """
+        train_ir_graph = IrGraph(
+            core.Graph(context.optimize_graph.program.clone().desc),
+            for_test=False)
+        test_ir_graph = IrGraph(
+            core.Graph(context.eval_graph.program.clone().desc), for_test=True)
+        transform_pass = QuantizationTransformPass(
+            scope=context.scope,
+            place=context.place,
+            weight_bits=self.weight_bits,
+            activation_bits=self.activation_bits,
+            activation_quantize_type=self.activation_quantize_type,
+            weight_quantize_type=self.weight_quantize_type)
+        transform_pass.apply(train_ir_graph)
+        transform_pass.apply(test_ir_graph)
+        # Put persistables created by transform_pass into context.optimize_graph.persistables
+        # for saving checkpoint.
+        program_persistables = set()
+        for var in context.optimize_graph.program.list_vars():
+            if var.persistable:
+                program_persistables.add(var.name)
+        program = Program()
+        for var_node in train_ir_graph.all_persistable_nodes():
+            if var_node.name() not in program_persistables:
+                var_desc = var_node.var()
+                var = program.global_block().create_var(
+                    name=var_node.name(),
+                    shape=var_desc.shape(),
+                    dtype=var_desc.dtype(),
+                    type=var_desc.type(),
+                    lod_level=var_desc.lod_level())
+                context.optimize_graph.persistables[var.name] = var
+        build_strategy = BuildStrategy()
+        build_strategy.enable_inplace = False
+        build_strategy.memory_optimize = False
+        # for quantization training
+        context.optimize_graph.compiled_graph = CompiledProgram(
+            train_ir_graph.graph).with_data_parallel(
+                loss_name=context.optimize_graph.out_nodes['loss'],
+                build_strategy=build_strategy)
+        # for evaluation. And program compiled from ir graph must be with data parallel.
+        context.eval_graph.compiled_graph = CompiledProgram(
+            test_ir_graph.graph).with_data_parallel(
+                build_strategy=build_strategy)
+        # for saving inference model after training
+        context.put('quantization_test_ir_graph_backup', test_ir_graph)
    def on_epoch_begin(self, context):
        """
        Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
        """
-        super(QuantizationStrategy, self).on_compression_begin(context)
+        super(QuantizationStrategy, self).on_epoch_begin(context)
        if self.start_epoch == context.epoch_id:
            _logger.info('QuantizationStrategy::on_epoch_begin')
-            train_ir_graph = IrGraph(
+            self._modify_graph_for_quantization(context)
-                core.Graph(context.optimize_graph.program.desc), for_test=False)
-            test_ir_graph = IrGraph(
-                core.Graph(context.eval_graph.program.desc), for_test=True)
-            transform_pass = QuantizationTransformPass(
-                scope=context.scope,
-                place=context.place,
-                weight_bits=self.weight_bits,
-                activation_bits=self.activation_bits,
-                activation_quantize_type=self.activation_quantize_type,
-                weight_quantize_type=self.weight_quantize_type)
-            transform_pass.apply(train_ir_graph)
-            transform_pass.apply(test_ir_graph)
-            build_strategy = BuildStrategy()
-            build_strategy.enable_inplace = False
-            build_strategy.memory_optimize = False
-            # for quantization training
-            context.optimize_graph.compiled_graph = CompiledProgram(
-                train_ir_graph.graph).with_data_parallel(
-                    loss_name=context.optimize_graph.out_nodes['loss'],
-                    build_strategy=build_strategy)
-            # for evaluation. And program compiled from ir graph must be with data parallel.
-            context.eval_graph.compiled_graph = CompiledProgram(
-                test_ir_graph.graph).with_data_parallel(
-                    build_strategy=build_strategy)
-            # for saving inference model after training
-            context.put('quantization_test_ir_graph_backup', test_ir_graph)
            _logger.info('Finish QuantizationStrategy::on_epoch_begin')
    def on_epoch_end(self, context):

--- a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
+++ b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
@@ -33,10 +33,17 @@ distillers:
        teacher_feature_map: 'teacher.tmp_2'
        student_feature_map: 'student.tmp_2'
        distillation_loss_weight: 1
+    soft_label_distiller:
+        class: 'SoftLabelDistiller'
+        student_temperature: 1.0
+        teacher_temperature: 1.0 
+        teacher_feature_map: 'teacher.tmp_1'
+        student_feature_map: 'student.tmp_1'
+        distillation_loss_weight: 0.001
 strategies:
    distillation_strategy:
        class: 'DistillationStrategy'
-        distillers: ['fsp_distiller', 'l2_distiller']
+        distillers: ['fsp_distiller', 'l2_distiller', 'soft_label_distiller']
        start_epoch: 0
        end_epoch: 1
 compressor:

--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from functools import partial
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import contextlib
+def get_places():
+    places = [fluid.CPUPlace()]
+    if fluid.core.is_compiled_with_cuda():
+        places.append(fluid.CUDAPlace(0))
+    return places
+@contextlib.contextmanager
+def prog_scope_guard(main_prog, startup_prog):
+    scope = fluid.core.Scope()
+    with fluid.unique_name.guard():
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(main_prog, startup_prog):
+                yield
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+class TestWeightDecay(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=2)()
+        self.train_data = [next(reader) for _ in range(5)]
+        self.learning_rate = .5
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+    def check_weight_decay(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            avg_cost = model(data, label, len(self.word_dict))
+            AdamW = fluid.contrib.extend_with_decoupled_weight_decay(
+                fluid.optimizer.Adam)
+            optimizer = AdamW(
+                learning_rate=self.learning_rate,
+                weight_decay=self.learning_rate)
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+    def check_weight_decay2(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with prog_scope_guard(main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            avg_cost = model(data, label, len(self.word_dict))
+            param_list = [(var, var * self.learning_rate)
+                          for var in main_prog.block(0).all_parameters()]
+            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
+            optimizer.minimize(avg_cost)
+            for params in param_list:
+                updated_p = fluid.layers.elementwise_sub(
+                    x=params[0], y=params[1])
+                fluid.layers.assign(input=updated_p, output=params[0])
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+    def test_weight_decay(self):
+        for place in get_places():
+            model = partial(bow_net, is_sparse=False)
+            param_sum1 = self.check_weight_decay(place, model)
+            param_sum2 = self.check_weight_decay2(place, model)
+            for i in range(len(param_sum1)):
+                assert np.isclose(a=param_sum1[i], b=param_sum2[i], rtol=5e-5)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -32,6 +32,9 @@ from .profiler import *
 from . import checkpoint
 from .checkpoint import *
+from . import learning_rate_scheduler
+from .learning_rate_scheduler import *
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
@@ -39,3 +42,4 @@ __all__ += nn.__all__
 __all__ += tracer.__all__
 __all__ += profiler.__all__
 __all__ += checkpoint.__all__
+__all__ += learning_rate_scheduler.__all__
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
 def enabled():
-    return framework._in_imperative_mode()
+    return framework._in_dygraph_mode()
 @signature_safe_contextmanager
@@ -39,14 +39,14 @@ def guard(place=None):
    with framework.program_guard(train, startup):
        with framework.unique_name.guard():
-            with framework._imperative_guard(tracer):
+            with framework._dygraph_guard(tracer):
-                with framework._imperative_place_guard(place):
+                with framework._dygraph_place_guard(place):
                    yield
 def to_variable(value, block=None, name=None):
    if isinstance(value, np.ndarray):
-        assert enabled(), "to_variable could only be called in imperative mode"
+        assert enabled(), "to_variable could only be called in dygraph mode"
        if not block:
            block = framework.default_main_program().current_block()

--- a/python/paddle/fluid/imperative/checkpoint.py
+++ b/python/paddle/fluid/imperative/checkpoint.py
@@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None):
            dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                        init_cell)
            param_path = "./my_paddle_model"
-            fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path,
+            fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path,
                                       layer=ptb_model)
    """
    if isinstance(vardict, collections.OrderedDict):
@@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None):
    Examples:
        .. code-block:: python
-            my_layer = layer(fluid.imperative.Layer)
+            my_layer = layer(fluid.dygraph.Layer)
            param_path = "./my_paddle_model"
-            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path)
+            param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
            param_1 = param_dict['PtbModel_0.w_1']
            or:
-            my_layer = layer(fluid.imperative.Layer)
+            my_layer = layer(fluid.dygraph.Layer)
            param_path = "./my_paddle_model"
            filename = "model.file"
-            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path,
+            param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
                                                                       filename=filename)
            param_1 = param_dict['PtbModel_0.w_1']

--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import copy
 import six
-from ..framework import Parameter, _in_imperative_mode
+from ..framework import Parameter, _in_dygraph_mode
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip

--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -283,7 +283,7 @@ class PyLayer(core.PyLayer):
    @classmethod
    def __call__(cls, *inputs):
-        tracer = framework._imperative_tracer()
+        tracer = framework._dygraph_tracer()
        block = framework.default_main_program().current_block()
        ivar_inputs = [x._ivar for x in inputs]

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import math
+from .. import unique_name
+__all__ = [
+    'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
+    'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay'
+]
+class LearningRateDecay(object):
+    """
+    Base class of learning rate decay
+    """
+    def __init__(self, begin=0, step=1, dtype='float32'):
+        self.step_num = begin
+        self.step_size = step
+        self.dtype = dtype
+    def __call__(self):
+        lr = self.step()
+        if isinstance(lr, float):
+            lr = self.create_lr_var(lr)
+        self.step_num += self.step_size
+        return lr
+    def create_lr_var(self, lr):
+        from .. import layers
+        lr = layers.create_global_var(
+            name=unique_name.generate("learning_rate"),
+            shape=[1],
+            value=float(lr),
+            dtype=self.dtype,
+            persistable=True)
+        return lr
+    def step(self):
+        raise NotImplementedError()
+class PiecewiseDecay(LearningRateDecay):
+    def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
+        super(PiecewiseDecay, self).__init__(begin, step, dtype)
+        self.boundaries = boundaries
+        self.values = values
+        self.vars = []
+        for value in values:
+            self.vars.append(self.create_lr_var(value))
+    def step(self):
+        for i in range(len(self.boundaries)):
+            if self.step_num < self.boundaries[i]:
+                return self.vars[i]
+        return self.vars[len(self.values) - 1]
+class NaturalExpDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 decay_rate,
+                 staircase=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(NaturalExpDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.decay_steps = decay_steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+    def step(self):
+        from .. import layers
+        div_res = self.create_lr_var(self.step_num / self.decay_steps)
+        if self.staircase:
+            div_res = layers.floor(div_res)
+        decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate *
+                                                     div_res)
+        return decayed_lr
+class ExponentialDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 decay_rate,
+                 staircase=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(ExponentialDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.decay_steps = decay_steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+    def step(self):
+        from .. import layers
+        div_res = self.create_lr_var(self.step_num / self.decay_steps)
+        if self.staircase:
+            div_res = layers.floor(div_res)
+        decayed_lr = self.learning_rate * (self.decay_rate**div_res)
+        return decayed_lr
+class InverseTimeDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 decay_rate,
+                 staircase=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(InverseTimeDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.decay_steps = decay_steps
+        self.decay_rate = decay_rate
+        self.staircase = staircase
+    def step(self):
+        from .. import layers
+        div_res = self.create_lr_var(self.step_num / self.decay_steps)
+        if self.staircase:
+            div_res = layers.floor(div_res)
+        decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
+        return decayed_lr
+class PolynomialDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 decay_steps,
+                 end_learning_rate=0.0001,
+                 power=1.0,
+                 cycle=False,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(PolynomialDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.decay_steps = decay_steps
+        self.end_learning_rate = end_learning_rate
+        self.power = power
+        self.cycle = cycle
+    def step(self):
+        from .. import layers
+        tmp_step_num = self.step_num
+        tmp_decay_steps = self.decay_steps
+        if self.cycle:
+            div_res = layers.ceil(
+                self.create_lr_var(tmp_step_num / float(self.decay_steps)))
+            if tmp_step_num == 0:
+                div_res = self.create_lr_var(1.0)
+            tmp_decay_steps = self.decay_steps * div_res
+        else:
+            tmp_step_num = self.create_lr_var(tmp_step_num
+                                              if tmp_step_num < self.decay_steps
+                                              else self.decay_steps)
+        decayed_lr = (self.learning_rate - self.end_learning_rate) * \
+            ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate
+        return decayed_lr
+class CosineDecay(LearningRateDecay):
+    def __init__(self,
+                 learning_rate,
+                 step_each_epoch,
+                 epochs,
+                 begin=0,
+                 step=1,
+                 dtype='float32'):
+        super(CosineDecay, self).__init__(begin, step, dtype)
+        self.learning_rate = learning_rate
+        self.step_each_epoch = step_each_epoch
+        self.epochs = epochs
+    def step(self):
+        from .. import layers
+        cur_epoch = layers.floor(
+            self.create_lr_var(self.step_num / self.step_each_epoch))
+        decayed_lr = self.learning_rate * 0.5 * (
+            layers.cos(cur_epoch * math.pi / self.epochs) + 1)
+        return decayed_lr
+class NoamDecay(LearningRateDecay):
+    def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
+        super(NoamDecay, self).__init__(begin, step, dtype)
+        self.d_model = d_model
+        self.warmup_steps = warmup_steps
+    def step(self):
+        from .. import layers
+        a = self.create_lr_var(self.step_num**-0.5)
+        b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num)
+        lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b)
+        return lr_value
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -133,7 +133,7 @@ class Conv2D(layers.Layer):
            outputs={'Out': [pre_act]},
            attrs={'axis': 1})
-        # Currently, we don't support inplace in imperative mode
+        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(pre_act, act=self._act)
@@ -265,7 +265,7 @@ class FC(layers.Layer):
                attrs={'axis': self._num_flatten_dims})
        else:
            pre_activation = pre_bias
-        # Currently, we don't support inplace in imperative mode
+        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(pre_activation, act=self._act)
@@ -387,7 +387,7 @@ class BatchNorm(layers.Layer):
                "use_global_stats": self._use_global_stats
            })
-        # Currently, we don't support inplace in imperative mode
+        # Currently, we don't support inplace in dygraph mode
        return self._helper.append_activation(batch_norm_out, self._act)
@@ -426,7 +426,7 @@ class Embedding(layers.Layer):
          dict_size = len(dataset.ids)
          input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
-          embedding = fluid.imperative.Embedding(size=[dict_size, 16])
+          embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
          fc = embedding(input)
    """

--- a/python/paddle/fluid/imperative/profiler.py
+++ b/python/paddle/fluid/imperative/profiler.py
--- a/python/paddle/fluid/imperative/tracer.py
+++ b/python/paddle/fluid/imperative/tracer.py
@@ -24,12 +24,12 @@ __all__ = ['Tracer']
 def release_op(op):
-    del framework._imperative_tracer()._ops[op._trace_id]
+    del framework._dygraph_tracer()._ops[op._trace_id]
 class Tracer(core.Tracer):
    """
-    Python wrapper of imperative tracer
+    Python wrapper of dygraph tracer
    """
    def __init__(self, block):

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix()
 ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
-_imperative_tracer_ = None
+_dygraph_tracer_ = None
-_imperative_current_expected_place_ = None
+_dygraph_current_expected_place_ = None
-def _in_imperative_mode():
+def _in_dygraph_mode():
-    return _imperative_tracer_ is not None
+    return _dygraph_tracer_ is not None
-def _imperative_tracer():
+def _dygraph_tracer():
-    return _imperative_tracer_
+    return _dygraph_tracer_
 def _current_expected_place():
-    return _imperative_current_expected_place_
+    return _dygraph_current_expected_place_
 def _cpu_num():
@@ -396,7 +396,7 @@ class Variable(object):
            if not isinstance(dtype, core.VarDesc.VarType):
                dtype = convert_np_dtype_to_dtype_(dtype)
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            # record vars in tracer rather than blocks
            self._ivar = kwargs.get("ivar", None)
            if not self._ivar:
@@ -406,7 +406,7 @@ class Variable(object):
                    _current_expected_place(), stop_gradient, True
                    if persistable else False)
            if persistable:
-                _imperative_tracer().trace_var(name, self)
+                _dygraph_tracer().trace_var(name, self)
        else:
            self.error_clip = error_clip
@@ -515,8 +515,8 @@ class Variable(object):
        Returns:
            str: The debug string.
        """
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
-            # TODO(panyx0718): add more imperative debug info.
+            # TODO(panyx0718): add more dygraph debug info.
            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
                                                     self.shape)
@@ -548,42 +548,42 @@ class Variable(object):
    @property
    def _stop_gradient(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.stop_gradient
        else:
            return self.stop_gradient
    @_stop_gradient.setter
    def _stop_gradient(self, s):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            self._ivar.stop_gradient = s
        else:
            self.stop_gradient = s
    @property
    def persistable(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.persistable
        else:
            return self.desc.persistable()
    @persistable.setter
    def persistable(self, p):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.persistable
        else:
            self.desc.set_persistable(p)
    @property
    def name(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.name
        else:
            return cpt.to_text(self.desc.name())
    @name.setter
    def name(self, new_name):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            self._ivar.name = new_name
        else:
            self.desc.set_name(new_name)
@@ -591,26 +591,26 @@ class Variable(object):
    @property
    def shape(self):
        # convert to tuple, make it as same as numpy API.
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.shape
        else:
            return tuple(self.desc.shape())
    @property
    def dtype(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.dtype
        else:
            return self.desc.dtype()
    @property
    def lod_level(self):
-        # TODO(minqiyang): Support lod_level in imperative mode
+        # TODO(minqiyang): Support lod_level in dygraph mode
        return self.desc.lod_level()
    @property
    def type(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self._ivar.dtype
        else:
            return self.desc.type()
@@ -789,13 +789,24 @@ class Variable(object):
        if isinstance(item, tuple):
            if len(item) > len(self.shape):
                raise IndexError("Too many indexes")
+            fixedSize = True
+            for i in range(len(self.shape)):
+                if self.shape[i] == -1:
+                    fixedSize = False
+                    break
            newitem = self._reconstructSliceinfo(item) or item
-            check, info = self._detectContinuesSlice(newitem)
+            if fixedSize:
-            if check:
+                check, info = self._detectContinuesSlice(newitem)
-                starts = info[0]
+                if check:
-                ends = info[1]
+                    starts = info[0]
-                axes = [i for i in range(len(starts))]
+                    ends = info[1]
-                return self._sliceVar(axes, starts, ends)
+                    axes = [i for i in range(len(starts))]
+                    return self._sliceVar(axes, starts, ends)
+                else:
+                    new_var = self
+                    for index, o in enumerate(newitem):
+                        new_var = new_var._sliceAndConcatVar(o, index)
            else:
                new_var = self
                for index, o in enumerate(newitem):
@@ -918,7 +929,7 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            if type is None:
                raise ValueError(
                    "`type` to initialized an Operator can not be None.")
@@ -1037,7 +1048,7 @@ class Operator(object):
                    for arg in out_args:
                        out_arg_names.append(cpt.to_text(arg.name))
                        # TODO(minqiyang): could we remove variable's op in static mode?
-                        if not _in_imperative_mode():
+                        if not _in_dygraph_mode():
                            arg.op = self
                    self.desc.set_output(out_proto.name, out_arg_names)
@@ -1083,7 +1094,7 @@ class Operator(object):
    @property
    def type(self):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            return self.iop.type
        else:
            return self.desc.type()
@@ -1626,7 +1637,7 @@ class Block(object):
        Returns:
            Operator: the append Operator.
        """
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            op = Operator(
                block=self,
                desc=None,
@@ -1638,9 +1649,8 @@ class Block(object):
            # record ops in tracer rather than blocks
            #
            # TODO(minqiyang): add op stop_gradient support in static mode too.
-            # currently, we only support stop_gradient in imperative mode.
+            # currently, we only support stop_gradient in dygraph mode.
-            _imperative_tracer().trace_op(op,
+            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
-                                          kwargs.get("stop_gradient", False))
        else:
            op_desc = self.desc.append_op()
            op = Operator(
@@ -1699,7 +1709,7 @@ class Block(object):
        return self.ops[start:end]
    def _prepend_op(self, *args, **kwargs):
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            op = Operator(
                self,
                None,
@@ -1707,8 +1717,7 @@ class Block(object):
                inputs=kwargs.get("inputs", None),
                outputs=kwargs.get("outputs", None),
                attrs=kwargs.get("attrs", None))
-            _imperative_tracer().trace_op(op,
+            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
-                                          kwargs.get("stop_gradient", False))
        else:
            op_desc = self.desc._prepend_op()
            op = Operator(
@@ -3511,22 +3520,22 @@ def _get_var(name, program=None):
 @signature_safe_contextmanager
-def _imperative_guard(tracer):
+def _dygraph_guard(tracer):
-    global _imperative_tracer_
+    global _dygraph_tracer_
-    tmp_trace = _imperative_tracer_
+    tmp_trace = _dygraph_tracer_
-    _imperative_tracer_ = tracer
+    _dygraph_tracer_ = tracer
    yield
-    _imperative_tracer_ = tmp_trace
+    _dygraph_tracer_ = tmp_trace
 @signature_safe_contextmanager
-def _imperative_place_guard(place):
+def _dygraph_place_guard(place):
-    global _imperative_current_expected_place_
+    global _dygraph_current_expected_place_
-    tmp_place = _imperative_current_expected_place_
+    tmp_place = _dygraph_current_expected_place_
-    _imperative_current_expected_place_ = place
+    _dygraph_current_expected_place_ = place
    yield
-    _imperative_current_expected_place_ = tmp_place
+    _dygraph_current_expected_place_ = tmp_place
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -165,7 +165,7 @@ class ConstantInitializer(Initializer):
                'force_cpu': self._force_cpu or force_init_on_cpu()
            },
            stop_gradient=True)
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -245,7 +245,7 @@ class UniformInitializer(Initializer):
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -324,7 +324,7 @@ class NormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -509,7 +509,7 @@ class XavierInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -610,7 +610,7 @@ class MSRAInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -709,7 +709,7 @@ class BilinearInitializer(Initializer):
                'shape': list(shape),
                value_name: values
            })
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op
@@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
                value_name: values
            },
            stop_gradient=True)
-        if not framework._in_imperative_mode():
+        if not framework._in_dygraph_mode():
            var.op = op
        return op

--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -17,7 +17,7 @@ from .param_attr import ParamAttr
 from .initializer import Constant
 from . import layers
 from . import backward
-from .imperative import Layer, nn
+from .dygraph import Layer, nn
 from . import executor
 from . import core

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import six
-from .framework import Parameter, dtype_is_floating, _in_imperative_mode
+from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
@@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase):
    def __init__(self, layer_type, **kwargs):
        self.kwargs = kwargs
        name = self.kwargs.get('name', None)
-        # TODO(panyx0718, minqiyang): imperative mode
+        # TODO(panyx0718, minqiyang): dygraph mode
        # can not use both `layer_type` and `name`. Deprecate LayerHelper
-        # and write a Helper for imperative mode.
+        # and write a Helper for dygraph mode.
        if name is None:
            self.kwargs['name'] = unique_name.generate(layer_type)

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import numpy as np
-from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place
+from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place
 from . import unique_name
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
@@ -54,8 +54,8 @@ class LayerHelperBase(object):
        Return Variable construct from value
        """
        if isinstance(value, np.ndarray):
-            assert _in_imperative_mode(
+            assert _in_dygraph_mode(
-            ), "to_variable could only be called in imperative mode"
+            ), "to_variable could only be called in dygraph mode"
            if not block:
                block = default_main_program().current_block()
@@ -302,8 +302,8 @@ class LayerHelperBase(object):
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
-            # In imperative mode, we want the returned parameter to be
+            # In dygraph mode, we want the returned parameter to be
            # initialized so that it can be used imperatively.
            return self.main_program.global_block().create_parameter(
                dtype=dtype,
@@ -370,7 +370,7 @@ class LayerHelperBase(object):
               initializer: initializer to use
        """
        assert isinstance(var, Variable)
-        if _in_imperative_mode():
+        if _in_dygraph_mode():
            initializer(var, var.block)
        else:
            self.startup_program.global_block().create_var(

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -929,9 +929,9 @@ def array_read(array, i):
    Examples:
        .. code-block:: python
-          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          array = fluid.layers.create_array(dtype='float32')
          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-          arr = layers.array_read(tmp, i=i)
+          item = fluid.layers.array_read(array, i)
    """
    helper = LayerHelper('array_read', **locals())
    if not isinstance(

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -22,18 +22,21 @@ strategy according to this module.
 from __future__ import print_function
+import math
 from . import control_flow
 from . import nn
 from . import ops
 from . import tensor
 from ..initializer import init_on_cpu
 from ..framework import default_main_program, Parameter, unique_name, name_scope
-import math
+from ..dygraph import base as imperative_base
+from ..dygraph import learning_rate_scheduler as imperate_lr
 __all__ = [
    'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS',
-    'cosine_decay'
+    'cosine_decay', 'linear_lr_warmup'
 ]
@@ -66,13 +69,17 @@ def noam_decay(d_model, warmup_steps):
        The decayed learning rate.
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter(1)
+        if imperative_base.enabled():
+            decay = imperate_lr.NoamDecay(d_model, warmup_steps)
+            return decay
+        else:
+            global_step = _decay_step_counter(1)
-        a = global_step**-0.5
+            a = global_step**-0.5
-        b = (warmup_steps**-1.5) * global_step
+            b = (warmup_steps**-1.5) * global_step
-        lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
+            lr_value = (d_model**-0.5) * nn.elementwise_min(a, b)
-    return lr_value
+            return lr_value
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -112,14 +119,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.ExponentialDecay(learning_rate, decay_steps,
+                                                 decay_rate, staircase)
+            return decay
+        else:
+            global_step = _decay_step_counter()
-        div_res = global_step / decay_steps
+            div_res = global_step / decay_steps
-        if staircase:
+            if staircase:
-            div_res = ops.floor(div_res)
+                div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * (decay_rate**div_res)
+            decayed_lr = learning_rate * (decay_rate**div_res)
-        return decayed_lr
+            return decayed_lr
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -141,14 +153,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
        The decayed learning rate
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.NaturalExpDecay(learning_rate, decay_steps,
+                                                decay_rate, staircase)
+            return decay
+        else:
+            global_step = _decay_step_counter()
-        div_res = global_step / decay_steps
+            div_res = global_step / decay_steps
-        if staircase:
+            if staircase:
-            div_res = ops.floor(div_res)
+                div_res = ops.floor(div_res)
-        decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
-        return decayed_lr
+            return decayed_lr
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
@@ -187,15 +204,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
          sgd_optimizer.minimize(avg_cost)
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.InverseTimeDecay(learning_rate, decay_steps,
+                                                 decay_rate, staircase)
+            return decay
+        else:
+            global_step = _decay_step_counter()
-        div_res = global_step / decay_steps
+            div_res = global_step / decay_steps
-        if staircase:
+            if staircase:
-            div_res = ops.floor(div_res)
+                div_res = ops.floor(div_res)
-        decayed_lr = learning_rate / (1 + decay_rate * div_res)
+            decayed_lr = learning_rate / (1 + decay_rate * div_res)
-        return decayed_lr
+            return decayed_lr
 def polynomial_decay(learning_rate,
@@ -227,27 +249,33 @@ def polynomial_decay(learning_rate,
        Variable: The decayed learning rate
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.PolynomialDecay(learning_rate, decay_steps,
-        if cycle:
+                                                end_learning_rate, power, cycle)
-            div_res = ops.ceil(global_step / decay_steps)
+            return decay
-            zero_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=0.0)
-            one_var = tensor.fill_constant(
-                shape=[1], dtype='float32', value=1.0)
-            with control_flow.Switch() as switch:
-                with switch.case(global_step == zero_var):
-                    tensor.assign(input=one_var, output=div_res)
-            decay_steps = decay_steps * div_res
        else:
-            decay_steps_var = tensor.fill_constant(
+            global_step = _decay_step_counter()
-                shape=[1], dtype='float32', value=float(decay_steps))
-            global_step = nn.elementwise_min(x=global_step, y=decay_steps_var)
+            if cycle:
+                div_res = ops.ceil(global_step / decay_steps)
+                zero_var = tensor.fill_constant(
+                    shape=[1], dtype='float32', value=0.0)
+                one_var = tensor.fill_constant(
+                    shape=[1], dtype='float32', value=1.0)
+                with control_flow.Switch() as switch:
+                    with switch.case(global_step == zero_var):
+                        tensor.assign(input=one_var, output=div_res)
+                decay_steps = decay_steps * div_res
+            else:
+                decay_steps_var = tensor.fill_constant(
+                    shape=[1], dtype='float32', value=float(decay_steps))
+                global_step = nn.elementwise_min(
+                    x=global_step, y=decay_steps_var)
-        decayed_lr = (learning_rate - end_learning_rate) * \
+            decayed_lr = (learning_rate - end_learning_rate) * \
-            ((1 - global_step / decay_steps) ** power) + end_learning_rate
+                ((1 - global_step / decay_steps) ** power) + end_learning_rate
-        return decayed_lr
+            return decayed_lr
 def piecewise_decay(boundaries, values):
@@ -279,34 +307,38 @@ def piecewise_decay(boundaries, values):
        if len(values) - len(boundaries) != 1:
            raise ValueError("len(values) - len(boundaries) should be 1")
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.PiecewiseDecay(boundaries, values, 0)
+            return decay
+        else:
+            global_step = _decay_step_counter()
-        lr = tensor.create_global_var(
+            lr = tensor.create_global_var(
-            shape=[1],
+                shape=[1],
-            value=0.0,
+                value=0.0,
-            dtype='float32',
+                dtype='float32',
-            persistable=True,
+                persistable=True,
-            name="learning_rate")
+                name="learning_rate")
-        with control_flow.Switch() as switch:
+            with control_flow.Switch() as switch:
-            for i in range(len(boundaries)):
+                for i in range(len(boundaries)):
-                boundary_val = tensor.fill_constant(
+                    boundary_val = tensor.fill_constant(
+                        shape=[1],
+                        dtype='float32',
+                        value=float(boundaries[i]),
+                        force_cpu=True)
+                    value_var = tensor.fill_constant(
+                        shape=[1], dtype='float32', value=float(values[i]))
+                    with switch.case(global_step < boundary_val):
+                        tensor.assign(value_var, lr)
+                last_value_var = tensor.fill_constant(
                    shape=[1],
                    dtype='float32',
-                    value=float(boundaries[i]),
+                    value=float(values[len(values) - 1]))
-                    force_cpu=True)
+                with switch.default():
-                value_var = tensor.fill_constant(
+                    tensor.assign(last_value_var, lr)
-                    shape=[1], dtype='float32', value=float(values[i]))
-                with switch.case(global_step < boundary_val):
-                    tensor.assign(value_var, lr)
-            last_value_var = tensor.fill_constant(
-                shape=[1],
-                dtype='float32',
-                value=float(values[len(values) - 1]))
-            with switch.default():
-                tensor.assign(last_value_var, lr)
-    return lr
+            return lr
 def cosine_decay(learning_rate, step_each_epoch, epochs):
@@ -336,12 +368,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
 	learning_rate = base_lr, step_each_epoch=10000, epochs=120)
    """
    with default_main_program()._lr_schedule_guard():
-        global_step = _decay_step_counter()
+        if imperative_base.enabled():
+            decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
+                                            epochs)
+            return decay
+        else:
+            global_step = _decay_step_counter()
-        cur_epoch = ops.floor(global_step / step_each_epoch)
+            cur_epoch = ops.floor(global_step / step_each_epoch)
-        decayed_lr = learning_rate * 0.5 * (
+            decayed_lr = learning_rate * 0.5 * (
-            ops.cos(cur_epoch * math.pi / epochs) + 1)
+                ops.cos(cur_epoch * math.pi / epochs) + 1)
-        return decayed_lr
+            return decayed_lr
 def append_LARS(params_grads, learning_rate, weight_decay):
@@ -363,6 +400,9 @@ def append_LARS(params_grads, learning_rate, weight_decay):
                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
    """
+    assert not imperative_base.enabled(
+    ), "append_LARS is NOT supported in dygraph mode now"
    def _balanced_weight(param_norm, grad_norm):
        if weight_decay == 1.0:
            return grad_norm + param_norm
@@ -383,3 +423,59 @@ def append_LARS(params_grads, learning_rate, weight_decay):
                    / _balanced_weight(param_norm, grad_norm)
            # set back param local learning rate
            param.optimize_attr['learning_rate'] = decayed_lr
+def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
+    """
+    Applies linear learning rate warmup before the normal learning rate
+    scheduling.
+    .. code-block:: python
+     if global_step < warmup_steps:
+         linear_step = end_lr - start_lr
+         lr = start_lr + linear_step * (global_step / warmup_steps)
+    Args:
+        learning_rate (float | Variable): A float value or Variable.
+        warmup_steps (int): The warmup steps.
+        start_lr (float): The start learning of warmup.
+        end_lr (float): The end learning of warmup.
+    Returns:
+        The decayed learning rate in warmup period.
+    Examples:
+        .. code-block:: python
+            boundaries = [100, 200]
+            lr_steps = [0.1, 0.01, 0.001]
+            warmup_steps = 50 
+            start_lr = 1. / 3. 
+            end_lr = 0.1
+            decayed_lr = fluid.layers.linear_lr_warmup(
+                fluid.layers.piecewise_decay(boundaries, lr_steps),
+                warmup_steps, start_lr, end_lr)
+    """
+    assert (isinstance(end_lr, float))
+    assert (isinstance(start_lr, float))
+    linear_step = end_lr - start_lr
+    with default_main_program()._lr_schedule_guard():
+        lr = tensor.create_global_var(
+            shape=[1],
+            value=0.0,
+            dtype='float32',
+            persistable=True,
+            name="learning_rate_warmup")
+        global_step = _decay_step_counter()
+        with control_flow.Switch() as switch:
+            with switch.case(global_step < warmup_steps):
+                decayed_lr = start_lr + linear_step * (global_step /
+                                                       float(warmup_steps))
+                tensor.assign(decayed_lr, lr)
+            with switch.default():
+                tensor.assign(learning_rate, lr)
+    return lr
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,8 +23,8 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _in_imperative_mode
+from ..framework import Variable, OpProtoHolder, _in_dygraph_mode
-from ..imperative import base
+from ..dygraph import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
 from .tensor import concat, assign
@@ -32,7 +32,7 @@ from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
-from ..imperative import layers
+from ..dygraph import layers
 __all__ = [
    'fc',
@@ -183,10 +183,12 @@ __all__ = [
    'get_tensor_from_selected_rows',
    'lstm',
    'shuffle_channel',
+    'temporal_shift',
    'py_func',
    'psroi_pool',
    'teacher_student_sigmoid_loss',
    'huber_loss',
+    'kldiv_loss',
    'tree_conv',
    'npair_loss',
    'fsp_matrix',
@@ -296,7 +298,6 @@ def fc(input,
          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
          fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
    """
    helper = LayerHelper("fc", **locals())
    dtype = helper.input_dtype()
@@ -1820,17 +1821,18 @@ def sequence_softmax(input, use_cudnn=False, name=None):
    return softmax_out
-def softmax(input, use_cudnn=False, name=None):
+def softmax(input, use_cudnn=False, name=None, axis=-1):
    """
    The input of the softmax operator is a tensor of any rank. The output tensor
    has the same shape as the input.
-    The input tensor will first be logically flattened to a 2-D matrix. The matrix's
+    The dimension :attr:`axis` of the input tensor will be permuted to the last.
-    second dimension(row length) is as same as the last dimension of the input
+    Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
+    second dimension(row length) is the same as the dimension :attr:`axis` of the input
    tensor, and the first dimension(column length) is the product of all other
    dimensions of the input tensor. For each row of the matrix, the softmax operator
    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's last dimension) vector of arbitrary real values to a
+    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
    K-dimensional vector of real values in the range [0, 1] that add up to 1.
    It computes the exponential of the given dimension and the sum of exponential
@@ -1852,6 +1854,9 @@ def softmax(input, use_cudnn=False, name=None):
            False by default. Default: False
        name (str|None): A name for this layer(optional). If set None, the layer
            will be named automatically. Default: None.
+        axis (int): The index of dimension to perform softmax calculations, it should
+            be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of
+            input variable. Default: -1.
    Returns:
        Variable: output of softmax
@@ -1861,7 +1866,10 @@ def softmax(input, use_cudnn=False, name=None):
        .. code-block:: python
             fc = fluid.layers.fc(input=x, size=10)
-             softmax = fluid.layers.softmax(input=fc)
+             # perform softmax in the second dimension
+             softmax = fluid.layers.softmax(input=fc, axis=1)
+             # perform softmax in the last dimension
+             softmax = fluid.layers.softmax(input=fc, axis=-1)
    """
    helper = LayerHelper('softmax', **locals())
@@ -1871,7 +1879,8 @@ def softmax(input, use_cudnn=False, name=None):
        type="softmax",
        inputs={"X": input},
        outputs={"Out": softmax_out},
-        attrs={"use_cudnn": use_cudnn})
+        attrs={"axis": axis,
+               "use_cudnn": use_cudnn})
    return softmax_out
@@ -3279,6 +3288,8 @@ def layer_norm(input,
        >>>                          dtype='float32')
        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
    """
+    assert _in_dygraph_mode(
+    ) is not True, "please use FC instead of fc in dygraph mode!"
    helper = LayerHelper('layer_norm', **locals())
    dtype = helper.input_dtype()
@@ -5866,11 +5877,49 @@ def multiplex(inputs, index):
    """
    ${comment}
-    >>> import paddle.fluid as fluid
+    For Example:
-    >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
-    >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+    .. code-block:: text
-    >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32')
-    >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+        case 1:
+        Given:
+        X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
+             [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]],
+             [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]],
+             [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]]
+        index = [3,0,1,2]
+        out:[[3 0 3 4]    // X[3,0] (3 = index[i], 0 = i); i=0
+             [0 1 3 4]    // X[0,1] (0 = index[i], 1 = i); i=1
+             [1 2 4 2]    // X[1,2] (0 = index[i], 2 = i); i=2
+             [2 3 3 4]]   // X[2,3] (0 = index[i], 3 = i); i=3
+        case 2:
+        Given:
+        X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]],
+             [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]]]
+        index = [1,0]
+        out:[[1 0 3 4]    // X[1,0] (3 = index[0], 0 = i); i=1
+             [0 1 3 4]    // X[0,1] (0 = index[1], 1 = i); i=2
+             [0 2 4 4]    // X[0,2] (0 = 0, 2 = i); i=3
+             [0 3 3 4]]   // X[0,3] (0 = 0, 3 = i); i=4
+    Examples:
+    .. code-block:: python
+        import paddle.fluid as fluid
+        x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+        x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+        index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+        out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
    Args:
       inputs (list): ${x_comment}.
@@ -6405,8 +6454,8 @@ def squeeze(input, axes, name=None):
            x = layers.data(name='x', shape=[5, 1, 10])
            y = layers.sequeeze(input=x, axes=[1])
    """
-    assert not _in_imperative_mode(), (
+    assert not _in_dygraph_mode(), (
-        "squeeze layer is not supported in imperative mode yet.")
+        "squeeze layer is not supported in dygraph mode yet.")
    helper = LayerHelper("squeeze", **locals())
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
    x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -9144,7 +9193,7 @@ def _elementwise_op(helper):
    op_type = helper.layer_type
    x = helper.kwargs.get('x', None)
    y = helper.kwargs.get('y', None)
-    if _in_imperative_mode():
+    if _in_dygraph_mode():
        x = base.to_variable(x)
        y = base.to_variable(y)
@@ -10352,6 +10401,48 @@ def shuffle_channel(x, group, name=None):
    return out
+@templatedoc()
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
+    """
+    **Temporal Shift Operator**
+    ${comment}
+    Args: 
+        x(Variable): ${x_comment}
+        seg_num(int): ${seg_num_comment}
+        shift_ratio(float): ${shift_ratio_comment}
+        name (str, default None): The name of this layer.
+    Returns:
+        out(Variable): The temporal shifting result is a tensor variable with the 
+        same shape and same type as the input.
+    Raises:
+        TypeError: seg_num must be int type.
+    Examples:
+        .. code-block:: python
+            input = fluid.layers.data(name='input', shape=[4,2,2], dtype='float32')
+            out = fluid.layers.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
+    """
+    helper = LayerHelper("temporal_shift", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if not isinstance(seg_num, int):
+        raise TypeError("seg_num must be int type.")
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"seg_num": seg_num,
+               "shift_ratio": shift_ratio})
+    return out
 class PyFuncRegistry(object):
    _register_funcs = []
@@ -10672,6 +10763,38 @@ def huber_loss(input, label, delta):
    return out
+@templatedoc()
+def kldiv_loss(x, target, reduction='mean', name=None):
+    """
+    ${comment}
+    Args:
+        x (Variable): ${x_comment}
+        target (Variable): ${target_comment}
+        reduction (Variable): ${reduction_comment}
+        name (str, default None): The name of this layer.
+    Returns:
+        kldiv\_loss (Variable): The KL divergence loss.
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name='x', shape=[4,2,2], dtype='float32')
+            target = fluid.layers.data(name='target', shape=[4,2,2], dtype='float32')
+            loss = fluid.layers.kldiv_loss(x=x, target=target, reduction='batchmean')
+    """
+    helper = LayerHelper('kldiv_loss', **locals())
+    loss = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': x,
+                'Target': target},
+        outputs={'Loss': loss},
+        attrs={'reduction': reduction})
+    return loss
 @templatedoc()
 def tree_conv(nodes_vector,
              edge_set,

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_
 from ..framework import Variable
 from ..initializer import Constant, force_init_on_cpu
 from ..core import VarDesc
-from ..imperative import base as imperative_base
 from .layer_function_generator import templatedoc
 import numpy

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -30,7 +30,8 @@ from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
 from .regularizer import append_regularization_ops
-from .imperative import base as imperative_base
+from .dygraph import base as imperative_base
+from .dygraph.learning_rate_scheduler import LearningRateDecay
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
@@ -54,9 +55,19 @@ class Optimizer(object):
    """
    def __init__(self, learning_rate, regularization=None, name=None):
-        if not isinstance(learning_rate, float) and \
+        if framework._in_dygraph_mode():
-                not isinstance(learning_rate, framework.Variable):
+            if not isinstance(learning_rate, float) and \
-            raise TypeError("learning rate should be float or Variable")
+                    not isinstance(learning_rate, LearningRateDecay):
+                raise TypeError(
+                    "learning rate should be float or LearningRateDecay, got %s here"
+                    % type(learning_rate))
+        else:
+            if not isinstance(learning_rate, float) and \
+                    not isinstance(learning_rate, framework.Variable):
+                raise TypeError(
+                    "learning rate should be float or Variable, got %s here" %
+                    type(learning_rate))
        self._name = name
        self.regularization = regularization
        self._learning_rate = learning_rate
@@ -80,24 +91,49 @@ class Optimizer(object):
        return self._opti_name_list
    def _create_global_learning_rate(self):
-        lr = self._global_learning_rate()
+        if imperative_base.enabled():
+            # create learning rate Variable
+            if isinstance(self._learning_rate, float):
+                lr = self._global_learning_rate()
-        if isinstance(lr, framework.Variable):
+                if isinstance(lr, framework.Variable):
-            return
+                    return
-        else:
+                else:
-            if not isinstance(self._learning_rate, float):
+                    self._learning_rate_map[framework.default_main_program(
+                    )] = layers.create_global_var(
+                        name=unique_name.generate("learning_rate"),
+                        shape=[1],
+                        value=float(self._learning_rate),
+                        dtype='float32' if self._dtype is None else self._dtype,
+                        persistable=True)
+            # get learning rate Variable from LearningRateDecay
+            elif isinstance(self._learning_rate, LearningRateDecay):
+                self._learning_rate_map[framework.default_main_program(
+                )] = self._learning_rate()
+            else:
                raise TypeError(
-                    "learning rate variable is create outside optimizer,"
+                    "optimizer's learning rate must be float or LearningRateDecay"
-                    "can not create new learning rate variable for new program")
+                )
+        else:
+            lr = self._global_learning_rate()
-        # create learning rate in the current main program
+            if isinstance(lr, framework.Variable):
-        self._learning_rate_map[framework.default_main_program(
+                return
-        )] = layers.create_global_var(
+            else:
-            name=unique_name.generate("learning_rate"),
+                if not isinstance(self._learning_rate, float):
-            shape=[1],
+                    raise TypeError(
-            value=float(self._learning_rate),
+                        "learning rate variable is create outside optimizer,"
-            dtype='float32' if self._dtype is None else self._dtype,
+                        "can not create new learning rate variable for new program"
-            persistable=True)
+                    )
+            # create learning rate in the current main program
+            self._learning_rate_map[framework.default_main_program(
+            )] = layers.create_global_var(
+                name=unique_name.generate("learning_rate"),
+                shape=[1],
+                value=float(self._learning_rate),
+                dtype='float32' if self._dtype is None else self._dtype,
+                persistable=True)
    def _global_learning_rate(self, program=None):
        """
@@ -169,7 +205,7 @@ class Optimizer(object):
            name = self._name + "_" + name
        if (name in self._accumulators and
                param.name in self._accumulators[name]):
-            if framework._in_imperative_mode():
+            if framework._in_dygraph_mode():
                return self._accumulators[name][param.name]
            raise Exception("Accumulator {} already exists for parameter {}".
                            format(name, param.name))
@@ -326,12 +362,38 @@ class Optimizer(object):
        Examples:
            See examples in `apply_gradients`.
        """
-        if callbacks is None:
+        self._dtype = loss.dtype
-            callbacks = [error_clip_callback]
+        if framework._in_dygraph_mode():
+            if parameter_list is not None:
+                parameters = parameter_list
+            else:
+                parameters = framework._dygraph_tracer().all_parameters()
+            params_grads = []
+            for param in parameters:
+                if not param.trainable:
+                    continue
+                if param._ivar._grad_ivar() is not None:
+                    # create gradient variable
+                    grad_var = Variable(
+                        block=loss.block,
+                        name=param._ivar._grad_name(),
+                        stop_gradient=True,
+                        ivar=param._ivar._grad_ivar())
+                    params_grads.append((param, grad_var))
        else:
-            assert (isinstance(callbacks, list))
+            if callbacks is None:
-            callbacks.append(error_clip_callback)
+                callbacks = [error_clip_callback]
-        return append_backward(loss, parameter_list, no_grad_set, callbacks)
+            else:
+                assert (isinstance(callbacks, list))
+            program = loss.block.program
+            with program_guard(program, startup_program):
+                params_grads = append_backward(loss, parameter_list,
+                                               no_grad_set, callbacks)
+                # Note: since we can't use all_reduce_op now,
+                #  dgc_op should be the last op of one grad.
+                self._append_dgc_ops(params_grads)
+        return params_grads
    def apply_gradients(self, params_grads):
        """
@@ -372,6 +434,30 @@ class Optimizer(object):
        return optimize_ops
+    def apply_optimize(self, loss, startup_program, params_grads):
+        """
+        Second part of `minimize`, appending optimization operators for
+        given `params_grads` pairs.
+        Args:
+            loss (Variable): loss variable to run optimizations.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            params_grads (list): list of (param, grad) pair to do optimization.
+        Returns:
+            list: A list of operators appended to the current program.
+        """
+        if framework._in_dygraph_mode():
+            with program_guard(framework.default_main_program(),
+                               framework.default_startup_program()):
+                optimize_ops = self._create_optimization_pass(params_grads)
+        else:
+            program = loss.block.program
+            with program_guard(program, startup_program):
+                optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
    def minimize(self,
                 loss,
                 startup_program=None,
@@ -394,38 +480,13 @@ class Optimizer(object):
            tuple: (optimize_ops, params_grads) which are, list of operators appended;
            and list of (param, grad) Variables pair for optimization.
        """
-        self._dtype = loss.dtype
+        params_grads = self.backward(
-        optimize_ops = []
+            loss,
-        if framework._in_imperative_mode():
+            startup_program=startup_program,
-            if parameter_list is not None:
+            parameter_list=parameter_list,
-                parameters = parameter_list
+            no_grad_set=no_grad_set)
-            else:
+        optimize_ops = self.apply_optimize(
-                parameters = framework._imperative_tracer().all_parameters()
+            loss, startup_program=startup_program, params_grads=params_grads)
-            params_grads = []
-            for param in parameters:
-                if not param.trainable:
-                    continue
-                if param._ivar._grad_ivar() is not None:
-                    # create gradient variable
-                    grad_var = Variable(
-                        block=loss.block,
-                        name=param._ivar._grad_name(),
-                        stop_gradient=True,
-                        ivar=param._ivar._grad_ivar())
-                    params_grads.append((param, grad_var))
-            with program_guard(framework.default_main_program(),
-                               framework.default_startup_program()):
-                optimize_ops = self._create_optimization_pass(params_grads)
-        else:
-            program = loss.block.program
-            with program_guard(program, startup_program):
-                params_grads = self.backward(loss, startup_program,
-                                             parameter_list, no_grad_set)
-                # Note: since we can't use all_reduce_op now,
-                #  dgc_op should be the last op of one grad.
-                self._append_dgc_ops(params_grads)
-                optimize_ops = self.apply_gradients(params_grads)
        return optimize_ops, params_grads
@@ -581,10 +642,10 @@ class DGCMomentumOptimizer(MomentumOptimizer):
    DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
    This optimizer will do two things:
        1. Compress the gradient by get TopK import value from tensor \
            and use it for allreduce to reduce network bandwidth.
        2. Call momentum to optimize on the cost.
    Args:

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -78,7 +78,7 @@ list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
-list(REMOVE_ITEM TEST_OPS test_imperative_optimizer)
+list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
@@ -89,7 +89,7 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
  FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_optimizer MODULES test_imperative_optimizer ENVS
+py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
  FLAGS_cudnn_deterministic=1)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -262,14 +262,14 @@ class OpTest(unittest.TestCase):
        if isinstance(value, tuple):
            data = value[0]
            lod = value[1]
-            v = fluid.imperative.base.to_variable(value=data)
+            v = fluid.dygraph.base.to_variable(value=data)
            v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod)
            return v
        else:
-            return fluid.imperative.base.to_variable(value)
+            return fluid.dygraph.base.to_variable(value)
-    def _calc_imperative_output(self, place, parallel=False, no_check_set=None):
+    def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
-        with fluid.imperative.base.guard(place=place):
+        with fluid.dygraph.base.guard(place=place):
            block = fluid.default_main_program().global_block()
            # prepare input variable
@@ -316,7 +316,7 @@ class OpTest(unittest.TestCase):
            return outputs
-    def _calc_output(self, place, parallel=False, no_check_set=None):
+    def _calc_output(self, place, parallel=False, no_check_set=None, loss=None):
        program = Program()
        block = program.global_block()
        self._append_ops(block)
@@ -329,8 +329,14 @@ class OpTest(unittest.TestCase):
            use_cuda = False
            if isinstance(place, fluid.CUDAPlace(0)):
                use_cuda = True
-            executor = fluid.ParallelExecutor(
+            if loss:
-                use_cuda=use_cuda, loss_name=loss.name, main_program=program)
+                executor = fluid.ParallelExecutor(
+                    use_cuda=use_cuda,
+                    loss_name=loss.name,
+                    main_program=program)
+            else:
+                executor = fluid.ParallelExecutor(
+                    use_cuda=use_cuda, main_program=program)
        else:
            executor = Executor(place)
@@ -364,9 +370,9 @@ class OpTest(unittest.TestCase):
                                atol,
                                no_check_set=None,
                                equal_nan=False,
-                                check_imperative=False):
+                                check_dygraph=False):
-        if check_imperative:
+        if check_dygraph:
-            imperative_outs = self._calc_imperative_output(
+            dygraph_outs = self._calc_dygraph_output(
                place, no_check_set=no_check_set)
        outs, fetch_list = self._calc_output(place, no_check_set=no_check_set)
@@ -393,8 +399,8 @@ class OpTest(unittest.TestCase):
                                         type(sub_out))
                for item in sub_out:
                    sub_out_name, expect = item[0], item[1]
-                    if check_imperative:
+                    if check_dygraph:
-                        imperative_actual = imperative_outs[sub_out_name][0]
+                        imperative_actual = dygraph_outs[sub_out_name][0]
                        imperative_actual_t = np.array(
                            imperative_actual._ivar.value().get_tensor())
                    idx = find_actual(sub_out_name, fetch_list)
@@ -407,7 +413,7 @@ class OpTest(unittest.TestCase):
                            actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                        "Output (" + sub_out_name + ") has diff at " +
                        str(place))
-                    if check_imperative:
+                    if check_dygraph:
                        self.assertTrue(
                            np.allclose(
                                imperative_actual_t,
@@ -415,21 +421,21 @@ class OpTest(unittest.TestCase):
                                atol=atol,
                                equal_nan=equal_nan),
                            "Output (" + sub_out_name + ") has diff at " +
-                            str(place) + " in imperative mode")
+                            str(place) + " in dygraph mode")
                    if isinstance(expect, tuple):
                        self.assertListEqual(
                            actual.recursive_sequence_lengths(), expect[1],
                            "Output (" + sub_out_name +
                            ") has different lod at " + str(place))
-                    if check_imperative:
+                    if check_dygraph:
                        self.assertListEqual(
                            imperative_actual._ivar.value().get_tensor()
                            .recursive_sequence_lengths(), expect[1],
                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in imperative mode")
+                            str(place) + " in dygraph mode")
            else:
-                if check_imperative:
+                if check_dygraph:
-                    imperative_actual = imperative_outs[out_name][0]
+                    imperative_actual = dygraph_outs[out_name][0]
                    imperative_actual_t = np.array(
                        imperative_actual._ivar.value().get_tensor())
                idx = find_actual(out_name, fetch_list)
@@ -443,7 +449,7 @@ class OpTest(unittest.TestCase):
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
                    str(actual_t) + " in class " + self.__class__.__name__)
-                if check_imperative:
+                if check_dygraph:
                    self.assertTrue(
                        np.allclose(
                            imperative_actual_t,
@@ -458,12 +464,12 @@ class OpTest(unittest.TestCase):
                    self.assertListEqual(actual.recursive_sequence_lengths(),
                                         expect[1], "Output (" + out_name +
                                         ") has different lod at " + str(place))
-                    if check_imperative:
+                    if check_dygraph:
                        self.assertListEqual(
                            imperative_actual._ivar.value().get_tensor()
                            .recursive_sequence_lengths(), expect[1],
                            "Output (" + out_name + ") has different lod at " +
-                            str(place) + " in imperative mode")
+                            str(place) + " in dygraph mode")
    def _get_places(self):
        if self.dtype == np.float16:
@@ -490,11 +496,11 @@ class OpTest(unittest.TestCase):
                     atol=1e-5,
                     no_check_set=None,
                     equal_nan=False,
-                     check_imperative=False):
+                     check_dygraph=False):
        places = self._get_places()
        for place in places:
            self.check_output_with_place(place, atol, no_check_set, equal_nan,
-                                         check_imperative)
+                                         check_dygraph)
    def check_output_customized(self, checker):
        places = self._get_places()

--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
-class L1(fluid.imperative.Layer):
+class L1(fluid.dygraph.Layer):
    def __init__(self, prefix):
        super(L1, self).__init__(prefix)
        self._param_attr = fluid.ParamAttr(
@@ -32,7 +32,7 @@ class L1(fluid.imperative.Layer):
        return self.w1 + self.w2
-class L2(fluid.imperative.Layer):
+class L2(fluid.dygraph.Layer):
    def __init__(self, prefix):
        super(L2, self).__init__(prefix)
        self.layer1 = L1(self.full_name())
@@ -42,7 +42,7 @@ class L2(fluid.imperative.Layer):
        return self.layer1() + self.layer2()
-class L3(fluid.imperative.Layer):
+class L3(fluid.dygraph.Layer):
    def __init__(self, prefix):
        super(L3, self).__init__(prefix)
        self.layer1 = L2(self.full_name())
@@ -54,7 +54,7 @@ class L3(fluid.imperative.Layer):
 class TestBaseLayer(unittest.TestCase):
    def test_one_level(self):
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            l = L1('test_one_level')
            ret = l()
            self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
@@ -62,7 +62,7 @@ class TestBaseLayer(unittest.TestCase):
            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
    def test_three_level(self):
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            l = L3('test_three_level')
            names = [p.name for p in l.parameters()]
            ret = l()

--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -156,7 +156,7 @@ class TestGRUOp(OpTest):
        }
    def test_check_output(self):
-        self.check_output(atol=1e-8, check_imperative=True)
+        self.check_output(atol=1e-8, check_dygraph=True)
    def test_check_grad(self):
        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])

--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.imperative.nn import FC
+from paddle.fluid.dygraph.nn import FC
 from test_imperative_base import new_program_scope
-class MyLayer(fluid.imperative.Layer):
+class MyLayer(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(MyLayer, self).__init__(name_scope)
@@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.Layer):
        return [x]
-class MyPyLayer(fluid.imperative.PyLayer):
+class MyPyLayer(fluid.dygraph.PyLayer):
    def __init__(self):
        super(MyPyLayer, self).__init__()
@@ -48,7 +48,7 @@ class MyPyLayer(fluid.imperative.PyLayer):
        return np.array(dout) * (1 - np.square(np.array(out)))
-class MLP(fluid.imperative.Layer):
+class MLP(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(MLP, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(),
@@ -71,7 +71,7 @@ class MLP(fluid.imperative.Layer):
        return x
-class SimpleRNNCell(fluid.imperative.Layer):
+class SimpleRNNCell(fluid.dygraph.Layer):
    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
                 param_attr):
        super(SimpleRNNCell, self).__init__(name_scope)
@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
        return reduce_out, hidden
-class SimpleRNN(fluid.imperative.Layer):
+class SimpleRNN(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(SimpleRNN, self).__init__(name_scope)
        self.seq_len = 4
@@ -194,10 +194,10 @@ class SimpleRNN(fluid.imperative.Layer):
 class TestImperative(unittest.TestCase):
    def test_sum_op(self):
        x = np.ones([2, 2], np.float32)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            inputs = []
            for _ in range(10):
-                inputs.append(fluid.imperative.base.to_variable(x))
+                inputs.append(fluid.dygraph.base.to_variable(x))
            ret = fluid.layers.sums(inputs)
            loss = fluid.layers.reduce_sum(ret)
            loss._backward()
@@ -205,17 +205,17 @@ class TestImperative(unittest.TestCase):
            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
    def test_layer(self):
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            cl = core.Layer()
            cl.forward([])
-            l = fluid.imperative.Layer("l")
+            l = fluid.dygraph.Layer("l")
            self.assertRaises(NotImplementedError, l.forward, [])
    def test_pylayer_func_id(self):
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
-            class PyLayer1(fluid.imperative.PyLayer):
+            class PyLayer1(fluid.dygraph.PyLayer):
                def __init__(self):
                    super(PyLayer1, self).__init__()
@@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase):
                def backward(input):
                    return input
-            class PyLayer2(fluid.imperative.PyLayer):
+            class PyLayer2(fluid.dygraph.PyLayer):
                def __init__(self):
                    super(PyLayer2, self).__init__()
@@ -241,21 +241,21 @@ class TestImperative(unittest.TestCase):
            py_layer_1 = PyLayer1()
            py_layer_2 = PyLayer2()
-            py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
-            py_layer_2(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2])))
            id = py_layer_1.forward_id
            self.assertGreater(id, 0)
            self.assertEqual(py_layer_1.backward_id, id + 1)
            self.assertEqual(py_layer_2.forward_id, id + 2)
            self.assertEqual(py_layer_2.backward_id, id + 3)
-            py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2])))
+            py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
            self.assertEqual(py_layer_1.forward_id, id)
    def test_pylayer(self):
        np_inp = np.ones([2, 2], np.float32)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            my_py_layer = MyPyLayer()
-            var_inp = fluid.imperative.base.to_variable(np_inp)
+            var_inp = fluid.dygraph.base.to_variable(np_inp)
            outs = my_py_layer(var_inp)
            dy_out = np.sum(outs[0]._numpy())
            outs[0]._backward()
@@ -282,8 +282,8 @@ class TestImperative(unittest.TestCase):
    def test_layer_in_out(self):
        np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
-            var_inp = fluid.imperative.base.to_variable(np_inp)
+            var_inp = fluid.dygraph.base.to_variable(np_inp)
            l = MyLayer("my_layer")
            x = l(var_inp)[0]
            self.assertIsNotNone(x)
@@ -310,8 +310,8 @@ class TestImperative(unittest.TestCase):
    def test_mlp(self):
        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
-            var_inp = fluid.imperative.base.to_variable(np_inp)
+            var_inp = fluid.dygraph.base.to_variable(np_inp)
            mlp = MLP("mlp")
            out = mlp(var_inp)
            dy_out = out._numpy()
@@ -353,8 +353,8 @@ class TestImperative(unittest.TestCase):
                           [10.0, 11.0, 12.0]])
        np_inp = np_inp.reshape((1, 4, 3))
        np_inp = np_inp.astype(np.float32)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
-            var_inp = fluid.imperative.base.to_variable(np_inp)
+            var_inp = fluid.dygraph.base.to_variable(np_inp)
            var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
            simple_rnn = SimpleRNN("simple_rnn")
            outs, pre_hiddens = simple_rnn.forward(var_inp)

--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
-class SimpleImgConvPool(fluid.imperative.Layer):
+class SimpleImgConvPool(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):
        return x
-class MNIST(fluid.imperative.Layer):
+class MNIST(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(MNIST, self).__init__(name_scope)
@@ -98,12 +98,12 @@ class MNIST(fluid.imperative.Layer):
        return x
-class TestImperativeCheckpoint(unittest.TestCase):
+class TestDygraphCheckpoint(unittest.TestCase):
    def save_load_persistables(self):
        seed = 90
        epoch_num = 1
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
@@ -135,14 +135,14 @@ class TestImperativeCheckpoint(unittest.TestCase):
                    avg_loss._backward()
                    sgd.minimize(avg_loss)
-                    fluid.imperative.save_persistables(mnist, "save_dir")
+                    fluid.dygraph.save_persistables(mnist, "save_dir")
                    mnist.clear_gradients()
                    for param in mnist.parameters():
                        dy_param_init_value[param.name] = param._numpy()
                    mnist.load_dict(
-                        fluid.imperative.load_persistables(mnist, "save_dir"))
+                        fluid.dygraph.load_persistables(mnist, "save_dir"))
                    restore = mnist.parameters()

--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -22,7 +22,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from test_imperative_base import new_program_scope
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
 # Can use Amusic dataset as the DeepCF describes.
 DATA_PATH = os.environ.get('DATA_PATH', '')
@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
 NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
-class DMF(fluid.imperative.Layer):
+class DMF(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(DMF, self).__init__(name_scope)
-        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
        self._user_layers = []
        self._item_layers = []
@@ -45,12 +45,12 @@ class DMF(fluid.imperative.Layer):
            self._user_layers.append(
                self.add_sublayer(
                    'user_layer_%d' % i,
-                    fluid.imperative.FC(
+                    fluid.dygraph.FC(
                        self.full_name(), self._hid_sizes[i], act='relu')))
            self._item_layers.append(
                self.add_sublayer(
                    'item_layer_%d' % i,
-                    fluid.imperative.FC(
+                    fluid.dygraph.FC(
                        self.full_name(), self._hid_sizes[i], act='relu')))
    def forward(self, users, items):
@@ -63,18 +63,18 @@ class DMF(fluid.imperative.Layer):
        return fluid.layers.elementwise_mul(users, items)
-class MLP(fluid.imperative.Layer):
+class MLP(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(MLP, self).__init__(name_scope)
-        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
        self._match_layers = []
        self._hid_sizes = [128, 64]
        for i in range(len(self._hid_sizes)):
            self._match_layers.append(
                self.add_sublayer(
                    'match_layer_%d' % i,
-                    fluid.imperative.FC(
+                    fluid.dygraph.FC(
                        self.full_name(), self._hid_sizes[i], act='relu')))
        self._mat
@@ -88,7 +88,7 @@ class MLP(fluid.imperative.Layer):
        return match_vec
-class DeepCF(fluid.imperative.Layer):
+class DeepCF(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_users, num_items, matrix):
        super(DeepCF, self).__init__(name_scope)
        self._num_users = num_users
@@ -103,7 +103,7 @@ class DeepCF(fluid.imperative.Layer):
        self._mlp = MLP(self.full_name())
        self._dmf = DMF(self.full_name())
-        self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
+        self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid')
    def forward(self, users, items):
        # users_emb = self._user_emb(users)
@@ -191,7 +191,7 @@ def load_data(DATA_PATH):
           np.expand_dims(labels_np, -1), num_users, num_items, matrix
-class TestImperativeDeepCF(unittest.TestCase):
+class TestDygraphDeepCF(unittest.TestCase):
    def test_deefcf(self):
        seed = 90
        if DATA_PATH:
@@ -237,7 +237,7 @@ class TestImperativeDeepCF(unittest.TestCase):
                        fetch_list=[loss])[0]
                    sys.stderr.write('static loss %s\n' % static_loss)
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -22,12 +22,12 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
-class Discriminator(fluid.imperative.Layer):
+class Discriminator(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(Discriminator, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(), size=32, act='elu')
@@ -38,7 +38,7 @@ class Discriminator(fluid.imperative.Layer):
        return self._fc2(x)
-class Generator(fluid.imperative.Layer):
+class Generator(fluid.dygraph.Layer):
    def __init__(self, name_scope):
        super(Generator, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(), size=64, act='elu')
@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
        return self._fc3(x)
-class TestImperativeGAN(unittest.TestCase):
+class TestDygraphGAN(unittest.TestCase):
    def test_gan_float32(self):
        seed = 90
@@ -130,7 +130,7 @@ class TestImperativeGAN(unittest.TestCase):
                    scope.find_var(param.name).get_tensor())
        dy_params = dict()
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -22,16 +22,16 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
 def gen_data():
    pass
-class GraphConv(fluid.imperative.Layer):
+class GraphConv(fluid.dygraph.Layer):
    def __init__(self, name_scope, in_features, out_features):
        super(GraphConv, self).__init__(name_scope)
@@ -50,7 +50,7 @@ class GraphConv(fluid.imperative.Layer):
        return fluid.layers.matmul(adj, support) + self.bias
-class GCN(fluid.imperative.Layer):
+class GCN(fluid.dygraph.Layer):
    def __init__(self, name_scope, num_hidden):
        super(GCN, self).__init__(name_scope)
        self.gc = GraphConv(self.full_name(), num_hidden, 32)
@@ -61,7 +61,7 @@ class GCN(fluid.imperative.Layer):
        return self.gc2(x, adj)
-class TestImperativeGNN(unittest.TestCase):
+class TestDygraphGNN(unittest.TestCase):
    def test_gnn_float32(self):
        seed = 90
@@ -115,7 +115,7 @@ class TestImperativeGNN(unittest.TestCase):
            static_weight = np.array(
                scope.find_var(model.gc.weight.name).get_tensor())
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import contextlib
+import unittest
+import numpy as np
+import six
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+class SimpleImgConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+class TestImperativeMnist(unittest.TestCase):
+    def test_mnist_float32(self):
+        seed = 90
+        epoch_num = 1
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            dy_param_init_value = {}
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label._stop_gradient = True
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    dy_out = avg_loss._numpy()
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param._numpy()
+                    avg_loss._backward()
+                    sgd.minimize(avg_loss)
+                    mnist.clear_gradients()
+                    dy_param_value = {}
+                    for param in mnist.parameters():
+                        dy_param_value[param.name] = param._numpy()
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.mean(loss)
+            sgd.minimize(avg_loss)
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mnist.parameters():
+                static_param_name_list.append(param.name)
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+                    fetch_list = [avg_loss.name]
+                    fetch_list.extend(static_param_name_list)
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+                    static_param_value = {}
+                    static_out = out[0]
+                    for i in range(1, len(out)):
+                        static_param_value[static_param_name_list[i - 1]] = out[
+                            i]
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+        self.assertTrue(np.allclose(static_out, dy_out))
+        for key, value in six.iteritems(static_param_value):
+            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -22,131 +22,71 @@ import six
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.optimizer import SGDOptimizer, Adam
-from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.nn import FC
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
-class SimpleImgConvPool(fluid.imperative.Layer):
+class MLP(fluid.dygraph.Layer):
-    def __init__(self,
+    def __init__(self, name_scope, param_attr=None, bias_attr=None):
-                 name_scope,
+        super(MLP, self).__init__(name_scope)
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 pool_size,
-                 pool_stride,
-                 pool_padding=0,
-                 pool_type='max',
-                 global_pooling=False,
-                 conv_stride=1,
-                 conv_padding=0,
-                 conv_dilation=1,
-                 conv_groups=1,
-                 act=None,
-                 use_cudnn=False,
-                 param_attr=None,
-                 bias_attr=None):
-        super(SimpleImgConvPool, self).__init__(name_scope)
-        self._conv2d = Conv2D(
-            self.full_name(),
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
-            stride=conv_stride,
-            padding=conv_padding,
-            dilation=conv_dilation,
-            groups=conv_groups,
-            param_attr=None,
-            bias_attr=None,
-            use_cudnn=use_cudnn)
-        self._pool2d = Pool2D(
-            self.full_name(),
-            pool_size=pool_size,
-            pool_type=pool_type,
-            pool_stride=pool_stride,
-            pool_padding=pool_padding,
-            global_pooling=global_pooling,
-            use_cudnn=use_cudnn)
-    def forward(self, inputs):
+        self._fc1 = FC(self.full_name(), 10)
-        x = self._conv2d(inputs)
+        self._fc2 = FC(self.full_name(), 10)
-        x = self._pool2d(x)
-        return x
-class MNIST(fluid.imperative.Layer):
-    def __init__(self, name_scope):
-        super(MNIST, self).__init__(name_scope)
-        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+    def forward(self, inputs):
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+        y = self._fc1(inputs)
+        y = self._fc2(y)
-        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+        return y
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
-        pool_2_shape = 50 * 4 * 4
-        SIZE = 10
-        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(self.full_name(),
-                      10,
-                      param_attr=fluid.param_attr.ParamAttr(
-                          initializer=fluid.initializer.NormalInitializer(
-                              loc=0.0, scale=scale)),
-                      act="softmax")
-    def forward(self, inputs):
+class TestImperativeOptimizerBase(unittest.TestCase):
-        x = self._simple_img_conv_pool_1(inputs)
+    def setUp(self):
-        x = self._simple_img_conv_pool_2(x)
+        self.batch_num = 20
-        x = self._fc(x)
-        return x
+    def get_optimizer(self):
+        raise NotImplementedError()
-class TestImperativeMnist(unittest.TestCase):
+    def _check_mlp(self):
-    def test_mnist_float32(self):
        seed = 90
-        epoch_num = 1
+        with fluid.dygraph.guard():
-        with fluid.imperative.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
-            mnist = MNIST("mnist")
+            mlp = MLP('mlp')
-            sgd = SGDOptimizer(learning_rate=1e-3)
+            optimizer = self.get_optimizer()
            train_reader = paddle.batch(
                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
            dy_param_init_value = {}
-            for epoch in range(epoch_num):
+            for batch_id, data in enumerate(train_reader()):
-                for batch_id, data in enumerate(train_reader()):
+                if batch_id >= self.batch_num:
-                    dy_x_data = np.array(
+                    break
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
+                dy_x_data = np.array(
-                    y_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    128, 1)
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
+                img = to_variable(dy_x_data)
-                    label._stop_gradient = True
+                label = to_variable(y_data)
+                label._stop_gradient = True
-                    cost = mnist(img)
-                    loss = fluid.layers.cross_entropy(cost, label)
+                cost = mlp(img)
-                    avg_loss = fluid.layers.mean(loss)
+                avg_loss = fluid.layers.reduce_mean(cost)
+                dy_out = avg_loss._numpy()
-                    dy_out = avg_loss._numpy()
+                if batch_id == 0:
-                    if epoch == 0 and batch_id == 0:
+                    for param in mlp.parameters():
-                        for param in mnist.parameters():
+                        dy_param_init_value[param.name] = param._numpy()
-                            dy_param_init_value[param.name] = param._numpy()
+                avg_loss._backward()
-                    avg_loss._backward()
+                optimizer.minimize(avg_loss)
-                    sgd.minimize(avg_loss)
+                mlp.clear_gradients()
-                    mnist.clear_gradients()
+                dy_param_value = {}
+                for param in mlp.parameters():
-                    dy_param_value = {}
+                    dy_param_value[param.name] = param._numpy()
-                    for param in mnist.parameters():
-                        dy_param_value[param.name] = param._numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
@@ -155,23 +95,22 @@ class TestImperativeMnist(unittest.TestCase):
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
-            mnist = MNIST("mnist")
+            mlp = MLP('mlp')
-            sgd = SGDOptimizer(learning_rate=1e-3)
+            optimizer = self.get_optimizer()
            train_reader = paddle.batch(
                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
            img = fluid.layers.data(
                name='pixel', shape=[1, 28, 28], dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            cost = mnist(img)
+            cost = mlp(img)
-            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.reduce_mean(cost)
-            avg_loss = fluid.layers.mean(loss)
+            optimizer.minimize(avg_loss)
-            sgd.minimize(avg_loss)
            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
-            for param in mnist.parameters():
+            for param in mlp.parameters():
                static_param_name_list.append(param.name)
            out = exe.run(fluid.default_startup_program(),
@@ -180,29 +119,26 @@ class TestImperativeMnist(unittest.TestCase):
            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]
-            for epoch in range(epoch_num):
+            for batch_id, data in enumerate(train_reader()):
-                for batch_id, data in enumerate(train_reader()):
+                if batch_id >= self.batch_num:
-                    static_x_data = np.array(
+                    break
-                        [x[0].reshape(1, 28, 28)
-                         for x in data]).astype('float32')
+                static_x_data = np.array(
-                    y_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    [128, 1])
-                    fetch_list = [avg_loss.name]
-                    fetch_list.extend(static_param_name_list)
+                fetch_list = [avg_loss.name]
-                    out = exe.run(
+                fetch_list.extend(static_param_name_list)
-                        fluid.default_main_program(),
+                out = exe.run(fluid.default_main_program(),
-                        feed={"pixel": static_x_data,
+                              feed={"pixel": static_x_data,
-                              "label": y_data},
+                                    "label": y_data},
-                        fetch_list=fetch_list)
+                              fetch_list=fetch_list)
-                    static_param_value = {}
+                static_param_value = {}
-                    static_out = out[0]
+                static_out = out[0]
-                    for i in range(1, len(out)):
+                for i in range(1, len(out)):
-                        static_param_value[static_param_name_list[i - 1]] = out[
+                    static_param_value[static_param_name_list[i - 1]] = out[i]
-                            i]
-        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
@@ -210,7 +146,92 @@ class TestImperativeMnist(unittest.TestCase):
        self.assertTrue(np.allclose(static_out, dy_out))
        for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5))
+            self.assertTrue(np.allclose(value, dy_param_value[key]))
+class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        bd = [3, 6, 9]
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
+            boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
+        return optimizer
+    def test_sgd(self):
+        self._check_mlp()
+class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+    def test_sgd(self):
+        self._check_mlp()
+class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+    def test_sgd(self):
+        self._check_mlp()
+class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay(
+            learning_rate=0.1,
+            decay_steps=10000,
+            decay_rate=0.5,
+            staircase=True))
+        return optimizer
+    def test_adam(self):
+        self._check_mlp()
+class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
+            learning_rate=0.1, decay_steps=5, cycle=self.cycle))
+        return optimizer
+    def test_sgd_cycle(self):
+        self.cycle = True
+        self._check_mlp()
+    def test_sgd(self):
+        self.cycle = False
+        self._check_mlp()
+class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
+            learning_rate=0.1, step_each_epoch=10000, epochs=120))
+        return optimizer
+    def test_sgd(self):
+        self._check_mlp()
+class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase):
+    def get_optimizer(self):
+        optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
+            d_model=512, warmup_steps=8000))
+        return optimizer
+    def test_sgd(self):
+        self._check_mlp()
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -16,17 +16,17 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
-from paddle.fluid.imperative.nn import Embedding
+from paddle.fluid.dygraph.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 import numpy as np
 import six
 from paddle.fluid.backward import append_backward
-class SimpleLSTMRNN(fluid.imperative.Layer):
+class SimpleLSTMRNN(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
                 hidden_size,
@@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
        return real_res, last_hidden, last_cell
-class PtbModel(fluid.imperative.Layer):
+class PtbModel(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
                 hidden_size,
@@ -214,7 +214,7 @@ class PtbModel(fluid.imperative.Layer):
        return loss, last_hidden, last_cell
-class TestImperativePtbRnn(unittest.TestCase):
+class TestDygraphPtbRnn(unittest.TestCase):
    def test_ptb_rnn_cpu_float32(self):
        seed = 90
        hidden_size = 10
@@ -224,7 +224,7 @@ class TestImperativePtbRnn(unittest.TestCase):
        init_scale = 0.1
        batch_size = 4
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            # TODO: marsyang1993 Change seed to

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,8 +21,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
-from paddle.fluid.imperative.base import to_variable
+from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 batch_size = 8
@@ -57,7 +57,7 @@ def optimizer_setting(params):
        lr = []
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-        # TODO(minqiyang): Add learning rate scheduler support to imperative mode
+        # TODO(minqiyang): Add learning rate scheduler support to dygraph mode
        #  optimizer = fluid.optimizer.Momentum(
    #  learning_rate=params["lr"],
    #  learning_rate=fluid.layers.piecewise_decay(
@@ -68,7 +68,7 @@ def optimizer_setting(params):
    return optimizer
-class ConvBNLayer(fluid.imperative.Layer):
+class ConvBNLayer(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.imperative.Layer):
        return y
-class BottleneckBlock(fluid.imperative.Layer):
+class BottleneckBlock(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.imperative.Layer):
        return layer_helper.append_activation(y)
-class ResNet(fluid.imperative.Layer):
+class ResNet(fluid.dygraph.Layer):
    def __init__(self, name_scope, layers=50, class_dim=102):
        super(ResNet, self).__init__(name_scope)
@@ -226,13 +226,13 @@ class ResNet(fluid.imperative.Layer):
        return y
-class TestImperativeResnet(unittest.TestCase):
+class TestDygraphResnet(unittest.TestCase):
    def test_resnet_float32(self):
        seed = 90
        batch_size = train_parameters["batch_size"]
        batch_num = 20
-        with fluid.imperative.guard():
+        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
-from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard
 from test_imperative_base import new_program_scope
 from paddle.fluid import core
 import numpy as np
@@ -623,7 +623,7 @@ class PrepareEncoderDecoderLayer(Layer):
                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
                trainable=False))
-        # use in imperative_mode to fit different length batch
+        # use in dygraph_mode to fit different length batch
        # self._pos_emb._w = to_variable(
        #     position_encoding_init(self._src_max_len, self._src_emb_dim))
@@ -946,7 +946,7 @@ class TransFormer(Layer):
        return sum_cost, avg_cost, predict, token_num
-class TestImperativeTransformer(unittest.TestCase):
+class TestDygraphTransformer(unittest.TestCase):
    def test_transformer_float32(self):
        seed = 90
        with guard():

--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+def kldiv_loss(x, target, reduction):
+    output = target * (np.log(target) - x)
+    loss = np.where(target >= 0, output, np.zeros_like(x))
+    if reduction == "batchmean":
+        return loss.sum() / x.shape[0]
+    if reduction == "mean":
+        return loss.mean()
+    if reduction == "sum":
+        return loss.sum()
+    return loss
+class TestKLDivLossOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'kldiv_loss'
+        x = np.random.uniform(-10, 10, self.x_shape).astype('float32')
+        target = np.random.uniform(-10, 10, self.x_shape).astype('float32')
+        self.attrs = {"reduction": self.reduction}
+        self.inputs = {
+            'X': x,
+            'Target': target,
+        }
+        loss = kldiv_loss(x, target, self.reduction)
+        self.outputs = {'Loss': loss.astype('float32')}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'], 'Loss', no_grad_set=set(["Target"]), max_relative_error=0.06)
+    def initTestCase(self):
+        self.x_shape = (2, 5, 5)
+        self.reduction = 'batchmean'
+class TestKLDivLossOp2(TestKLDivLossOp):
+    def initTestCase(self):
+        self.x_shape = (3, 2, 7, 7)
+        self.reduction = 'none'
+class TestKLDivLossOp3(TestKLDivLossOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 7, 9)
+        self.reduction = 'mean'
+class TestKLDivLossOp4(TestKLDivLossOp):
+    def initTestCase(self):
+        self.x_shape = (5, 7)
+        self.reduction = 'sum'
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -29,8 +29,8 @@ from paddle.fluid import core
 from paddle.fluid.initializer import Constant
 import paddle.fluid.layers as layers
 from test_imperative_base import new_program_scope
-from paddle.fluid.imperative import nn
+from paddle.fluid.dygraph import nn
-from paddle.fluid.imperative import base
+from paddle.fluid.dygraph import base
 class LayerTest(unittest.TestCase):
@@ -68,7 +68,7 @@ class LayerTest(unittest.TestCase):
    @contextlib.contextmanager
    def dynamic_graph(self, force_to_use_cpu=False):
-        with fluid.imperative.guard(
+        with fluid.dygraph.guard(
                self._get_place(force_to_use_cpu=force_to_use_cpu)):
            fluid.default_startup_program().random_seed = self.seed
            fluid.default_main_program().random_seed = self.seed
@@ -845,7 +845,7 @@ class TestBook(unittest.TestCase):
        with program_guard(program):
            data = layers.data(name='data', shape=[10], dtype='float32')
            hid = layers.fc(input=data, size=20)
-            self.assertIsNotNone(layers.softmax(hid))
+            self.assertIsNotNone(layers.softmax(hid, axis=1))
        print(str(program))
    def test_space_to_depth(self):
@@ -1591,6 +1591,23 @@ class TestBook(unittest.TestCase):
            out = layers.spectral_norm(weight, dim=1, power_iters=1)
            self.assertIsNotNone(out)
+    def test_kldiv_loss(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[32, 128, 128], dtype="float32")
+            target = layers.data(
+                name='target', shape=[32, 128, 128], dtype="float32")
+            loss = layers.kldiv_loss(x=x, target=target, reduction='batchmean')
+            self.assertIsNotNone(loss)
+        print(str(program))
+    def test_temporal_shift(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            out = layers.temporal_shift(x, seg_num=4, shift_ratio=0.2)
+            self.assertIsNotNone(out)
        print(str(program))
    def test_shuffle_channel(self):

--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -120,9 +120,9 @@ class TestLearningRateDecay(unittest.TestCase):
            self.assertAlmostEqual(
                python_decayed_lr,
                lr_val[0],
-                msg='Failed fn is {0}, Python result is {1}, Fluid result is {2}'.
+                msg='Failed lr scheduler is {0}, step {1}, Python result is {2}, Fluid result is {3}'.
                format(python_decay_fn.__name__,
-                       str(python_decayed_lr), str(lr_val[0])))
+                       str(step), str(python_decayed_lr), str(lr_val[0])))
    def test_decay(self):
        common_kwargs_true = {
@@ -164,12 +164,53 @@ class TestLearningRateDecay(unittest.TestCase):
        ]
        for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
-            print("decay_fn=" + py_decay_fn.__name__ + " kwargs=" + str(kwargs))
+            print("class=" + self.__class__.__name__ + "decay_fn=" +
+                  py_decay_fn.__name__ + " kwargs=" + str(kwargs))
            main_program = framework.Program()
            startup_program = framework.Program()
            with framework.program_guard(main_program, startup_program):
                self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
+def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
+    linear_step = end_lr - start_lr
+    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
+    return decayed_lr
+class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
+    def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
+                               kwargs):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        warmup_steps = 10
+        start_lr = 1. / 3.
+        end_lr = 0.1
+        with fluid.program_guard(main_prog, startup_prog):
+            decayed_lr = layers.linear_lr_warmup(
+                fluid_decay_fn(**kwargs), warmup_steps, start_lr, end_lr)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        for step in range(20):
+            lr_val, = exe.run(main_prog, feed={}, fetch_list=[decayed_lr])
+            if step < warmup_steps:
+                python_decayed_lr = linear_lr_warmup(
+                    float(step), warmup_steps, start_lr, end_lr)
+            else:
+                python_decayed_lr = python_decay_fn(
+                    global_step=float(step), **kwargs)
+            self.assertAlmostEqual(
+                python_decayed_lr,
+                lr_val[0],
+                msg='Test {0} Failed, step {1}, Python result is {2}, Fluid result is {3}'.
+                format(python_decay_fn.__name__,
+                       str(step), str(python_decayed_lr), str(lr_val[0])))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -177,6 +177,9 @@ class TestMNIST(TestParallelExecutorBase):
            for use_fast_executor in (False, True):
                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)
+    # FIXME(wuyi): should checkout why this fails when merging
+    # https://github.com/PaddlePaddle/Paddle/pull/16545
+    @unittest.skip("should fix this later")
    def test_batchnorm_fc_with_new_strategy(self):
        # NOTE: the computation result of nccl_reduce is non-deterministic,
        # related issue: https://github.com/NVIDIA/nccl/issues/157

--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -31,6 +31,9 @@ class TestSoftmaxOp(OpTest):
    def get_x_shape(self):
        return [10, 10]
+    def get_axis(self):
+        return -1
    def setUp(self):
        self.op_type = "softmax"
        self.use_cudnn = False
@@ -38,15 +41,15 @@ class TestSoftmaxOp(OpTest):
        self.dtype = np.float32
        self.init_kernel_type()
        self.shape = self.get_x_shape()
+        self.axis = self.get_axis()
        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
-        out = np.apply_along_axis(stable_softmax, 1,
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
-                                  x.reshape([-1, self.shape[-1]]))
-        out = out.reshape(self.shape)
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
        self.outputs = {'Out': out}
        self.attrs = {
+            'axis': self.axis,
            'use_cudnn': self.use_cudnn,
            'use_mkldnn': self.use_mkldnn
        }
@@ -76,6 +79,38 @@ class TestSoftmaxOp2(TestSoftmaxOp):
        return [2, 3, 4, 5]
+class TestSoftmaxOp3(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 0
+class TestSoftmaxOp4(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 1
+class TestSoftmaxOp5(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 2
+class TestSoftmaxOp5(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 3
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                 "core is not compiled with CUDA")
 class TestSoftmaxCUDNNOp(TestSoftmaxOp):
@@ -90,6 +125,16 @@ class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp):
        return [2, 3, 4, 5]
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxCUDNNOp5(TestSoftmaxCUDNNOp):
+    def get_x_shape(self):
+        return [2, 3, 4, 5]
+    def get_axis(self):
+        return 3
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                 "core is not compiled with CUDA")
 class TestSoftmaxFP16Op(TestSoftmaxOp):

--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import division
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.fluid import core
+def temporal_shift(x, seg_num, shift_ratio):
+    shape = x.shape
+    reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3]))
+    pad_x = np.pad(reshape_x, ((0, 0), (1, 1), (0, 0), (0, 0), (0, 0)),
+                   'constant')
+    c1 = int(shape[1] * shift_ratio)
+    c2 = int(shape[1] * 2 * shift_ratio)
+    slice1 = pad_x[:, :seg_num, :c1, :, :]
+    slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
+    slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
+    concat_x = np.concatenate([slice1, slice2, slice3], axis=2)
+    return concat_x.reshape(shape)
+class TestTemporalShift(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'temporal_shift'
+        x = np.random.random(self.x_shape).astype('float32')
+        self.attrs = {
+            "seg_num": self.seg_num,
+            "shift_ratio": self.shift_ratio,
+        }
+        self.inputs = {"X": x, }
+        output = temporal_shift(x, self.seg_num, self.shift_ratio)
+        self.outputs = {"Out": output}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad_ignore_uv(self):
+        self.check_grad(['X'], 'Out')
+    def initTestCase(self):
+        self.x_shape = (6, 4, 4, 4)
+        self.seg_num = 3
+        self.shift_ratio = 0.25
+class TestTemporalShift2(TestTemporalShift):
+    def initTestCase(self):
+        self.x_shape = (4, 9, 7, 7)
+        self.seg_num = 2
+        self.shift_ratio = 0.2
+class TestTemporalShift3(TestTemporalShift):
+    def initTestCase(self):
+        self.x_shape = (3, 10, 5, 5)
+        self.seg_num = 1
+        self.shift_ratio = 0.3
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -19,7 +19,6 @@ from paddle.fluid.framework import default_main_program, Program, convert_np_dty
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
-from test_imperative_base import new_program_scope
 class TestVariable(unittest.TestCase):
@@ -62,7 +61,7 @@ class TestVariable(unittest.TestCase):
            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
        self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
-    def _test_slice(self):
+    def _test_slice(self, place):
        b = default_main_program().current_block()
        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
@@ -84,7 +83,6 @@ class TestVariable(unittest.TestCase):
        self.assertEqual(0, nw.lod_level)
-        place = fluid.CPUPlace()
        main = fluid.Program()
        with fluid.program_guard(main):
            exe = fluid.Executor(place)
@@ -101,10 +99,23 @@ class TestVariable(unittest.TestCase):
            var6 = var[1, 1:, 1:]
            var7 = var[1, ..., 1:]
            var8 = var[1, ...]
+            var_reshape = fluid.layers.reshape(var, [3, -1, 3])
+            var9 = var_reshape[1, ..., 2]
+            var10 = var_reshape[:, :, -1]
+            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y = fluid.layers.fc(input=x, size=1, act=None)
+            var11 = y[:, 0]
+            feeder = fluid.DataFeeder(place=place, feed_list=[x])
+            data = []
+            data.append((np.random.randint(10, size=[13]).astype('float32')))
+            exe.run(fluid.default_startup_program())
            local_out = exe.run(main,
+                                feed=feeder.feed([data]),
                                fetch_list=[
                                    var, var1, var2, var3, var4, var5, var6,
-                                    var7, var8
+                                    var7, var8, var9, var10, var11
                                ])
            self.assertTrue((np.array(local_out[1]) == np.array(tensor_array[
@@ -123,38 +134,16 @@ class TestVariable(unittest.TestCase):
                1, ..., 1:])).all())
            self.assertTrue((np.array(local_out[8]) == np.array(tensor_array[
                1, ...])).all())
+            self.assertEqual(local_out[9].shape, (1, 3, 1))
+            self.assertEqual(local_out[10].shape, (3, 3, 1))
+            self.assertEqual(local_out[11].shape, (1, 1))
    def test_slice(self):
-        self._test_slice()
+        place = fluid.CPUPlace()
+        self._test_slice(place)
-class TestVariableImperative(unittest.TestCase):
-    def _test_slice(self):
-        b = default_main_program().current_block()
-        w = b.create_var(dtype="float64", shape=[784, 100, 100], lod_level=0)
-        for i in range(3):
-            nw = w[i]
-            self.assertEqual([1, 100, 100], nw.shape)
-        nw = w[:]
-        self.assertEqual([784, 100, 100], nw.shape)
-        nw = w[:, :, :]
-        self.assertEqual([784, 100, 100], nw.shape)
-        nw = w[::2, ::2, :]
-        self.assertEqual([392, 50, 100], nw.shape)
-        nw = w[::-2, ::-2, :]
-        self.assertEqual([392, 50, 100], nw.shape)
-        nw = w[0::-2, 0::-2, :]
-        self.assertEqual([1, 1, 100], nw.shape)
-    def test_slice(self):
+        if core.is_compiled_with_cuda():
-        with fluid.imperative.guard():
+            self._test_slice(core.CUDAPlace(0))
-            self._test_slice()
 if __name__ == '__main__':

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -102,7 +102,7 @@ packages=['paddle',
          'paddle.reader',
          'paddle.distributed',
          'paddle.fluid',
-          'paddle.fluid.imperative',
+          'paddle.fluid.dygraph',
          'paddle.fluid.proto',
          'paddle.fluid.proto.profiler',
          'paddle.fluid.distributed',
@@ -119,6 +119,7 @@ packages=['paddle',
          'paddle.fluid.contrib.slim.quantization',
          'paddle.fluid.contrib.slim.distillation',
          'paddle.fluid.contrib.utils',
+          'paddle.fluid.contrib.extend_optimizer',
          'paddle.fluid.transpiler',
          'paddle.fluid.transpiler.details']

--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -28,7 +28,7 @@ import hashlib
 member_dict = collections.OrderedDict()
-experimental_namespace = {"paddle.fluid.imperative"}
+experimental_namespace = {"paddle.fluid.dygraph"}
 def md5(doc):