diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index fc204dc9193bb28b654936048dd61a9b461abb2f..ba8b5fc6c838b221fcfb559f1f01051fc09072a4 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -24,7 +24,7 @@ set(BOOST_PROJECT       "extern_boost")
 # So we use 1.41.0 here.
 set(BOOST_VER           "1.41.0")
 set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index c5754da59bf2053931be413eb10c481adecbae6b..d96da470b3cbbd8092dbf80ec5f500af9afa2ce4 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -44,7 +44,7 @@ ExternalProject_Add(
     # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
     #    checkout and clean other dirs under third_party
     # 4. remove .git, and package the directory.
-    URL "http://paddlepaddledeps.cdn.bcebos.com/grpc-v1.10.x.tar.gz"
+    URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz"
     URL_MD5  "1f268a2aff6759839dccd256adcc91cf"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index ae2679db4aed7a77ad407f881c4482fd3914ac27..142fce816de4f06aa0a36b91e3e4ecb962a8dc2a 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -34,7 +34,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 SET(TIME_VERSION "2019.0.1.20181227")
 IF(WIN32)
     SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "https://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@@ -43,7 +43,7 @@ ELSE()
     #TODO(intel-huying):
     #  Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
     SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
-    SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 9b111e09e07682b6be0c72a4a0d5a3e86e271e62..032da0cad85ce43ab2630123f9f2cfd8dee4224e 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -10,6 +10,9 @@ paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=No
 paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
 paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b54f403e57825a1592aece03afe3afb6'))
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
+paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
+paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
+paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
 paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
@@ -44,7 +47,7 @@ paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'f
 paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2'))
 paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'e1af7fd53cf868554f312779fc803864'))
+paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
@@ -58,6 +61,12 @@ paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program'
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
 paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', '70f4f53f13572436ac72d1c8b5efeb9d'))
 paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '7a5255386075dac3c75b7058254fcdcb'))
+paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a3fefec8bacd6ce83f49906a9d05e779'))
+paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '7abd9cf7d695bab5bb6cf7ded5903cb2'))
+paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'faef298f73e91aedcfaf5d184f3109b7'))
+paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ff1cc1e2beb8824d453656c72c28ddfb'))
+paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b7ea0a548991924e4cfe61a577b8e56d'))
 paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -222,6 +231,7 @@ paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label'
 paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
+paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
 paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
@@ -229,7 +239,7 @@ paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=Non
 paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'f563d376d35e1a4c4db100fd11b381a0'))
 paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
 paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
-paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '13dabc57863f62ab3141586784ee356b'))
+paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '4357643685cfd65454ba5a15f0151709'))
 paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '350f74d93fab9adb2ac4950f1c26416b'))
 paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -255,6 +265,7 @@ paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=
 paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
+paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -376,23 +387,9 @@ paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args',
 paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
 paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
-paddle.fluid.contrib.build_compressor (ArgSpec(args=['place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'config'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.__init__ (ArgSpec(args=['self', 'place', 'data_reader', 'data_feeder', 'scope', 'metrics', 'epoch', 'program_exe'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.CompressPass.add_strategy (ArgSpec(args=['self', 'strategy'], varargs=None, keywords=None, defaults=None), ('document', '3bf6010b6f47d3c86df0ec8957be95e0'))
-paddle.fluid.contrib.CompressPass.apply (ArgSpec(args=['self', 'graph'], varargs=None, keywords=None, defaults=None), ('document', 'a92bf85d4b59bd4f2ac1706d7c4899a6'))
-paddle.fluid.contrib.ImitationGraph.__init__ (ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.ImitationGraph.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.__init__ (ArgSpec(args=['self', 'pruner', 'start_epoch', 'end_epoch', 'delta_rate', 'acc_loss_threshold', 'sensitivities'], varargs=None, keywords=None, defaults=(None, 0, 10, 0.2, 0.2, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_batch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_compress_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_begin (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.SensitivePruneStrategy.on_epoch_end (ArgSpec(args=['self', 'context'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
-paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
+paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80'))
+paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
+paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
 paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
 paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
 paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
@@ -432,48 +429,59 @@ paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'poo
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
 paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1a79bd7d10ae54ca763ec81bca36ba24'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -512,6 +520,7 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
 paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
 paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 5f29ebc70f0c5c5aaa924bc1ced5c6131a69b5c0..7a371af510b8050aec3708d82923c707fd9d3a90 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -51,9 +51,7 @@ else()
     cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle)
 endif()
 
-cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor)
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
-cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
 
 if(WITH_GPU)
 cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info)
@@ -74,7 +72,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
index ff223e616f7ef0c794e72a0028c7e5bb3f234ec0..c084410864b06b972407d50bc0998499c6f9ee80 100644
--- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -52,13 +53,28 @@ std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
   //               Note that must assert topology sort is stable
   auto& ops = graph->Get<const std::vector<OpDesc*>>(kStaleProgramOpDescs);
   for (auto* op_desc : ops) {
-    auto outputs = op_desc->Outputs();
-    for (auto& o_it : outputs) {
-      for (auto& v : o_it.second) {  // values
-        vars[v] = order;
+    try {
+      bool is_bk_op =
+          static_cast<bool>(boost::get<int>(op_desc->GetAttr(
+                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                            static_cast<int>(OpRole::kBackward));
+      if (!is_bk_op) continue;
+
+      auto backward_vars =
+          boost::get<std::vector<std::string>>(op_desc->GetNullableAttr(
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
+
+      auto outputs = op_desc->Outputs();
+      for (auto& o_it : outputs) {
+        for (auto& v : o_it.second) {  // values
+          vars[v] = order;
+          VLOG(1) << "in all_reduce_deps_pass:" << v;
+        }
       }
+      order++;
+    } catch (boost::bad_get e) {
     }
-    order++;
   }
 
   std::vector<OpHandleBase*> dist_ops;
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index c1f9c2b60c915370df7793f26fe83812a7ced96d..fdaff08e53755dc43df01e4734d355a286bb5863 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -11,9 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <algorithm>
-
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include <algorithm>
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -56,6 +55,7 @@ void AllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
+
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
   PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 0c75e05f861636565ae855ddd534c1082d40d237..0b4d33513506d41a63db8316abaa5cd0458ff352 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -57,7 +57,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
-  bool IsMultiDeviceTransfer() override { return false; };
+  bool IsMultiDeviceTransfer() override { return true; };
 
  protected:
   void RunImpl() override;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 4184353bcbd96e0b13f0dc11794e5d4f35cd5a25..5d9db237538599ec9a6887317b61af73f1113b97 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -147,6 +147,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Verify that the graph is correct for multi-device executor.
     AppendPass("multi_devices_check_pass");
 
+    if (VLOG_IS_ON(2)) {
+      AppendPass("all_reduce_deps_pass");
+    }
+
     if (SeqOnlyAllReduceOps(strategy)) {
       VLOG(10) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc
deleted file mode 100644
index c9b52b68205ade000e21a3d06b80af86cbe01f34..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/data_balance_op_handle.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
-#include <algorithm>
-#include "paddle/fluid/framework/details/container_cast.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places,
-    const platform::NCCLContextMap *ctxs)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  if (ctxs) {
-    for (auto &p : places_) {
-      this->SetDeviceContext(p, ctxs->DevCtx(p));
-    }
-  }
-}
-#else
-DataBalanceOpHandle::DataBalanceOpHandle(
-    ir::Node *node, const std::vector<Scope *> &local_scopes,
-    const std::vector<platform::Place> &places)
-    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
-#endif
-
-std::string DataBalanceOpHandle::Name() const { return "data balance"; }
-
-std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
-    const std::vector<int> &device_sizes) {
-  int device_num = device_sizes.size();
-  int total_size = 0;
-  int empty_num = 0;
-  std::vector<std::array<int, 2>> size_device_vec;
-  size_device_vec.reserve(device_num);
-  for (int i = 0; i < device_num; ++i) {
-    if (device_sizes[i] == 0) {
-      ++empty_num;
-    }
-    total_size += device_sizes[i];
-    size_device_vec.push_back({{device_sizes[i], i}});
-  }
-  std::vector<std::array<int, 3>> res;
-  if (empty_num == 0) {
-    // No need to do data balance.
-    return res;
-  }
-  if (total_size < device_num) {
-    // No enough data.
-    PADDLE_THROW_EOF();
-  }
-  std::sort(size_device_vec.begin(), size_device_vec.end(),
-            [](const std::array<int, 2> &a, const std::array<int, 2> &b) {
-              return a[0] > b[0];
-            });
-  int expected_device_size = total_size / device_num;
-  int src_idx = 0;
-  for (int dst_idx = device_num - empty_num; dst_idx < device_num; ++dst_idx) {
-    if (size_device_vec[src_idx][0] <= expected_device_size) {
-      ++src_idx;
-      PADDLE_ENFORCE_LT(
-          src_idx, device_num - empty_num,
-          "In current srategy an empty tensor should not be copy source.");
-    }
-    size_device_vec[src_idx][0] -= expected_device_size;
-    size_device_vec[dst_idx][0] += expected_device_size;
-    res.push_back({{size_device_vec[src_idx][1], size_device_vec[dst_idx][1],
-                    expected_device_size}});
-  }
-  return res;
-}
-
-void DataBalanceOpHandle::RunImpl() {
-  PADDLE_ENFORCE_GT(places_.size(), 1UL,
-                    "Data balance can only be enabled when the number of "
-                    "places to run larger than 1.");
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0);
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  int data_num = in_var_handles.size() / places_.size();
-  WaitInputVarGenerated();
-  std::vector<std::vector<LoDTensor *>> lod_tensors(data_num);
-  std::vector<int> device_sizes;
-  for (int i = 0; i < static_cast<int>(in_var_handles.size()); ++i) {
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
-    int place_idx = i / data_num;
-    int data_idx = i % data_num;
-    auto *local_scope =
-        local_scopes_[place_idx]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto *tensor_var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE(tensor_var->IsType<LoDTensor>());
-    auto *tensor = tensor_var->GetMutable<LoDTensor>();
-    lod_tensors[data_idx].push_back(tensor);
-    int ins_size =
-        tensor->lod().empty() ? tensor->dims()[0] : tensor->NumElements();
-    if (data_idx == 0) {
-      device_sizes.emplace_back(ins_size);
-    } else {
-      PADDLE_ENFORCE_EQ(
-          ins_size, device_sizes.at(place_idx),
-          "All data on the same device shall have the same batch size.");
-    }
-  }
-  const auto &balance_plan = GetBalancePlan(device_sizes);
-
-  for (const auto &trans : balance_plan) {
-    for (int data_idx = 0; data_idx < data_num; ++data_idx) {
-      LoDTensor *src_tensor = lod_tensors[data_idx][trans[0]];
-      LoDTensor *dst_tensor = lod_tensors[data_idx][trans[1]];
-      int trans_ins_size = trans[2];
-      LoD src_lod = src_tensor->lod();
-      int src_ins_size =
-          src_lod.empty() ? src_tensor->dims()[0] : src_tensor->NumElements();
-      int cut_point = src_ins_size - trans_ins_size;
-      if (!src_lod.empty()) {
-        for (auto &level : src_lod) {
-          cut_point = level[cut_point];
-        }
-      }
-      TensorCopySync(src_tensor->Slice(cut_point, src_tensor->dims()[0]),
-                     dst_tensor->place(), dst_tensor);
-      src_tensor->ShareDataWith(src_tensor->Slice(0, cut_point));
-      if (!src_lod.empty()) {
-        dst_tensor->set_lod(SliceInLevel(
-            src_lod, 0, src_ins_size - trans_ins_size, src_ins_size));
-        src_tensor->set_lod(
-            SliceInLevel(src_lod, 0, 0, src_ins_size - trans_ins_size));
-      }
-    }
-  }
-}
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h
deleted file mode 100644
index 2db18a1a7203f85aac6338576f2e68c7b37d7c69..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/data_balance_op_handle.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct DataBalanceOpHandle : public OpHandleBase {
- public:
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places,
-                      const platform::NCCLContextMap *ctxs);
-#else
-  DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
-                      const std::vector<platform::Place> &places);
-#endif
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  // std::vector<(src_dev_id, dst_dev_id, trans_size)>
-  std::vector<std::array<int, 3>> GetBalancePlan(
-      const std::vector<int> &batch_size_per_device);
-
-  const std::vector<Scope *> local_scopes_;
-  const std::vector<platform::Place> places_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index bbf81e1b8e49cae133858f7aa121701fb0f5456f..232d82a5da596a78d2999c4a4c4f7dda0c7cad7e 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -82,6 +82,8 @@ void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
   }
 }
 
+bool FetchOpHandle::IsMultiDeviceTransfer() { return true; }
+
 std::string FetchOpHandle::Name() const { return "Fetch"; }
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h
index 6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc..dbb7f4f6582f6e0f0b9b5702533852d12da1051c 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_op_handle.h
@@ -39,6 +39,8 @@ struct FetchOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  bool IsMultiDeviceTransfer() override;
+
  protected:
   void RunImpl() override;
 
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.cc b/paddle/fluid/framework/details/fuse_vars_op_handle.cc
deleted file mode 100644
index 14292c0a5d06aa3ff12b46b5768b136fa925752d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/details/fuse_vars_op_handle.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-void FuseVarsOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
-  PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
-
-  auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-
-  auto out_var_handle = out_var_handles[0];
-  auto out_var = scope->Var(out_var_handle->name());
-
-  auto out_tensor = out_var->GetMutable<LoDTensor>();
-  out_tensor->Resize({total_numel_}).mutable_data(this->place_, type_);
-
-  int64_t s = 0;
-  for (size_t i = 1; i < out_var_handles.size(); ++i) {
-    auto out_name = out_var_handles[i]->name();
-    auto out_t = scope->Var(out_name)->GetMutable<LoDTensor>();
-    auto numel = this->inputs_numel_.at(out_name);
-    out_t->ShareDataWith(out_tensor->Slice(s, s + numel));
-    s += numel;
-  }
-  this->RunAndRecordEvent([] {});
-}
-
-std::string FuseVarsOpHandle::Name() const { return "fuse vars"; }
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h
deleted file mode 100644
index b40b01df36479543e8b2779762210ae144d7d9be..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/details/fuse_vars_op_handle.h
+++ /dev/null
@@ -1,65 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-struct FuseVarsOpHandle : public OpHandleBase {
- public:
-  FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
-                   const platform::Place &place,
-                   const std::unordered_map<std::string, int64_t> &inputs_numel,
-                   const proto::VarType::Type var_type)
-      : OpHandleBase(node),
-        local_scope_(local_scope),
-        place_(place),
-        inputs_numel_(inputs_numel),
-        type_(var_type) {
-    total_numel_ = 0;
-    for (auto in_numel : inputs_numel) {
-      PADDLE_ENFORCE_GT(in_numel.second, 0);
-      total_numel_ += in_numel.second;
-    }
-  }
-
-  std::string Name() const override;
-
-  bool IsMultiDeviceTransfer() override { return false; };
-
- protected:
-  void RunImpl() override;
-
- private:
-  Scope *local_scope_;
-  const platform::Place place_;
-  const std::unordered_map<std::string, int64_t> inputs_numel_;
-  const proto::VarType::Type type_;
-  int64_t total_numel_;
-};
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index e3cd2340c97c5a3e1315e4eefc0b6f6475d247db..125dbf746c3880e142af4d4bffd3ccda8654c0a1 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -14,13 +14,15 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include <algorithm>
 #include <fstream>
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
-#include "paddle/fluid/framework/details/data_balance_op_handle.h"
 #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 4822627ac3b65972f41d9a23d9fe3dba3de3f97d..158da6f606f3f5a7062a4aaed7cf7e3fe71c817a 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include <map>
+#include <unordered_set>
 
 namespace paddle {
 namespace framework {
@@ -41,15 +42,42 @@ OpHandleBase::~OpHandleBase() {
 
 void OpHandleBase::Run(bool use_cuda) {
 #ifdef PADDLE_WITH_CUDA
-  if (events_.empty() && use_cuda) {
+  if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) {
     for (auto &p : dev_ctxes_) {
       int dev_id = boost::get<platform::CUDAPlace>(p.first).device;
       PADDLE_ENFORCE(cudaSetDevice(dev_id));
       PADDLE_ENFORCE(
           cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
     }
+    if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          int dev_id =
+              boost::get<platform::CUDAPlace>(out_var_handle->place()).device;
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL,
+                        "%s should have only one dev_ctx.", Name());
+      auto &place = dev_ctxes_.begin()->first;
+      int dev_id = boost::get<platform::CUDAPlace>(place).device;
+      for (auto &out_var : outputs_) {
+        auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
+        if (out_var_handle) {
+          PADDLE_ENFORCE(
+              platform::is_same_place(place, out_var_handle->place()),
+              "The place of input(%s) is not consistent with the "
+              "place of current op(%s).",
+              out_var_handle->Name(), Name());
+          out_var_handle->SetGenerateEvent(events_[dev_id]);
+        }
+      }
+    }
   }
 #else
+
   PADDLE_ENFORCE(!use_cuda);
 #endif
 
@@ -93,17 +121,48 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
 void OpHandleBase::WaitInputVarGenerated() {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
-      for (auto &pair : dev_ctxes_) {
-        in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        auto &place = in_var_handle->place();
+        if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream =
+              static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
+                  ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
       }
     }
   }
 }
 
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
-  for (auto *in : inputs_) {
-    if (NeedWait(in)) {
-      in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place));
+  for (auto in_var : inputs_) {
+    if (NeedWait(in_var)) {
+      // Dummy Variable is used to represent dependencies between operators, so
+      // there doesn't add event for it.
+      auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+      if (in_var_handle) {
+        if (platform::is_gpu_place(in_var_handle->place())) {
+#ifdef PADDLE_WITH_CUDA
+          auto stream = static_cast<platform::CUDADeviceContext *>(
+                            dev_ctxes_.at(in_var_handle->place()))
+                            ->stream();
+          PADDLE_ENFORCE(
+              cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#else
+          PADDLE_THROW("Doesn't compile the GPU.");
+#endif
+        }
+        // There are nothing to do when the place is CPUPlace.
+      }
     }
   }
 }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 9ba295a2b06a5ee9c3069e95fa688595fe72d6fd..c4254bbadfa17682f437f46f02adc9c884d24304 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
-#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -27,62 +26,49 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
     : graph_(graph),
       pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
                                        : nullptr),
+      prepare_pool_(1),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      running_ops_(0),
-      strategy_(strategy) {}
+      strategy_(strategy) {
+  PrepareOpDeps();
+  CopyOpDeps();
+}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   std::unique_ptr<platform::RecordEvent> event(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
-  std::unordered_map<OpHandleBase *, size_t> pending_ops;
-  std::unordered_set<VarHandleBase *> pending_vars;
-  auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
-  std::unordered_set<OpHandleBase *> ready_ops;
+  std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
+  CopyOpDeps();
+  VLOG(10) << "ThreadedSSAGraphExecutor::Run";
+  std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
+      new BlockingQueue<VarHandleBase *>);
+  auto &pending_ops = op_deps->pending_ops_;
+  auto &pending_vars = op_deps->pending_vars_;
+  auto &ready_ops = op_deps->ready_ops_;
+
   // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
   // streams from multiple GPUs, it's faster to buffer them and schedule
   // together since we currently cannot overlap computation and memcpy streams.
   // Should revisit it if overlapping is available.
   std::unordered_set<OpHandleBase *> delayed_ops;
 
-  // Transform SSAGraph to pending_ops & pending_vars
-  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
-    for (auto &name_pair : var_map) {
-      for (auto &version_pair : name_pair.second) {
-        InsertPendingVar(&pending_vars, ready_vars.get(), version_pair);
-      }
-    }
-  }
-  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
-    InsertPendingVar(&pending_vars, ready_vars.get(), var);
-  }
-
-  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
-    if (op->Inputs().empty()) {  // Special case, Op has no input.
-      ready_ops.insert(op);
-    } else {
-      InsertPendingOp(&pending_ops, op);
-    }
-  }
-
   // Step 2. Insert FetchOps
   std::vector<FetchOpHandle *> fetch_ops;
   std::unordered_set<VarHandleBase *> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
-  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
-                 &pending_vars, ready_vars.get(), &fetch_data);
+  InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
+                 &pending_ops, &pending_vars, &fetch_data);
 
   auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
     for (auto *op : set) {
-      running_ops_++;
       RunOp(ready_vars, op);
     }
     set.clear();
   };
-
+  auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); };
   // Clean run context
   run_op_futures_.clear();
   exception_holder_.Clear();
@@ -91,19 +77,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
     // Keep loop until all vars are ready.
-    //
-    // NOTE: DelayedOps have a lower priority. It will be scheduled after all
-    // ready_ops have been performed.
-    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
-      run_all_ops(delayed_ops);
-    } else {
-      run_all_ops(ready_ops);
-    }
+    run_all_ops(ready_ops);
 
     // 2. Find ready variable
     bool timeout;
     auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-
     if (timeout) {
       if (exception_holder_.IsCaught()) {
         for (auto &run_op_future : run_op_futures_) {
@@ -115,6 +93,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         continue;
       }
     }
+
     // 3. Remove the dependency of ready_var.
     // Find the ready_ops after the ready_var.
     for (auto ready_var : cur_ready_vars) {
@@ -123,11 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
-            delayed_ops.insert(op);
-          } else {
-            ready_ops.insert(op);
-          }
+          run_all_op(op);
         }
       }
     }
@@ -143,16 +118,17 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
     std::vector<FetchOpHandle *> *fetch_ops,
     std::unordered_set<VarHandleBase *> *fetch_dependencies,
+    std::unordered_set<OpHandleBase *> *ready_ops,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
     std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data) {
+    FeedFetchList *fetch_data) {
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-
+  std::unordered_set<VarHandleBase *> local_ready_vars;
   for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
-        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
+        fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin());
       }
     }
   }
@@ -161,8 +137,9 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     auto &var_name = fetch_tensors[i];
     auto fetched_var_it = fetched_vars.find(var_name);
     PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
-                   "Cannot find fetched variable.(Perhaps the main_program "
-                   "is not set to ParallelExecutor)");
+                   "Cannot find fetched variable(%s).(Perhaps the main_program "
+                   "is not set to ParallelExecutor)",
+                   var_name);
 
     auto &vars = fetched_var_it->second;
 
@@ -184,9 +161,23 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     auto *fetch_dummy = new DummyVarHandle(fetch_var);
     op->AddOutput(fetch_dummy);
     fetch_dependencies->emplace(fetch_dummy);
-    this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
-    this->InsertPendingOp(pending_ops, op);
+
+    this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy);
+
+    size_t wait_input_num = 0;
+    std::unordered_set<VarHandleBase *> input_set(vars.begin(), vars.end());
+    for (auto *var : input_set) {
+      if (pending_vars->count(var)) {
+        ++wait_input_num;
+      }
+    }
+    if (wait_input_num) {
+      pending_ops->insert({op, wait_input_num});
+    } else {
+      ready_ops->insert(static_cast<OpHandleBase *>(op));
+    }
   }
+  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -197,11 +188,63 @@ void ThreadedSSAGraphExecutor::InsertPendingOp(
 
 void ThreadedSSAGraphExecutor::InsertPendingVar(
     std::unordered_set<VarHandleBase *> *pending_vars,
-    BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
+    std::unordered_set<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
   pending_vars->insert(var);
   if (var->GeneratedOp() == nullptr) {
-    ready_vars->Push(var);
+    ready_vars->insert(var);
+  }
+}
+
+void ThreadedSSAGraphExecutor::PrepareOpDeps() {
+  op_deps_.reset(new OpDependentData());
+  std::unordered_map<OpHandleBase *, size_t> &pending_ops =
+      op_deps_->pending_ops_;
+  std::unordered_set<VarHandleBase *> &pending_vars = op_deps_->pending_vars_;
+  std::unordered_set<OpHandleBase *> &ready_ops = op_deps_->ready_ops_;
+  std::unordered_set<VarHandleBase *> ready_vars;
+
+  // Transform SSAGraph to pending_ops & pending_vars
+  for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
+    for (auto &name_pair : var_map) {
+      for (auto &version_pair : name_pair.second) {
+        InsertPendingVar(&pending_vars, &ready_vars, version_pair);
+      }
+    }
+  }
+  for (auto &var : graph_->Get<details::GraphDepVars>(details::kGraphDepVars)) {
+    InsertPendingVar(&pending_vars, &ready_vars, var);
+  }
+
+  for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
+    if (op->Inputs().empty()) {  // Special case, Op has no input.
+      ready_ops.insert(op);
+    } else {
+      InsertPendingOp(&pending_ops, op);
+    }
   }
+  for (auto ready_var : ready_vars) {
+    pending_vars.erase(ready_var);
+    for (auto *op : ready_var->PendingOps()) {
+      auto &deps = pending_ops[op];
+      --deps;
+      if (deps == 0) {
+        ready_ops.insert(op);
+      }
+    }
+  }
+}
+
+void ThreadedSSAGraphExecutor::CopyOpDeps() {
+  op_deps_futures_ = prepare_pool_.enqueue([&] {
+    auto *op_deps = new OpDependentData();
+    op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(),
+                                 op_deps_->pending_ops_.end());
+    op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(),
+                                  op_deps_->pending_vars_.end());
+    op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
+                               op_deps_->ready_ops_.end());
+    return std::unique_ptr<OpDependentData>(op_deps);
+  });
 }
 
 void ThreadedSSAGraphExecutor::RunOp(
@@ -216,7 +259,6 @@ void ThreadedSSAGraphExecutor::RunOp(
         op->Run(strategy_.use_cuda_);
       }
       VLOG(10) << op << " " << op->Name() << " Done ";
-      running_ops_--;
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 0867f6210480ec405e7cc4ea42c74b750133ea4e..b9bccba8fa2fa13d99a9a39a5135106101daa903 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -15,18 +15,20 @@
 #pragma once
 
 #include <deque>
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
-#include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
@@ -36,6 +38,12 @@ class Scope;
 
 namespace details {
 
+struct OpDependentData {
+  std::unordered_map<OpHandleBase *, size_t> pending_ops_;
+  std::unordered_set<VarHandleBase *> pending_vars_;
+  std::unordered_set<OpHandleBase *> ready_ops_;
+};
+
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
   ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
@@ -57,29 +65,35 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  private:
   ir::Graph *graph_;
   std::unique_ptr<::ThreadPool> pool_;
+  ::ThreadPool prepare_pool_;
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
   ExceptionHolder exception_holder_;
-  std::atomic<int> running_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
 
   void InsertPendingVar(std::unordered_set<VarHandleBase *> *pending_vars,
-                        BlockingQueue<VarHandleBase *> *ready_vars,
+                        std::unordered_set<VarHandleBase *> *ready_vars,
                         VarHandleBase *var) const;
 
   void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
                       std::vector<FetchOpHandle *> *fetch_ops,
                       std::unordered_set<VarHandleBase *> *fetch_dependencies,
+                      std::unordered_set<OpHandleBase *> *ready_ops,
                       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                       std::unordered_set<VarHandleBase *> *pending_vars,
-                      BlockingQueue<VarHandleBase *> *ready_vars,
                       FeedFetchList *fetch_data);
 
+  void PrepareOpDeps();
+  void CopyOpDeps();
+
  private:
+  std::future<std::unique_ptr<OpDependentData>> op_deps_futures_;
+
   ExecutionStrategy strategy_;
+  std::unique_ptr<OpDependentData> op_deps_;
   // use std::list because clear(), push_back, and for_each are O(1)
   std::list<std::future<void>> run_op_futures_;
 };
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 8321c32f8b1d73bf5e6080b4b314abc9fd20536d..93060ef2593cbc032a382b617f9690e392a15b63 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -43,6 +43,7 @@ struct VarHandleBase {
   virtual ~VarHandleBase();
 
   virtual std::string DebugString() const = 0;
+  virtual const std::string& Name() const = 0;
 
   void AddInput(OpHandleBase* in, ir::Node* node) {
     node_->inputs.clear();
@@ -95,8 +96,6 @@ struct VarHandleBase {
 //
 // NOTE: runtime variables have place.
 struct VarHandle : public VarHandleBase {
-  explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
-
   virtual ~VarHandle();
 
   std::string DebugString() const override;
@@ -109,6 +108,20 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
+#ifdef PADDLE_WITH_CUDA
+  bool HasEvent() { return has_event_; }
+
+  const cudaEvent_t& GetEvent() {
+    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    return event_;
+  }
+
+  void SetGenerateEvent(const cudaEvent_t& event) {
+    has_event_ = true;
+    event_ = event;
+  }
+#endif
+
   // version field currently is not used, however, just store the version to
   // debug easily.
  private:
@@ -116,6 +129,11 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
+#ifdef PADDLE_WITH_CUDA
+  // Only when this event is triggered, var is generated.
+  cudaEvent_t event_;
+  bool has_event_{false};
+#endif
 
  public:
   bool IsTheSameVar(const VarHandle& o) const {
@@ -125,6 +143,7 @@ struct VarHandle : public VarHandleBase {
 
   size_t version() const { return version_; }
   size_t scope_idx() const { return scope_idx_; }
+  const std::string& Name() const override { return name_; }
   const std::string& name() const { return name_; }
   const platform::Place& place() const { return place_; }
 };
@@ -136,6 +155,10 @@ struct DummyVarHandle : public VarHandleBase {
   virtual ~DummyVarHandle();
 
   std::string DebugString() const override;
+
+ public:
+  const std::string& Name() const override { return name_; }
+  std::string name_{"DummyVar"};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 87c69d3accbfc07af9049bd492d8b4cd1e731465..81b8ffa83f612f5b67cd91a7a2c1228519a1fbb7 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,9 +46,6 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
-pass_library(cpu_quantize_placement_pass base)
-pass_library(cpu_quantize_pass inference)
-pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
@@ -93,6 +90,9 @@ if(WITH_MKLDNN)
     pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
     pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
+    pass_library(cpu_quantize_placement_pass base mkldnn)
+    pass_library(cpu_quantize_pass inference mkldnn)
+    pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
@@ -111,9 +111,6 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
-cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
-cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
     cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
@@ -123,4 +120,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
     cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
+    cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+    cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+    cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 endif ()
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
similarity index 99%
rename from paddle/fluid/framework/ir/cpu_quantize_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index ed80f9cae347cfb2bf23859daea2f1f47dba599b..b3a8c208916f699dc032496c6d0fa5bf86227903 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/cpu_quantize_pass.h
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
similarity index 99%
rename from paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 89601be7d1c0f5c9d3c3dcefa4327be7c20a7d65..0d0ed989012fced7f639c2bc12a3bafa6edf27f6 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
similarity index 96%
rename from paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 50bbe4915b3502a867be397ae0922d982108d12c..511003dce59f91272802766544577e9c473a3a1d 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 #include <string>
 #include <unordered_set>
 
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
index 5a4d622645a4377526351bbf4acbcea95a780d22..11d72a56bd66792ff3ed5cc8184f5b242d9cdba5 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h"
 
 #include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
similarity index 98%
rename from paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index de62a69de4f25912c5f56973de0dca5343bbe906..6e74cc7787b73d06b1093ed4e846ab83b1234803 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -13,7 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/cpu_quantize_squash_pass.h
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
diff --git a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
similarity index 98%
rename from paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
rename to paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 3a3eb53f79955b37f5f9af6a09b2f9c8e934aa3e..3cf51d97aa4b8be468b8c2a78dd17aafbbf0e15b 100644
--- a/paddle/fluid/framework/ir/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/cpu_quantize_squash_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 3d1de95f58ded4af7fcc3d4c75b4d5e1aa63f13f..036d2a50a4a7ea3ce7e052a56202b1d54465b03e 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -315,6 +315,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       for (size_t i = 0; i < outputs.size(); ++i) {
         framework::Variable* grad = outputs[i]->var_;
         framework::Variable* orig_grad = origin_outputs[i]->var_;
+        VLOG(3) << "AddTo Called with orig_grad is: "
+                << origin_outputs[i]->name_ << " Grad to be added is "
+                << outputs[i]->name_;
         AddTo(grad, orig_grad, place_);
         delete grad;
       }
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0cfdea030eb4ef297e26fabb7fc394e5cbc19033..7c9d0af3ecd647604ab46ee6239fc352e5fd8d85 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -277,6 +277,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
           VarBase* var = current_vars_map[var_it->second];
           InitGrad(var, prepared_op.GetDeviceContext());
           grad_out_vars.push_back(var->grads_);
+          VLOG(3) << "grads output var name: " << var->name_;
         }
       }
     }
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 555b57810e3f13357b7f5ebcb2541d763d5cec66..29f16943e0c13fbe080e8e073b081583f1d14d11 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -42,8 +42,11 @@ namespace inference {
 namespace analysis {
 
 using framework::ir::Graph;
+
+#ifdef PADDLE_WITH_MKLDNN
 using VarQuantScale =
     std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
+#endif
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -137,6 +140,7 @@ struct Argument {
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
 
+#ifdef PADDLE_WITH_MKLDNN
   // A set of op types to enable their quantized kernels
   DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes,
                       std::unordered_set<std::string>);
@@ -147,6 +151,7 @@ struct Argument {
 
   // Scales for variables to be quantized
   DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
+#endif
 
   // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 86ff06c6ae454b6d830a605f97272671e26c3bb6..7a96ac11d8ef754f38070862a70744947412882b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -64,6 +64,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
+#ifdef PADDLE_WITH_MKLDNN
     } else if (pass_name == "cpu_quantize_placement_pass") {
       pass->Set("quantize_enabled_op_types",
                 new std::unordered_set<std::string>(
@@ -74,22 +75,8 @@ void IRPassManager::CreatePasses(Argument *argument,
     } else if (pass_name == "cpu_quantize_pass") {
       pass->Set("quant_var_scales",
                 new VarQuantScale(argument->quant_var_scales()));
-    }
-
-    if (pass_name == "anakin_subgraph_pass") {
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
-      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
-      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
-                                       argument->engine_opt_info()));
-      pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->anakin_max_input_shape()));
-      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
-    }
-
-    if (pass_name == "tensorrt_subgraph_pass") {
+#endif
+    } else if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
@@ -124,6 +111,19 @@ void IRPassManager::CreatePasses(Argument *argument,
                                        argument->engine_opt_info()));
     }
 
+    if (pass_name == "anakin_subgraph_pass") {
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+      pass->Set("engine_opt_info", new std::map<std::string, std::string>(
+                                       argument->engine_opt_info()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->anakin_max_input_shape()));
+      pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
+    }
+
     pre_pass = pass_name;
 
     passes_.emplace_back(std::move(pass));
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 963986f245cdafa737d76953f0e5323e4f74e669..bf2e3593c2beadaea2cb08aa3dcc2370c3e06bf4 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -27,7 +27,7 @@ if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
 fi
 
 PREFIX=inference-vis-demos%2F
-URL_ROOT=http://paddlemodels.cdn.bcebos.com/${PREFIX}
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 94e14228df84f2c462881e9a8ce24578c74ffeaa..35dd1176718a3d7e4f3867ce048216ea45e5ba7f 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,8 +86,8 @@ const std::vector<std::string> kAnakinSubgraphPasses({
 
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
-    "infer_clean_graph_pass",                        //
-        "identity_scale_op_clean_pass",              //
+    "infer_clean_graph_pass",  //
+        //   "identity_scale_op_clean_pass",              //
         "conv_affine_channel_fuse_pass",             //
         "conv_eltwiseadd_affine_channel_fuse_pass",  //
         "conv_bn_fuse_pass",                         //
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index d9ac73b0638ad356501a9883b49e65f8f3e32245..2f17a44e0c08ef7d9204a115512a1cd76790efdf 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -115,14 +115,14 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index f551b322fe00892be79dd966235504bb4f54c718..df7af71d9b32ba11822e066f574146cfa5c50edd 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -1,5 +1,5 @@
 include(ExternalProject)
-set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
+set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 7c44e18f8f39cfcdf749441ba7530e5227c44b5f..ac77c3d2a500816a4eb41ed13f23ee628290f287 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -61,4 +61,6 @@ nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocat
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
 
-cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
+cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
+
+cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67905973ff620a7e0fb863fef80778aceba7aeb2
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_int64(gpu_allocator_retry_time);
+#endif
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+//! Run allocate test cases for different places
+void AllocateTestCases() {
+  auto &instance = AllocatorFacade::Instance();
+  platform::Place place;
+  size_t size = 1024;
+
+  {
+    place = platform::CPUPlace();
+    size = 1024;
+    auto cpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(cpu_allocation, nullptr);
+    ASSERT_NE(cpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(cpu_allocation->place(), place);
+    ASSERT_EQ(cpu_allocation->size(), size);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    place = platform::CUDAPlace(0);
+    size = 1024;
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    place = platform::CUDAPlace(0);
+    size = 2 * static_cast<size_t>(1 << 30);
+    auto gpu_allocation = instance.Alloc(place, size);
+    ASSERT_NE(gpu_allocation, nullptr);
+    ASSERT_NE(gpu_allocation->ptr(), nullptr);
+    ASSERT_EQ(gpu_allocation->place(), place);
+    ASSERT_GE(gpu_allocation->size(), size);
+  }
+
+  {
+    place = platform::CUDAPinnedPlace();
+    size = (1 << 20);
+    auto cuda_pinned_allocation =
+        instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20);
+    ASSERT_NE(cuda_pinned_allocation, nullptr);
+    ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr);
+    ASSERT_EQ(cuda_pinned_allocation->place(), place);
+    ASSERT_GE(cuda_pinned_allocation->size(), size);
+  }
+#endif
+}
+
+TEST(Allocator, SpecifyGpuMemory) {
+#ifdef PADDLE_WITH_CUDA
+  // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
+  // FLAGS_reallocate_gpu_memory_in_mb
+  FLAGS_fraction_of_gpu_memory_to_use = 0.0;
+  // 512 MB
+  FLAGS_initial_gpu_memory_in_mb = 512;
+  // 4 MB
+  FLAGS_reallocate_gpu_memory_in_mb = 4;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
similarity index 92%
rename from paddle/fluid/memory/allocation/allocator_facade_test.cc
rename to paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
index 802d79e15de253d4e67e35046bdf1d689258da6d..decdc62f1361a9c159b8ccb09910e0f164b35210 100644
--- a/paddle/fluid/memory/allocation/allocator_facade_test.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc
@@ -19,6 +19,8 @@
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_int64(gpu_allocator_retry_time);
 #endif
 
@@ -26,13 +28,8 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-TEST(allocator, allocator) {
-#ifdef PADDLE_WITH_CUDA
-  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
-  FLAGS_gpu_allocator_retry_time = 500;
-  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
-#endif
-
+//! Run allocate test cases for different places
+void AllocateTestCases() {
   auto &instance = AllocatorFacade::Instance();
   platform::Place place;
   size_t size = 1024;
@@ -82,6 +79,16 @@ TEST(allocator, allocator) {
 #endif
 }
 
+TEST(Allocator, Allocator) {
+#ifdef PADDLE_WITH_CUDA
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+  FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
+#endif
+
+  AllocateTestCases();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index c233bf4edf5462dc48f6c3f4f22a517a03585b45..514ac7883ad2effdf3518be8afe3f448a5ac10b2 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -37,6 +37,8 @@ DEFINE_bool(init_allocated_mem, false,
             "that initializing the allocated memory with a small value "
             "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);
 
 namespace paddle {
@@ -153,12 +155,18 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
                                     platform::GpuMinChunkSize(),
                                     platform::GpuMaxChunkSize());
 
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
     }
   });
 
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index c725dba5e98c200c2542d97cb8f53a938f6b614a..a555b6b299228720c7559e610f4d6f31167e1555 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -9,3 +9,5 @@ endif(${WITH_GPU})
 cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
 
 cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog)
+
+cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator)
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 26ef27c3caafadb4801b0ae52133f6175655ce0a..edd6ea4adec2e080d294fdb207d8dd4880fdcf79 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+
+#include <algorithm>
+#include <utility>
+
 #include "glog/logging.h"
 
 DEFINE_bool(free_idle_memory, false,
@@ -36,9 +40,10 @@ BuddyAllocator::~BuddyAllocator() {
               "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(10) << "Free from block (" << block << ", " << block->size(cache_)
+             << ")";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
     pool_.erase(pool_.begin());
   }
@@ -71,7 +76,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
 
   // refill the pool if failure
   if (it == pool_.end()) {
-    it = RefillPool();
+    it = RefillPool(size);
     // if still failure, fail fatally
     if (it == pool_.end()) {
       return nullptr;
@@ -184,19 +189,28 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   return static_cast<MemoryBlock*>(p)->data();
 }
 
-BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
+    size_t request_bytes) {
+  size_t allocate_bytes = max_chunk_size_;
+  size_t index = 0;
+
 #ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes);
+    } else {
+      // Reallocation size
+      if (realloc_size_ == 0) {
+        realloc_size_ = platform::GpuReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
     }
   }
 #endif
 
-  // Allocate a new maximum sized block
-  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
+  // Allocate a new block
+  void* p = system_allocator_->Alloc(&index, allocate_bytes);
 
   if (p == nullptr) return pool_.end();
 
@@ -204,7 +218,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
            << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     allocate_bytes, nullptr, nullptr);
 
   // gpu fallback allocation
   if (system_allocator_->UseGpu() &&
@@ -212,10 +226,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
     fallback_alloc_count_++;
   }
 
-  total_free_ += max_chunk_size_;
+  total_free_ += allocate_bytes;
 
   // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -286,12 +300,12 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
 
     VLOG(10) << "Return block " << block << " to fallback allocator.";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
     fallback_alloc_count_--;
 
     // If no fall allocation exists, return directly
@@ -322,12 +336,12 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     VLOG(10) << "Return block " << block << " to base allocator.";
 
-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, block->size(cache_), block->index(cache_));
     cache_.invalidate(block);
 
     pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
 
-    total_free_ -= max_chunk_size_;
+    total_free_ -= block->size(cache_);
 
     if (!shall_free_alloc()) return;
   }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 3f86a51f0d0b8504bbc4b0477f123093b343e9cf..bdc8cca4b55e6fe67618fb13cd8bf40c2c24858b 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -60,7 +60,7 @@ class BuddyAllocator {
   void* SystemAlloc(size_t size);
 
   /*! \brief If existing chunks are not suitable, refill pool */
-  PoolSet::iterator RefillPool();
+  PoolSet::iterator RefillPool(size_t request_bytes);
 
   /**
    *  \brief   Find the suitable chunk from existing pool and split
@@ -89,6 +89,8 @@ class BuddyAllocator {
   size_t min_chunk_size_;  // the minimum size of each chunk
   size_t max_chunk_size_;  // the maximum size of each chunk
 
+  size_t realloc_size_ = 0;  // the size of re-allocated chunk
+
  private:
   /**
    * \brief A list of free allocation
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1edc9f2034c87d4dbd655135c557bdb86ec4354d
--- /dev/null
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/detail/buddy_allocator.h"
+
+#include <memory>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+constexpr static int test_gpu_id = 0;
+
+void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) {
+  bool freed = false;
+  size_t used_bytes = allocator->Used();
+
+  if (size_bytes > 0) {
+    void* p = allocator->Alloc(size_bytes);
+
+    EXPECT_NE(p, nullptr);
+#ifdef PADDLE_WITH_CUDA
+    if (size_bytes < platform::GpuMaxChunkSize()) {
+#else
+    if (size_bytes < platform::CpuMaxChunkSize()) {
+#endif
+      // Not allocate from SystemAllocator
+      EXPECT_GE(allocator->Used(), used_bytes + size_bytes);
+    } else {
+      // Allocate from SystemAllocator doesn't count in Used()
+      EXPECT_EQ(allocator->Used(), used_bytes);
+    }
+
+    int* intp = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(intp, [&](void* p) {
+      allocator->Free(intp);
+      freed = true;
+    });
+  } else {
+    freed = true;
+  }
+
+  EXPECT_EQ(used_bytes, allocator->Used());
+  EXPECT_TRUE(freed);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(BuddyAllocator, GpuFraction) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+
+TEST(BuddyAllocator, InitRealloc) {
+  FLAGS_initial_gpu_memory_in_mb = 100;
+  FLAGS_reallocate_gpu_memory_in_mb = 50;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(100 << 20));
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 40 << 20);
+  // Greater then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 80 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+
+TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
+  FLAGS_initial_gpu_memory_in_mb = 5;
+  FLAGS_reallocate_gpu_memory_in_mb = 10;
+
+  EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast<size_t>(10 << 20));
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(test_gpu_id)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 1 << 20);
+  // Between initial size and reallocate size and not exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then initial size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 3 << 20);
+  // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7
+  // MB)
+  TestBuddyAllocator(&buddy_allocator, 7 << 20);
+  // Less then reallocate size and exceed pool
+  TestBuddyAllocator(&buddy_allocator, 8 << 20);
+  // Greater then initial size and reallocate size
+  TestBuddyAllocator(&buddy_allocator, 2 * static_cast<size_t>(1 << 30));
+}
+#endif
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 197d1c2f21fd818879aafe17599bc87d33caa198..41d79c5beb1367907a401b572d3d0eaf3a8ac67b 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -32,6 +32,9 @@ limitations under the License. */
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+
 namespace paddle {
 namespace memory {
 namespace detail {
@@ -119,11 +122,18 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     gpu_alloc_size_ += size;
     return p;
   } else {
-    LOG(WARNING)
-        << "Cannot malloc " << size / 1024.0 / 1024.0
-        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
-           "environment variable to a lower value. Current value is "
-        << FLAGS_fraction_of_gpu_memory_to_use;
+    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
+                 << " MB GPU memory. Please shrink "
+                    "FLAGS_fraction_of_gpu_memory_to_use or "
+                    "FLAGS_initial_gpu_memory_in_mb or "
+                    "FLAGS_reallocate_gpu_memory_in_mb"
+                    "environment variable to a lower value. "
+                 << "Current FLAGS_fraction_of_gpu_memory_to_use value is "
+                 << FLAGS_fraction_of_gpu_memory_to_use
+                 << ". Current FLAGS_initial_gpu_memory_in_mb value is "
+                 << FLAGS_initial_gpu_memory_in_mb
+                 << ". Current FLAGS_reallocate_gpu_memory_in_mb value is "
+                 << FLAGS_reallocate_gpu_memory_in_mb;
     return nullptr;
   }
 }
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 5d5ad9e711a97fb2ab83df40c7605902d5aa7751..6e3c9f28649b9f15a2a78fc832ab5e52986fcf46 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -57,7 +57,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                               "elements except the specify axis.");
           } else {
             // not check -1 with other in compile time
-            if (out_dims[j] != -1 && ins[i][j] != -1) {
+            if (out_dims[j] > 0 && ins[i][j] > 0) {
               PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                                 "Input tensors should have the same "
                                 "elements except the specify axis.");
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66c56da417487e3b2ee94ad572d83a971958ab62
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseFloorDivOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "FloorDiv"; }
+  std::string GetEquation() const override { return "Out = X // Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
+                             ops::ElementwiseFloorDivOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
+                                   int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..60846d1e8fee1c7f68ac101f18355750c2c15a4d
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_floordiv,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d24e394d5c823dbd22c837210e46cefeceba1be
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct FloorDivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_floor_div(const framework::ExecutionContext &ctx,
+                           const framework::Tensor *x,
+                           const framework::Tensor *y, framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseFloorDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_floor_div<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d63a7df03d0de7489a507825b066ab365e1ef8b9
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include <string>
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseModOpMaker : public ElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Mod"; }
+  std::string GetEquation() const override { return "Out = X % Y"; }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp,
+                             ops::ElementwiseModOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mod,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseModKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da3304a83952d448ffcad61f1878b06d354168b9
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_mod_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mod, ops::ElementwiseModKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseModKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b139fd4b33152b4a340c6c5a0f094338bbdffc8
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a % b; }
+};
+
+template <typename DeviceContext, typename T>
+void elementwise_mod(const framework::ExecutionContext &ctx,
+                     const framework::Tensor *x, const framework::Tensor *y,
+                     framework::Tensor *z) {
+  int axis = ctx.Attr<int>("axis");
+  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                        ModFunctor<T>(), z);
+}
+
+template <typename DeviceContext, typename T>
+class ElementwiseModKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    auto *y = ctx.Input<framework::LoDTensor>("Y");
+    auto *z = ctx.Output<framework::LoDTensor>("Out");
+
+    z->mutable_data<T>(ctx.GetPlace());
+
+    // dtype of x and y is int64 or int32
+    elementwise_mod<DeviceContext, T>(ctx, x, y, z);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 68c7227e5a7123e1e751dd55e243ee481bf36540..4a8937ba1c7ef9827ecc9bf575d9893c95a3b22b 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -33,8 +33,51 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    if (scale_num == 1) {
+      const int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      for (int i = 0; i < channel; i++) {
+        T s = scale_factor[i];
+        framework::Tensor one_channel_in = in->Slice(i, i + 1);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        auto& dev = *dev_ctx.eigen_device();
+        out_e.device(dev) = (s / max_range) * in_e;
+      }
+    } else if (scale_num == 2) {
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      for (int i = 0; i < batch_size; i++) {
+        framework::Tensor one_batch_in = in->Slice(i, i + 1).Resize(
+            framework::slice_ddim(in->dims(), 1, in->dims().size()));
+        framework::Tensor one_batch_out = out->Slice(i, i + 1).Resize(
+            framework::slice_ddim(out->dims(), 1, out->dims().size()));
+        for (int j = 0; j < channel; j++) {
+          T s = scale_one[j];
+          framework::Tensor one_channel_in = one_batch_in.Slice(j, j + 1);
+          framework::Tensor one_channel_out = one_batch_out.Slice(j, j + 1);
+          auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
+          auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+          auto& dev = *dev_ctx.eigen_device();
+          out_e.device(dev) = (s * scale_two[0] / max_range) * in_e;
+        }
+      }
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
 template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CPUDeviceContext, double>;
 
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
diff --git a/paddle/fluid/operators/fake_dequantize_op.cu b/paddle/fluid/operators/fake_dequantize_op.cu
index 35dcc69279d0119e75c4c5072e7817c839b9e819..02f9dc827d68cbb58447ed1557ff4bf310b2c017 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@@ -44,8 +44,66 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template <typename T>
+__global__ void DequantizeOneScale(const T* in, const T* scale, T max_range,
+                                   int num, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale[blockIdx.x] / max_range;
+  }
+}
+
+template <typename T>
+__global__ void DequantizeTwoScale(const T* in, const T* scale_one,
+                                   const T* scale_two, T max_range, int num,
+                                   int batch_size, int channel, T* out) {
+  int tid = threadIdx.x;
+  int channel_size = num / (batch_size * channel);
+  int scale_index = blockIdx.x % channel;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    out_c[i] = in_c[i] * scale_one[scale_index] * scale_two[0] / max_range;
+  }
+}
+
+template <typename T>
+struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor** scales,
+                  const int scale_num, T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+    if (scale_num == 1) {
+      int num = in->numel();
+      int channel = in->dims()[0];
+      const T* scale_factor = scales[0]->data<T>();
+      int block = 1024;
+      int grid = channel;
+      DequantizeOneScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_factor, max_range, num, channel, out_data);
+    } else if (scale_num == 2) {
+      int num = in->numel();
+      int batch_size = in->dims()[0];
+      int channel = in->dims()[1];
+      const T* scale_one = scales[0]->data<T>();
+      const T* scale_two = scales[1]->data<T>();
+      int block = 1024;
+      int grid = batch_size * channel;
+      DequantizeTwoScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          in_data, scale_one, scale_two, max_range, num, batch_size, channel,
+          out_data);
+    }
+  }
+};
+
 template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
 template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct ChannelDequantizeFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fake_dequantize_op.h b/paddle/fluid/operators/fake_dequantize_op.h
index d05f2038531bbe9c35da54c94d2ef4d659acca70..ed9a0a4d65fab5ce1ef48835c332fade978d2bae 100644
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
+#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
@@ -28,6 +29,13 @@ struct DequantizeFunctor {
                   framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelDequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor** scales, const int scale_num,
+                  T max_range, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
  public:
@@ -54,32 +62,33 @@ class FakeChannelWiseDequantizeMaxAbsKernel : public framework::OpKernel<T> {
     auto scales = ctx.MultiInput<framework::Tensor>("Scales");
     auto* out = ctx.Output<framework::Tensor>("Out");
 
-    PADDLE_ENFORCE_EQ(scales[0]->numel(), in->dims()[0],
-                      "The number of first scale values must be the same with "
-                      "first dimension value of Input(X).");
-
     auto quant_bits = ctx.Attr<std::vector<int>>("quant_bits");
-    int max_range = std::pow(2, quant_bits[0] - 1) - 1;
+    int max_range = 1;
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     out->mutable_data<T>(dev_ctx.GetPlace());
-
-    auto dequant = DequantizeFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      framework::Tensor one_channel_scale = scales[0]->Slice(i, i + 1);
-      dequant(dev_ctx, &one_channel_in, &one_channel_scale,
-              static_cast<T>(max_range), &one_channel_out);
-    }
-
-    if (scales.size() == 2) {
+    int scale_num = scales.size();
+    if (scale_num == 1) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[0],
+          "The number of first scale values must be the same with "
+          "first dimension value of Input(X) when the `Scales` has only one "
+          "element.");
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1);
+    } else if (scale_num == 2) {
+      PADDLE_ENFORCE_EQ(
+          scales[0]->numel(), in->dims()[1],
+          "The number of first scale values must be the same with "
+          "second dimension value of Input(X) when the `Scales` has two "
+          "elements.");
       PADDLE_ENFORCE_EQ(
           scales[1]->numel(), 1,
           "The second scale tensor should only have one value at now.");
-      max_range = std::pow(2, quant_bits[1] - 1) - 1;
-      dequant(dev_ctx, out, scales[1], static_cast<T>(max_range), out);
+      max_range *= (std::pow(2, quant_bits[0] - 1) - 1) *
+                   (std::pow(2, quant_bits[1] - 1) - 1);
     }
+    ChannelDequantizeFunctor<DeviceContext, T>()(
+        dev_ctx, in, scales.data(), scale_num, static_cast<T>(max_range), out);
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index d51d51b4953073e9a350806f041bb3112fad239c..054ef4658cc0c4448d49870849017d3191d57db9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -37,6 +37,21 @@ struct FindAbsMaxFunctor<platform::CPUDeviceContext, T> {
 
 template struct FindAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    const int channel_size = num / channel;
+    for (int i = 0; i < channel; i++) {
+      auto* start = in + i * channel_size;
+      auto* end = in + (i + 1) * channel_size;
+      out[i] = std::abs(*(std::max_element(start, end, Compare<T>())));
+    }
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CPUDeviceContext, float>;
+
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -53,6 +68,36 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    const int channel_size = in.numel() / channel;
+    platform::Transform<platform::CPUDeviceContext> trans;
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      auto* start = in_data + i * channel_size;
+      auto* end = in_data + (i + 1) * channel_size;
+      trans(ctx, start, end, out_data + i * channel_size,
+            ClipFunctor<T>(-s, s));
+    }
+    for (int i = 0; i < channel; i++) {
+      T s = scale_data[i];
+      framework::Tensor one_channel_out = out->Slice(i, i + 1);
+      auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+      out_e.device(*ctx.eigen_device()) = (bin_cnt / s * out_e).round();
+    }
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -169,10 +214,10 @@ class FakeChannelWiseQuantizeAbsMaxOp : public framework::OperatorWithKernel {
         ctx->HasOutput("Out"),
         "Output(Out) of FakeChannelWiseQuantizeOp should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("OutScales"),
-        "Output(Scales) of FakeChannelWiseQuantizeOp should not be null.");
+        ctx->HasOutput("OutScale"),
+        "Output(Scale) of FakeChannelWiseQuantizeOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("OutScales", {ctx->GetInputDim("X")[0]});
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[0]});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -192,7 +237,7 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
     AddOutput("Out",
               "(Tensor) Output of quantized low level tensor, "
               "but also saved as float data type.");
-    AddOutput("OutScales", "(Tensor) Current channel wise scale");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
     AddAttr<int>("bit_length", "(int, default 8)")
         .SetDefault(8)
         .AddCustomChecker([](const int& bit_length) {
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 3707f6772eac0d568c170d60c17d431e254d0b6b..33bd275e5cc507ec700b3694cd8b1df9672ec512 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -74,6 +74,45 @@ struct FindAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
 template struct FindAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
+template <typename T>
+__global__ void FindChannelAbsMaxKernel(const T* in, const int n, const int c,
+                                        T* out) {
+  int tid = threadIdx.x;
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  extern __shared__ T shared_max_data[];
+  shared_max_data[tid] = T(0);
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T tmp = fabs(in_c[i]);
+    if (tmp > shared_max_data[tid]) {
+      shared_max_data[tid] = tmp;
+    }
+  }
+  __syncthreads();
+  for (int i = blockDim.x / 2; i > 0; i >>= 1) {
+    if (tid < i && (shared_max_data[tid] < shared_max_data[tid + i])) {
+      shared_max_data[tid] = shared_max_data[tid + i];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    out[blockIdx.x] = shared_max_data[0];
+  }
+}
+
+template <typename T>
+struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* in,
+                  const int num, const int channel, T* out) {
+    int block = 1024;
+    int grid = channel;
+    FindChannelAbsMaxKernel<T><<<grid, block, 1024 * sizeof(T), ctx.stream()>>>(
+        in, num, channel, out);
+  }
+};
+
+template struct FindChannelAbsMaxFunctor<platform::CUDADeviceContext, float>;
+
 template <typename T>
 __global__ void ClipAndQuantKernel(const T* in, const T* scale,
                                    const int bin_cnt, const int n, T* out) {
@@ -82,14 +121,76 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
 
   T s = scale[0];
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[bid];
+    T x = in[i];
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt / s * v;
-    out[bid] = round(v);
+    out[i] = round(v);
   }
 }
 
+template <typename T>
+struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = (block - 1 + num) / block;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, out_data);
+  }
+};
+
+template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
+
+template <typename T>
+__global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
+                                          const int bin_cnt, const int n,
+                                          const int c, T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt / s * v;
+    out_c[i] = round(v);
+  }
+}
+
+template <typename T>
+struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int channel,
+                  framework::Tensor* out) {
+    int num = in.numel();
+    int block = 1024;
+    int grid = channel;
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    ChannelClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
+        in_data, scale_data, bin_cnt, num, channel, out_data);
+  }
+};
+
+template struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext,
+                                               float>;
+
 template <typename T>
 __global__ void FindRangeAbsMaxAndFillArray(const T* cur_scale,
                                             const T* last_scale,
@@ -182,26 +283,6 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
                                                float>;
 
-template <typename T>
-struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ClipAndQuantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-
-template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index ec667e89e7699d87db9423f17014a2761ce62763..5ab38b086df7f9df33996ec83b5ec07047c204ba 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,6 +42,19 @@ struct FindRangeAbsMaxFunctor {
                   framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };
 
+template <typename DeviceContext, typename T>
+struct FindChannelAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const T* in, const int num,
+                  const int channel, T* out);
+};
+
+template <typename DeviceContext, typename T>
+struct ChannelClipAndFakeQuantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int channel, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -78,29 +91,18 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
     auto* in = context.Input<framework::Tensor>("X");
 
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* out_scales = context.Output<framework::Tensor>("OutScales");
-    T* out_scales_data = out_scales->mutable_data<T>(context.GetPlace());
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
     out->mutable_data<T>(context.GetPlace());
 
     int bit_length = context.Attr<int>("bit_length");
     int bin_cnt = std::pow(2, bit_length - 1) - 1;
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto find_abs_max = FindAbsMaxFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel = in->Slice(i, i + 1);
-      const T* one_channel_data = one_channel.data<T>();
-      find_abs_max(dev_ctx, one_channel_data, one_channel.numel(),
-                   &out_scales_data[i]);
-    }
-    auto clip_quant = ClipAndFakeQuantFunctor<DeviceContext, T>();
-    for (int64_t i = 0; i < in->dims()[0]; i++) {
-      framework::Tensor one_channel_in = in->Slice(i, i + 1);
-      framework::Tensor one_channel_out = out->Slice(i, i + 1);
-      framework::Tensor one_channel_scale = out_scales->Slice(i, i + 1);
-      clip_quant(dev_ctx, one_channel_in, one_channel_scale, bin_cnt,
-                 &one_channel_out);
-    }
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, in->data<T>(), in->numel(), in->dims()[0], out_scale_data);
+    ChannelClipAndFakeQuantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, in->dims()[0], out);
   }
 };
 
diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe8e56a6160219175bd573a2ff186eb35e56fdf
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fsp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FSPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of FSPOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FSPOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE(
+        x_dims.size() == 4,
+        "The Input(X) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        y_dims.size() == 4,
+        "The Input(Y) must have shape [batch_size, channel, height, width].");
+    PADDLE_ENFORCE(
+        (x_dims[2] == y_dims[2]) && (x_dims[3] == y_dims[3]),
+        "The Input(X) and Input(Y) should have the same height and width.");
+
+    ctx->SetOutputDim("Out", {x_dims[0], x_dims[1], y_dims[1]});
+    ctx->ShareLoD("X", "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.device_context(), layout_, library_);
+  }
+};
+
+class FSPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input of FSP op with shape [batch_size, x_channel, "
+             "height, width]");
+    AddInput("Y",
+             "(Tensor) The input of FSP op with shape"
+             "[batch_size, y_channel, height, width]."
+             "The y_channel can be different with the x_channel of Input(X)"
+             " while the other dimensions must be the same with Input(X)'s.");
+    AddOutput(
+        "Out",
+        "(Tensor) The output of FSP op with shape "
+        "[batch_size, x_channel, y_channel]. The x_channel is the channel "
+        "of Input(X) and the y_channel is the channel of Input(Y).");
+    AddComment(R"DOC(
+    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
+    Given feature map x with shape [x_channel, h, w] and feature map y with shape
+    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
+
+        step 1: reshape x into matrix with shape [x_channel, h * w] and reshape and
+                transpose y into matrix with shape [h * w, y_channel]
+        step 2: multiply x and y to get fsp matrix with shape [x_channel, y_channel]
+
+    The output is a batch of fsp matrices.
+    )DOC");
+  }
+};
+
+class FSPOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fsp, ops::FSPOp, ops::FSPOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    fsp, ops::FSPOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    fsp_grad, ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FSPGradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.cu b/paddle/fluid/operators/fsp_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4fd7ba04ff9af1806963427ad58c68fc216e82ac
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/fsp_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fsp, ops::FSPOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPOpKernel<plat::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(fsp_grad,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, float>,
+                        ops::FSPGradOpKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..544af2b7d9b9729fe5dce08793da6c983fbcc6fa
--- /dev/null
+++ b/paddle/fluid/operators/fsp_op.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FSPOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+
+    auto batch_size = x_dims[0];
+    auto x_channel = x_dims[1];
+    auto y_channel = y_dims[1];
+    auto height = x_dims[2];
+    auto width = x_dims[3];
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+
+    math::MatDescriptor x_mat_desc;
+    x_mat_desc.height_ = x_channel;
+    x_mat_desc.width_ = height * width;
+    x_mat_desc.batch_size_ = batch_size;
+    x_mat_desc.stride_ = x_channel * height * width;
+
+    math::MatDescriptor y_mat_desc;
+    y_mat_desc.height_ = height * width;
+    y_mat_desc.width_ = y_channel;
+    y_mat_desc.batch_size_ = batch_size;
+    y_mat_desc.stride_ = y_channel * height * width;
+    y_mat_desc.trans_ = true;
+
+    blas.MatMul(*x, x_mat_desc, *y, y_mat_desc,
+                static_cast<T>(1.0 / (height * width)), output,
+                static_cast<T>(0.0));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FSPGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    if (d_x == nullptr && d_y == nullptr) {
+      return;
+    }
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto d_out_dims = d_out->dims();
+    auto batch_size = d_out_dims[0];
+    auto x_channel = d_out_dims[1];
+    auto y_channel = d_out_dims[2];
+    int64_t h = 0;
+    int64_t w = 0;
+
+    auto blas = math::GetBlas<DeviceContext, T>(context);
+    math::SetConstant<DeviceContext, T> set_zero;
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_x,
+               static_cast<T>(0));
+      auto* y = context.Input<Tensor>("Y");
+      auto y_dims = y->dims();
+      h = y_dims[2];
+      w = y_dims[3];
+
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = x_channel;
+      d_out_mat_desc.width_ = y_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+
+      math::MatDescriptor y_mat_desc;
+      y_mat_desc.height_ = y_channel;
+      y_mat_desc.width_ = h * w;
+      y_mat_desc.batch_size_ = batch_size;
+      y_mat_desc.stride_ = y_channel * h * w;
+
+      blas.MatMul(*d_out, d_out_mat_desc, *y, y_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_x, static_cast<T>(0.0));
+    }
+
+    if (d_y != nullptr) {
+      d_y->mutable_data<T>(context.GetPlace());
+      set_zero(context.template device_context<DeviceContext>(), d_y,
+               static_cast<T>(0));
+      auto* x = context.Input<Tensor>("X");
+      auto x_dims = x->dims();
+      h = x_dims[2];
+      w = x_dims[3];
+
+      math::MatDescriptor d_out_mat_desc;
+      d_out_mat_desc.height_ = y_channel;
+      d_out_mat_desc.width_ = x_channel;
+      d_out_mat_desc.batch_size_ = batch_size;
+      d_out_mat_desc.stride_ = x_channel * y_channel;
+      d_out_mat_desc.trans_ = true;
+
+      math::MatDescriptor x_mat_desc;
+      x_mat_desc.height_ = x_channel;
+      x_mat_desc.width_ = h * w;
+      x_mat_desc.batch_size_ = batch_size;
+      x_mat_desc.stride_ = x_channel * h * w;
+
+      blas.MatMul(*d_out, d_out_mat_desc, *x, x_mat_desc,
+                  static_cast<T>(1.0 / (h * w)), d_y, static_cast<T>(0.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index f5c802986e0573e81b3ab6187b57657b52b37215..2948cf71a911b296f8cee7ff9a2fb75f644dbe71 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -11,89 +11,27 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/load_combine_op.h"
 
 namespace paddle {
 namespace operators {
 
-class LoadCombineOp : public framework::OperatorBase {
+class LoadCombineOp : public framework::OperatorWithKernel {
  public:
-  LoadCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto model_from_memory = Attr<bool>("model_from_memory");
-    auto out_var_names = Outputs("Out");
-    PADDLE_ENFORCE_GT(
-        static_cast<int>(out_var_names.size()), 0,
-        "The number of output variables should be greater than 0.");
-    if (!model_from_memory) {
-      std::ifstream fin(filename, std::ios::binary);
-      PADDLE_ENFORCE(static_cast<bool>(fin),
-                     "Cannot open file %s for load_combine op", filename);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    } else {
-      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
-      std::stringstream fin(filename, std::ios::in | std::ios::binary);
-      LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
-    }
-  }
-  void LoadParamsFromBuffer(
-      const framework::Scope &scope, const platform::Place &place,
-      std::istream *buffer, bool load_as_fp16,
-      const std::vector<std::string> &out_var_names) const {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    for (size_t i = 0; i < out_var_names.size(); i++) {
-      auto *out_var = scope.FindVar(out_var_names[i]);
-
-      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
-                     out_var_names[i]);
-
-      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-      // Error checking
-      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
-
-      // Get data from fin to tensor
-      DeserializeFromStream(*buffer, tensor, dev_ctx);
-
-      auto in_dtype = tensor->type();
-      auto out_dtype =
-          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        // convert to float16 tensor
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor fp16_tensor;
-        // copy LoD info to the new tensor
-        fp16_tensor.set_lod(tensor->lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                                 &fp16_tensor);
-
-        // reset output tensor
-        out_var->Clear();
-        tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(fp16_tensor.lod());
-        tensor->ShareDataWith(fp16_tensor);
-      }
-    }
-    buffer->peek();
-    PADDLE_ENFORCE(buffer->eof(),
-                   "You are not allowed to load partial data via "
-                   "load_combine_op, use load_op instead.");
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, ctx.GetPlace());
+    return kt;
   }
 };
 
@@ -124,21 +62,30 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 LoadCombine Operator.
 
-LoadCombine operator loads LoDTensor variables from a file, which could be 
-loaded in memory already. The file should contain one or more LoDTensors 
+LoadCombine operator loads LoDTensor variables from a file, which could be
+loaded in memory already. The file should contain one or more LoDTensors
 serialized using the SaveCombine operator. The
-LoadCombine operator applies a deserialization strategy to appropriately load 
-the LodTensors, and this strategy complements the serialization strategy used 
+LoadCombine operator applies a deserialization strategy to appropriately load
+the LodTensors, and this strategy complements the serialization strategy used
 in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
-with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+with the SaveCombine operator, and can only deserialize one or more LoDTensors
 that were saved using the SaveCombine operator.
 
 )DOC");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
                   ops::LoadCombineOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.cu b/paddle/fluid/operators/load_combine_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2a42c0daa7fc58165e85d851c602a65ec287c905
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f620ba7d2f1c2797ad4fd76a16af9aeee9c2806
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto model_from_memory = ctx.Attr<bool>("model_from_memory");
+    auto &out_var_names = ctx.Outputs("Out");
+
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+    if (!model_from_memory) {
+      std::ifstream fin(filename, std::ios::binary);
+      PADDLE_ENFORCE(static_cast<bool>(fin),
+                     "Cannot open file %s for load_combine op", filename);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    } else {
+      PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
+      std::stringstream fin(filename, std::ios::in | std::ios::binary);
+      LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names);
+    }
+  }
+
+  void LoadParamsFromBuffer(
+      const framework::ExecutionContext &context, const platform::Place &place,
+      std::istream *buffer, bool load_as_fp16,
+      const std::vector<std::string> &out_var_names) const {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto out_vars = context.MultiOutputVar("Out");
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      PADDLE_ENFORCE(out_vars[i] != nullptr,
+                     "Output variable %s cannot be found", out_var_names[i]);
+
+      auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
+
+      // Get data from fin to tensor
+      DeserializeFromStream(*buffer, tensor, dev_ctx);
+
+      auto in_dtype = tensor->type();
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
+        out_vars[i]->Clear();
+        tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
+      }
+    }
+    buffer->peek();
+    PADDLE_ENFORCE(buffer->eof(),
+                   "You are not allowed to load partial data via "
+                   "load_combine_op, use load_op instead.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 4bce4eba22e4a8900f8d12454fd233e17c9ad617..2d8e6ca854b55e01dacd1e0e7898ba59ea6078dc 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -11,89 +11,26 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <fstream>
 
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include <string>
+
+#include "paddle/fluid/operators/load_op.h"
 
 namespace paddle {
 namespace operators {
 
-class LoadOp : public framework::OperatorBase {
+class LoadOp : public framework::OperatorWithKernel {
  public:
-  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
-                   filename);
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    auto out_var_name = Output("Out");
-    auto *out_var = scope.FindVar(out_var_name);
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Output variable %s cannot be found in scope %p",
-                   out_var_name, &scope);
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 
-    if (out_var->IsType<framework::LoDTensor>()) {
-      LoadLodTensor(fin, place, out_var);
-    } else if (out_var->IsType<framework::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "Load only support LoDTensor and SelectedRows, %s has wrong type",
-          out_var_name);
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin, const platform::Place &place,
-                     framework::Variable *var) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    DeserializeFromStream(fin, tensor, dev_ctx);
-
-    auto load_as_fp16 = Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->type();
-    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
-                               &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<framework::LoDTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = framework::OpKernelType(
+        framework::proto::VarType::FP32, platform::CPUPlace());
+    return kt;
   }
 };
 
@@ -116,8 +53,15 @@ class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "file.");
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.cu b/paddle/fluid/operators/load_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90f78110f8f349ebc834570c4fb9f15af24b144d
--- /dev/null
+++ b/paddle/fluid/operators/load_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bf3c6bed2f0ddf352a2bad65b0d710097016b28
--- /dev/null
+++ b/paddle/fluid/operators/load_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <string>
+
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class LoadOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    auto filename = ctx.Attr<std::string>("file_path");
+    std::ifstream fin(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = ctx.Outputs("Out").data();
+    auto *out_var = ctx.OutputVar("Out");
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found ",
+                   out_var_name);
+
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable cannot be found ");
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      LoadLodTensor(fin, place, out_var, ctx);
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      LoadSelectedRows(fin, place, out_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "Load only support LoDTensor and SelectedRows, %s has wrong type",
+          out_var_name);
+    }
+  }
+
+  void LoadLodTensor(std::istream &fin, const platform::Place &place,
+                     framework::Variable *var,
+                     const framework::ExecutionContext &ctx) const {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    DeserializeFromStream(fin, tensor, dev_ctx);
+
+    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
+    auto in_dtype = tensor->type();
+    auto out_dtype = load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      // convert to float16 tensor
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor fp16_tensor;
+      // copy LoD info to the new tensor
+      fp16_tensor.set_lod(tensor->lod());
+      framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                               &fp16_tensor);
+
+      // reset output tensor
+      var->Clear();
+      tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(fp16_tensor.lod());
+      tensor->ShareDataWith(fp16_tensor);
+    }
+  }
+
+  void LoadSelectedRows(std::istream &fin, const platform::Place &place,
+                        framework::Variable *var) const {
+    auto *selectedRows = var->GetMutable<framework::SelectedRows>();
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
+    selectedRows->SyncIndex();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 54c6a71111a2cc2f9e5004922ae5d3541a9d0a70..97387af92ffbd123ae6e795f17ef2273dadeab9d 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/operators/concat_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -38,15 +39,20 @@ static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
 }
 
 static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
-                                                const mkldnn::engine& engine) {
-  constexpr auto data_type = mkldnn::memory::f32;
+                                                const mkldnn::engine& engine,
+                                                const memory::data_type& dt) {
   const auto dims = paddle::framework::vectorize2int(input.dims());
   const auto format = input.format();
-  auto description = memory::desc(dims, data_type, format);
+  auto description = memory::desc(dims, dt, format);
   auto mem_prim_desc = memory::primitive_desc(description, engine);
   return mem_prim_desc;
 }
 
+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
+
 static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
@@ -61,14 +67,30 @@ static const mkldnn::engine& GetMKLDNNEngine(
   return dev_ctx.GetEngine();
 }
 
+std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
+                      const std::vector<const Tensor*> multi_input,
+                      const int64_t& concat_axis, const memory::data_type& dt) {
+  std::string key;
+  key.reserve(platform::MKLDNNHandler::MaxKeyLength);
+  for (size_t i = 0; i < multi_input.size(); i++) {
+    platform::MKLDNNHandler::AppendKeyDims(
+        &key, paddle::framework::vectorize2int(multi_input[i]->dims()));
+  }
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
+  platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
+  platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
+  return key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
  public:
   concat::primitive_desc CreateConcatPrimDescriptor(
       const std::vector<const Tensor*> multi_input, Tensor* output,
-      int concat_axis, const mkldnn::engine& mkldnn_engine) {
-    CreateSourcesDescriptors(multi_input, mkldnn_engine);
-    auto dst_desc = CreateDstMemDescriptor(output);
+      int concat_axis, const mkldnn::engine& mkldnn_engine,
+      const memory::data_type& dt = memory::data_type::f32) {
+    CreateSourcesDescriptors(multi_input, mkldnn_engine, dt);
+    auto dst_desc = CreateDstMemDescriptor(output, dt);
     return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
   }
 
@@ -79,23 +101,39 @@ class ConcatPrimitiveFactory {
     return concat(concat_pd, inputs, dst_mem.get());
   }
 
+  void SetSrcDataHandleByIndex(const std::vector<memory>& srcs, const size_t& i,
+                               void* handler) {
+    srcs[i].set_data_handle(handler);
+  }
+
+  void SetDstDataHandle(const memory& dst_mem, void* handler) {
+    dst_mem.set_data_handle(handler);
+  }
+
+  std::vector<memory> GetSrcs() { return srcs; }
+
+  memory GetDst() { return dst_mem.get(); }
+
  private:
-  memory::desc CreateDstMemDescriptor(Tensor* output) {
+  memory::desc CreateDstMemDescriptor(Tensor* output,
+                                      const memory::data_type& dt) {
     auto dst_dims = paddle::framework::vectorize2int(output->dims());
-    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
-                        memory::format::any);
+    return memory::desc(dst_dims, dt, memory::format::any);
   }
 
   mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
-                                 Tensor* output, platform::CPUPlace place) {
+                                 Tensor* output,
+                                 const platform::CPUPlace& place) {
     return memory(concat_pd.dst_primitive_desc(),
                   output->mutable_data<T>(place));
   }
 
   void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
-                                const mkldnn::engine& mkldnn_engine) {
+                                const mkldnn::engine& mkldnn_engine,
+                                const memory::data_type& dt) {
     for (size_t i = 0; i < multi_input.size(); i++) {
-      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      auto mem_prim_desc =
+          CreateMemPrimDesc(*multi_input[i], mkldnn_engine, dt);
       srcs_pd.push_back(mem_prim_desc);
       srcs.push_back(
           memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
@@ -120,21 +158,59 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto place = GetCpuPlace(ctx);
-    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
-
     auto multi_input = ctx.MultiInput<Tensor>("X");
     EnforceLayouts(multi_input);
     Tensor* output = ctx.Output<Tensor>("Out");
     int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    auto place = GetCpuPlace(ctx);
+
+    memory::data_type dt =
+        paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
 
     ConcatPrimitiveFactory<T> prim_creator;
-    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
-        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
-    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
-    stream(stream::kind::eager).submit({concat}).wait();
+    std::string key = CreateKey(ctx, multi_input, concat_axis, dt);
+    const std::string key_prim = key + "@concat_p";
+    const std::string key_concat_pd = key + "@concat_pd";
+    const std::string key_srcs = key + "@concat_srcs";
+    const std::string key_dst = key + "@concat_dst";
+
+    std::shared_ptr<concat::primitive_desc> concat_pd;
+    std::shared_ptr<std::vector<memory>> srcs;
+    std::shared_ptr<memory> dst_mem;
+    auto concat_p = std::static_pointer_cast<concat>(dev_ctx.GetBlob(key_prim));
+
+    if (concat_p == nullptr) {
+      const auto& mkldnn_engine = dev_ctx.GetEngine();
+      concat_pd = std::make_shared<concat::primitive_desc>(
+          prim_creator.CreateConcatPrimDescriptor(multi_input, output,
+                                                  static_cast<int>(concat_axis),
+                                                  mkldnn_engine, dt));
+      concat_p = std::make_shared<concat>(
+          prim_creator.CreateConcatPrimitive(*concat_pd, output, place));
+      srcs = std::make_shared<std::vector<memory>>(prim_creator.GetSrcs());
+      dst_mem = std::make_shared<memory>(prim_creator.GetDst());
+      dev_ctx.SetBlob(key_prim, concat_p);
+      dev_ctx.SetBlob(key_concat_pd, concat_pd);
+      dev_ctx.SetBlob(key_srcs, srcs);
+      dev_ctx.SetBlob(key_dst, dst_mem);
+    } else {
+      srcs = std::static_pointer_cast<std::vector<memory>>(
+          dev_ctx.GetBlob(key_srcs));
+      dst_mem = std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_dst));
+      concat_pd = std::static_pointer_cast<concat::primitive_desc>(
+          dev_ctx.GetBlob(key_concat_pd));
+      for (size_t i = 0; i < multi_input.size(); i++) {
+        prim_creator.SetSrcDataHandleByIndex(
+            *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
+      }
+      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+    }
+
+    stream(stream::kind::eager).submit({*concat_p}).wait();
 
-    output->set_mkldnn_prim_desc(concat_pd.dst_primitive_desc());
+    output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc());
   }
 };
 }  // namespace operators
@@ -143,4 +219,6 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>)
+                   ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<int8_t>,
+                   ops::ConcatMKLDNNOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/range_op.cc b/paddle/fluid/operators/range_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee8c68fd008c8c9764e9ef74dc37fa08cf31be19
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/range_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RangeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    if (ctx->HasInput("Start")) {
+      auto s_dims = ctx->GetInputDim("Start");
+      PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                     "The shape of Input(Start) should be [1].");
+    }
+    if (ctx->HasInput("End")) {
+      auto e_dims = ctx->GetInputDim("End");
+      PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                     "The shape of Input(End) should be [1].");
+    }
+    if (ctx->HasInput("Step")) {
+      auto step_dims = ctx->GetInputDim("Step");
+      PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                     "The shape of Input(Step) should be [1].");
+    }
+    ctx->SetOutputDim("Out", {-1});
+  }
+};
+
+class RangeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "Start of interval. The interval includes this value. It is a "
+             "tensor with shape=[1].");
+    AddInput("End",
+             "End of interval. The interval does not include this value, "
+             "except in some cases where step is not an integer and floating "
+             "point round-off affects the length of out. It is a tensor with "
+             "shape=[1].");
+    AddInput("Step", "Spacing between values. It is a tensor with shape=[1].");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return evenly spaced values within a given interval. Values are generated within the half-open interval [start, stop) (in other words, the interval including start but excluding stop). Like arange function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(range, ops::RangeOp, ops::RangeOpMaker);
+REGISTER_OP_CPU_KERNEL(range, ops::CPURangeKernel<int>,
+                       ops::CPURangeKernel<float>, ops::CPURangeKernel<double>,
+                       ops::CPURangeKernel<int64_t>);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e2c03716d55ee41ce3a9053b48b5c6d4c70e391f
--- /dev/null
+++ b/paddle/fluid/operators/range_op.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void RangeKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+class CUDARangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
+    T end = n.data<T>()[0];
+    framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (size + block - 1) / block;
+    RangeKernel<T><<<grid, block, 0, stream>>>(start, step, size, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(range, ops::CUDARangeKernel<int>,
+                        ops::CUDARangeKernel<int64_t>,
+                        ops::CUDARangeKernel<float>,
+                        ops::CUDARangeKernel<double>);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fce58b45c96ad76dfdd4ed7f54becde327070002
--- /dev/null
+++ b/paddle/fluid/operators/range_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+void GetSize(T start, T end, T step, int64_t* size) {
+  PADDLE_ENFORCE(!std::equal_to<T>()(step, 0),
+                 "The step of range op should not be 0.");
+  PADDLE_ENFORCE(((start < end) && (step > 0)) || ((start > end) && (step < 0)),
+                 "The step should be greater than 0 while start < end. And the "
+                 "step should be less than 0 while start > end.");
+  *size = std::is_integral<T>::value
+              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
+              : std::ceil(std::abs((end - start) / step));
+}
+
+template <typename T>
+class CPURangeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T end = context.Input<framework::Tensor>("End")->data<T>()[0];
+    T step = context.Input<framework::Tensor>("Step")->data<T>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+    out->Resize(framework::make_ddim({size}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      out_data[i] = value;
+      value += step;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 7c284312df912ad758f6fffc44f111dfe765feb8..5ee1206175600cd668ccbbf5b98053708a4406d3 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -17,7 +17,9 @@ function(reader_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
+cc_library(py_reader SRCS py_reader.cc DEPS reader)
 cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
+
 reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
 reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
 reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
@@ -26,7 +28,7 @@ reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_o
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
 reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
-reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
+reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader)
 
 if (NOT WIN32 AND NOT ON_INFER)
     cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib)
@@ -38,7 +40,7 @@ cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc)
 # Export local libraries to parent
 # set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
 
-op_library(read_op)
+op_library(read_op DEPS py_reader buffered_reader)
 
 foreach(src ${LOCAL_READER_LIBS})
     set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs")
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 51b980acb5a08d431d96a3a92479dec09119c27e..78d238aa6115265023d5d87c01048a87180448d0 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -16,6 +16,7 @@
 
 #include <condition_variable>  // NOLINT
 #include <deque>
+#include <utility>
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -34,7 +35,7 @@ class BlockingQueue {
   explicit BlockingQueue(size_t capacity, bool speed_test_mode = false)
       : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) {
     PADDLE_ENFORCE_GT(
-        capacity_, 0,
+        capacity_, static_cast<size_t>(0),
         "The capacity of a reader::BlockingQueue must be greater than 0.");
   }
 
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 134807092d59329ce93381da67a98b8230db5767..c24e9aedc4ebd8f4fa9e483b1c1cc71fe0bf0aa7 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -30,8 +30,10 @@ BufferedReader::~BufferedReader() {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaStreamDestroy(stream));
-    for (auto &event : events) PADDLE_ENFORCE(cudaEventDestroy(event));
+    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+    for (auto &event : events_) {
+      PADDLE_ENFORCE(cudaEventDestroy(event));
+    }
   }
 #endif
 }
@@ -46,15 +48,15 @@ BufferedReader::BufferedReader(
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    compute_stream =
+    compute_stream_ =
         ((platform::CUDADeviceContext *)(platform::DeviceContextPool::Instance()
                                              .Get(place_)))
             ->stream();
-    events.resize(buffer_size);
-    for (auto &event : events) {
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
       PADDLE_ENFORCE(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
     }
-    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    PADDLE_ENFORCE(cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking));
   }
 #endif
   cpu_buffer_.resize(buffer_size);
@@ -73,7 +75,7 @@ void BufferedReader::ReadAsync(size_t i) {
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(place_)) {
     platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-    PADDLE_ENFORCE(cudaEventRecord(events[i], compute_stream));
+    PADDLE_ENFORCE(cudaEventRecord(events_[i], compute_stream_));
   }
 #endif
   position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
@@ -91,7 +93,7 @@ void BufferedReader::ReadAsync(size_t i) {
     // commands from different streams cannot run concurrently.
     if (platform::is_gpu_place(place_)) {
       platform::SetDeviceId(boost::get<platform::CUDAPlace>(place_).device);
-      PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events[i], 0));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(stream_, events_[i], 0));
       TensorVec &gpu = gpu_buffer_[i];
       gpu.resize(cpu.size());
       platform::RecordEvent record_event("BufferedReader:MemoryCopy");
@@ -106,12 +108,14 @@ void BufferedReader::ReadAsync(size_t i) {
         if (platform::is_cuda_pinned_place(cpu_place)) {
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CUDAPinnedPlace>(cpu_place),
-                       cpu_ptr, size, stream);
+                       cpu_ptr, size, stream_);
         } else if ((platform::is_gpu_place(cpu_place))) {
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CUDAPlace>(cpu_place), cpu_ptr,
-                       size, stream);
+                       size, stream_);
         } else {
+          // if cpu place is not pinned, async copy is slower than sync copy,
+          // so we use sync copy instead.
           // TODO(zcd): The default stream should not be used here.
           memory::Copy(boost::get<platform::CUDAPlace>(place_), gpu_ptr,
                        boost::get<platform::CPUPlace>(cpu_place), cpu_ptr, size,
@@ -119,7 +123,7 @@ void BufferedReader::ReadAsync(size_t i) {
         }
         gpu[i].set_lod(cpu[i].lod());
       }
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
     }
 #endif
     return i;
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 87680da01a1f51cfdfe4d100508440eda9d1877f..5f8b2d47c22d0a15d53c8d30d39608fd64d4bddd 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <list>
+#include <memory>
 #include <queue>
 #include <vector>
 #include "ThreadPool.h"
@@ -63,9 +64,9 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> gpu_buffer_;
   size_t prev_pos_{-1UL};
 #ifdef PADDLE_WITH_CUDA
-  cudaStream_t stream;
-  cudaStream_t compute_stream;
-  std::vector<cudaEvent_t> events;
+  cudaStream_t stream_;
+  cudaStream_t compute_stream_;
+  std::vector<cudaEvent_t> events_;
 #endif
 };
 
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 901a92ab5b5c74b071be8b57a7653d90e2a4fb29..4a6581bbbd00019db33896371adac6d4e420e48c 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -12,37 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
 namespace paddle {
 namespace operators {
 namespace reader {
 
-class PyReader : public framework::FileReader {
- public:
-  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
-      : framework::FileReader() {
-    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
-    queue_ = queue;
-  }
-
-  void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    bool success;
-    *out = queue_->Pop(&success);
-    if (!success) out->clear();
-  }
-
-  ~PyReader() { queue_->Close(); }
-
-  void Shutdown() override { queue_->Close(); }
-
-  void Start() override { queue_->ReOpen(); }
-
- private:
-  std::shared_ptr<LoDTensorBlockingQueue> queue_;
-};
-
 class CreatePyReaderOp : public framework::OperatorBase {
  public:
   using framework::OperatorBase::OperatorBase;
diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc
new file mode 100644
index 0000000000000000000000000000000000000000..155ae859defcf20a5e226a4abfb99dc308dfb23c
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_reader.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include <memory>
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+PyReader::PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue)
+    : framework::FileReader() {
+  PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
+  queue_ = queue;
+}
+
+void PyReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  bool success;
+  *out = queue_->Pop(&success);
+  if (!success) out->clear();
+}
+
+PyReader::~PyReader() { queue_->Close(); }
+
+void PyReader::Shutdown() { queue_->Close(); }
+
+void PyReader::Start() { queue_->ReOpen(); }
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reader/py_reader.h b/paddle/fluid/operators/reader/py_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..43079075142e8db22c0e3b7c86de4249d447f961
--- /dev/null
+++ b/paddle/fluid/operators/reader/py_reader.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class PyReader : public framework::FileReader {
+ public:
+  explicit PyReader(const std::shared_ptr<LoDTensorBlockingQueue>& queue);
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override;
+
+  ~PyReader();
+
+  void Shutdown() override;
+
+  void Start() override;
+
+ private:
+  std::shared_ptr<LoDTensorBlockingQueue> queue_;
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc
index d0edcc170f0afbccdcdf83eed9a167b7602e34ab..62b1e09737a4af4d0fe08eafcb3b2999d97032c1 100644
--- a/paddle/fluid/operators/save_combine_op.cc
+++ b/paddle/fluid/operators/save_combine_op.cc
@@ -12,87 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <sstream>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include <string>
+
+#include "paddle/fluid/operators/save_combine_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SaveCombineOp : public framework::OperatorBase {
+class SaveCombineOp : public framework::OperatorWithKernel {
  public:
-  SaveCombineOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-
-    bool is_present = FileExists(filename);
-    if (is_present && !overwrite) {
-      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto inp_var_names = Inputs("X");
-    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
-                      "The number of input variables should be greater than 0");
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-    for (size_t i = 0; i < inp_var_names.size(); i++) {
-      auto *var = scope.FindVar(inp_var_names[i]);
-
-      PADDLE_ENFORCE(var != nullptr,
-                     "Cannot find variable %s for save_combine_op",
-                     inp_var_names[i]);
-      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
-                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
-                     inp_var_names[i]);
-
-      auto &tensor = var->Get<framework::LoDTensor>();
-      // Serialize tensors one by one
-
-      // Check types to see if a fp16 transformation is required
-      auto in_dtype = tensor.type();
-      auto out_dtype =
-          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-      if (in_dtype != out_dtype) {
-        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-        framework::LoDTensor out;
-        // copy LoD info to the new tensor
-        out.set_lod(tensor.lod());
-        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-        framework::SerializeToStream(fout, out, dev_ctx);
-      } else {
-        framework::SerializeToStream(fout, tensor, dev_ctx);
-      }
-    }
-    fout.close();
-  }
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 };
 
 class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
@@ -105,7 +36,7 @@ class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 SaveCombine operator
 
-This operator will serialize and write a list of input LoDTensor variables 
+This operator will serialize and write a list of input LoDTensor variables
 to a file on disk.
 )DOC");
     AddAttr<bool>("overwrite",
@@ -134,3 +65,10 @@ namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
                   ops::SaveCombineOpProtoMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_combine_op.cu b/paddle/fluid/operators/save_combine_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bc4478b51b111518439fe250a70b8dee0df53ad9
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveCombineOpKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ee82e17dd5e8173ce7dfb5c248890912d2cc7ef
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include <string>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SaveCombineOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto &inp_var_names = ctx.Inputs("X");
+    auto &inp_vars = ctx.MultiInputVar("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      PADDLE_ENFORCE(inp_vars[i] != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(inp_vars[i]->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = inp_vars[i]->Get<framework::LoDTensor>();
+      // Serialize tensors one by one
+
+      // Check types to see if a fp16 transformation is required
+      auto in_dtype = tensor.type();
+      auto out_dtype =
+          save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor out;
+        // copy LoD info to the new tensor
+        out.set_lod(tensor.lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+        framework::SerializeToStream(fout, out, dev_ctx);
+      } else {
+        framework::SerializeToStream(fout, tensor, dev_ctx);
+      }
+    }
+    fout.close();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 4743e0d9499b111d8baa921dbb245431713fd7a8..5594de16b6789e99d5c4cc6828889eb0e311624a 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
-USE_NO_KERNEL_OP(save_combine);
-USE_NO_KERNEL_OP(load_combine);
+USE_CPU_ONLY_OP(save_combine);
+USE_CPU_ONLY_OP(load_combine);
 
 template <typename T, typename U>
 T* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
diff --git a/paddle/fluid/operators/save_load_op_test.cc b/paddle/fluid/operators/save_load_op_test.cc
index ccaea0eef2906953d922e097348b6c0a86dad6f1..d277198a2f92c426586e774873c6770b93660e85 100644
--- a/paddle/fluid/operators/save_load_op_test.cc
+++ b/paddle/fluid/operators/save_load_op_test.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
-USE_NO_KERNEL_OP(save);
-USE_NO_KERNEL_OP(load);
+USE_CPU_ONLY_OP(save);
+USE_CPU_ONLY_OP(load);
 
 TEST(SaveLoadOp, CPU) {
   paddle::framework::Scope scope;
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index b02c098099625ca544fd889d5bb1c13ef2374450..338e2fbb5d868f146c9ff420b2d5d4cf6088316e 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -15,118 +15,24 @@ limitations under the License. */
 #include <stdint.h>
 #include <fstream>
 #include <numeric>
+#include <string>
+#include <vector>
 
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/port.h"
+#include "paddle/fluid/operators/save_op.h"
 
 namespace paddle {
 namespace operators {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-class SaveOp : public framework::OperatorBase {
+class SaveOp : public framework::OperatorWithKernel {
  public:
-  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    auto iname = Input("X");
-    auto *var = scope.FindVar(iname);
-    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
-                   iname);
-
-    if (var->IsType<framework::LoDTensor>()) {
-      SaveLodTensor(place, var);
-    } else if (var->IsType<framework::SelectedRows>()) {
-      SaveSelectedRows(scope, place, var);
-    } else {
-      PADDLE_ENFORCE(
-          false,
-          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
-          iname);
-    }
-  }
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void SaveLodTensor(const platform::Place &place,
-                     framework::Variable *var) const {
-    auto filename = Attr<std::string>("file_path");
-    auto overwrite = Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
-                   filename, overwrite);
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &tensor = var->Get<framework::LoDTensor>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-
-    auto save_as_fp16 = Attr<bool>("save_as_fp16");
-    auto in_dtype = tensor.type();
-    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
-      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
-      framework::LoDTensor out;
-      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
-      // copy LoD info to the new tensor
-      out.set_lod(tensor.lod());
-      framework::SerializeToStream(fout, out, dev_ctx);
-    } else {
-      framework::SerializeToStream(fout, tensor, dev_ctx);
-    }
-    fout.close();
-  }
+  void InferShape(framework::InferShapeContext *ctx) const override {}
 
-  void SaveSelectedRows(const framework::Scope &scope,
-                        const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *lt_var = scope.FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
-    PADDLE_ENFORCE(
-        lt_var != nullptr,
-        "Can not find variable kLookupTablePath for SaveSelectedRows");
-    std::string filename = lt_var->data();
-    VLOG(4) << "SaveSelectedRows get File name: " << filename;
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto &selectedRows = var->Get<framework::SelectedRows>();
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
-                   filename);
-    framework::SerializeToStream(fout, selectedRows, dev_ctx);
-    fout.close();
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.GetPlace());
   }
 };
 
@@ -154,14 +60,20 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
                          "The \"file_path\" where the variable will be saved.")
         .AddCustomChecker(
             [](const std::string &path) { return !path.empty(); });
+    AddOutput(LOOKUP_TABLE_PATH,
+              "(string)"
+              "for pserver: The \"kLookupTablePath\" where checkpoint notify "
+              "to save lookup table variables"
+              " to directory specified.")
+        .AsDispensable();
   }
 };
 
 class SaveOpVarTypeInference : public framework::VarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
-    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
+    auto var_type = framework::proto::VarType::RAW;
+    ctx->SetType(LOOKUP_TABLE_PATH, var_type);
   }
 };
 
@@ -169,11 +81,18 @@ class SaveOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {}
 };
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(save, ops::SaveOp, paddle::framework::EmptyGradOpMaker,
-                  ops::SaveOpProtoMaker, ops::SaveOpVarTypeInference,
-                  ops::SaveOpShapeInference);
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
+                  ops::SaveOpVarTypeInference, ops::SaveOpShapeInference);
+
+REGISTER_OP_CPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cu b/paddle/fluid/operators/save_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0a778a694e52f146b6cceddb969b8af08f40ef9e
--- /dev/null
+++ b/paddle/fluid/operators/save_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..642235aad58bef2ec7f741ee5fb5a65a2081f4ce
--- /dev/null
+++ b/paddle/fluid/operators/save_op.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <fstream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
+// to directory specified.
+constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
+template <typename DeviceContext, typename T>
+class SaveOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto place = ctx.GetPlace();
+
+    auto *input_var = ctx.InputVar("X");
+    auto iname = ctx.Inputs("X").data();
+    PADDLE_ENFORCE(input_var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    if (input_var->IsType<framework::LoDTensor>()) {
+      SaveLodTensor(ctx, place, input_var);
+    } else if (input_var->IsType<framework::SelectedRows>()) {
+      SaveSelectedRows(ctx, place, input_var);
+    } else {
+      PADDLE_ENFORCE(
+          false,
+          "SaveOp only support LoDTensor and SelectedRows, %s has wrong type",
+          iname);
+    }
+  }
+
+  void SaveLodTensor(const framework::ExecutionContext &ctx,
+                     const platform::Place &place,
+                     const framework::Variable *var) const {
+    auto filename = ctx.Attr<std::string>("file_path");
+    auto overwrite = ctx.Attr<bool>("overwrite");
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto save_as_fp16 = ctx.Attr<bool>("save_as_fp16");
+    auto in_dtype = tensor.type();
+    auto out_dtype = save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+    if (in_dtype != out_dtype) {
+      auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+      auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+      framework::LoDTensor out;
+      framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out);
+      // copy LoD info to the new tensor
+      out.set_lod(tensor.lod());
+      framework::SerializeToStream(fout, out, dev_ctx);
+    } else {
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+
+  void SaveSelectedRows(const framework::ExecutionContext &ctx,
+                        const platform::Place &place,
+                        const framework::Variable *var) const {
+    framework::Variable *out_put_var = ctx.OutputVar(LOOKUP_TABLE_PATH);
+    PADDLE_ENFORCE(
+        out_put_var != nullptr,
+        "Can not find variable kLookupTablePath for SaveSelectedRows");
+    auto *lt_var = out_put_var->GetMutable<std::string>();
+
+    std::string filename = lt_var->data();
+    VLOG(4) << "SaveSelectedRows get File name: " << filename;
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    auto &selectedRows = var->Get<framework::SelectedRows>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename, std::ios::binary);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+    framework::SerializeToStream(fout, selectedRows, dev_ctx);
+    fout.close();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 400a6d7bfa5912774c4bbb2a5868dd9a471afd00..47cca879b4b71f58778cf3d1f24cab463ac73418 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/gpu_info.h"
-
 #include <algorithm>
 #include <cstdlib>
 #include <string>
@@ -31,6 +30,8 @@ constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
 constexpr static float fraction_of_gpu_memory_to_use = 0.5f;
 #endif
 
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
 DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "Allocate a trunk of gpu memory that is this fraction of the "
               "total gpu memory size. Future memory usage will be allocated "
@@ -38,6 +39,24 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
               "additional trunks of the same size will be requested from gpu "
               "until the gpu has no memory left for another trunk.");
 
+DEFINE_uint64(
+    initial_gpu_memory_in_mb, 0ul,
+    "Allocate a trunk of gpu memory whose byte size is specified by "
+    "the flag. Future memory usage will be allocated from the "
+    "truck. If the trunk doesn't have enough gpu memory, additional "
+    "trunks of the gpu memory will be requested from gpu with size "
+    "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has "
+    "no memory left for the additional trunk. Note: if you set this "
+    "flag, the memory size set by "
+    "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this "
+    "flag. If you don't set this flag, PaddlePaddle will use "
+    "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory");
+
+DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul,
+              "If this flag is set, Paddle will reallocate the gpu memory with "
+              "size specified by this flag. Else Paddle will reallocate by "
+              "FLAGS_fraction_of_gpu_memory_to_use");
+
 DEFINE_bool(
     enable_cublas_tensor_op_math, false,
     "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -180,13 +199,43 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }
 
 size_t GpuMaxAllocSize() {
+  return std::max(GpuInitAllocSize(), GpuReallocSize());
+}
+
+size_t GpuInitAllocSize() {
+  if (FLAGS_initial_gpu_memory_in_mb > 0ul) {
+    // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb << 20);
+  }
+
+  // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by
+  // fraction
   size_t total = 0;
   size_t available = 0;
 
   GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
 
-  // Reserve the rest for page tables, etc.
-  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuReallocSize() {
+  if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) {
+    // Additional memory will be allocated by FLAGS_reallocate_gpu_memory_in_mb
+    return static_cast<size_t>(FLAGS_reallocate_gpu_memory_in_mb << 20);
+  }
+
+  // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be allocated
+  // by fraction
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(&available, &total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
+
+  return static_cast<size_t>((total - reserving) *
+                             FLAGS_fraction_of_gpu_memory_to_use);
 }
 
 size_t GpuMinChunkSize() {
@@ -201,16 +250,13 @@ size_t GpuMaxChunkSize() {
   GpuMemoryUsage(&available, &total);
   VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
            << total / 1024 / 1024 << "M";
-  size_t reserving = static_cast<size_t>(0.05 * total);
+  size_t reserving = static_cast<size_t>(fraction_reserve_gpu_memory * total);
   // If available less than minimum chunk size, no usable memory exists.
   available =
       std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
                total - reserving);
 
-  // Reserving the rest memory for page tables, etc.
-
-  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
-                                          (total - reserving));
+  size_t allocating = GpuMaxAllocSize();
 
   PADDLE_ENFORCE_LE(allocating, available,
                     "Insufficient GPU memory to allocation.");
diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h
index 1e1ab2503f53fe20bbe62c48f65d8535947f1aa8..d4be7ac97b2df6fe578582ae296e1dfc5548260c 100644
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -60,6 +60,12 @@ void GpuMemoryUsage(size_t *available, size_t *total);
 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
 
+//! Get the initial allocation size of current GPU device.
+size_t GpuInitAllocSize();
+
+//! Get the re-allocation size of current GPU device.
+size_t GpuReallocSize();
+
 //! Get the minimum chunk size for GPU buddy allocator.
 size_t GpuMinChunkSize();
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index f1385f57184eceec49b791cf6c89641b098f036a..0991eff0fdaaca80ada2d8dd3c68eba72fd3f6e6 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -5,7 +5,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a57083a1444a164cdeecf7e3e6eff6dc0e1e7be7..cef95de2ef675e417b5a2c49d01e3c85e23f9718 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -55,6 +55,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
+#include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -128,6 +129,11 @@ static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
   return paddle::platform::Place(p1) == paddle::platform::Place(p2);
 }
 
+template <typename PlaceType>
+static inline int PlaceIndex(const PlaceType &p) {
+  return static_cast<int>(paddle::platform::Place(p).which());
+}
+
 PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
@@ -531,6 +537,7 @@ PYBIND11_MODULE(core, m) {
 
 All parameter, weight, gradient are variables in Paddle.
 )DOC")
+      .def(py::init<>())
       .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
       .def("set_int",
            [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
@@ -572,14 +579,13 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference);
 
-  py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("start", &framework::ReaderHolder::Start)
-      .def("reset", &framework::ReaderHolder::ResetAll);
+  BindReader(&m);
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
   using LoDTensorBlockingQueueHolder =
       ::paddle::operators::reader::LoDTensorBlockingQueueHolder;
+
   py::class_<LoDTensorBlockingQueue, std::shared_ptr<LoDTensorBlockingQueue>>(
       m, "LoDTensorBlockingQueue", "")
       .def("push",
@@ -776,6 +782,7 @@ All parameter, weight, gradient are variables in Paddle.
              PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
 #endif
            })
+      .def("_type", &PlaceIndex<platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
@@ -785,6 +792,7 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
       .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
@@ -800,6 +808,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
              new (&self) platform::CUDAPinnedPlace();
            })
+      .def("_type", &PlaceIndex<platform::CUDAPinnedPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
@@ -811,16 +820,25 @@ All parameter, weight, gradient are variables in Paddle.
 
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
+      .def("_type", &PlaceIndex<platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::Place>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
+      .def("is_cpu_place",
+           [](platform::Place &self) { return platform::is_cpu_place(self); })
+      .def("is_cuda_pinned_place",
+           [](platform::Place &self) {
+             return platform::is_cuda_pinned_place(self);
+           })
       .def("gpu_device_id",
            [](platform::Place &self) {
              return boost::get<platform::CUDAPlace>(self).device;
            })
+      .def("set_place", [](platform::Place &self,
+                           const platform::Place &other) { self = other; })
       .def("set_place",
            [](platform::Place &self, const platform::CPUPlace &cpu_place) {
              self = cpu_place;
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af7d30552ed47c0fbe26090b328cc7128b90f84d
--- /dev/null
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/reader_py.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/operators/reader/buffered_reader.h"
+#include "paddle/fluid/operators/reader/py_reader.h"
+#include "paddle/fluid/platform/place.h"
+#include "pybind11/stl.h"
+
+namespace paddle {
+namespace pybind {
+
+class MultiDeviceFeedReader {
+ public:
+  using ResultDictList =
+      std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
+
+  MultiDeviceFeedReader(
+      const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
+      const std::vector<std::string> &names,
+      const std::vector<platform::Place> &dst_places, bool use_double_buffer)
+      : queue_(queue),
+        names_(names),
+        pool_(new ::ThreadPool(dst_places.size())) {
+    std::shared_ptr<framework::ReaderBase> reader(
+        new operators::reader::PyReader(queue));
+
+    readers_.reserve(dst_places.size());
+    for (auto &p : dst_places) {
+      auto *holder = new framework::ReaderHolder();
+      if (use_double_buffer) {
+        holder->Reset(
+            framework::MakeDecoratedReader<operators::reader::BufferedReader>(
+                reader, p, 2));
+      } else {
+        if (platform::is_gpu_place(p)) {
+          PADDLE_THROW(
+              "Place cannot be CUDAPlace when use_double_buffer is False");
+        }
+        holder->Reset(reader);
+      }
+      readers_.emplace_back(holder);
+    }
+
+    futures_.resize(dst_places.size());
+    ret_.resize(dst_places.size());
+    ReadAsync();
+  }
+
+  ResultDictList ReadNext() {
+    bool success = WaitFutures();
+
+    if (!success) {
+      return {};
+    }
+
+    ResultDictList result(ret_.size());
+    for (size_t i = 0; i < ret_.size(); ++i) {
+      for (size_t j = 0; j < names_.size(); ++j) {
+        result[i].emplace(names_[j], std::move(ret_[i][j]));
+      }
+    }
+    ReadAsync();
+    return result;
+  }
+
+  void Reset() {
+    Shutdown();
+    Start();
+    ReadAsync();
+  }
+
+  ~MultiDeviceFeedReader() {
+    queue_->Close();
+    pool_.reset();
+  }
+
+ private:
+  bool WaitFutures() {
+    bool success = true;
+    for (auto &f : futures_) {
+      success &= f.get();
+    }
+    return success;
+  }
+
+  void Shutdown() {
+    for (auto &r : readers_) r->Shutdown();
+  }
+
+  void Start() {
+    for (auto &r : readers_) r->Start();
+  }
+
+  void ReadAsync() {
+    for (size_t i = 0; i < readers_.size(); ++i) {
+      futures_[i] = pool_->enqueue([this, i] {
+        readers_[i]->ReadNext(&ret_[i]);
+        return !ret_[i].empty();
+      });
+    }
+  }
+
+  std::shared_ptr<operators::reader::LoDTensorBlockingQueue> queue_;
+  std::vector<std::string> names_;
+  std::unique_ptr<::ThreadPool> pool_;
+
+  std::vector<std::unique_ptr<framework::ReaderHolder>> readers_;
+
+  std::vector<std::future<bool>> futures_;
+  std::vector<std::vector<framework::LoDTensor>> ret_;
+};
+
+namespace py = pybind11;
+
+void BindReader(py::module *module) {
+  auto &m = *module;
+
+  namespace reader = ::paddle::operators::reader;
+
+  py::class_<framework::ReaderHolder>(m, "Reader", "")
+      .def("start", &framework::ReaderHolder::Start)
+      .def("reset", &framework::ReaderHolder::ResetAll);
+
+  py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
+      .def("read_next", &MultiDeviceFeedReader::ReadNext,
+           py::call_guard<py::gil_scoped_release>())
+      .def("reset", &MultiDeviceFeedReader::Reset,
+           py::call_guard<py::gil_scoped_release>());
+
+  m.def("create_py_reader",
+        [](const std::shared_ptr<operators::reader::LoDTensorBlockingQueue>
+               &queue,
+           const std::vector<std::string> &names,
+           const std::vector<platform::Place> &dst_places,
+           bool use_double_buffer) {
+          return new MultiDeviceFeedReader(queue, names, dst_places,
+                                           use_double_buffer);
+        },
+        py::return_value_policy::take_ownership);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/reader_py.h b/paddle/fluid/pybind/reader_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..472ff65368f3fb206ae599ae5d9d11e9ae8195ae
--- /dev/null
+++ b/paddle/fluid/pybind/reader_py.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindReader(pybind11::module *module);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index e91fa9292438532a5f696082a179aea7ff3e093f..614a3586156b0a858e2c5d2decec6dc6844c8886 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -41,6 +41,8 @@ int main(int argc, char** argv) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   envs.push_back("fraction_of_gpu_memory_to_use");
+  envs.push_back("initial_gpu_memory_in_mb");
+  envs.push_back("reallocate_gpu_memory_in_mb");
   envs.push_back("allocator_strategy");
 #elif __clang__
   envs.push_back("use_mkldnn");
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 57c5e83c82d216f55a33e568849d87689f86270f..5728a37fc33467968ca68de316d963f31f66da03 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -46,9 +46,9 @@ import six
 from six.moves import cPickle as pickle
 __all__ = ['train', 'test', 'valid']
 
-DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
-LABEL_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/imagelabels.mat'
-SETID_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/setid.mat'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
+LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
+SETID_URL = 'http://paddlemodels.bj.bcebos.com/flowers/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index adc0c1aac80cbdb0b0c04535fc39b6a172d23eec..450f159f9d10c282849e6e26fb595fb683b1a02e 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -15,7 +15,7 @@
 WMT14 dataset.
 The original WMT14 dataset is too large and a small set of data for set is
 provided. This module will download dataset from
-http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 
 """
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 3bc5cd4448198c189fff1bd23e383acc71d04374..63b7b28948a783bc5910d53f6e65a8c09d77bdb1 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -163,7 +163,8 @@ def __bootstrap__():
 
     if core.is_compiled_with_cuda():
         read_env_flags += [
-            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
+            'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
             'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
             'sync_nccl_allreduce', 'limit_of_tmp_allocation',
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 5732377bd60f849494ae7e463f40d4843ffa2c23..ac2a40a7c25f7c3ff0cc103647355da55d27fec3 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -17,9 +17,10 @@ import os
 import six
 import sys
 from .. import compat as cpt
+from . import framework
+from .framework import cuda_places, cpu_places
 
 from . import core
-from . import framework
 
 __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 
@@ -44,21 +45,6 @@ def _is_pserver_mode(main_program):
     return False
 
 
-def get_available_places(use_cuda):
-    if use_cuda:
-        gpus_env = os.getenv("FLAGS_selected_gpus")
-        if gpus_env:
-            gpus = [int(s) for s in gpus_env.split(",")]
-        else:
-            gpus = [i for i in six.moves.range(core.get_cuda_device_count())]
-        places = [core.CUDAPlace(i) for i in gpus]
-    else:
-        cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-        places = [core.CPUPlace() for _ in six.moves.range(cpu_num)]
-    assert places, "no place for execution"
-    return places
-
-
 class CompiledProgram(object):
     """
     Compiles to Graph for execution.
@@ -117,7 +103,8 @@ class CompiledProgram(object):
                            loss_name=None,
                            build_strategy=None,
                            exec_strategy=None,
-                           share_vars_from=None):
+                           share_vars_from=None,
+                           places=None):
         """Configs the program to run in data parallel way.
 
         Args:
@@ -132,10 +119,18 @@ class CompiledProgram(object):
                 threads are used, how many iterations to clean up the temp
                 variables. For more information, please refer
                 to fluid.ExecutionStrategy. Default None.
-            share_vars_from(CompiledProgram): If provide, this CompiledProgram
+            share_vars_from(CompiledProgram): If provided, this CompiledProgram
                 will share variables from `share_vars_from`. `share_vars_from`
                 must be run by the executor before this CompiledProgram so that
                 vars are ready.
+            places(list(CUDAPlace)|list(CPUPlace)|None): If provided, only compile
+                program in the given places. Otherwise, the places used when compiled 
+                is determined by the Executor, and the places used are controlled 
+                by environment variables: FLAGS_selected_gpus or CUDA_VISIBLE_DEVICES
+                if using GPU; or CPU_NUM if using CPU. For example, if you want to 
+                run on GPU 0 and 1, set places=[fluid.CUDAPlace(0), fluid.CUDAPlace(1)].
+                If you want to run on 2 CPU cores, set places=[fluid.CPUPlace()]*2.  
+
         Returns:
             self
         """
@@ -150,6 +145,12 @@ class CompiledProgram(object):
             self._exec_strategy = ExecutionStrategy()
         if self._build_strategy is None:
             self._build_strategy = BuildStrategy()
+        if places is not None:
+            if not isinstance(places, (list, tuple)):
+                places = [places]
+            self._places = places
+        else:
+            self._places = None
         self._build_strategy.is_distribution = _is_pserver_mode(self._program)
         return self
 
@@ -192,7 +193,15 @@ class CompiledProgram(object):
             self._local_scopes = []
 
         self._exec_strategy.use_cuda = use_cuda
-        self._places = get_available_places(self._exec_strategy.use_cuda)
+        has_set_place = (self._places is not None)
+        if has_set_place:
+            for p in self._places:
+                assert p._type() == self._place._type(), \
+                    "Place type not match. You may set the wrong type of places"
+        else:
+            self._places = cuda_places(
+            ) if self._exec_strategy.use_cuda else cpu_places()
+        assert self._places, "no place for execution"
 
         if self._exec_strategy.num_threads == 0:
             if self._exec_strategy.use_cuda:
@@ -200,9 +209,7 @@ class CompiledProgram(object):
                 # performance. Worth tunning for other models in the future.
                 self._exec_strategy.num_threads = len(self._places) * 4
             else:
-                cpu_num = int(
-                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
-                self._exec_strategy.num_threads = cpu_num * 2
+                self._exec_strategy.num_threads = len(self._places) * 2
 
         # FIXME(dzhwinter): enable_inplace should be after memory_optimize
         # if turn on python memory optimize, turn off the inplace_pass.
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
index 460ae393f158ae320c93601365a68b8cfe2ba50e..55a21ed1c55d1eca51118e726e7e2cf041ace45c 100644
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -45,28 +45,41 @@ You can load INT8 model by load_inference_model [API](https://github.com/PaddleP
 ```
 
 ## 3. Result
-We provide the results of accuracy measurd on [Intel® Xeon® Platinum Gold Processor](https://ark.intel.com/products/120489/Intel-Xeon-Gold-6148-Processor-27-5M-Cache-2-40-GHz- "Intel® Xeon® Gold 6148 Processor") (also known as Intel® Xeon® Skylake6148).
+We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 (single core).
+
+**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271**
 
 | Model  | Dataset  | FP32 Accuracy  | INT8 Accuracy  | Accuracy Diff  |
-| ------------ | ------------ | ------------ | ------------ | ------------ |
-| ResNet-50  | Small  | 72.00%  | 72.00%  |  0.00% |
-| MobileNet-V1  | Small  | 62.00%  | 62.00%  | 0.00%  |
-| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.17%  | 0.46% |
-| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.49%  | 0.29%  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  76.63%  | 76.23%  | 0.40% |
+| MobileNet-V1 | Full ImageNet Val  | 70.78%  | 70.47%  | 0.31%  |
+
+**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)**
+
+| Model  | Dataset  | FP32 Throughput  | INT8 Throughput  |  Ratio(INT8/FP32)  |
+| :------------: | :------------: | :------------: | :------------: | :------------: |
+| ResNet-50  | Full ImageNet Val  |  11.54 images/s | 32.2 images/s | 2.79 |
+| MobileNet-V1 | Full ImageNet Val  | 49.21 images/s | 108.37 images/s | 2.2  |
 
-Please note that [Small](http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz "Small") is a subset of [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset"). 
+Please note that [full ImageNet validation dataset](http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar "full ImageNet validation dataset") can be downloaded by script `test_calibration.py` with `DATASET=full`. 
 
 Notes:
 * The accuracy measurement requires the model with `label`.
-* The INT8 theoretical speedup is ~1.33X on Intel® Xeon® Skylake Server (please refer to `This allows for 4x more input at the cost of 3x more instructions or 33.33% more compute` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")).
+* The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `providing a theoretical peak compute gain of 4x int8 OPS over fp32 OPS` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). However, the actual test results at the model level will be less than 4X, and in general the average is about 2X. In addition, the calculation library optimization of batch size 1 is not as good as the large batch size.
 
 ## 4. How to reproduce the results
-* Small dataset
+* Small dataset (Single core)
 ```bash
 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 
-* Full dataset
+* Full dataset (Single core)
 ```bash
 FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
+
+* Full dataset (Multi-core)
+```bash
+FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
+```
+> Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
diff --git a/python/paddle/fluid/contrib/slim/__init__.py b/python/paddle/fluid/contrib/slim/__init__.py
index 22dbf7c8b6bb2da7c310a20bdcbaffca248575b0..4a71fab6d0fc73aa3bbe9c9fe56278e473f354e1 100644
--- a/python/paddle/fluid/contrib/slim/__init__.py
+++ b/python/paddle/fluid/contrib/slim/__init__.py
@@ -13,13 +13,4 @@
 # limitations under the License.
 
 from .core import *
-from .graph import *
-from .prune import *
-__all__ = [
-    'build_compressor',
-    'CompressPass',
-    'ImitationGraph',
-    'SensitivePruneStrategy',
-    'MagnitudePruner',
-    'RatioPruner',
-]
+__all__ = ['Compressor', ]
diff --git a/python/paddle/fluid/contrib/slim/core/__init__.py b/python/paddle/fluid/contrib/slim/core/__init__.py
index 7826d5830a6f7f6d42cb1275c2289695c080e52f..831bd70ecc62f8d576b304c52b0abea994fd2ceb 100644
--- a/python/paddle/fluid/contrib/slim/core/__init__.py
+++ b/python/paddle/fluid/contrib/slim/core/__init__.py
@@ -14,11 +14,9 @@
 
 from . import config
 from .config import *
-from . import compress_pass
-from .compress_pass import *
+from . import compressor
+from .compressor import *
 from . import strategy
 from .strategy import *
-from . import pass_builder
-from .pass_builder import *
 
-__all__ = config.__all__ + compress_pass.__all__ + strategy.__all__ + pass_builder.__all__
+__all__ = config.__all__ + compressor.__all__ + strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py
deleted file mode 100644
index c4c348b878a1df43d7fb909f506c8cf65366866f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/compress_pass.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ....core import CPUPlace
-from ..graph import get_executor
-
-__all__ = ['Context', 'CompressPass']
-
-
-class Context(object):
-    """
-    The context in the process of compression.
-    Args:
-        exe: The executor used to execute graph.
-        graph: The graph to be compressed.
-        scope: The scope used to execute graph.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self, exe, graph, scope, program_exe=None):
-        # The total number of epoches to be trained.
-        self.epoch = 0
-        # Current epoch
-        self.epoch_id = 0
-        # Current batch
-        self.batch_id = 0
-        self.exe = exe
-        self.graph = graph
-        self.scope = scope
-        self.program_exe = program_exe
-
-
-class CompressPass(object):
-    """
-    The pass used to compress model.
-    Args:
-        place: The device used in compression.
-        data_reader: The data_reader used to run graph.
-        data_feeder: The data_feeder used to run graph.
-        scope: The scope used to run graph.
-        metrics: The metrics for evaluating model.
-        epoch: The total epoches of trainning in compression.
-        program_exe: The program_exe is used to execute the program
-                     created for modifying the variables in scope.
-    """
-
-    def __init__(self,
-                 place=None,
-                 data_reader=None,
-                 data_feeder=None,
-                 scope=None,
-                 metrics=None,
-                 epoch=None,
-                 program_exe=None):
-        self.strategies = []
-        self.place = CPUPlace() if place is None else place
-        self.data_reader = data_reader
-        self.data_feeder = data_feeder
-        self.scope = scope
-        self.metrics = metrics
-        self.epoch = epoch
-        self.program_exe = program_exe
-
-    def add_strategy(self, strategy):
-        """
-        Add a strategy to current compress pass.
-        Args:
-            strategy: The strategy to be added into current compress pass.
-        """
-        self.strategies.append(strategy)
-        self.epoch = max(strategy.end_epoch, self.epoch)
-
-    def apply(self, graph):
-        """
-        Compress a model.
-        Args:
-            graph: The target graph to be compressed.
-        """
-        self.executor = get_executor(graph, self.place)
-        context = Context(
-            self.executor, graph, self.scope, program_exe=self.program_exe)
-
-        for strategy in self.strategies:
-            strategy.on_compress_begin(context)
-
-        for epoch in range(self.epoch):
-
-            for strategy in self.strategies:
-                strategy.on_epoch_begin(context)
-
-            for data in self.data_reader():
-
-                for strategy in self.strategies:
-                    strategy.on_batch_begin(context)
-                fetches = None
-                if self.metrics:
-                    fetches = self.metrics.values()
-                feed = None
-                if self.data_feeder:
-                    feed = self.data_feeder.feed(data)
-                results = self.executor.run(graph,
-                                            fetches=fetches,
-                                            scope=self.scope,
-                                            feed=feed)
-                if results:
-                    print("results: {}".format(
-                        zip(self.metrics.keys(), results)))
-                for strategy in self.strategies:
-                    strategy.on_batch_end(context)
-                context.batch_id += 1
-
-            for strategy in self.strategies:
-                strategy.on_epoch_end(context)
-            context.epoch_id += 1
-
-        for strategy in self.strategies:
-            strategy.on_compress_end(context)
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1547b6abbe660b6be7a681a4e270e3080a5dac36
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -0,0 +1,481 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....core import CPUPlace
+from .... import compiler
+from .... import io
+from .... import profiler
+from .... import scope_guard
+from ....data_feeder import DataFeeder
+from ..graph import *
+from .config import ConfigFactory
+import numpy as np
+from collections import Iterable
+import time
+import os
+import logging
+import sys
+import pickle
+import functools
+
+__all__ = ['Context', 'Compressor']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+def cached_reader(reader, sampled_rate, cache_path, cached_id):
+    """
+    Sample partial data from reader and cache them into local file system.
+    Args:
+        reader: Iterative data source.
+        sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+        cache_path(str): The path to cache the sampled data.
+        cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+    """
+    np.random.seed(cached_id)
+    cache_path = os.path.join(cache_path, str(cached_id))
+    _logger.debug('read data from: {}'.format(cache_path))
+
+    def s_reader():
+        if os.path.isdir(cache_path):
+            for file_name in open(os.path.join(cache_path, "list")):
+                yield np.load(os.path.join(cache_path, file_name.strip()))
+        else:
+            os.makedirs(cache_path)
+            list_file = open(os.path.join(cache_path, "list"), 'w')
+            batch = 0
+            dtype = None
+            for data in reader():
+                if batch == 0 or (np.random.uniform() < sampled_rate):
+                    np.save(
+                        os.path.join(cache_path, 'batch' + str(batch)), data)
+                    list_file.write('batch' + str(batch) + '.npy\n')
+                    batch += 1
+                    yield data
+
+    return s_reader
+
+
+class Context(object):
+    """
+    The context in the process of compression.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_graph=None,
+                 train_reader=None,
+                 eval_graph=None,
+                 eval_reader=None,
+                 teacher_graphs=None,
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place: The device place where the compression job running.
+            scope: The scope used in compression job.
+            train_graph: The graph with loss as output node.
+            eval_graph: The graph used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            teacher_graphs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies.
+        """
+        # The total number of epoches to be trained.
+        self.epoch = 0
+        # Current epoch
+        self.epoch_id = 0
+        # Current batch
+        self.batch_id = 0
+
+        self.k_v = {}
+
+        self.place = place
+        self.scope = scope
+        self.train_graph = train_graph
+        self.train_reader = train_reader
+        self.eval_graph = eval_graph
+        self.eval_reader = eval_reader
+        self.executor = None
+        self.teacher_graphs = teacher_graphs
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.optimize_graph = None
+        self.cache_path = './eval_cache'
+        self.eval_results = {}
+
+    def to_file(self, file_name):
+        """
+        Save the context into file.
+        """
+        data = {}
+        data['epoch_id'] = self.epoch_id
+        data['eval_results'] = self.eval_results
+        with open(file_name, 'wb') as context_file:
+            pickle.dump(data, context_file)
+
+    def from_file(self, file_name):
+        """
+        Load the context from file.
+        """
+        with open(file_name) as context_file:
+            if sys.version_info < (3, 0):
+                data = pickle.load(context_file)
+            else:
+                data = pickle.load(context_file, encoding='bytes')
+            self.epoch_id = data['epoch_id']
+            self.eval_results = data['eval_results']
+
+    def eval_converged(self, metric_name, delta=0.001):
+        """
+        Check whether the training has been converged.
+        Args:
+            metric_name(str): The metric used to check convergence.
+            delta(float): '(metric[k] - metric[k-1] / metric[k-1]) < delta'
+                          means that the training has been converged.
+        Returns:
+            bool: True means the training has been converged.
+        """
+        # TODO(wanghaoshuang@baidu.com): enhence this method.
+        if (metric_name not in self.eval_results
+            ) or len(self.eval_results[metric_name]) < 2:
+            return False
+        results = self.eval_results[metric_name][-2:]
+        _logger.info('Latest evaluations: {}'.format(results))
+        return abs(results[1] - results[0]) / results[0] < delta
+
+    def run_eval_graph(self, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            sampled_rate(float): The sampled rate used to sample partial data
+            for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same
+                            cached_id use the same sampled dataset. default: 0.
+        """
+        _logger.info('Running evaluation')
+        assert self.eval_graph is not None
+        assert self.eval_reader is not None
+        eval_graph = self.eval_graph.clone(for_test=True)
+
+        executor = SlimGraphExecutor(self.place)
+        results = []
+        batch_id = 0
+        s_time = time.time()
+        reader = self.eval_reader
+        if sampled_rate:
+            reader = cached_reader(reader, sampled_rate, self.cache_path,
+                                   cached_id)
+        for data in reader():
+            result = executor.run(eval_graph, self.scope, data=data)
+            result = [np.mean(r) for r in result]
+            results.append(result)
+            if batch_id % 20 == 0:
+                _logger.info("batch-{}; {}={}".format(
+                    batch_id, eval_graph.out_nodes.keys(), result))
+            batch_id += 1
+        result = np.mean(np.array(results), axis=0)
+        _logger.info("Final eval result: {}={}".format(
+            eval_graph.out_nodes.keys(), result))
+        if not isinstance(result, Iterable):
+            result = [result]
+        _logger.info('Finish evaluation')
+        return result, eval_graph.out_nodes.keys()
+
+    def put(self, key, value):
+        self.k_v[key] = value
+
+    def get(self, key):
+        return self.k_v.get(key)
+
+
+class Compressor(object):
+    """
+    The pass used to compress model.
+    """
+
+    def __init__(self,
+                 place,
+                 scope,
+                 train_program,
+                 train_reader=None,
+                 train_feed_list=None,
+                 train_fetch_list=None,
+                 eval_program=None,
+                 eval_reader=None,
+                 eval_feed_list=None,
+                 eval_fetch_list=None,
+                 teacher_programs=[],
+                 checkpoint_path='./checkpoints',
+                 train_optimizer=None,
+                 distiller_optimizer=None):
+        """
+        Args:
+            place(fluid.Place): The device place where the compression job running.
+            scope(fluid.core.Scope): The scope used to run graph.
+            train_program(Program): The main program to be compressed. It must have loss op.
+            train_reader: The data reader used for training.
+            train_feed_list(dict): A dict to indicate the input variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            train_fetch_list(dict): A dict to indicate the output variable of the training program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_program(Program): The program used for evaluation.
+            eval_reader: The data reader used for evaluation.
+            eval_feed_list(dict): A dict to indicate the input variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            eval_fetch_list(dict): A dict to indicate the output variable of the evaluation program.
+                                   The key is user-defined and human-readable name.
+                                   The value is the name of Variable.
+            teacher_programs: The teacher graphs used in distillation strategies.
+            train_optimizer: The optimizer used to append backward ops and
+                             optimization ops into train_graph.
+            distiller_optimizer: The optimizer used by distillation strategies. In distillation strategy,
+                                 this optimizer is used to minimize the combined loss of student-net and
+                                 teacher-net while train_optimizer is used to minimize loss of
+                                 student-net in fine-tune stage. 
+
+        """
+        assert isinstance(
+            train_feed_list, list
+        ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        assert isinstance(
+            eval_feed_list, list
+        ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
+        self.strategies = []
+        self.epoch = 0
+        self.place = CPUPlace() if place is None else place
+        self.scope = scope
+        self.train_graph = GraphWrapper(
+            train_program, in_nodes=train_feed_list, out_nodes=train_fetch_list)
+        self.eval_graph = GraphWrapper(
+            eval_program, in_nodes=eval_feed_list, out_nodes=eval_fetch_list)
+        self.train_reader = train_reader
+        self.eval_reader = eval_reader
+        self.teacher_graphs = []
+        for teacher in teacher_programs:
+            self.teacher_graphs.append(GraphWrapper(teacher))
+
+        self.checkpoint = None
+        self.checkpoint_path = checkpoint_path
+        self.eval_epoch = 1
+
+        self.train_optimizer = train_optimizer
+        self.distiller_optimizer = distiller_optimizer
+        self.init_model = None
+
+    def _add_strategy(self, strategy):
+        """
+        Add a strategy to current compress pass.
+        Args:
+            strategy: The strategy to be added into current compress pass.
+        """
+        self.strategies.append(strategy)
+        self.epoch = max(strategy.end_epoch, self.epoch)
+
+    def config(self, config_file):
+        """
+        Configure the compress pass from file with yaml format.
+        Args:
+            config_file(str): The config file in local file system.
+        """
+        factory = ConfigFactory(config_file)
+        self.epoch = factory.compressor['epoch']
+        for strategy in factory.compressor['strategies']:
+            self._add_strategy(strategy)
+        if 'checkpoint_path' in factory.compressor:
+            self.checkpoint_path = factory.compressor['checkpoint_path']
+
+        if 'init_model' in factory.compressor:
+            self.init_model = factory.compressor['init_model']
+
+    def _init_model(self, context):
+        """
+        Load model that has been compressed. 
+        """
+        if self.init_model and os.path.exists(self.init_model):
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.train_graph.load_persistables(self.init_model, exe)
+            flops = context.eval_graph.flops()
+            conv_flops = context.eval_graph.flops(only_conv=True)
+            context.eval_graph.update_param_shape(context.scope)
+            context.eval_graph.update_groups_of_conv()
+            _logger.info("conv flops: -{}".format(1 - float(
+                context.eval_graph.flops(only_conv=True)) / conv_flops))
+            _logger.info("total flops: -{}".format(1 - float(
+                context.eval_graph.flops()) / flops))
+            context.train_graph.update_param_shape(context.scope)
+            context.train_graph.update_groups_of_conv()
+            context.train_graph.infer_shape()
+            _logger.info("Init model from: {}".format(self.init_model))
+
+    def _load_checkpoint(self, context):
+        """
+        Load checkpoints from file.
+        """
+        _logger.debug('_load_checkpoint')
+        strategies = self.strategies
+        if self.checkpoint_path:
+            if not os.path.exists(self.checkpoint_path):
+                _logger.warning("Checkpints path doesn't exist: [{}]".format(
+                    self.checkpoint_path))
+                return context, strategies
+            checkpoints = [
+                dir for dir in os.listdir(self.checkpoint_path)
+                if os.path.isdir(os.path.join(self.checkpoint_path, dir))
+            ]
+            _logger.debug('self.checkpoint_path: {}'.format(
+                self.checkpoint_path))
+            _logger.info('checkpoints: {}'.format(checkpoints))
+            if len(checkpoints) > 0:
+                latest = max([int(ck) for ck in checkpoints])
+                latest_ck_path = os.path.join(self.checkpoint_path, str(latest))
+
+                model_path = os.path.join(latest_ck_path, 'model')
+                context_path = os.path.join(latest_ck_path, 'context')
+                strategy_path = os.path.join(latest_ck_path, 'strategies')
+                if os.path.exists(context_path):
+                    context.from_file(context_path)
+                    context.epoch_id += 1
+                if os.path.exists(strategy_path):
+                    with open(strategy_path, 'rb') as strategy_file:
+                        if sys.version_info < (3, 0):
+                            strategies = pickle.load(strategy_file)
+                        else:
+                            strategies = pickle.load(
+                                strategy_file, encoding='bytes')
+
+                if os.path.exists(model_path):
+                    exe = SlimGraphExecutor(context.place)
+                    with scope_guard(context.scope):
+                        context.optimize_graph.load_persistables(model_path,
+                                                                 exe)
+                    context.optimize_graph.update_param_shape(context.scope)
+                    context.optimize_graph.update_groups_of_conv()
+                    context.eval_graph.update_param_shape(context.scope)
+                    context.eval_graph.update_groups_of_conv()
+                    _logger.info("Loaded params from: {}".format(model_path))
+        return context, strategies
+
+    def _save_checkpoint(self, context):
+        """
+        Save checkpoints to file.
+        """
+        if context.epoch_id % 1 == 0 and self.checkpoint_path:
+            checkpoint_path = os.path.join(self.checkpoint_path,
+                                           str(context.epoch_id))
+            model_path = os.path.join(checkpoint_path, 'model')
+            context_path = os.path.join(checkpoint_path, 'context')
+            strategy_path = os.path.join(checkpoint_path, 'strategies')
+            if not os.path.isdir(model_path):
+                os.makedirs(model_path)
+            exe = SlimGraphExecutor(context.place)
+            with scope_guard(context.scope):
+                context.optimize_graph.save_persistables(model_path, exe)
+            context.to_file(context_path)
+            with open(strategy_path, 'wb') as strategy_file:
+                pickle.dump(self.strategies, strategy_file)
+            _logger.info('Saved checkpoint to: {}'.format(checkpoint_path))
+
+    def _train_one_epoch(self, context):
+        """
+        Train one epoch.
+        """
+
+        executor = SlimGraphExecutor(self.place)
+
+        if context.optimize_graph.compiled_graph is None:
+            context.optimize_graph.compiled_graph = compiler.CompiledProgram(
+                context.optimize_graph.program).with_data_parallel(
+                    loss_name=context.optimize_graph.out_nodes['loss'])
+
+        for data in context.train_reader():
+            for strategy in self.strategies:
+                strategy.on_batch_begin(context)
+            results = executor.run(context.optimize_graph,
+                                   context.scope,
+                                   data=data)
+            results = [float(np.mean(result)) for result in results]
+            if context.batch_id % 20 == 0:
+                _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
+                    context.epoch_id, context.batch_id,
+                    context.optimize_graph.out_nodes.keys(
+                    ), [round(r, 3) for r in results]))
+            for strategy in self.strategies:
+                strategy.on_batch_end(context)
+            context.batch_id += 1
+        context.batch_id = 0
+
+    def _eval(self, context):
+        """
+        Runing evaluation.
+        """
+        results, names = context.run_eval_graph()
+        for name, result in zip(names, results):
+            if name not in context.eval_results:
+                context.eval_results[name] = []
+            context.eval_results[name].append(result)
+
+    def run(self):
+        """
+        Execute compressiong pass.
+        """
+        context = Context(
+            place=self.place,
+            scope=self.scope,
+            train_graph=self.train_graph,
+            train_reader=self.train_reader,
+            eval_graph=self.eval_graph,
+            eval_reader=self.eval_reader,
+            teacher_graphs=self.teacher_graphs,
+            train_optimizer=self.train_optimizer,
+            distiller_optimizer=self.distiller_optimizer)
+        self.context = context
+        if self.teacher_graphs:
+            context.put('teachers', self.teacher_graphs)
+        self._init_model(context)
+        if not context.optimize_graph:
+            if context.train_optimizer:
+                context.train_optimizer._name = 'train_opt'
+                context.optimize_graph = context.train_graph.get_optimize_graph(
+                    context.train_optimizer, context.place, context.scope)
+            else:
+                context.optimize_graph = context.train_graph
+
+        context, self.strategies = self._load_checkpoint(context)
+
+        for strategy in self.strategies:
+            strategy.on_compression_begin(context)
+        start = context.epoch_id
+        self._eval(context)
+        for epoch in range(start, self.epoch):
+            context.epoch_id = epoch
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+            self._train_one_epoch(context)
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            if self.eval_epoch and epoch % self.eval_epoch == 0:
+                self._eval(context)
+            self._save_checkpoint(context)
+        for strategy in self.strategies:
+            strategy.on_compression_end(context)
+        return context.eval_graph
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
index 811c45700376aff9883fe197007b582f63817f03..9bb395aee95b5236850ca51096ed870ab1d27b62 100644
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -17,8 +17,9 @@ import funcsigs
 import yaml
 from collections import OrderedDict
 from ..prune import *
-from .compress_pass import *
+from ..quantization import *
 from .strategy import *
+from ..distillation import *
 
 __all__ = ['ConfigFactory']
 """This factory is used to create instances by loading and parsing configure file with yaml format.
@@ -29,15 +30,10 @@ class ConfigFactory(object):
     def __init__(self, config):
         """Init a factory from configure file."""
         self.instances = {}
+        self.compressor = {}
         self.version = None
         self._parse_config(config)
 
-    def get_compress_pass(self):
-        """
-        Get compress pass from factory.
-        """
-        return self.instance('compress_pass')
-
     def instance(self, name):
         """
         Get instance from factory.
@@ -59,8 +55,16 @@ class ConfigFactory(object):
             args = {}
             for key in keys:
                 value = attrs[key]
+                if isinstance(value, str) and value.lower() == 'none':
+                    value = None
                 if isinstance(value, str) and value in self.instances:
                     value = self.instances[value]
+                if isinstance(value, list):
+                    for i in range(len(value)):
+                        if isinstance(value[i],
+                                      str) and value[i] in self.instances:
+                            value[i] = self.instances[value[i]]
+
                 args[key] = value
             self.instances[name] = class_(**args)
         return self.instances.get(name)
@@ -76,16 +80,23 @@ class ConfigFactory(object):
                     assert self.version == int(key_values['version'])
 
                 # parse pruners
-                if key == 'pruners' or key == 'strategies':
+                if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies':
                     instances = key_values[key]
                     for name in instances:
                         self._new_instance(name, instances[name])
 
-                if key == 'compress_pass':
-                    compress_pass = self._new_instance(key, key_values[key])
-                    for name in key_values[key]['strategies']:
-                        strategy = self.instance(name)
-                        compress_pass.add_strategy(strategy)
+                if key == 'compressor':
+                    self.compressor['strategies'] = []
+                    self.compressor['epoch'] = key_values[key]['epoch']
+                    if 'init_model' in key_values[key]:
+                        self.compressor['init_model'] = key_values[key][
+                            'init_model']
+                    self.compressor['checkpoint_path'] = key_values[key][
+                        'checkpoint_path']
+                    if 'strategies' in key_values[key]:
+                        for name in key_values[key]['strategies']:
+                            strategy = self.instance(name)
+                            self.compressor['strategies'].append(strategy)
 
                 if key == 'include':
                     for config_file in key_values[key]:
diff --git a/python/paddle/fluid/contrib/slim/core/pass_builder.py b/python/paddle/fluid/contrib/slim/core/pass_builder.py
deleted file mode 100644
index fc1ddc94e04f1d606292071ba7e5cc74fedd5d36..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/core/pass_builder.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .compress_pass import CompressPass
-from .config import ConfigFactory
-
-__all__ = ['build_compressor']
-
-
-def build_compressor(place=None,
-                     data_reader=None,
-                     data_feeder=None,
-                     scope=None,
-                     metrics=None,
-                     epoch=None,
-                     config=None):
-    if config is not None:
-        factory = ConfigFactory(config)
-        comp_pass = factory.get_compress_pass()
-    else:
-        comp_pass = CompressPass()
-    comp_pass.place = place
-    comp_pass.data_reader = data_reader
-    comp_pass.data_feeder = data_feeder
-    comp_pass.scope = scope
-    comp_pass.metrics = metrics
-    comp_pass.epoch = epoch
-    return comp_pass
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
index 74d98e98b0c390599acfaefeb0636a599b46d391..28bf24f4e341dd528d2cd25f6fb24543886150d6 100644
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -20,7 +20,7 @@ class Strategy(object):
     Base class for all strategies.
     """
 
-    def __init__(self, start_epoch=0, end_epoch=10):
+    def __init__(self, start_epoch=0, end_epoch=0):
         """
         Args:
             start_epoch: The first epoch to apply the strategy.
@@ -29,7 +29,7 @@ class Strategy(object):
         self.start_epoch = start_epoch
         self.end_epoch = end_epoch
 
-    def on_compress_begin(self, context):
+    def on_compression_begin(self, context):
         pass
 
     def on_epoch_begin(self, context):
@@ -44,5 +44,5 @@ class Strategy(object):
     def on_batch_end(self, context):
         pass
 
-    def on_compress_end(self, context):
+    def on_compression_end(self, context):
         pass
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml b/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
deleted file mode 100644
index ea888fa2c74a23b4769f75dce6a776afcca41a51..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/config.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-version: 1.0
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_1'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py b/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
deleted file mode 100644
index 21c59c0c9d2d9b76932ab6eeff73754940a3bfa0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/demo/filter_prune/demo.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle
-import os
-import sys
-from paddle.fluid.contrib.slim import CompressPass
-from paddle.fluid.contrib.slim import build_compressor
-from paddle.fluid.contrib.slim import ImitationGraph
-
-
-class LinearModel(object):
-    def __init__(slef):
-        pass
-
-    def train(self):
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        startup_program.random_seed = 10
-        with fluid.program_guard(train_program, startup_program):
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=1, act=None)
-            cost = fluid.layers.square_error_cost(input=predict, label=y)
-            avg_cost = fluid.layers.mean(cost)
-            eval_program = train_program.clone()
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_cost)
-
-        train_reader = paddle.batch(
-            paddle.dataset.uci_housing.train(), batch_size=1)
-        eval_reader = paddle.batch(
-            paddle.dataset.uci_housing.test(), batch_size=1)
-        place = fluid.CPUPlace()
-        train_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        eval_feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-        exe = fluid.Executor(place)
-        exe.run(startup_program)
-        train_metrics = {"loss": avg_cost.name}
-        eval_metrics = {"loss": avg_cost.name}
-
-        graph = ImitationGraph(train_program)
-        config = './config.yaml'
-        comp_pass = build_compressor(
-            place,
-            data_reader=train_reader,
-            data_feeder=train_feeder,
-            scope=fluid.global_scope(),
-            metrics=train_metrics,
-            epoch=1,
-            config=config)
-        comp_pass.apply(graph)
-
-
-if __name__ == "__main__":
-    model = LinearModel()
-    model.train()
diff --git a/python/paddle/fluid/contrib/slim/distillation/__init__.py b/python/paddle/fluid/contrib/slim/distillation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..455c7c563318daec42892e71dcf0a48f22f376a1
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import distiller
+from .distiller import *
+from . import distillation_strategy
+from .distillation_strategy import *
+
+__all__ = distiller.__all__
+__all__ += distillation_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f11f07a51e713d42cee5e63bd2a9a02d82232f7
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core.strategy import Strategy
+from ....framework import Program, program_guard
+from .... import Executor
+import logging
+
+__all__ = ['DistillationStrategy']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class DistillationStrategy(Strategy):
+    def __init__(self, distillers=None, start_epoch=0, end_epoch=0):
+        """
+        Args:
+            distillers(list): A list of distiller used to combine student graph and teacher graph
+                              by adding some loss.
+            start_epoch(int): The epoch when to merge student graph and teacher graph for
+                              distillation training. default: 0
+            end_epoch(int): The epoch when to finish distillation training. default: 0
+            
+        """
+        super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
+        self.distillers = distillers
+
+    def on_compression_begin(self, context):
+        # load from checkpoint
+        if context.epoch_id > 0:
+            if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
+                _logger.info('Restore DistillationStrategy')
+                self._create_distillation_graph(context)
+                _logger.info('Restore DistillationStrategy finish.')
+
+    def on_epoch_begin(self, context):
+        if self.start_epoch == context.epoch_id:
+            _logger.info('DistillationStrategy::on_epoch_begin.')
+            self._create_distillation_graph(context)
+            _logger.info('DistillationStrategy set optimize_graph.')
+
+    def _create_distillation_graph(self, context):
+        """
+        step 1: Merge student graph and teacher graph into distillation graph.
+        step 2: Add loss into distillation graph by distillers.
+        step 3: Append backward ops and optimize ops into distillation graph for training.
+        """
+        # step 1
+        teacher = context.teacher_graphs[0]
+        for var in teacher.program.list_vars():
+            var.stop_gradient = True
+        graph = context.train_graph.clone()
+        graph.merge(teacher)
+        graph.out_nodes['student_loss'] = graph.out_nodes['loss']
+
+        # step 2
+        for distiller in self.distillers:
+            graph = distiller.distiller_loss(graph)
+
+        # step 3
+        startup_program = Program()
+        with program_guard(graph.program, startup_program):
+            context.distiller_optimizer._name = 'distillation_optimizer'
+            context.distiller_optimizer.minimize(
+                graph.var(graph.out_nodes['loss'])._var)
+        exe = Executor(context.place)
+        exe.run(startup_program, scope=context.scope)
+
+        # backup graph for fine-tune after distillation
+        context.put('distillation_backup_optimize_graph',
+                    context.optimize_graph)
+        context.optimize_graph = graph
+
+    def on_epoch_end(self, context):
+        if context.epoch_id == (self.end_epoch - 1):
+            _logger.info('DistillationStrategy::on_epoch_end.')
+            # restore optimize_graph for fine-tune or other strategy in next stage.
+            context.optimize_graph = context.get(
+                'distillation_backup_optimize_graph')
+            _logger.info(
+                'DistillationStrategy set context.optimize_graph to None.')
diff --git a/python/paddle/fluid/contrib/slim/distillation/distiller.py b/python/paddle/fluid/contrib/slim/distillation/distiller.py
new file mode 100644
index 0000000000000000000000000000000000000000..13bb35a8be73ed29e907308d08a33cdc13dee069
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .... import layers
+from .... import optimizer
+from .... import Executor
+from .... import Program
+from .... import program_guard
+from .... import regularizer
+
+__all__ = ['FSPDistiller', 'L2Distiller']
+
+
+class L2Distiller(object):
+    """
+    Combine two layers from student net and teacher net by l2-loss.
+    And add the loss into the total loss using for distillation training.
+    """
+
+    def __init__(self,
+                 student_feature_map,
+                 teacher_feature_map,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def distiller_loss(self, graph):
+        """
+        Modify graph inplace to add l2-loss.
+        Args: 
+            graph(GraphWrapper): The graph to be modified.
+        Returns:
+            GraphWrapper: The modified graph.
+        """
+        distiller_pass = L2DistillerPass(self.student_feature_map,
+                                         self.teacher_feature_map,
+                                         self.distillation_loss_weight)
+        dis_graph = distiller_pass.apply(graph)
+        return dis_graph
+
+
+class L2DistillerPass(object):
+    """
+    The pass used to add l2-loss.
+    """
+
+    def __init__(self,
+                 student_feature_map,
+                 teacher_feature_map,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_feature_map(str): The name of feature map from student network.
+            teacher_feature_map(str): The name of feature map from teacher network.
+                                      It's shape should be the same with student network.
+            distillation_loss_weight(float): The weight of the l2-loss.
+        """
+        self.student_feature_map = student_feature_map
+        self.teacher_feature_map = teacher_feature_map
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def apply(self, graph):
+        ret_graph = graph
+        with program_guard(ret_graph.program):
+
+            student_feature_map = ret_graph.var(self.student_feature_map)._var
+            teacher_feature_map = ret_graph.var(self.teacher_feature_map)._var
+            l2loss = layers.reduce_mean(
+                layers.square(student_feature_map - teacher_feature_map))
+
+            distillation_loss = l2loss * self.distillation_loss_weight
+            student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
+            loss = distillation_loss + student_loss
+
+            ret_graph.out_nodes[
+                'l2loss_' + self.student_feature_map + "_" +
+                self.teacher_feature_map] = distillation_loss.name
+            ret_graph.out_nodes['loss'] = loss.name
+        return ret_graph
+
+
+class FSPDistiller(object):
+    """
+    Combine layers from student net and teacher net by fsp-loss.
+    """
+
+    def __init__(self, student_pairs, teacher_pairs,
+                 distillation_loss_weight=1):
+        """
+        Args:
+            student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+                                        a section in student network. The variables in a tuple should
+                                        have the same feature map size.
+            teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+                                        a section in teacher network. The variables in a tuple should
+                                        have the same feature map size. Varibale named teacher_pairs[i][j]
+                                        should has the save channel number with that of variable named 
+                                        student_pairs[i][j].
+
+            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
+        """
+        self.student_pairs = student_pairs
+        self.teacher_pairs = teacher_pairs
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def distiller_loss(self, graph):
+        """
+        Modify graph inplace to add fsp-loss.
+        Args: 
+            graph(GraphWrapper): The graph to be modified.
+        Returns:
+            GraphWrapper: The modified graph.
+        """
+        distiller_pass = FSPDistillerPass(self.student_pairs,
+                                          self.teacher_pairs,
+                                          self.distillation_loss_weight)
+        dis_graph = distiller_pass.apply(graph)
+        return dis_graph
+
+
+class FSPDistillerPass(object):
+    '''
+    Combine layers from student net and teacher net by fsp-loss.
+    '''
+
+    def __init__(self, s_pairs, t_pairs, distillation_loss_weight=1):
+        """
+        Args:
+            s_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+                                        a section in student network. The variables in a tuple should
+                                        have the same feature map size.
+            t_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+                                        a section in teacher network. The variables in a tuple should
+                                        have the same feature map size. Varibale named teacher_pairs[i][j]
+                                        should has the save channel number with that of variable named 
+                                        student_pairs[i][j].
+
+            distillation_loss_weight(float): The weight of the fsp-loss. default: 1.
+        """
+        self.s_pairs = s_pairs
+        self.t_pairs = t_pairs
+        self.distillation_loss_weight = distillation_loss_weight
+
+    def apply(self, graph):
+        ret_graph = graph
+        with program_guard(ret_graph.program):
+            losses = []
+            for s_pair, t_pair in zip(self.s_pairs, self.t_pairs):
+                s_pair_start = ret_graph.var(s_pair[0])._var
+                s_pair_end = ret_graph.var(s_pair[1])._var
+                s_fsp_matrix = self._fsp_matrix(s_pair_start, s_pair_end)
+                t_pair_start = ret_graph.var(t_pair[0])._var
+                t_pair_end = ret_graph.var(t_pair[1])._var
+                t_fsp_matrix = self._fsp_matrix(t_pair_start, t_pair_end)
+                l2_loss = layers.reduce_mean(
+                    layers.square(s_fsp_matrix - t_fsp_matrix))
+                losses.append(l2_loss)
+            distillation_loss = layers.sum(
+                losses) * self.distillation_loss_weight
+            student_loss = ret_graph.var(ret_graph.out_nodes['loss'])._var
+            loss = distillation_loss + student_loss
+
+            ret_graph.out_nodes[
+                'fsp_distillation_loss'] = distillation_loss.name
+            ret_graph.out_nodes['loss'] = loss.name
+        return ret_graph
+
+    def _fsp_matrix(self, fea_map_0, fea_map_1):
+        return layers.fsp_matrix(fea_map_0, fea_map_1)
diff --git a/python/paddle/fluid/contrib/slim/graph/__init__.py b/python/paddle/fluid/contrib/slim/graph/__init__.py
index d65472d193b639f0766e278ec14b5dc36c5d62bc..c5d1c4dbdfb208ea66bb3dc315e502309799492e 100644
--- a/python/paddle/fluid/contrib/slim/graph/__init__.py
+++ b/python/paddle/fluid/contrib/slim/graph/__init__.py
@@ -14,10 +14,7 @@
 
 from . import executor
 from .executor import *
-from . import graph
-from .graph import *
-from . import graph_pass
-from .graph_pass import *
+from . import graph_wrapper
+from .graph_wrapper import *
 __all__ = executor.__all__
-__all__ += graph.__all__
-__all__ += graph_pass.__all__
+__all__ += graph_wrapper.__all__
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
index c02c3af82013287bf19e1869cb60dc65239b720a..70438a90eb790e7ca5d00be0bc09efc6c00cafe4 100644
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -12,51 +12,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import abc
-from abc import abstractmethod
+from ....compiler import CompiledProgram
+from ....data_feeder import DataFeeder
 from .... import executor
-from .graph import IRGraph, ImitationGraph
+from .graph_wrapper import GraphWrapper
 
-__all__ = ['get_executor']
+__all__ = ['SlimGraphExecutor']
 
 
-class GraphExecutor(object):
-    __metaclass__ = abc.ABCMeta
+class SlimGraphExecutor(object):
+    """
+    Wrapper of executor used to run GraphWrapper.
+    """
 
     def __init__(self, place):
-        self.place = place
-
-    @abstractmethod
-    def run(self, graph, feches=None, feed=None):
-        pass
-
-
-class IRGraphExecutor(GraphExecutor):
-    def run(self, grah, fetches, feed=None):
-        pass
-
-
-class ImitationGraphExecutor(GraphExecutor):
-    def __init__(self, place):
-        super(ImitationGraphExecutor, self).__init__(place)
         self.exe = executor.Executor(place)
+        self.place = place
 
-    def run(self, graph, scope=None, fetches=None, feed=None):
-        assert isinstance(graph, ImitationGraph)
-        fetch_list = None
-        if fetches:
-            fetch_list = [
-                graph.program.global_block().var(name) for name in fetches
-            ]
-        results = self.exe.run(graph.program,
+    def run(self, graph, scope, data=None):
+        """
+        Runing a graph with a batch of data.
+        Args:
+            graph(GraphWrapper): The graph to be executed.
+            scope(fluid.core.Scope): The scope to be used.
+            data(list<tuple>): A batch of data. Each tuple in this list is a sample.
+                               It will feed the items of tuple to the in_nodes of graph.
+        Returns:
+            results(list): A list of result with the same order indicated by graph.out_nodes.
+        """
+        assert isinstance(graph, GraphWrapper)
+        if data is not None:
+            feeder = DataFeeder(
+                feed_list=graph.in_nodes.values(),
+                place=self.place,
+                program=graph.program)
+            feed = feeder.feed(data)
+
+        fetch_list = graph.out_nodes.values()
+        program = graph.compiled_graph if graph.compiled_graph else graph.program
+        results = self.exe.run(program,
                                scope=scope,
                                fetch_list=fetch_list,
                                feed=feed)
         return results
-
-
-def get_executor(graph, place):
-    if isinstance(graph, ImitationGraph):
-        return ImitationGraphExecutor(place)
-    if isinstance(graph, IRGraph):
-        return IRGraphExecutor(place)
diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py
deleted file mode 100644
index f38d9783413a01cd1005a014c0aba5ecf5cc79c2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import os
-import subprocess
-from ....framework import Program
-from ....framework import Block
-from .... import core
-
-__all__ = ['Graph', 'ImitationGraph', 'IRGraph']
-
-
-class Graph(object):
-    """
-    Base class for all graph.
-    """
-
-    def __init__(self):
-        pass
-
-    def all_parameters(self):
-        """
-        Return all the parameters in current graph.
-        """
-        pass
-
-
-class ImitationGraph(Graph):
-    def __init__(self, program=None):
-        super(ImitationGraph, self).__init__()
-        self.program = Program() if program is None else program
-
-    def all_parameters(self):
-        return self.program.global_block().all_parameters()
-
-
-class IRGraph(Graph):
-    pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_pass.py b/python/paddle/fluid/contrib/slim/graph/graph_pass.py
deleted file mode 100644
index 1db6c4f110daa44be7fcbcc36f47224797b6dc88..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/graph/graph_pass.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = ['GraphPass', 'PruneParameterPass']
-
-
-class GraphPass(object):
-    """
-    Base class for all graph pass.
-    """
-
-    def __init__(self):
-        pass
-
-    def apply(self, graph):
-        pass
-
-
-class PruneParameterPass(GraphPass):
-    """
-    Generate a graph for pruning parameters from target graph.
-    """
-
-    def __init__(self, pruned_params, thresholds):
-        super(PruneParameterPass, self).__init__()
-        self.pruned_params = pruned_params
-        self.thresholds = thresholds
-        self.default_threshold = thresholds['*']
-
-    def apply(self, graph):
-        pass
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c208553fd811c7b18f9168b8fcae4da6e5856070
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -0,0 +1,502 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from .... import io
+from .... import compiler
+from ....framework import Program
+from ....framework import program_guard
+from ....framework import Parameter
+from ....framework import Variable
+from ....executor import Executor
+import copy
+from collections import Iterable
+from ....io import save_inference_model, load_inference_model, save_persistables
+import numpy as np
+import pickle
+import os
+
+__all__ = ['GraphWrapper', 'VarWrapper', 'OpWrapper']
+
+OPTIMIZER_OPS = [
+    'momentum',
+    'lars_momentum',
+    'adagrad',
+    'adam',
+    'adamax',
+    'decayed_adagrad',
+    'adadelta',
+    'rmsprop',
+]
+
+
+class VarWrapper(object):
+    def __init__(self, var, graph):
+        assert isinstance(var, Variable)
+        assert isinstance(graph, GraphWrapper)
+        self._var = var
+        self._graph = graph
+
+    def __eq__(self, v):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self._var.name == v._var.name
+
+    def name(self):
+        """
+        Get the name of the variable.
+        """
+        return self._var.name
+
+    def shape(self):
+        """
+        Get the shape of the varibale.
+        """
+        return self._var.shape
+
+    def set_shape(self, shape):
+        """
+        Set the shape of the variable.
+        """
+        self._var.desc.set_shape(shape)
+
+    def inputs(self):
+        """
+        Get all the operators that use this variable as output.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_inputs():
+                ops.append(op)
+        return ops
+
+    def outputs(self):
+        """
+        Get all the operators that use this variable as input.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for op in self._graph.ops():
+            if self in op.all_outputs():
+                ops.append(op)
+        return ops
+
+
+class OpWrapper(object):
+    def __init__(self, op, graph):
+        assert isinstance(graph, GraphWrapper)
+        self._op = op
+        self._graph = graph
+
+    def __eq__(self, op):
+        """
+        Overwrite this function for ...in... syntax in python.
+        """
+        return self.idx() == op.idx()
+
+    def all_inputs(self):
+        """
+        Get all the input variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.input_arg_names
+        ]
+
+    def all_outputs(self):
+        """
+        Get all the output variables of this operator.
+        """
+        return [
+            self._graph.var(var_name) for var_name in self._op.output_arg_names
+        ]
+
+    def idx(self):
+        """
+        Get the id of this operator.
+        """
+        return self._op.idx
+
+    def type(self):
+        """
+        Get the type of this operator.
+        """
+        return self._op.type
+
+    def is_bwd_op(self):
+        """
+        Whether this operator is backward op.
+        """
+        return self.type().endswith('_grad')
+
+    def is_opt_op(self):
+        """
+        Whether this operator is optimizer op.
+        """
+        return self.type() in OPTIMIZER_OPS
+
+    def inputs(self, name):
+        """
+        Get all the varibales by the input name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.input(name)]
+
+    def outputs(self, name):
+        """
+        Get all the varibales by the output name.
+        """
+        return [self._graph.var(var_name) for var_name in self._op.output(name)]
+
+    def set_attr(self, key, value):
+        """
+        Set the value of attribute by attribute's name.
+
+        Args:
+            key(str): the attribute name.
+            value(bool|int|str|float|list): the value of the attribute.
+        """
+        self._op._set_attr(key, value)
+
+    def attr(self, name):
+        """
+        Get the attribute by name.
+
+        Args:
+            name(str): the attribute name.
+
+        Returns:
+            bool|int|str|float|list: The attribute value. The return value
+            can be any valid attribute type.
+        """
+        return self._op.attr(name)
+
+
+class GraphWrapper(object):
+    """
+    It is a wrapper of paddle.fluid.framework.IrGraph with some special functions
+    for paddle slim framework.
+    """
+
+    def __init__(self, program=None, in_nodes=[], out_nodes=[]):
+        """
+        Args:
+            program(framework.Program): A program with 
+            in_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+            out_nodes(dict): A dict to indicate the input nodes of the graph.
+                            The key is user-defined and human-readable name.
+                            The value is the name of Variable.
+        """
+        super(GraphWrapper, self).__init__()
+        self.program = Program() if program is None else program
+        self.compiled_graph = None
+        self.in_nodes = OrderedDict(in_nodes)
+        self.out_nodes = OrderedDict(out_nodes)
+        self._attrs = OrderedDict()
+
+    def all_parameters(self):
+        """
+        Get all the parameters in this graph.
+        Returns:
+            list<VarWrapper>: A list of VarWrapper instances.
+        """
+        params = []
+        for block in self.program.blocks:
+            for param in block.all_parameters():
+                params.append(VarWrapper(param, self))
+        return params
+
+    def is_parameter(self, var):
+        """
+        Whether the given variable is parameter.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return isinstance(var._var, Parameter)
+
+    def is_persistable(self, var):
+        """
+        Whether the given variable is persistable.
+        Args:
+            var(VarWrapper): The given varibale.
+        """
+        return var._var.persistable
+
+    def compile(self, for_parallel=True, for_test=False):
+        """
+        Compile the program in this wrapper to framework.CompiledProgram for next running.
+        This function must be called if the program is modified.
+        Args:
+            for_parallel(bool): Whether the program to run in data parallel way. default: True.
+            for_test(bool): Whether the compiled program is used for test.
+        """
+        target = self.program
+        if for_test:
+            loss = None
+        else:
+            loss = self.out_nodes['loss']
+        if for_parallel:
+            # disable memory optimize for stable training
+            build_strategy = compiler.BuildStrategy()
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
+            self.compiled_graph = compiler.CompiledProgram(
+                target).with_data_parallel(
+                    loss_name=loss, build_strategy=build_strategy)
+        else:
+            self.compiled_graph = compiler.CompiledProgram(target)
+
+    def ops(self):
+        """
+        Return all operator nodes included in the graph as a set.
+        """
+        ops = []
+        for block in self.program.blocks:
+            for op in block.ops:
+                ops.append(OpWrapper(op, self))
+        return ops
+
+    def vars(self):
+        """
+        Get all the variables.
+        """
+        return [VarWrapper(var, self) for var in self.program.list_vars()]
+
+    def var(self, name):
+        """
+        Get the variable by variable name.
+        """
+        return VarWrapper(self.program.global_block().var(name), self)
+
+    def clone(self, for_test=False):
+        """
+        Clone a new graph from current graph.
+        Returns:
+            (GraphWrapper): The wrapper of a new graph.
+        """
+        return GraphWrapper(
+            self.program.clone(for_test),
+            copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes))
+
+    def merge(self, graph):
+        """
+        Merge a graph into current graph.
+        Args:
+            graph(GraphWrapper): The graph to be merged by current graph.
+        """
+        for var in graph.program.list_vars():
+            new_var = self.program.global_block()._clone_variable(
+                var, force_persistable=False)
+            new_var.stop_gradient = var.stop_gradient
+            # TODO: parameters should be cloned
+        for op in graph.ops():
+            op = op._op
+            inputs = {}
+            outputs = {}
+            attrs = {}
+            for input_name in op.input_names:
+                inputs[input_name] = [
+                    self.var(in_var_name)._var
+                    for in_var_name in op.input(input_name)
+                ]
+            for output_name in op.output_names:
+                outputs[output_name] = [
+                    self.var(out_var_name)._var
+                    for out_var_name in op.output(output_name)
+                ]
+            for attr_name in op.attr_names:
+                attrs[attr_name] = op.attr(attr_name)
+            self.program.global_block().append_op(
+                type=op.type, inputs=inputs, outputs=outputs, attrs=attrs)
+
+    def program(self):
+        """
+        Get the program in current wrapper.
+        """
+        return self.program
+
+    def pre_ops(self, op):
+        """
+        Get all the previous operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for in_var in op.all_inputs():
+                if in_var in p.all_outputs():
+                    ops.append(p)
+        return ops
+
+    def next_ops(self, op):
+        """
+        Get all the next operators of target operator.
+        Args:
+            op(OpWrapper): Target operator..
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        ops = []
+        for p in self.ops():
+            for out_var in op.all_outputs():
+                if out_var in p.all_inputs():
+                    ops.append(p)
+        return ops
+
+    def get_param_by_op(self, op):
+        """
+        Get the parameters used by target operator.
+        """
+        assert isinstance(op, OpWrapper)
+        params = []
+        for var in op.all_inputs():
+            if isinstance(var._var, Parameter):
+                params.append(var)
+        assert len(params) > 0
+        return params
+
+    def numel_params(self):
+        """
+        Get the number of elements in all parameters.
+        """
+        ret = 0
+        for param in self.all_parameters():
+            ret += np.product(param.shape())
+        return ret
+
+    def get_optimize_graph(self, optimizer, place, scope, no_grad_var_names=[]):
+        """
+        Get a new graph for training by appending some backward operators and optimization operators.
+        Args:
+            optimizer: The optimzier used to generate training graph.
+            place: The place to run the graph.
+            scope: The scope used to run the graph. Some new variable will be added into this scope.
+            no_grad_var_names(list<str>): Names of variables that should be ignored while computing gradients. default: [].
+        Returns:
+            (GraphWrapper): The wrapper of new graph with backward ops and optimization ops. 
+        """
+        graph = self.clone()
+        startup_program = Program()
+        with program_guard(
+                main_program=graph.program, startup_program=startup_program):
+            target_name = None
+            if 'loss' in graph.out_nodes:
+                target_name = graph.out_nodes['loss']
+            elif 'cost' in graph.out_nodes:
+                target_name = graph.out_nodes['cost']
+            target = graph.var(target_name)._var
+            optimizer.minimize(target, no_grad_set=no_grad_var_names)
+
+        exe = Executor(place)
+        exe.run(program=startup_program, scope=scope)
+        return graph
+
+    def flops(self, only_conv=False):
+        """
+        Get the flops of current graph.
+        Args:
+            only_conv: Only calculating the conv layers. default: False.
+        Returns:
+            int: The flops of current graph.
+        """
+        flops = 0
+        for op in self.ops():
+            if op.type() in ['conv2d', 'depthwise_conv2d']:
+                filter_shape = op.inputs("Filter")[0].shape()
+                input_shape = op.inputs("Input")[0].shape()
+                output_shape = op.outputs("Output")[0].shape()
+                c_out, c_in, k_h, k_w = filter_shape
+                _, _, h_out, w_out = output_shape
+                groups = op.attr("groups")
+                kernel_ops = k_h * k_w * (c_in / groups)
+                if len(op.inputs("Bias")) > 0:
+                    with_bias = 1
+                else:
+                    with_bias = 0
+                flops += 2 * h_out * w_out * c_out * (kernel_ops + with_bias)
+            elif op.type() == 'pool2d' and not only_conv:
+                input_shape = op.inputs("X")[0].shape()
+                output_shape = op.outputs("Out")[0].shape()
+                _, c_out, h_out, w_out = output_shape
+                k_size = op.attr("ksize")
+                flops += h_out * w_out * c_out * (k_size[0]**2)
+
+            elif op.type() == 'mul' and not only_conv:
+                x_shape = list(op.inputs("X")[0].shape())
+                y_shape = op.inputs("Y")[0].shape()
+                if x_shape[0] == -1:
+                    x_shape[0] = 1
+                flops += 2 * x_shape[0] * x_shape[1] * y_shape[1]
+
+            elif op.type() in ['relu', 'sigmoid', 'batch_norm'
+                               ] and not only_conv:
+                input_shape = list(op.inputs("X")[0].shape())
+                if input_shape[0] == -1:
+                    input_shape[0] = 1
+                flops += np.product(input_shape)
+
+        return flops
+
+    def save_persistables(self, path, exe):
+        """
+        Save all the persistable variables into file.
+        Args:
+            path(str): The path to save the persistables.
+            exe(framework.Executor): The executor used to save the persistables.
+        """
+        io.save_persistables(exe.exe, path, main_program=self.program)
+
+    def load_persistables(self, path, exe):
+        """
+        Load the persistable variables from file.
+        Args:
+            path(str): The path to load the persistables.
+            exe(framework.Executor): The executor used to load the persistables.
+        """
+
+        def if_exist(var):
+            return os.path.exists(os.path.join(path, var.name))
+
+        io.load_vars(
+            exe.exe, path, main_program=self.program, predicate=if_exist)
+
+    def update_param_shape(self, scope):
+        """
+        Update the shape of parameters in the graph according to tensors in scope.
+        It is used after loading pruned parameters from file.
+        """
+        for param in self.all_parameters():
+            tensor_shape = np.array(scope.find_var(param.name()).get_tensor(
+            )).shape
+            param.set_shape(tensor_shape)
+
+    def infer_shape(self):
+        """
+        Update the groups of convolution layer according to current filters.
+        It is used after loading pruned parameters from file.
+        """
+        for op in self.ops():
+            if op.type() != 'conditional_block':
+                op._op.desc.infer_shape(op._op.block.desc)
+
+    def update_groups_of_conv(self):
+        for op in self.ops():
+            if op.type() == 'depthwise_conv2d':
+                op.set_attr('groups', op.inputs('Filter')[0].shape()[0])
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index 34c5107daa3cde10e7995902be37e34e19664da8..7a25c3a61e0815a20fa9b0477a6c69a4f8d2a066 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -13,54 +13,919 @@
 # limitations under the License.
 
 from ..core.strategy import Strategy
-from ....framework import Program, program_guard
+from ..graph import VarWrapper, OpWrapper, GraphWrapper
+from ....framework import Program, program_guard, Parameter
 from .... import layers
+import prettytable as pt
 import numpy as np
+from scipy.optimize import leastsq
+import copy
+import re
+import os
+import pickle
+import logging
+import sys
 
-__all__ = ['SensitivePruneStrategy', 'PruneStrategy']
+__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy']
 
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class PruneStrategy(Strategy):
+    """
+    The base class of all pruning strategies.
+    """
 
-class SensitivePruneStrategy(Strategy):
     def __init__(self,
                  pruner=None,
                  start_epoch=0,
-                 end_epoch=10,
-                 delta_rate=0.20,
-                 acc_loss_threshold=0.2,
-                 sensitivities=None):
-        super(SensitivePruneStrategy, self).__init__(start_epoch, end_epoch)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
         self.pruner = pruner
-        self.delta_rate = delta_rate
-        self.acc_loss_threshold = acc_loss_threshold
-        self.sensitivities = sensitivities
+        self.target_ratio = target_ratio
+        self.metric_name = metric_name
+        self.pruned_params = pruned_params
+        self.pruned_list = []
+        self.backup = {}
+        self.param_shape_backup = {}
 
+    def _eval_graph(self, context, sampled_rate=None, cached_id=0):
+        """
+        Evaluate the current mode in context.
+        Args:
+            context(slim.core.Context): The context storing all information used to evaluate the current model.
+            sampled_rate(float): The sampled rate used to sample partial data for evaluation. None means using all data in eval_reader. default: None.
+            cached_id(int): The id of dataset sampled. Evaluations with same cached_id use the same sampled dataset. default: 0.
+        """
+        results, names = context.run_eval_graph(sampled_rate, cached_id)
+        metric = np.mean(results[list(names).index(self.metric_name)])
+        return metric
 
-class PruneStrategy(Strategy):
+    def _prune_filters_by_ratio(self,
+                                scope,
+                                params,
+                                ratio,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning filters by given ratio.
+        Args:
+            scope(fluid.core.Scope): The scope used to pruning filters.
+            params(list<VarWrapper>): A list of filter parameters.
+            ratio(float): The ratio to be pruned.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[0]:
+            return
+        param_t = scope.find_var(params[0].name()).get_tensor()
+        pruned_idx = self.pruner.cal_pruned_idx(
+            params[0].name(), np.array(param_t), ratio, axis=0)
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[0] = pruned_param.shape[0]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()), str(0), str(ori_shape), str(param.shape())))
+            self.pruned_list[0].append(param.name())
+        return pruned_idx
+
+    def _prune_parameter_by_idx(self,
+                                scope,
+                                params,
+                                pruned_idx,
+                                pruned_axis,
+                                place,
+                                lazy=False,
+                                only_graph=False):
+        """
+        Pruning parameters in given axis.
+        Args:
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(VarWrapper): The parameter to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            pruned_axis(int): The pruning axis.
+            place(fluid.Place): The device place of filter parameters.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        if params[0].name() in self.pruned_list[pruned_axis]:
+            return
+        for param in params:
+            assert isinstance(param, VarWrapper)
+            param_t = scope.find_var(param.name()).get_tensor()
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
+            pruned_param = self.pruner.prune_tensor(
+                np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
+            if not only_graph:
+                param_t.set(pruned_param, place)
+            ori_shape = param.shape()
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
+            new_shape = list(param.shape())
+            new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
+            param.set_shape(new_shape)
+            _logger.debug(
+                '|----------------------------------------+----+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
+                str(param.name()),
+                str(pruned_axis), str(ori_shape), str(param.shape())))
+            self.pruned_list[pruned_axis].append(param.name())
+
+    def _forward_search_related_op(self, graph, param):
+        """
+        Forward search operators that will be affected by pruning of param.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            param(VarWrapper): The current pruned parameter.
+        Returns:
+            list<OpWrapper>: A list of operators.
+        """
+        assert isinstance(param, VarWrapper)
+        visited = {}
+        for op in graph.ops():
+            visited[op.idx()] = False
+        stack = []
+        for op in graph.ops():
+            if (not op.is_bwd_op()) and (param in op.all_inputs()):
+                stack.append(op)
+        visit_path = []
+        while len(stack) > 0:
+            top_op = stack[len(stack) - 1]
+            if visited[top_op.idx()] == False:
+                visit_path.append(top_op)
+                visited[top_op.idx()] = True
+            next_ops = None
+            if top_op.type() == "conv2d" and param not in top_op.all_inputs():
+                next_ops = None
+            elif top_op.type() == "mul":
+                next_ops = None
+            else:
+                next_ops = self._get_next_unvisited_op(graph, visited, top_op)
+            if next_ops == None:
+                stack.pop()
+            else:
+                stack += next_ops
+        return visit_path
+
+    def _get_next_unvisited_op(self, graph, visited, top_op):
+        """
+        Get next unvisited adjacent operators of given operators.
+        Args:
+            graph(GraphWrapper): The graph used to search. 
+            visited(list): The ids of operators that has been visited.
+            top_op: The given operator.
+        Returns:
+            list<OpWrapper>: A list of operators. 
+        """
+        assert isinstance(top_op, OpWrapper)
+        next_ops = []
+        for op in graph.next_ops(top_op):
+            if (visited[op.idx()] == False) and (not op.is_bwd_op()):
+                next_ops.append(op)
+        return next_ops if len(next_ops) > 0 else None
+
+    def _get_accumulator(self, graph, param):
+        """
+        Get accumulators of given parameter. The accumulator was created by optimizer.
+        Args:
+            graph(GraphWrapper): The graph used to search.
+            param(VarWrapper): The given parameter.
+        Returns:
+            list<VarWrapper>: A list of accumulators which are variables.
+        """
+        assert isinstance(param, VarWrapper)
+        params = []
+        for op in param.outputs():
+            if op.is_opt_op():
+                for out_var in op.all_outputs():
+                    if graph.is_persistable(out_var) and out_var.name(
+                    ) != param.name():
+                        params.append(out_var)
+        return params
+
+    def _forward_pruning_ralated_params(self,
+                                        graph,
+                                        scope,
+                                        param,
+                                        place,
+                                        ratio=None,
+                                        pruned_idxs=None,
+                                        lazy=False,
+                                        only_graph=False):
+        """
+        Pruning all the parameters affected by the pruning of given parameter.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            param(VarWrapper): The given parameter.
+            place(fluid.Place): The device place of filter parameters.
+            ratio(float): The target ratio to be pruned.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+        """
+        assert isinstance(
+            graph,
+            GraphWrapper), "graph must be instance of slim.core.GraphWrapper"
+        assert isinstance(
+            param, VarWrapper), "param must be instance of slim.core.VarWrapper"
+
+        if param.name() in self.pruned_list[0]:
+            return
+        related_ops = self._forward_search_related_op(graph, param)
+
+        if ratio is None:
+            assert pruned_idxs is not None
+            self._prune_parameter_by_idx(
+                scope, [param] + self._get_accumulator(graph, param),
+                pruned_idxs,
+                pruned_axis=0,
+                place=place,
+                lazy=lazy,
+                only_graph=only_graph)
+
+        else:
+            pruned_idxs = self._prune_filters_by_ratio(
+                scope, [param] + self._get_accumulator(graph, param),
+                ratio,
+                place,
+                lazy=lazy,
+                only_graph=only_graph)
+        corrected_idxs = pruned_idxs[:]
+
+        for idx, op in enumerate(related_ops):
+            if op.type() == "conv2d" and (param not in op.all_inputs()):
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=1,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            if op.type() == "depthwise_conv2d":
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        conv_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [conv_param] + self._get_accumulator(
+                                graph, conv_param),
+                            corrected_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "elementwise_add":
+                # pruning bias
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        bias_param = in_var
+                        self._prune_parameter_by_idx(
+                            scope, [bias_param] + self._get_accumulator(
+                                graph, bias_param),
+                            pruned_idxs,
+                            pruned_axis=0,
+                            place=place,
+                            lazy=lazy,
+                            only_graph=only_graph)
+            elif op.type() == "mul":  # pruning fc layer
+                fc_input = None
+                fc_param = None
+                for in_var in op.all_inputs():
+                    if graph.is_parameter(in_var):
+                        fc_param = in_var
+                    else:
+                        fc_input = in_var
+
+                idx = []
+                feature_map_size = fc_input.shape()[2] * fc_input.shape()[3]
+                range_idx = np.array(range(feature_map_size))
+                for i in corrected_idxs:
+                    idx += list(range_idx + i * feature_map_size)
+                corrected_idxs = idx
+                self._prune_parameter_by_idx(
+                    scope, [fc_param] + self._get_accumulator(graph, fc_param),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+            elif op.type() == "concat":
+                concat_inputs = op.all_inputs()
+                last_op = related_ops[idx - 1]
+                for out_var in last_op.all_outputs():
+                    if out_var in concat_inputs:
+                        concat_idx = concat_inputs.index(out_var)
+                offset = 0
+                for ci in range(concat_idx):
+                    offset += concat_inputs[ci].shape()[1]
+                corrected_idxs = [x + offset for x in pruned_idxs]
+            elif op.type() == "batch_norm":
+                bn_inputs = op.all_inputs()
+                mean = bn_inputs[2]
+                variance = bn_inputs[3]
+                alpha = bn_inputs[0]
+                beta = bn_inputs[1]
+                self._prune_parameter_by_idx(
+                    scope, [mean] + self._get_accumulator(graph, mean),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [variance] + self._get_accumulator(graph, variance),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [alpha] + self._get_accumulator(graph, alpha),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+                self._prune_parameter_by_idx(
+                    scope, [beta] + self._get_accumulator(graph, beta),
+                    corrected_idxs,
+                    pruned_axis=0,
+                    place=place,
+                    lazy=lazy,
+                    only_graph=only_graph)
+
+    def _prune_parameters(self,
+                          graph,
+                          scope,
+                          params,
+                          ratios,
+                          place,
+                          lazy=False,
+                          only_graph=False):
+        """
+        Pruning the given parameters.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            scope(fluid.core.Scope): The scope storing paramaters to be pruned.
+            params(list<str>): A list of parameter names to be pruned.
+            ratios(list<float>): A list of ratios to be used to pruning parameters.
+            place(fluid.Place): The device place of filter parameters.
+            pruned_idx(list): The index of elements to be pruned.
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means cutting down the pruned elements.
+            only_graph(bool): True means only modifying the graph.
+                              False means modifying graph and variables in  scope.
+
+        """
+        _logger.debug('\n################################')
+        _logger.debug('#       pruning parameters       #')
+        _logger.debug('################################\n')
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format('parameter', 'axis',
+                                                            'from', 'to'))
+        assert len(params) == len(ratios)
+        self.pruned_list = [[], []]
+        for param, ratio in zip(params, ratios):
+            assert isinstance(param, str) or isinstance(param, unicode)
+            param = graph.var(param)
+            self._forward_pruning_ralated_params(
+                graph,
+                scope,
+                param,
+                place,
+                ratio=ratio,
+                lazy=lazy,
+                only_graph=only_graph)
+            ops = param.outputs()
+            for op in ops:
+                if op.type() == 'conv2d':
+                    brother_ops = self._search_brother_ops(graph, op)
+                    for broher in brother_ops:
+                        for p in graph.get_param_by_op(broher):
+                            self._forward_pruning_ralated_params(
+                                graph,
+                                scope,
+                                p,
+                                place,
+                                ratio=ratio,
+                                lazy=lazy,
+                                only_graph=only_graph)
+        _logger.debug(
+            '|----------------------------------------+----+------------------------------+------------------------------|'
+        )
+
+    def _search_brother_ops(self, graph, op_node):
+        """
+        Search brother operators that was affected by pruning of given operator.
+        Args:
+            graph(GraphWrapper): The graph to be searched.
+            op_node(OpWrapper): The start node for searching.
+        Returns: 
+            list<VarWrapper>: A list of operators.
+        """
+        visited = [op_node.idx()]
+        stack = []
+        brothers = []
+        for op in graph.next_ops(op_node):
+            if (op.type() != 'conv2d') and (op.type() != 'fc') and (
+                    not op._is_bwd_op()):
+                stack.append(op)
+                visited.append(op.idx())
+        while len(stack) > 0:
+            top_op = stack.pop()
+            for parent in graph.pre_ops(top_op):
+                if parent.idx() not in visited and (not parent._is_bwd_op()):
+                    if ((parent.type == 'conv2d') or (parent.type == 'fc')):
+                        brothers.append(parent)
+                    else:
+                        stack.append(parent)
+                    visited.append(parent.idx())
+
+            for child in graph.next_ops(top_op):
+                if (child.type != 'conv2d') and (child.type != 'fc') and (
+                        child.idx() not in visited) and (
+                            not child._is_bwd_op()):
+                    stack.append(child)
+                    visited.append(child.idx())
+        return brothers
+
+    def _prune_graph(self, graph, target_graph):
+        """
+        Pruning parameters of graph according to target graph.
+        Args:
+            graph(GraphWrapper): The graph to be pruned.
+            target_graph(GraphWrapper): The reference graph.
+        Return: None
+        """
+        count = 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+        _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format('id', 'parammeter',
+                                                            'from', 'to'))
+        for param in target_graph.all_parameters():
+            var = graph.var(param.name())
+            ori_shape = var.shape()
+            var.set_shape(param.shape())
+            _logger.debug(
+                '|----+----------------------------------------+------------------------------+------------------------------|'
+            )
+            _logger.debug('|{:^4}|{:^40}|{:^30}|{:^30}|'.format(
+                str(count),
+                str(param.name()), str(ori_shape), str(param.shape())))
+            count += 1
+        _logger.debug(
+            '|----+----------------------------------------+------------------------------+------------------------------|'
+        )
+
+
+class UniformPruneStrategy(PruneStrategy):
     """
-    The strategy that pruning weights by threshold or ratio iteratively.
+    The uniform pruning strategy. The parameters will be pruned by uniform ratio.
     """
 
     def __init__(self,
-                 pruner,
-                 mini_batch_pruning_frequency=1,
+                 pruner=None,
                  start_epoch=0,
-                 end_epoch=10):
-        super(PruneStrategy, self).__init__(start_epoch, end_epoch)
-        self.pruner = pruner
-        self.mini_batch_pruning_frequency = mini_batch_pruning_frequency
-
-    def _triger(self, context):
-        return (context.batch_id % self.mini_batch_pruning_frequency == 0 and
-                self.start_epoch <= context.epoch_id < self.end_epoch)
-
-    def on_batch_end(self, context):
-        if self._triger(context):
-            prune_program = Program()
-            with program_guard(prune_program):
-                for param in context.graph.all_parameters():
-                    prune_program.global_block().clone_variable(param)
-                    p = prune_program.global_block().var(param.name)
-                    zeros_mask = self.pruner.prune(p)
-                    pruned_param = p * zeros_mask
-                    layers.assign(input=pruned_param, output=param)
-            context.program_exe.run(prune_program, scope=context.scope)
+                 end_epoch=0,
+                 target_ratio=0.5,
+                 metric_name=None,
+                 pruned_params='conv.*_weights'):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            target_ratio(float): The flops ratio to be pruned from current model.
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper.
+            pruned_params(str): The pattern str to match the parameter names to be pruned.
+        """
+        super(UniformPruneStrategy, self).__init__(pruner, start_epoch,
+                                                   end_epoch, target_ratio,
+                                                   metric_name, pruned_params)
+
+    def _get_best_ratios(self, context):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios')
+        pruned_params = []
+        for param in context.eval_graph.all_parameters():
+            if re.match(self.pruned_params, param.name()):
+                pruned_params.append(param.name())
+
+        min_ratio = 0.
+        max_ratio = 1.
+
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+
+        while min_ratio < max_ratio:
+            ratio = (max_ratio + min_ratio) / 2
+            _logger.debug(
+                '-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
+            ratios = [ratio] * len(pruned_params)
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                pruned_params,
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
+            _logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            if abs(pruned_flops - self.target_ratio) < 1e-2:
+                break
+            if pruned_flops > self.target_ratio:
+                max_ratio = ratio
+            else:
+                min_ratio = ratio
+        _logger.info('Get ratios: {}'.format([round(r, 2) for r in ratios]))
+        return pruned_params, ratios
+
+    def on_epoch_begin(self, context):
+        if context.epoch_id == self.start_epoch:
+            params, ratios = self._get_best_ratios(context)
+
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('\n################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################\n')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.2f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.2f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            #            metric = self._eval_graph(context)
+            #            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------UniformPruneStrategy.on_compression_begin finish--------------------------------'
+            )
+
+
+class SensitivePruneStrategy(PruneStrategy):
+    """
+    Sensitive pruning strategy. Different pruned ratio was applied on each layer.
+    """
+
+    def __init__(self,
+                 pruner=None,
+                 start_epoch=0,
+                 end_epoch=0,
+                 delta_rate=0.20,
+                 target_ratio=0.5,
+                 metric_name='top1_acc',
+                 pruned_params='conv.*_weights',
+                 sensitivities_file='./sensitivities.data',
+                 sensitivities={},
+                 num_steps=1,
+                 eval_rate=None):
+        """
+        Args:
+            pruner(slim.Pruner): The pruner used to prune the parameters.
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 10.
+            delta_rate(float): The delta used to generate ratios when calculating sensitivities. default: 0.2
+            target_ratio(float): The flops ratio to be pruned from current model. default: 0.5
+            metric_name(str): The metric used to evaluate the model.
+                         It should be one of keys in out_nodes of graph wrapper. default: 'top1_acc'
+            pruned_params(str): The pattern str to match the parameter names to be pruned. default: 'conv.*_weights'.
+            sensitivities_file(str): The sensitivities file. default: './sensitivities.data'
+            sensitivities(dict): The user-defined sensitivities. default: {}.
+            num_steps(int): The number of pruning steps. default: 1.
+            eval_rate(float): The rate of sampled data used to calculate sensitivities.
+                              None means using all the data. default: None.
+        """
+        super(SensitivePruneStrategy, self).__init__(pruner, start_epoch,
+                                                     end_epoch, target_ratio,
+                                                     metric_name, pruned_params)
+        self.delta_rate = delta_rate
+        self.pruned_list = []
+        self.sensitivities = sensitivities
+        self.sensitivities_file = sensitivities_file
+        self.backup = {}
+        self.param_shape_backup = {}
+        self.num_steps = num_steps
+        self.eval_rate = eval_rate
+        self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
+
+    def _save_sensitivities(self, sensitivities, sensitivities_file):
+        """
+        Save sensitivities into file.
+        """
+        with open(sensitivities_file, 'wb') as f:
+            pickle.dump(sensitivities, f)
+
+    def _load_sensitivities(self, sensitivities_file):
+        """
+        Load sensitivities from file.
+        """
+        sensitivities = {}
+        if sensitivities_file and os.path.exists(sensitivities_file):
+            with open(sensitivities_file, 'rb') as f:
+                if sys.version_info < (3, 0):
+                    sensitivities = pickle.load(f)
+                else:
+                    sensitivities = pickle.load(f, encoding='bytes')
+
+        for param in sensitivities:
+            sensitivities[param]['pruned_percent'] = [
+                round(p, 2) for p in sensitivities[param]['pruned_percent']
+            ]
+        self._format_sensitivities(sensitivities)
+        return sensitivities
+
+    def _format_sensitivities(self, sensitivities):
+        """
+        Print formated sensitivities in debug log level.
+        """
+        tb = pt.PrettyTable()
+        tb.field_names = ["parameter", "size"] + [
+            str(round(i, 2))
+            for i in np.arange(self.delta_rate, 1, self.delta_rate)
+        ]
+        for param in sensitivities:
+            if len(sensitivities[param]['loss']) == (len(tb.field_names) - 2):
+                tb.add_row([param, sensitivities[param]['size']] + [
+                    round(loss, 2) for loss in sensitivities[param]['loss']
+                ])
+        _logger.debug('\n################################')
+        _logger.debug('#      sensitivities table     #')
+        _logger.debug('################################\n')
+        _logger.debug(tb)
+
+    def _compute_sensitivities(self, context):
+        """
+        Computing the sensitivities of all parameters.
+        """
+        _logger.info("calling _compute_sensitivities.")
+        self.param_shape_backup = {}
+        self.backup = {}
+        cached_id = np.random.randint(1000)
+        if self.start_epoch == context.epoch_id:
+            sensitivities_file = self.sensitivities_file
+        else:
+            sensitivities_file = self.sensitivities_file + ".epoch" + str(
+                context.epoch_id)
+        sensitivities = self._load_sensitivities(sensitivities_file)
+
+        for param in context.eval_graph.all_parameters():
+            if not re.match(self.pruned_params, param.name()):
+                continue
+            if param.name() not in sensitivities:
+                sensitivities[param.name()] = {
+                    'pruned_percent': [],
+                    'loss': [],
+                    'size': param.shape()[0]
+                }
+
+        metric = None
+
+        for param in sensitivities.keys():
+            ratio = self.delta_rate
+            while ratio < 1:
+                ratio = round(ratio, 2)
+                if ratio in sensitivities[param]['pruned_percent']:
+                    _logger.debug('{}, {} has computed.'.format(param, ratio))
+                    ratio += self.delta_rate
+                    continue
+                if metric is None:
+                    metric = self._eval_graph(context, self.eval_rate,
+                                              cached_id)
+                # prune parameter by ratio
+                self._prune_parameters(
+                    context.eval_graph,
+                    context.scope, [param], [ratio],
+                    context.place,
+                    lazy=True)
+                self.pruned_list[0]
+                # get accuracy after pruning and update self.sensitivities
+                pruned_metric = self._eval_graph(context, self.eval_rate,
+                                                 cached_id)
+                loss = metric - pruned_metric
+                _logger.info("pruned param: {}; {}; loss={}".format(
+                    param, ratio, loss))
+                for brother in self.pruned_list[0]:
+                    if re.match(self.pruned_params, brother):
+                        if brother not in sensitivities:
+                            sensitivities[brother] = {
+                                'pruned_percent': [],
+                                'loss': []
+                            }
+                        sensitivities[brother]['pruned_percent'].append(ratio)
+                        sensitivities[brother]['loss'].append(loss)
+
+                self._save_sensitivities(sensitivities, sensitivities_file)
+
+                # restore pruned parameters
+                for param_name in self.backup.keys():
+                    param_t = context.scope.find_var(param_name).get_tensor()
+                    param_t.set(self.backup[param_name], context.place)
+
+#                pruned_metric = self._eval_graph(context)
+                self.backup = {}
+
+                ratio += self.delta_rate
+        return sensitivities
+
+    def _get_best_ratios(self, context, sensitivities, target_ratio):
+        """
+        Search a group of ratios for pruning target flops.
+        """
+        _logger.info('_get_best_ratios for pruning ratie: {}'.format(
+            target_ratio))
+        self.param_shape_backup = {}
+        self.backup = {}
+
+        def func(params, x):
+            a, b, c, d = params
+            return a * x * x * x + b * x * x + c * x + d
+
+        def error(params, x, y):
+            return func(params, x) - y
+
+        def slove_coefficient(x, y):
+            init_coefficient = [10, 10, 10, 10]
+            coefficient, loss = leastsq(error, init_coefficient, args=(x, y))
+            return coefficient
+
+        min_loss = 0.
+        max_loss = 0.
+
+        # step 1: fit curve by sensitivities
+        coefficients = {}
+        for param in sensitivities:
+            losses = np.array([0] * 5 + sensitivities[param]['loss'])
+            precents = np.array([0] * 5 + sensitivities[param][
+                'pruned_percent'])
+            coefficients[param] = slove_coefficient(precents, losses)
+            loss = np.max(losses)
+            max_loss = np.max([max_loss, loss])
+
+        # step 2: Find a group of ratios by binary searching.
+        flops = context.eval_graph.flops()
+        model_size = context.eval_graph.numel_params()
+        ratios = []
+        while min_loss < max_loss:
+            loss = (max_loss + min_loss) / 2
+            _logger.info(
+                '-----------Try pruned ratios while acc loss={:.4f}-----------'.
+                format(loss))
+            ratios = []
+            # step 2.1: Get ratios according to current loss
+            for param in sensitivities:
+                coefficient = copy.deepcopy(coefficients[param])
+                coefficient[-1] = coefficient[-1] - loss
+                roots = np.roots(coefficient)
+                for root in roots:
+                    min_root = 1
+                    if np.isreal(root) and root > 0 and root < 1:
+                        selected_root = min(root.real, min_root)
+                ratios.append(selected_root)
+            _logger.info('Pruned ratios={}'.format(
+                [round(ratio, 3) for ratio in ratios]))
+            # step 2.2: Pruning by current ratios
+            self._prune_parameters(
+                context.eval_graph,
+                context.scope,
+                sensitivities.keys(),
+                ratios,
+                context.place,
+                only_graph=True)
+
+            pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
+            pruned_size = 1 - (float(context.eval_graph.numel_params()) /
+                               model_size)
+            _logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
+            _logger.info('Pruned model size: {:.4f}'.format(pruned_size))
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
+                    param])
+            self.param_shape_backup = {}
+
+            # step 2.3: Check whether current ratios is enough
+            if abs(pruned_flops - target_ratio) < 0.015:
+                break
+            if pruned_flops > target_ratio:
+                max_loss = loss
+            else:
+                min_loss = loss
+        return sensitivities.keys(), ratios
+
+    def _current_pruning_target(self, context):
+        '''
+        Get the target pruning rate in current epoch.
+        '''
+        _logger.info('Left number of pruning steps: {}'.format(self.num_steps))
+        if self.num_steps <= 0:
+            return None
+        if (self.start_epoch == context.epoch_id) or context.eval_converged(
+                self.metric_name, 0.005):
+            self.num_steps -= 1
+            return self.pruning_step
+
+    def on_epoch_begin(self, context):
+        current_ratio = self._current_pruning_target(context)
+        if current_ratio is not None:
+            sensitivities = self._compute_sensitivities(context)
+            params, ratios = self._get_best_ratios(context, sensitivities,
+                                                   current_ratio)
+            self._prune_parameters(context.optimize_graph, context.scope,
+                                   params, ratios, context.place)
+
+            self.param_shape_backup = {}
+            self.backup = {}
+
+            model_size = context.eval_graph.numel_params()
+            flops = context.eval_graph.flops()
+            _logger.debug('################################')
+            _logger.debug('#          pruning eval graph    #')
+            _logger.debug('################################')
+            self._prune_graph(context.eval_graph, context.optimize_graph)
+            context.optimize_graph.update_groups_of_conv()
+            context.eval_graph.update_groups_of_conv()
+            context.optimize_graph.compile()  # to update the compiled program
+            context.eval_graph.compile(
+                for_parallel=False,
+                for_test=True)  # to update the compiled program
+            _logger.info(
+                '------------------finish pruning--------------------------------'
+            )
+            _logger.info('Pruned size: {:.3f}'.format(1 - (float(
+                context.eval_graph.numel_params()) / model_size)))
+            _logger.info('Pruned flops: {:.3f}'.format(1 - (float(
+                context.eval_graph.flops()) / flops)))
+            metric = self._eval_graph(context)
+            _logger.info('Metric after pruning: {:.2f}'.format(metric))
+            _logger.info(
+                '------------------SensitivePruneStrategy.on_epoch_begin finish--------------------------------'
+            )
diff --git a/python/paddle/fluid/contrib/slim/prune/pruner.py b/python/paddle/fluid/contrib/slim/prune/pruner.py
index ca72bcb6f6004c18f3ec794850e0aeaecb92d7ac..506b8fbe1de2e0f8a036f591bd2baacd5759c9c8 100644
--- a/python/paddle/fluid/contrib/slim/prune/pruner.py
+++ b/python/paddle/fluid/contrib/slim/prune/pruner.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 import numpy as np
+import collections
 from .... import layers
 
-__all__ = ['Pruner', 'MagnitudePruner', 'RatioPruner']
+__all__ = ['Pruner', 'StructurePruner']
 
 
 class Pruner(object):
@@ -30,54 +31,77 @@ class Pruner(object):
         pass
 
 
-class MagnitudePruner(Pruner):
+class StructurePruner(Pruner):
     """
-    Pruner used to pruning a parameter by threshold.
+    Pruner used to pruning parameters by groups.
     """
 
-    def __init__(self, threshold):
-        self.threshold = threshold
-
-    def prune(self, param, threshold=None):
-        if threshold is None:
-            thres = layers.fill_constant(
-                shape=[1], dtype='float32', value=self.threshold)
-        else:
-            thres = threshold
-        zeros_mask = layers.less_than(x=param, y=thres)
-        return zeros_mask
-
-
-class RatioPruner(Pruner):
-    """
-    Pruner used to pruning a parameter by ratio.
-    """
+    def __init__(self, pruning_axis, criterions):
+        """
+        Args:
+            pruning_axis(dict): The key is the name of parameter to be pruned,
+                                '*' means all the parameters.
+                                The value is the axis to be used. Given a parameter
+                                with shape [3, 4], the result of pruning 50% on aixs 1
+                                is a parameter with shape [3, 2].
+            criterions(dict): The key is the name of parameter to be pruned,
+                              '*' means all the parameters.
+                              The value is the criterion used to sort groups for pruning.
+                              It only supports 'l1_norm' currently.
+        """
+        self.pruning_axis = pruning_axis
+        self.criterions = criterions
 
-    def __init__(self, ratios=None):
+    def cal_pruned_idx(self, name, param, ratio, axis=None):
         """
+        Calculate the index to be pruned on axis by given pruning ratio.
         Args:
-            ratios: dict with pair (paramer_name, pruned_ratio). 
+            name(str): The name of parameter to be pruned.
+            param(np.array): The data of parameter to be pruned.
+            ratio(float): The ratio to be pruned.
+            axis(int): The axis to be used for pruning given parameter.
+                       If it is None, the value in self.pruning_axis will be used.
+                       default: None.
+        Returns:
+            list<int>: The indexes to be pruned on axis.
         """
-        self.ratios = ratios
+        criterion = self.criterions[
+            name] if name in self.criterions else self.criterions['*']
+        if axis is None:
+            assert self.pruning_axis is not None, "pruning_axis should set if axis is None."
+            axis = self.pruning_axis[
+                name] if name in self.pruning_axis else self.pruning_axis['*']
+        prune_num = int(round(param.shape[axis] * ratio))
+        reduce_dims = [i for i in range(len(param.shape)) if i != axis]
+        if criterion == 'l1_norm':
+            criterions = np.sum(np.abs(param), axis=tuple(reduce_dims))
+        pruned_idx = criterions.argsort()[:prune_num]
+        return pruned_idx
 
-    def prune(self, param, ratio=None):
+    def prune_tensor(self, tensor, pruned_idx, pruned_axis, lazy=False):
         """
+        Pruning a array by indexes on given axis.
         Args:
-            ratio: `ratio=40%` means pruning (1 - 40%) weights to zero.
+            tensor(numpy.array): The target array to be pruned.
+            pruned_idx(list<int>): The indexes to be pruned.
+            pruned_axis(int): The axis of given array to be pruned on. 
+            lazy(bool): True means setting the pruned elements to zero.
+                        False means remove the pruned elements from memory.
+                        default: False.
+        Returns:
+            numpy.array: The pruned array.
         """
-        if ratio is None:
-            rat = self.ratios[
-                param.name] if param.name in self.ratios else self.ratios['*']
-        else:
-            rat = ratio
-        if rat < 1.0:
-            k = max(int(rat * np.prod(param.shape)), 1)
-            param_vec = layers.reshape(x=param, shape=[1, -1])
-            param_topk, _ = layers.topk(param_vec, k=k)
-            threshold = layers.slice(
-                param_topk, axes=[1], starts=[-1], ends=[k])
-            threshold = layers.reshape(x=threshold, shape=[1])
-            zeros_mask = layers.less_than(x=param, y=threshold)
+        mask = np.zeros(tensor.shape[pruned_axis], dtype=bool)
+        mask[pruned_idx] = True
+
+        def func(data):
+            return data[~mask]
+
+        def lazy_func(data):
+            data[mask] = 0
+            return data
+
+        if lazy:
+            return np.apply_along_axis(lazy_func, pruned_axis, tensor)
         else:
-            zeros_mask = layers.ones(param.shape)
-        return zeros_mask
+            return np.apply_along_axis(func, pruned_axis, tensor)
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
index 6c26475f48855674d97abf5778a631646734fcf8..1c51aa15373779b06273296a27d913c070079f41 100644
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -16,5 +16,7 @@ from __future__ import print_function
 
 from . import quantization_pass
 from .quantization_pass import *
+from . import quantization_strategy
+from .quantization_strategy import *
 
-__all__ = quantization_pass.__all__
+__all__ = quantization_pass.__all__ + quantization_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 919db4c78e52edc9a8be44744f4b7704e3f62de4..5dcef506711b78c2aef30d16719f8766359ae8f3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -22,6 +22,7 @@ from ....framework import IrGraph
 from ....framework import IrNode
 from ....framework import Program
 from ....initializer import Constant
+from ....initializer import NumpyArrayInitializer
 from .... import unique_name
 
 __all__ = [
@@ -54,14 +55,15 @@ class QuantizationTransformPass(object):
                 the bias is not quantized.
             activation_bits (int): quantization bit number for activation.
             activation_quantize_type (str): quantization type for activation,
-                now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode,
-                the quantization scale will be calculated dynamically each step
-                in both training and testing period. If use 'range_abs_max',
-                a static quantization scale will be calculated during training
-                and used in inference.
+                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+                If use 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If use
+                'range_abs_max', a static quantization scale will be calculated
+                during training and used in inference.
             weight_quantize_type (str): quantization type for weights,
-                support 'abs_max'. The 'range_abs_max' usually is not used for
-                weight, since weights are fixed once the model is well trained.
+                support 'abs_max' and 'channel_wise_abs_max'. The 'range_abs_max'
+                usually is not used for weight, since weights are fixed once the
+                model is well trained.
             window_size (int): the window size for 'range_abs_max' quantization.
 
         Examples:
@@ -84,7 +86,11 @@ class QuantizationTransformPass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
 
-        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
+        quant_type = [
+            'abs_max', 'channel_wise_abs_max', 'range_abs_max',
+            'moving_average_abs_max'
+        ]
+        assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be ",
@@ -93,7 +99,7 @@ class QuantizationTransformPass(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
                 str(weight_quantize_type))
 
         self._activation_quantize_type = activation_quantize_type
@@ -103,6 +109,7 @@ class QuantizationTransformPass(object):
 
         self._need_initialized = collections.OrderedDict()
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._quantizable_grad_ops = [
             '%s_grad' % (op) for op in self._quantizable_ops
         ]
@@ -135,10 +142,26 @@ class QuantizationTransformPass(object):
                     else self._activation_bits
                     quant_type = self._weight_quantize_type if var_node.name() \
                         in persistable_vars else self._activation_quantize_type
-                    quant_var_node, scale_var_node = self._insert_quant_op(
-                        graph, var_node, quant_bits, quant_type)
-                    dequant_var_node = self._insert_dequant_op(
-                        graph, quant_var_node, scale_var_node, quant_bits)
+                    if quant_type == 'channel_wise_abs_max':
+                        assert var_node.name(
+                        ) in persistable_vars, "'channel_wise_abs_max' can only be applied on weights."
+                        if op.name() in self._conv_ops:
+                            quant_var_node, scale_var_node = self._insert_channel_quant_op(
+                                graph, var_node, quant_bits)
+                            dequant_var_node = self._insert_channel_dequant_op(
+                                graph, quant_var_node, [scale_var_node],
+                                [quant_bits])
+                        else:
+                            quant_var_node, scale_var_node = self._insert_quant_op(
+                                graph, var_node, quant_bits, 'abs_max')
+                            dequant_var_node = self._insert_dequant_op(
+                                graph, quant_var_node, scale_var_node,
+                                quant_bits)
+                    else:
+                        quant_var_node, scale_var_node = self._insert_quant_op(
+                            graph, var_node, quant_bits, quant_type)
+                        dequant_var_node = self._insert_dequant_op(
+                            graph, quant_var_node, scale_var_node, quant_bits)
                     dequantized_vars[var_node.name()] = dequant_var_node
                 graph.update_input_link(var_node, dequant_var_node, op)
 
@@ -244,7 +267,7 @@ class QuantizationTransformPass(object):
         scale_var_node = graph.create_var_node(
             name=self._quantized_scale_name(var_node.name()),
             var_type=var_node.type(),
-            shape=var_node.shape(),
+            shape=[1],
             var_dtype=var_node.dtype())
         quant_op_node = graph.create_op_node(
             op_type='fake_quantize_abs_max',
@@ -384,6 +407,36 @@ class QuantizationTransformPass(object):
 
         return quant_var_node, scale_out_node
 
+    def _insert_channel_quant_op(self, graph, var_node, quant_bits):
+        """
+        Insert fake_channel_wise_quantize_abs_max op in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        scale_var_node = graph.create_var_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=[var_node.shape()[0]],
+            var_dtype=var_node.dtype())
+        quant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_quantize_abs_max',
+            attrs={
+                'bit_length': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={'X': var_node},
+            outputs={'Out': quant_var_node,
+                     'OutScale': scale_var_node})
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_var_node)
+        return quant_var_node, scale_var_node
+
     def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
         """
         Insert fake_dequantize_op in the graph.
@@ -410,6 +463,33 @@ class QuantizationTransformPass(object):
         graph.link_to(dequant_op_node, dequant_var_node)
         return dequant_var_node
 
+    def _insert_channel_dequant_op(self, graph, var_node, scale_var_nodes,
+                                   quant_bits):
+        """
+        Insert fake_channel_wise_dequantize_max_abs in the graph.
+        """
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_dequantize_max_abs',
+            attrs={
+                'quant_bits': quant_bits,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={'X': var_node,
+                    'Scales': scale_var_nodes},
+            outputs={'Out': dequant_var_node})
+        graph.link_to(var_node, dequant_op_node)
+        for scale_n in scale_var_nodes:
+            graph.link_to(scale_n, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
+
     def _quantized_var_name(self, var_name):
         """
         Return quantized variable name for the input `var_name`.
@@ -442,7 +522,7 @@ class QuantizationFreezePass(object):
         place(fluid.CPUPlace|fluid.CUDAPlace): place is used to restore the weight tensors.
         weight_bits (int): quantization bit number for weights.
         activation_bits (int): quantization bit number for activation.
-        weight_quantize_type (str): quantization type for weights, support 'abs_max'.
+        weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'.
         The 'range_abs_max' usually is not used for weight, since weights are fixed once the
         model is well trained.
     """
@@ -463,11 +543,15 @@ class QuantizationFreezePass(object):
         self._activation_bits = activation_bits
         self._weight_quantize_type = weight_quantize_type
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
+        self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._fake_quant_op_names = [
             'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
-            'fake_quantize_moving_average_abs_max'
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
         ]
-        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
         self._op_input_rename_map = collections.OrderedDict()
         self._op_output_rename_map = collections.OrderedDict()
         self._var_scale_map = collections.OrderedDict()
@@ -489,20 +573,27 @@ class QuantizationFreezePass(object):
                     if self._weight_quantize_type == 'abs_max':
                         param = self._load_var(input_arg_name)
                         scale_v = np.max(np.abs(param))
+                    elif self._weight_quantize_type == 'channel_wise_abs_max':
+                        param = self._load_var(input_arg_name)
+                        if len(param.shape) == 4:  # conv2d or depthwise_conv2d
+                            scale_v = []
+                            for i in range(param.shape[0]):
+                                scale_v.append(np.max(np.abs(param[i])))
+                        else:
+                            scale_v = np.max(np.abs(param))
                     else:
                         scale_v = self._load_var(
                             op_node.output('OutScale')[0])[0]
                     self._var_scale_map[input_arg_name] = scale_v
-                else:
-                    scale_v = graph.var_node(op_node.output('OutScale')[0])
-                    self._var_scale_map[input_arg_name] = scale_v
-                if input_arg_name in persistable_vars:
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
                     # quantize weight and restore
                     param_v = self._load_var(input_arg_name)
                     quantized_param_v = self._quant(param_v, scale_v,
                                                     self._weight_bits)
                     self._restore_var(input_arg_name, quantized_param_v)
+                else:
+                    scale_v = graph.var_node(op_node.output('OutScale')[0])
+                    self._var_scale_map[input_arg_name] = scale_v
 
         ops = graph.all_op_nodes()
         for op_node in ops:
@@ -514,7 +605,10 @@ class QuantizationFreezePass(object):
         for op_node in ops:
             op_name = op_node.name()
             if op_name in self._quantizable_ops:
-                self._insert_post_dequant_op(graph, op_node)
+                if self._weight_quantize_type == 'channel_wise_abs_max' and op_name in self._conv_ops:
+                    self._insert_post_channel_dequant_op(graph, op_node)
+                else:
+                    self._insert_post_dequant_op(graph, op_node)
 
         for op_node in ops:
             # insert dequant_op after fc/conv, need to rename inputs of the followed ops
@@ -538,9 +632,73 @@ class QuantizationFreezePass(object):
             self._op_input_rename_map[k] = self._op_input_rename_map[v]
         graph.safe_remove_nodes(op_node)
 
+    def _insert_post_channel_dequant_op(self, graph, op_node):
+        persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
+        for var_node in op_node.inputs:
+            name = var_node.name()
+            if name in self._op_input_rename_map:
+                old_in = graph.var_node(name)
+                new_in = graph.var_node(self._op_input_rename_map[name])
+                new_in.clear_outputs()
+                graph.update_input_link(old_in, new_in, op_node)
+            original_var_name = self._original_var_name(name)
+            scale_v = self._var_scale_map[original_var_name]
+            if original_var_name in persistable_vars:
+                assert isinstance(
+                    scale_v,
+                    list), 'The scale of parameter %s is not a list.' % (
+                        original_var_name)
+                channel_scale = np.array(scale_v)
+            else:
+                assert isinstance(scale_v, IrNode)
+                scale_var_node = self._var_scale_map[original_var_name]
+
+        if len(op_node.outputs) != 1:
+            raise ValueError("Only support one output, but op %s has"
+                             " more than one output." % (op_node.name()))
+
+        output_var_node = op_node.outputs[0]
+        weight_scale_node = graph.create_persistable_node(
+            name=unique_name.generate('channel_scale'),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[channel_scale.shape[0]],
+            var_dtype=output_var_node.dtype())
+        init_program = Program()
+        weight_scale_var = init_program.global_block().create_var(
+            name=weight_scale_node.name(),
+            shape=weight_scale_node.shape(),
+            dtype=weight_scale_node.dtype(),
+            type=weight_scale_node.type(),
+            lod_level=weight_scale_node.var().lod_level(),
+            persistable=weight_scale_node.persistable())
+        initializer = NumpyArrayInitializer(value=channel_scale)
+        initializer(weight_scale_var, init_program.global_block())
+        exe = Executor(self._place)
+        exe.run(program=init_program, scope=self._scope)
+        dequant_var_node = graph.create_var_node(
+            name=self._dequantized_var_name(output_var_node.name()),
+            var_type=output_var_node.type(),
+            shape=output_var_node.shape(),
+            var_dtype=output_var_node.dtype())
+        dequant_op_node = graph.create_op_node(
+            op_type='fake_channel_wise_dequantize_max_abs',
+            attrs={
+                'quant_bits': [self._weight_bits, self._activation_bits],
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            },
+            inputs={
+                'X': output_var_node,
+                'Scales': [weight_scale_node, scale_var_node]
+            },
+            outputs={'Out': dequant_var_node})
+        graph.link_to(output_var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        graph.link_to(weight_scale_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        self._op_output_rename_map[output_var_node.name()] = dequant_var_node
+        return dequant_var_node
+
     def _insert_post_dequant_op(self, graph, op_node):
-        max_range = None
-        scale_var_node = None
         persistable_vars = [p.name() for p in graph.all_persistable_nodes()]
         for var_node in op_node.inputs:
             name = var_node.name()
@@ -637,7 +795,12 @@ class QuantizationFreezePass(object):
             or isinstance(v, np.float64)
 
     def _quant(self, x, scale, num_bits):
-        return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
+        if isinstance(scale, list):
+            for i, s in enumerate(scale):
+                x[i] = np.round(x[i] / s * ((1 << (num_bits - 1)) - 1))
+            return x
+        else:
+            return np.round(x / scale * ((1 << (num_bits - 1)) - 1))
 
 
 class ConvertToInt8Pass(object):
@@ -731,9 +894,13 @@ class TransformForMobilePass(object):
 
     def __init__(self):
         self._fake_quant_op_names = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max',
+            'fake_channel_wise_quantize_abs_max'
+        ]
+        self._fake_dequant_op_names = [
+            'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
         ]
-        self._fake_dequant_op_names = ['fake_dequantize_max_abs']
 
     def apply(self, graph):
         """
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..6812b4c633d5b55d84fff935b696297f30b18c6b
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+import numpy as np
+from .... import Executor
+from .... import io
+from .... import core
+from ....compiler import CompiledProgram
+from ....compiler import BuildStrategy
+from ....framework import IrGraph
+from ..core.strategy import Strategy
+from .quantization_pass import *
+
+__all__ = ['QuantizationStrategy']
+
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
+
+
+class QuantizationStrategy(Strategy):
+    """
+    The strategy for Quantization.
+    """
+
+    def __init__(self,
+                 start_epoch=0,
+                 end_epoch=0,
+                 float_model_save_path=None,
+                 mobile_model_save_path=None,
+                 int8_model_save_path=None,
+                 activation_bits=8,
+                 weight_bits=8,
+                 activation_quantize_type='abs_max',
+                 save_in_nodes=None,
+                 save_out_nodes=None):
+        """
+        Args:
+            start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0
+            end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0
+            float_model_save_path(str): The path to save model with float weights. 
+                            None means it doesn't save float model. defalut: None.
+            mobile_model_save_path(str): The path to save model for paddle-mobile execution.
+                            None means it doesn't save mobile model. defalut: None.
+            int8_model_save_path(str): The path to save model with int8_t weight.
+                            None means it doesn't save int8 model. defalut: None.
+            activation_bits(int): quantization bit number for activation. default: 8.
+            weight_bits(int): quantization bit number for weights. The bias is not quantized.
+                              default: 8.
+            activation_quantize_type(str): quantization type for activation,
+                now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+                If use 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If use
+                'range_abs_max', a static quantization scale will be calculated
+                during training and used in inference.
+            save_in_nodes(list<str>): A list of variable names used to prune graph 
+                                      for saving inference model.
+            save_out_nodes(list<str>): A list of variable names used to prune graph 
+                                      for saving inference model.
+
+        """
+        super(QuantizationStrategy, self).__init__(start_epoch, end_epoch)
+        self.start_epoch = start_epoch
+        self.end_epoch = end_epoch
+        self.float_model_save_path = float_model_save_path
+        self.mobile_model_save_path = mobile_model_save_path
+        self.int8_model_save_path = int8_model_save_path
+        self.activation_bits = activation_bits
+        self.weight_bits = weight_bits
+        self.activation_quantize_type = activation_quantize_type
+        self.save_out_nodes = save_out_nodes
+        self.save_in_nodes = save_in_nodes
+
+    def on_epoch_begin(self, context):
+        """
+        Insert fake_quantize_op and fake_dequantize_op before trainging and testing.
+        """
+        super(QuantizationStrategy, self).on_compression_begin(context)
+        if self.start_epoch == context.epoch_id:
+            _logger.info('QuantizationStrategy::on_epoch_begin')
+            train_ir_graph = IrGraph(
+                core.Graph(context.optimize_graph.program.desc), for_test=False)
+            test_ir_graph = IrGraph(
+                core.Graph(context.eval_graph.program.desc), for_test=True)
+            transform_pass = QuantizationTransformPass(
+                scope=context.scope,
+                place=context.place,
+                weight_bits=self.weight_bits,
+                activation_bits=self.activation_bits,
+                activation_quantize_type=self.activation_quantize_type)
+            transform_pass.apply(train_ir_graph)
+            transform_pass.apply(test_ir_graph)
+
+            build_strategy = BuildStrategy()
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
+            # for quantization training
+            context.optimize_graph.compiled_graph = CompiledProgram(
+                train_ir_graph.graph).with_data_parallel(
+                    loss_name=context.optimize_graph.out_nodes['loss'],
+                    build_strategy=build_strategy)
+            # for evaluation. And program compiled from ir graph must be with data parallel.
+            context.eval_graph.compiled_graph = CompiledProgram(
+                test_ir_graph.graph).with_data_parallel(
+                    build_strategy=build_strategy)
+            # for saving inference model after training
+            context.put('quantization_test_ir_graph_backup', test_ir_graph)
+            _logger.info('Finish QuantizationStrategy::on_epoch_begin')
+
+    def on_epoch_end(self, context):
+        """
+        Free and save inference model.
+        """
+        super(QuantizationStrategy, self).on_compression_end(context)
+
+        if context.epoch_id == self.end_epoch:
+            _logger.info('QuantizationStrategy::on_epoch_end')
+            test_ir_graph = context.get('quantization_test_ir_graph_backup')
+            # freeze the graph after training
+            freeze_pass = QuantizationFreezePass(
+                scope=context.scope,
+                place=context.place,
+                weight_bits=self.weight_bits,
+                activation_bits=self.activation_bits)
+            freeze_pass.apply(test_ir_graph)
+
+            # for other strategies
+            context.eval_graph.program = test_ir_graph.to_program()
+
+            if self.save_out_nodes == None:
+                out_vars = [
+                    context.eval_graph.var(var_name)._var
+                    for var_name in context.eval_graph.out_nodes.values()
+                ]
+            else:
+                out_vars = [
+                    context.eval_graph.var(var_name)._var
+                    for var_name in self.save_out_nodes
+                ]
+
+            if self.save_in_nodes == None:
+                in_vars = list(context.eval_graph.out_nodes.values())
+            else:
+                in_vars = self.save_in_nodes
+
+            # save float model
+            if self.float_model_save_path:
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.float_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+
+            # save int8 model
+            if self.int8_model_save_path:
+                convert_int8_pass = ConvertToInt8Pass(
+                    scope=context.scope, place=context.place)
+                convert_int8_pass.apply(test_ir_graph)
+
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.int8_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+
+            # save mobile model
+            if self.mobile_model_save_path:
+                if not self.int8_model_save_path:
+                    # convert the weights as int8_t type
+                    convert_int8_pass = ConvertToInt8Pass(
+                        scope=context.scope, place=context.place)
+                    convert_int8_pass.apply(test_ir_graph)
+                # make some changes on the graph for the mobile inference
+                mobile_pass = TransformForMobilePass()
+                mobile_pass.apply(test_ir_graph)
+                executor = Executor(context.place)
+                io.save_inference_model(
+                    self.mobile_model_save_path,
+                    in_vars,
+                    out_vars,
+                    executor,
+                    main_program=test_ir_graph.to_program(),
+                    model_filename='model',
+                    params_filename='weights',
+                    export_for_deployment=True)
+            _logger.info('Finish QuantizationStrategy::on_epoch_end')
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml b/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
deleted file mode 100644
index d9b49029d3e34d487ad65fe0f7e54e2cee1d5838..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/config.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-version: 1.0
-include: ["./configs/pruners.yaml", "./configs/pruners_0.yaml"]
-pruners:
-    pruner_1:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.3
-            'conv1_2.w': 0.4
-            '*': 0.9
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
-strategies:
-    strategy_1:
-        class: 'SensitivePruneStrategy'
-        pruner: 'pruner_2'
-        start_epoch: 0
-        end_epoch: 10
-        delta_rate: 0.20
-        acc_loss_threshold: 0.2
-        sensitivities:
-            'conv1_1.w': 0.4
-
-compress_pass:
-    class: 'CompressPass'
-    epoch: 100
-    strategies:
-        - strategy_1
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..570c60026d55c242106f7e2dc5c3f47bfbdbe884
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 0
+        delta_rate: 0.1
+        target_ratio: 0.3
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: '.*_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 120
+    checkpoint_path: './checkpoints/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
deleted file mode 100644
index 235092c595bf7c653221c7fe2b381fecf487fa49..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_2:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml b/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
deleted file mode 100644
index cd2ef9eb56ddbc1367ce2e3b413372fbcd542bde..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/configs/pruners_0.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-version: 1.0
-pruners:
-    pruner_3:
-        class: 'RatioPruner'
-        ratios:
-            'conv1_1.w': 0.5
-            'conv1_2.w': 0.2
-            '*': 0.7
-        group_dims:
-            '*': [1, 2, 3]
-        criterions:
-            '*': 'l1-norm'
diff --git a/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef89dfb7801e6df8a2cf842a5fcc745d70254977
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml
@@ -0,0 +1,46 @@
+#start_epoch(int): The epoch when to merge student graph and teacher graph for
+#                  distillation training. default: 0
+#
+#end_epoch(int): The epoch when to finish distillation training. default: 0
+#
+#student_feature_map(str): The name of feature map from student network.
+#
+#teacher_feature_map(str): The name of feature map from teacher network.
+#                          It's shape should be the same with student network.
+#
+#student_pairs(list<tuple>): Each tuple, with two variable names, in student_pairs indicates
+#                            a section in student network. The variables in a tuple should
+#                            have the same feature map size.
+#
+#teacher_pairs(list<tuple>): Each tuple, with two variable names, in teacher_pairs indicates
+#                            a section in teacher network. The variables in a tuple should
+#                            have the same feature map size. Varibale named teacher_pairs[i][j]
+#                            should has the save channel number with that of variable named 
+#                            student_pairs[i][j].
+#
+#distillation_loss_weight(float): The weight of the loss.
+version: 1.0
+distillers:
+    fsp_distiller:
+        class: 'FSPDistiller'
+#        teacher_pairs: [['teacher_depthwise_conv2d_1.tmp_0', 'teacher_conv2d_3.tmp_0']]
+#        student_pairs: [['student_depthwise_conv2d_1.tmp_0', 'student_conv2d_3.tmp_0']]
+        teacher_pairs: [['teacher_conv2_1_dw.tmp_0', 'teacher_conv1.tmp_0']]
+        student_pairs: [['student_conv2_1_dw.tmp_0', 'student_conv1.tmp_0']]
+        distillation_loss_weight: 1
+    l2_distiller:
+        class: 'L2Distiller'
+        teacher_feature_map: 'teacher.tmp_2'
+        student_feature_map: 'student.tmp_2'
+        distillation_loss_weight: 1
+strategies:
+    distillation_strategy:
+        class: 'DistillationStrategy'
+        distillers: ['fsp_distiller', 'l2_distiller']
+        start_epoch: 0
+        end_epoch: 1
+compressor:
+    epoch: 1
+    checkpoint_path: './distillation_checkpoints/'
+    strategies:
+        - distillation_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5f747a049e95a5920236336c69a80a9492e6190d
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml
@@ -0,0 +1,34 @@
+#start_epoch:         The 'on_epoch_begin' function will be called in start_epoch. default: 0.
+#end_epoch:           The 'on_epoch_end' function will be called in end_epoch. default: 10.
+#delta_rate:          The delta used to generate ratios when calculating sensitivities.
+#target_ratio:        The flops ratio to be pruned from current model.
+#metric_name:         The metric used to evaluate the model.
+#pruned_params:       The pattern str to match the parameter names to be pruned.
+#sensitivities_file:  The sensitivities file.
+#num_steps:           The number of pruning steps.
+#eval_rate:           The rate of sampled data used to calculate sensitivities.
+version: 1.0
+pruners:
+    pruner_1:
+        class: 'StructurePruner'
+        pruning_axis:
+            '*': 0
+        criterions:
+            '*': 'l1_norm'
+strategies:
+    sensitive_pruning_strategy:
+        class: 'SensitivePruneStrategy'
+        pruner: 'pruner_1'
+        start_epoch: 1
+        delta_rate: 0.2
+        target_ratio: 0.08
+        num_steps: 1
+        eval_rate: 0.5
+        pruned_params: 'conv6_sep_weights'
+        sensitivities_file: 'mobilenet_acc_top1_sensitive.data'
+        metric_name: 'acc_top1'
+compressor:
+    epoch: 2
+    checkpoint_path: './checkpoints_pruning/'
+    strategies:
+        - sensitive_pruning_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/mobilenet.py b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5dbef17e8d4a7c474881d88b6619061a3424177
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/mobilenet.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle.fluid as fluid
+from paddle.fluid.initializer import MSRA
+from paddle.fluid.param_attr import ParamAttr
+
+__all__ = ['MobileNet']
+
+train_parameters = {
+    "input_size": [3, 224, 224],
+    "input_mean": [0.485, 0.456, 0.406],
+    "input_std": [0.229, 0.224, 0.225],
+    "learning_strategy": {
+        "name": "piecewise_decay",
+        "batch_size": 256,
+        "epochs": [30, 60, 90],
+        "steps": [0.1, 0.01, 0.001, 0.0001]
+    }
+}
+
+
+class MobileNet():
+    def __init__(self, name=""):
+        self.params = train_parameters
+        self.name = name
+
+    def net(self, input, class_dim=1000, scale=1.0):
+        # conv1: 112x112
+        input = self.conv_bn_layer(
+            input,
+            filter_size=3,
+            channels=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1,
+            name=self.name + "_conv1")
+
+        # 56x56
+        input = self.depthwise_separable(
+            input,
+            num_filters1=32,
+            num_filters2=64,
+            num_groups=32,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv2_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=64,
+            num_filters2=128,
+            num_groups=64,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv2_2")
+
+        # 28x28
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=128,
+            num_groups=128,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv3_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=128,
+            num_filters2=256,
+            num_groups=128,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv3_2")
+
+        # 14x14
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=256,
+            num_groups=256,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv4_1")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=256,
+            num_filters2=512,
+            num_groups=256,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv4_2")
+
+        # 14x14
+        for i in range(5):
+            input = self.depthwise_separable(
+                input,
+                num_filters1=512,
+                num_filters2=512,
+                num_groups=512,
+                stride=1,
+                scale=scale,
+                name=self.name + "_conv5" + "_" + str(i + 1))
+        # 7x7
+        input = self.depthwise_separable(
+            input,
+            num_filters1=512,
+            num_filters2=1024,
+            num_groups=512,
+            stride=2,
+            scale=scale,
+            name=self.name + "_conv5_6")
+
+        input = self.depthwise_separable(
+            input,
+            num_filters1=1024,
+            num_filters2=1024,
+            num_groups=1024,
+            stride=1,
+            scale=scale,
+            name=self.name + "_conv6")
+
+        input = fluid.layers.pool2d(
+            input=input,
+            pool_size=0,
+            pool_stride=1,
+            pool_type='avg',
+            global_pooling=True)
+
+        output = fluid.layers.fc(
+            input=input,
+            size=class_dim,
+            act='softmax',
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=self.name + "_fc7_weights"),
+            bias_attr=ParamAttr(name=self.name + "_fc7_offset"),
+            name=self.name)
+        return output
+
+    def conv_bn_layer(self,
+                      input,
+                      filter_size,
+                      num_filters,
+                      stride,
+                      padding,
+                      channels=None,
+                      num_groups=1,
+                      act='relu',
+                      use_cudnn=True,
+                      name=None):
+        conv = fluid.layers.conv2d(
+            input=input,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            act=None,
+            use_cudnn=use_cudnn,
+            param_attr=ParamAttr(
+                initializer=MSRA(), name=name + "_weights"),
+            name=name,
+            bias_attr=False)
+        bn_name = name + "_bn"
+        return fluid.layers.batch_norm(
+            input=conv,
+            act=act,
+            name=name,
+            param_attr=ParamAttr(name=bn_name + "_scale"),
+            bias_attr=ParamAttr(name=bn_name + "_offset"),
+            moving_mean_name=bn_name + '_mean',
+            moving_variance_name=bn_name + '_variance')
+
+    def depthwise_separable(self,
+                            input,
+                            num_filters1,
+                            num_filters2,
+                            num_groups,
+                            stride,
+                            scale,
+                            name=None):
+        depthwise_conv = self.conv_bn_layer(
+            input=input,
+            filter_size=3,
+            num_filters=int(num_filters1 * scale),
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            use_cudnn=False,
+            name=name + "_dw")
+
+        pointwise_conv = self.conv_bn_layer(
+            input=depthwise_conv,
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0,
+            name=name + "_sep")
+        return pointwise_conv
diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f29eb53f88d22d87b61f82279b676af5ec1ef497
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml
@@ -0,0 +1,48 @@
+#start_epoch(int): The epoch to insert quantization operators. default: 0
+#
+#end_epoch(int): The epoch to save inferecne model. default: 0
+#
+#float_model_save_path(str): The path to save model with float weights.
+#                None means it doesn't save float model. defalut: None.
+#
+#mobile_model_save_path(str): The path to save model for paddle-mobile execution.
+#                None means it doesn't save mobile model. defalut: None.
+#
+#int8_model_save_path(str): The path to save model with int8_t weight.
+#                None means it doesn't save int8 model. defalut: None.
+#
+#activation_bits(int): quantization bit number for activation. default: 8.
+#
+#weight_bits(int): quantization bit number for weights. The bias is not quantized.
+#                  default: 8.
+#
+#activation_quantize_type(str): quantization type for activation,
+#    now support 'abs_max', 'range_abs_max' and 'moving_average_abs_max'.
+#    If use 'abs_max' mode, the quantization scale will be calculated
+#    dynamically each step in both training and testing period. If use
+#    'range_abs_max', a static quantization scale will be calculated
+#    during training and used in inference.
+#
+#save_in_nodes(list<str>): A list of variable names used to prune graph
+#                          for saving inference model.
+#
+#save_out_nodes(list<str>): A list of variable names used to prune graph
+#                                      for saving inference model.
+version: 1.0
+strategies:
+    quantization_strategy:
+        class: 'QuantizationStrategy'
+        start_epoch: 0
+        end_epoch: 0
+        float_model_save_path: './output/float'
+        weight_bits: 8
+        activation_bits: 8
+        weight_quantize_type: 'abs_max'
+        activation_quantize_type: 'abs_max'
+        save_in_nodes: ['image']
+        save_out_nodes: ['quan.tmp_2']
+compressor:
+    epoch: 1
+    checkpoint_path: './checkpoints_quan/'
+    strategies:
+        - quantization_strategy
diff --git a/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b967c0ac7d2bfdab23d4557ef0b7d28f4118ff7
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py
@@ -0,0 +1,94 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestDistillationStrategy(unittest.TestCase):
+    """
+    Test API of distillation strategy.
+    """
+
+    def test_compression(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet(name="student").net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        # define teacher program
+        teacher_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(teacher_program, startup_program):
+            img = teacher_program.global_block()._clone_variable(
+                image, force_persistable=False)
+            predict = MobileNet(name="teacher").net(input=img,
+                                                    class_dim=class_dim)
+
+        exe.run(startup_program)
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            teacher_programs=[teacher_program.clone(for_test=True)],
+            train_optimizer=optimizer,
+            distiller_optimizer=optimizer)
+        com_pass.config('./distillation/compress.yaml')
+        eval_graph = com_pass.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_factory.py b/python/paddle/fluid/contrib/slim/tests/test_factory.py
index 2fc72b6475e6bdd977dafb57696046a1100d0087..90eb8bd4b3caa44880f6df21c7f9f6d460655a8c 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_factory.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_factory.py
@@ -12,29 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.fluid.contrib.slim import ConfigFactory
+from paddle.fluid.contrib.slim.core import ConfigFactory
 import unittest
 
 
 class TestFactory(unittest.TestCase):
-    def test_parse(self):
-        factory = ConfigFactory('./configs/config.yaml')
+    def test_parse_pruning(self):
+        factory = ConfigFactory('./configs/filter_pruning.yaml')
 
-        pruner = factory.instance('pruner_1')
-        self.assertEquals(pruner.ratios['conv1_1.w'], 0.3)
+        pruner_1 = factory.instance('pruner_1')
+        self.assertEquals(pruner_1.pruning_axis['*'], 0)
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        pruner = factory.instance('pruner_2')
-        self.assertEquals(pruner.ratios['*'], 0.7)
+        strategy = factory.instance('sensitive_pruning_strategy')
+        pruner_1 = strategy.pruner
+        self.assertEquals(pruner_1.criterions['*'], 'l1_norm')
 
-        strategy = factory.instance('strategy_1')
-        pruner = strategy.pruner
-        self.assertEquals(pruner.ratios['*'], 0.7)
-
-        compress_pass = factory.get_compress_pass()
-        self.assertEquals(compress_pass.epoch, 100)
-
-        strategy = compress_pass.strategies[0]
-        self.assertEquals(strategy.delta_rate, 0.2)
+        self.assertEquals(strategy.start_epoch, 0)
+        self.assertEquals(strategy.sensitivities_file,
+                          'mobilenet_acc_top1_sensitive.data')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1763039b3a962a43f2fe3a22c05cb32cba596ed
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py
@@ -0,0 +1,89 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestFilterPruning(unittest.TestCase):
+    def test_compression(self):
+        """
+        Model: mobilenet_v1
+        data: mnist
+        step1: Training one epoch
+        step2: pruning flops
+        step3: fine-tune one epoch
+        step4: check top1_acc.
+        """
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet().net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            train_optimizer=optimizer)
+        com_pass.config('./filter_pruning/compress.yaml')
+        eval_graph = com_pass.run()
+        self.assertTrue(
+            abs((com_pass.context.eval_results['acc_top1'][-1] - 0.969) / 0.969)
+            < 0.02)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad82aa941183d72353dae19527b21286d6473a63
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -0,0 +1,140 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+import unittest
+import paddle.fluid as fluid
+import six
+import numpy as np
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+from paddle.fluid import core
+
+
+def residual_block(num):
+    def conv_bn_layer(input,
+                      ch_out,
+                      filter_size,
+                      stride,
+                      padding,
+                      act='relu',
+                      bias_attr=False):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=bias_attr)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    data = fluid.layers.data(name='image', shape=[1, 8, 8], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    data.stop_gradinet = False
+    hidden = data
+    for _ in six.moves.xrange(num):
+        conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True)
+        short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None)
+        hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu')
+    fc = fluid.layers.fc(input=hidden, size=10)
+
+    loss = fluid.layers.cross_entropy(input=fc, label=label)
+    loss = fluid.layers.mean(loss)
+    return data, label, loss
+
+
+class TestGraphWrapper(unittest.TestCase):
+    def build_program(self):
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            image, label, self.loss = residual_block(2)
+            eval_program = main.clone()
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
+            opt.minimize(self.loss)
+        self.scope = core.Scope()
+        exe = fluid.Executor(place)
+        exe.run(startup, scope=self.scope)
+        self.eval_graph = GraphWrapper(
+            program=eval_program,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+        self.train_graph = GraphWrapper(
+            program=main,
+            in_nodes={'image': image.name,
+                      'label': label.name},
+            out_nodes={'loss': self.loss.name})
+
+    def test_all_parameters(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.all_parameters()), 24)
+
+    def test_all_vars(self):
+        self.build_program()
+        self.assertEquals(len(self.train_graph.vars()), 90)
+
+    def test_numel_params(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.numel_params(), 13258)
+
+    def test_compile(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        self.train_graph.compile()
+        exe.run(self.train_graph.compiled_graph,
+                scope=self.scope,
+                feed={
+                    'image':
+                    np.random.randint(0, 40, [16, 1, 8, 8]).astype('float32'),
+                    'label': np.random.randint(0, 10, [16, 1]).astype('int64')
+                })
+
+    def test_pre_and_next_ops(self):
+        self.build_program()
+        for op in self.train_graph.ops():
+            for next_op in self.train_graph.next_ops(op):
+                self.assertTrue(op in self.train_graph.pre_ops(next_op))
+
+    def test_get_optimize_graph(self):
+        self.build_program()
+        place = fluid.CPUPlace()
+        if fluid.core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+        opt = fluid.optimizer.SGD(learning_rate=0.001)
+        train_graph = self.eval_graph.get_optimize_graph(
+            opt, place, self.scope, no_grad_var_names=['image'])
+        self.assertEquals(len(self.train_graph.ops()), len(train_graph.ops()))
+        exe = fluid.Executor(place)
+        train_graph.compile()
+        image = np.random.randint(0, 225, [16, 1, 8, 8]).astype('float32')
+        label = np.random.randint(0, 10, [16, 1]).astype('int64')
+        exe.run(train_graph.compiled_graph,
+                scope=self.scope,
+                feed={'image': image,
+                      'label': label})
+
+    def test_flops(self):
+        self.build_program()
+        self.assertEquals(self.train_graph.flops(), 354624)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 0b4b2a285f5de2596b5d30c6b2a6213762a64e7a..c7feca0b82606cdba9a05fb6de821aa6d347d4e6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -127,7 +127,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
                             arg_name.endswith('.quantized.dequantized'))
                         self.assertTrue(arg_name in quantized_ops)
 
-    def linear_fc_quant(self, quant_type, for_ci=False):
+    def linear_fc_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -140,14 +140,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             place=place,
-            activation_quantize_type=quant_type)
+            activation_quantize_type=activation_quant_type)
         transform_pass.apply(graph)
         if not for_ci:
             marked_nodes = set()
             for op in graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            graph.draw('.', 'quantize_fc_' + quant_type, marked_nodes)
+            graph.draw('.', 'quantize_fc_' + activation_quant_type,
+                       marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
@@ -156,7 +157,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
             for op in val_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_fc_' + quant_type, val_marked_nodes)
+            val_graph.draw('.', 'val_fc_' + activation_quant_type,
+                           val_marked_nodes)
 
     def test_linear_fc_quant_abs_max(self):
         self.linear_fc_quant('abs_max', for_ci=True)
@@ -167,7 +169,7 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_linear_fc_quant_moving_average_abs_max(self):
         self.linear_fc_quant('moving_average_abs_max', for_ci=True)
 
-    def residual_block_quant(self, quant_type, for_ci=False):
+    def residual_block_quant(self, activation_quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -180,14 +182,15 @@ class TestQuantizationTransformPass(unittest.TestCase):
         transform_pass = QuantizationTransformPass(
             scope=fluid.global_scope(),
             place=place,
-            activation_quantize_type=quant_type)
+            activation_quantize_type=activation_quant_type)
         transform_pass.apply(graph)
         if not for_ci:
             marked_nodes = set()
             for op in graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            graph.draw('.', 'quantize_residual_' + quant_type, marked_nodes)
+            graph.draw('.', 'quantize_residual_' + activation_quant_type,
+                       marked_nodes)
         program = graph.to_program()
         self.check_program(transform_pass, program)
         val_graph = IrGraph(core.Graph(program.desc), for_test=False)
@@ -196,7 +199,8 @@ class TestQuantizationTransformPass(unittest.TestCase):
             for op in val_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     val_marked_nodes.add(op)
-            val_graph.draw('.', 'val_residual_' + quant_type, val_marked_nodes)
+            val_graph.draw('.', 'val_residual_' + activation_quant_type,
+                           val_marked_nodes)
 
     def test_residual_block_abs_max(self):
         self.residual_block_quant('abs_max', for_ci=True)
@@ -209,7 +213,12 @@ class TestQuantizationTransformPass(unittest.TestCase):
 
 
 class TestQuantizationFreezePass(unittest.TestCase):
-    def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
+    def freeze_graph(self,
+                     use_cuda,
+                     seed,
+                     activation_quant_type,
+                     weight_quant_type='abs_max',
+                     for_ci=False):
         def build_program(main, startup, is_test):
             main.random_seed = seed
             startup.random_seed = seed
@@ -243,7 +252,12 @@ class TestQuantizationFreezePass(unittest.TestCase):
         with fluid.scope_guard(scope):
             exe.run(startup)
         transform_pass = QuantizationTransformPass(
-            scope=scope, place=place, activation_quantize_type=quant_type)
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quant_type)
+        #transform_pass = QuantizationTransformPass(
+        #    scope=scope, place=place, activation_quantize_type=activation_quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
         dev_name = '_gpu_' if use_cuda else '_cpu_'
@@ -252,12 +266,14 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in main_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            main_graph.draw('.', 'main' + dev_name + quant_type, marked_nodes)
+            main_graph.draw('.', 'main' + dev_name + activation_quant_type + '_'
+                            + weight_quant_type, marked_nodes)
             marked_nodes = set()
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test' + dev_name + quant_type, marked_nodes)
+            test_graph.draw('.', 'test' + dev_name + activation_quant_type + '_'
+                            + weight_quant_type, marked_nodes)
 
         build_strategy = fluid.BuildStrategy()
         build_strategy.memory_optimize = False
@@ -282,8 +298,9 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
                 if not for_ci:
-                    print('{}: {}'.format('loss' + dev_name + quant_type,
-                                          loss_v))
+                    print('{}: {}'.format('loss' + dev_name +
+                                          activation_quant_type + '_' +
+                                          weight_quant_type, loss_v))
 
         test_data = next(test_reader())
         with fluid.program_guard(quantized_test_program):
@@ -296,14 +313,17 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                           fetch_list=[loss, w_var])
 
         # Freeze graph for inference, but the weight of fc/conv is still float type.
-        freeze_pass = QuantizationFreezePass(scope=scope, place=place)
+        freeze_pass = QuantizationFreezePass(
+            scope=scope, place=place, weight_quantize_type=weight_quant_type)
+        #freeze_pass = QuantizationFreezePass(scope=scope, place=place)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_freeze' + dev_name + quant_type,
+            test_graph.draw('.', 'test_freeze' + dev_name +
+                            activation_quant_type + '_' + weight_quant_type,
                             marked_nodes)
 
         server_program = test_graph.to_program()
@@ -313,18 +333,20 @@ class TestQuantizationFreezePass(unittest.TestCase):
                                   fetch_list=[loss])
         self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
         if not for_ci:
-            print('{}: {}'.format('test_loss1' + dev_name + quant_type,
-                                  test_loss1))
-            print('{}: {}'.format('test_loss2' + dev_name + quant_type,
-                                  test_loss2))
+            print(
+                '{}: {}'.format('test_loss1' + dev_name + activation_quant_type
+                                + '_' + weight_quant_type, test_loss1))
+            print(
+                '{}: {}'.format('test_loss2' + dev_name + activation_quant_type
+                                + '_' + weight_quant_type, test_loss2))
         w_freeze = np.array(scope.find_var('conv2d_1.w_0').get_tensor())
         # Maybe failed, this is due to the calculation precision
         # self.assertAlmostEqual(np.sum(w_freeze), np.sum(w_quant))
         if not for_ci:
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
-            print('{}: {}'.format('w_quant' + dev_name + quant_type,
-                                  np.sum(w_quant)))
+            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
+                                  + '_' + weight_quant_type, np.sum(w_freeze)))
+            print('{}: {}'.format('w_quant' + dev_name + activation_quant_type +
+                                  '_' + weight_quant_type, np.sum(w_quant)))
 
         # Convert parameter to 8-bit.
         convert_int8_pass = ConvertToInt8Pass(scope=scope, place=place)
@@ -334,26 +356,28 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_int8' + dev_name + quant_type,
-                            marked_nodes)
+            test_graph.draw('.', 'test_int8' + dev_name + activation_quant_type
+                            + '_' + weight_quant_type, marked_nodes)
         server_program_int8 = test_graph.to_program()
         # Save the 8-bit parameter and model file.
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('server_int8' + dev_name + quant_type,
-                                          ['image', 'label'], [loss], exe,
-                                          server_program_int8)
+            fluid.io.save_inference_model(
+                'server_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, ['image', 'label'], [loss], exe,
+                server_program_int8)
             # Test whether the 8-bit parameter and model file can be loaded successfully.
             [infer, feed, fetch] = fluid.io.load_inference_model(
-                'server_int8' + dev_name + quant_type, exe)
+                'server_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, exe)
         # Check the loaded 8-bit weight.
         w_8bit = np.array(scope.find_var('conv2d_1.w_0.int8').get_tensor())
         self.assertEqual(w_8bit.dtype, np.int8)
         self.assertEqual(np.sum(w_8bit), np.sum(w_freeze))
         if not for_ci:
-            print('{}: {}'.format('w_8bit' + dev_name + quant_type,
-                                  np.sum(w_8bit)))
-            print('{}: {}'.format('w_freeze' + dev_name + quant_type,
-                                  np.sum(w_freeze)))
+            print('{}: {}'.format('w_8bit' + dev_name + activation_quant_type +
+                                  '_' + weight_quant_type, np.sum(w_8bit)))
+            print('{}: {}'.format('w_freeze' + dev_name + activation_quant_type
+                                  + '_' + weight_quant_type, np.sum(w_freeze)))
 
         mobile_pass = TransformForMobilePass()
         mobile_pass.apply(test_graph)
@@ -362,42 +386,103 @@ class TestQuantizationFreezePass(unittest.TestCase):
             for op in test_graph.all_op_nodes():
                 if op.name().find('quantize') > -1:
                     marked_nodes.add(op)
-            test_graph.draw('.', 'test_mobile' + dev_name + quant_type,
+            test_graph.draw('.', 'test_mobile' + dev_name +
+                            activation_quant_type + '_' + weight_quant_type,
                             marked_nodes)
 
         mobile_program = test_graph.to_program()
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('mobile_int8' + dev_name + quant_type,
-                                          ['image', 'label'], [loss], exe,
-                                          mobile_program)
+            fluid.io.save_inference_model(
+                'mobile_int8' + dev_name + activation_quant_type + '_' +
+                weight_quant_type, ['image', 'label'], [loss], exe,
+                mobile_program)
 
     def test_freeze_graph_cuda_dynamic(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(
-                    True, seed=1, quant_type='abs_max', for_ci=True)
+                    True,
+                    seed=1,
+                    activation_quant_type='abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
+            with fluid.unique_name.guard():
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='abs_max',
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_dynamic(self):
         with fluid.unique_name.guard():
-            self.freeze_graph(False, seed=2, quant_type='abs_max', for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
 
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
                 self.freeze_graph(
-                    True, seed=1, quant_type='range_abs_max', for_ci=True)
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    weight_quant_type='abs_max',
+                    for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
-                    quant_type='moving_average_abs_max',
+                    activation_quant_type='range_abs_max',
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    weight_quant_type='channel_wise_abs_max',
                     for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
             self.freeze_graph(
-                False, seed=2, quant_type='range_abs_max', for_ci=True)
+                False,
+                seed=2,
+                activation_quant_type='range_abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='moving_average_abs_max',
+                weight_quant_type='abs_max',
+                for_ci=True)
+            self.freeze_graph(
+                False,
+                seed=2,
+                activation_quant_type='range_abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
             self.freeze_graph(
-                False, seed=2, quant_type='moving_average_abs_max', for_ci=True)
+                False,
+                seed=2,
+                activation_quant_type='moving_average_abs_max',
+                weight_quant_type='channel_wise_abs_max',
+                for_ci=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..92afd892afed86e69266c9ab9c97d90daebb86d5
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py
@@ -0,0 +1,82 @@
+#   copyright (c) 2019 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+import paddle
+import unittest
+import paddle.fluid as fluid
+from mobilenet import MobileNet
+from paddle.fluid.contrib.slim.core import Compressor
+from paddle.fluid.contrib.slim.graph import GraphWrapper
+
+
+class TestQuantizationStrategy(unittest.TestCase):
+    """
+    Test API of quantization strategy.
+    """
+
+    def test_compression(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        class_dim = 10
+        image_shape = [1, 28, 28]
+        image = fluid.layers.data(
+            name='image', shape=image_shape, dtype='float32')
+        image.stop_gradient = False
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        out = MobileNet(name='quan').net(input=image, class_dim=class_dim)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+        val_program = fluid.default_main_program().clone(for_test=False)
+
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        optimizer = fluid.optimizer.Momentum(
+            momentum=0.9,
+            learning_rate=0.01,
+            regularization=fluid.regularizer.L2Decay(4e-5))
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
+
+        val_feed_list = [('img', image.name), ('label', label.name)]
+        val_fetch_list = [('acc_top1', acc_top1.name), ('acc_top5',
+                                                        acc_top5.name)]
+
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=128)
+        train_feed_list = [('img', image.name), ('label', label.name)]
+        train_fetch_list = [('loss', avg_cost.name)]
+
+        com_pass = Compressor(
+            place,
+            fluid.global_scope(),
+            fluid.default_main_program(),
+            train_reader=train_reader,
+            train_feed_list=train_feed_list,
+            train_fetch_list=train_fetch_list,
+            eval_program=val_program,
+            eval_reader=val_reader,
+            eval_feed_list=val_feed_list,
+            eval_fetch_list=val_fetch_list,
+            train_optimizer=optimizer)
+        com_pass.config('./quantization/compress.yaml')
+        eval_graph = com_pass.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index b9f938bebed71dc9611df8d743a066858ea38bca..00885eb5d6057b4a7738705007a9334da6aea9d0 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -136,7 +136,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
                                                         "full_data", False)
         else:
             data_urls.append(
-                'http://paddle-inference-dist.cdn.bcebos.com/int8/calibration_test_data.tar.gz'
+                'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
             )
             data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
             self.data_cache_folder = self.download_data(data_urls, data_md5s,
@@ -189,7 +189,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
     def download_model(self):
         # resnet50 fp32 data
         data_urls = [
-            'http://paddle-inference-dist.cdn.bcebos.com/int8/resnet50_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/resnet50_int8_model.tar.gz'
         ]
         data_md5s = ['4a5194524823d9b76da6e738e1367881']
         self.model_cache_folder = self.download_data(data_urls, data_md5s,
@@ -290,7 +290,7 @@ class TestCalibrationForResnet50(unittest.TestCase):
             self.model, self.infer_iterations)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program("calibration_out")
-        delta_value = np.abs(fp32_acc1 - int8_acc1)
+        delta_value = fp32_acc1 - int8_acc1
         self.assertLess(delta_value, 0.01)
         print(
             "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}".
@@ -307,7 +307,7 @@ class TestCalibrationForMobilenetv1(TestCalibrationForResnet50):
     def download_model(self):
         # mobilenetv1 fp32 data
         data_urls = [
-            'http://paddle-inference-dist.cdn.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
         ]
         data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
         self.model_cache_folder = self.download_data(data_urls, data_md5s,
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 3dac41ce43d61c02f3e11087aef98e2fc454556b..00c4e5691a23a9864ed3e8964f4cafaf9588c665 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -26,6 +26,24 @@ from .framework import Variable, default_main_program
 __all__ = ['DataFeeder']
 
 
+def convert_dtype(dtype):
+    if dtype == core.VarDesc.VarType.FP32:
+        return 'float32'
+    elif dtype == core.VarDesc.VarType.INT64:
+        return 'int64'
+    elif dtype == core.VarDesc.VarType.FP64:
+        return 'float64'
+    elif dtype == core.VarDesc.VarType.FP16:
+        return 'float16'
+    elif dtype == core.VarDesc.VarType.INT32:
+        return 'int32'
+    elif dtype == core.VarDesc.VarType.UINT8:
+        return 'uint8'
+    else:
+        raise ValueError("dtype must be any of [int32, float32, int64, "
+                         "float64, uint8]")
+
+
 class DataToLoDTensorConverter(object):
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
@@ -38,27 +56,12 @@ class DataToLoDTensorConverter(object):
             if negtive_count > 1:
                 self.shape = None
                 break
-        if dtype == core.VarDesc.VarType.FP32:
-            self.dtype = 'float32'
-        elif dtype == core.VarDesc.VarType.INT64:
-            self.dtype = 'int64'
-        elif dtype == core.VarDesc.VarType.FP64:
-            self.dtype = 'float64'
-        elif dtype == core.VarDesc.VarType.FP16:
-            self.dtype = 'float16'
-        elif dtype == core.VarDesc.VarType.INT32:
-            self.dtype = 'int32'
-        elif dtype == core.VarDesc.VarType.UINT8:
-            self.dtype = 'uint8'
-        else:
-            raise ValueError("dtype must be any of [int32, float32, int64, "
-                             "float64, uint8]")
+        self.dtype = convert_dtype(dtype)
+        self._reset()
 
+    def _reset(self):
         self.data = []
-        self.lod = []
-
-        for i in six.moves.range(lod_level):
-            self.lod.append([])
+        self.lod = [[] for _ in six.moves.range(self.lod_level)]
 
     def feed(self, data):
         self._feed_impl_(data, self.lod, self.lod_level)
@@ -88,15 +91,52 @@ class DataToLoDTensorConverter(object):
                     raise ValueError(
                         "Reshape error. What is defined in data layer is {}, but receive {}"
                         .format(self.shape, arr.shape))
-            #else:
-            #    self._check_shape(arr.shape)
         t = core.LoDTensor()
         t.set(arr, self.place)
         if self.lod_level > 0:
             t.set_recursive_sequence_lengths(self.lod)
+        self._reset()
         return t
 
 
+class BatchedTensorProvider(object):
+    def __init__(self, feed_list, place, batch_size, generator, drop_last):
+        self.place = place
+        self.batch_size = batch_size
+        self.generator = generator
+        self.converters = []
+        self.drop_last = drop_last
+
+        for var in feed_list:
+            assert var.lod_level == 0, "lod_level must be 0"
+            self.converters.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=0,
+                    shape=var.shape,
+                    dtype=var.dtype))
+
+    def _done(self):
+        return [c.done() for c in self.converters]
+
+    def __call__(self):
+        idx = 0
+        for each_sample in self.generator():
+            for each_slot, each_converter in six.moves.zip(each_sample,
+                                                           self.converters):
+                each_converter.data.append(each_slot)
+
+            idx += 1
+            if idx == self.batch_size:
+                idx = 0
+                yield self._done()
+
+        if not self.drop_last and idx > 0:
+            yield self._done()
+        else:
+            [c._reset() for c in self.converters]
+
+
 class DataFeeder(object):
     """
     DataFeeder converts the data that returned by a reader into a data
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 03aa9917f3201e690a7072442cf11ac2284b03c5..018e38cbb3f2676ac05f1a27e9e92b6e0f16efb0 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -564,6 +564,10 @@ class Executor(object):
 
         if feed is None:
             feed = {}
+        elif isinstance(feed, (list, tuple)):
+            assert len(feed) == 1, "Not compiled with data parallel"
+            feed = feed[0]
+
         if not isinstance(feed, dict):
             raise TypeError(
                 "feed requires dict as its Parameter. But you passed in %s" %
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 556ce71ee585fd24bc983b4fcedc2fbdfb016889..b25d9441e0098ffaa7801cb9029d786587e74c25 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -26,6 +26,7 @@ import six
 
 import numpy as np
 import subprocess
+import multiprocessing
 
 from .. import compat as cpt
 from .proto import framework_pb2
@@ -63,6 +64,9 @@ __all__ = [
     'default_main_program',
     'program_guard',
     'name_scope',
+    'cuda_places',
+    'cpu_places',
+    'cuda_pinned_places',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -87,6 +91,87 @@ def _current_expected_place():
     return _imperative_current_expected_place_
 
 
+def _cpu_num():
+    return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+
+
+def cuda_places(device_ids=None):
+    '''
+    Create a list of :code:`fluid.CUDAPlace` objects.
+
+    If :code:`device_ids` is None, environment variable of
+    :code:`FLAGS_selected_gpus` would be checked first. If
+    :code:`FLAGS_selected_gpus=0,1,2`, the returned list would
+    be [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    If :code:`FLAGS_selected_gpus` is not set, all visible
+    gpu places would be returned.  
+
+    If :code:`device_ids` is not None, it should be the device
+    ids of gpus. For example, if :code:`device_ids=[0,1,2]`, 
+    the returned list would be 
+    [fluid.CUDAPlace(0), fluid.CUDAPlace(1), fluid.CUDAPlace(2)].
+    
+    Args: 
+        device_ids (None|list(int)|tuple(int)): gpu device id list.
+
+    Returns:
+        out (list(fluid.CUDAPlace)): gpu place list.
+    '''
+    assert core.is_compiled_with_cuda(), \
+        "Not compiled with CUDA"
+    if device_ids is None:
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            device_ids = [int(s) for s in gpus_env.split(",")]
+        else:
+            device_ids = six.moves.range(core.get_cuda_device_count())
+    elif not isinstance(device_ids, (list, tuple)):
+        device_ids = [device_ids]
+    return [core.CUDAPlace(dev_id) for dev_id in device_ids]
+
+
+def cpu_places(device_count=None):
+    '''
+    Create a list of :code:`fluid.CPUPlace` objects.
+    
+    If :code:`device_count` is None, the device count would
+    be determined by environment variable :code:`CPU_NUM`. 
+    If :code:`CPU_NUM` is not set, the device count would
+    be determined by :code:`multiprocessing.cpu_count()`. 
+
+    Args:
+        device_count (None|int): device number.
+
+    Returns:
+        out (list(fluid.CPUPlace)): cpu place list.
+    '''
+    if device_count is None:
+        device_count = _cpu_num()
+    return [core.CPUPlace()] * device_count
+
+
+def cuda_pinned_places(device_count=None):
+    '''
+    Create a list of :code:`fluid.CUDAPinnedPlace` objects.
+
+    If :code:`device_count` is None, the device count would
+    be determined by environment variable :code:`CPU_NUM`. 
+    If :code:`CPU_NUM` is not set, the device count would
+    be determined by :code:`multiprocessing.cpu_count()`. 
+
+    Args:
+        device_count (None|int): device number.
+
+    Returns:
+        out (list(fluid.CUDAPinnedPlace)): cuda pinned place list.
+    '''
+    assert core.is_compiled_with_cuda(), \
+        "Not compiled with CUDA"
+    if device_count is None:
+        device_count = _cpu_num()
+    return [core.cuda_pinned_places()] * device_count
+
+
 class NameScope(object):
     def __init__(self, name="", parent=None):
         self._children = dict()
@@ -318,8 +403,8 @@ class Variable(object):
                 self._ivar = core.VarBase(
                     name, dtype if dtype else core.VarDesc.VarType.FP32,
                     list(shape) if shape else [],
-                    _current_expected_place(), True
-                    if persistable else False, stop_gradient)
+                    _current_expected_place(), stop_gradient, True
+                    if persistable else False)
             if persistable:
                 _imperative_tracer().trace_var(name, self)
         else:
@@ -644,10 +729,9 @@ class Operator(object):
                                 outputs={"Out": [var1]})
     """
     OP_WITHOUT_KERNEL_SET = {
-        'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
-        'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
-        'listen_and_serv', 'save_combine', 'load_combine', 'ncclInit', 'select',
-        'checkpoint_notify', 'gen_nccl_id'
+        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
+        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
+        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
@@ -1560,12 +1644,15 @@ class Block(object):
                 name=v.name)
             self.vars[new_p.name] = new_p
 
-    def _clone_variable(self, var):
+    def _clone_variable(self, var, force_persistable=True):
         """
         Clone a variable into current block.
 
         Args:
             var: the variable to be cloned.
+            force_persistable(bool): True means setting the result variable to being persistable.
+                                     False means setting the persistable the same with that of input var.
+                                     default: True.
 
         Returns:
             Variable: the new  variable cloned from 'var' in current block.
@@ -1585,7 +1672,7 @@ class Block(object):
                 shape=var.shape,
                 dtype=var.dtype,
                 type=var.type,
-                persistable=True,
+                persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data)
         else:
             ret_var = self.create_var(
@@ -1594,7 +1681,7 @@ class Block(object):
                 dtype=var.dtype,
                 type=var.type,
                 lod_level=var.lod_level,
-                persistable=True,
+                persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data)
         return ret_var
 
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 7f31ca1b9b70a05d22eca325b38fe2cb5ff15b03..7281b3ea4b961a14126023a14a2ba2f03c7d1387 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -29,9 +29,13 @@ from .tracer import *
 from . import profiler
 from .profiler import *
 
+from . import checkpoint
+from .checkpoint import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
 __all__ += profiler.__all__
+__all__ += checkpoint.__all__
diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py
index 174f138bfa2d3cfaa433c3235c2b0f9a5650e756..d619c09b1bdd704700af219856148524d9d0d8db 100644
--- a/python/paddle/fluid/imperative/base.py
+++ b/python/paddle/fluid/imperative/base.py
@@ -44,7 +44,7 @@ def guard(place=None):
                     yield
 
 
-def to_variable(value, block=None):
+def to_variable(value, block=None, name=None):
     if isinstance(value, np.ndarray):
         assert enabled(), "to_variable could only be called in imperative mode"
 
@@ -53,7 +53,7 @@ def to_variable(value, block=None):
         py_var = framework.Variable(
             block,
             type=core.VarDesc.VarType.LOD_TENSOR,
-            name=None,
+            name=name,
             shape=value.shape,
             dtype=value.dtype)
         var = py_var._ivar.value()
diff --git a/python/paddle/fluid/imperative/checkpoint.py b/python/paddle/fluid/imperative/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..37c43f29d2ae9214058238e4f834dbbcd9e42df1
--- /dev/null
+++ b/python/paddle/fluid/imperative/checkpoint.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import collections
+from .. import core
+from ..framework import Variable, default_main_program
+
+__all__ = ['save_persistables', 'load_persistables']
+
+
+def save_persistables(vardict, dirname, filename=None):
+    """
+    This function filters out all variables in layer.parameters from the
+    give `layer` and then trys to load these variables from the folder
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
+    the file name.
+
+    Args:
+        vardict(dict of Parameters): The parameters will
+                                    be saved. If it is None, nothing
+                                    will be deal.
+        dirname(str): The directory path.
+        filename(str|None): The file which saved all variables. If variables were
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+
+    Examples:
+        .. code-block:: python
+            ptb_model = PtbModel(
+                hidden_size=hidden_size,
+                vocab_size=vocab_size,
+                num_layers=num_layers,
+                num_steps=num_steps,
+                init_scale=init_scale)
+
+            x_data = np.arange(12).reshape(4, 3).astype('int64')
+            y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
+            x_data = x_data.reshape((-1, num_steps, 1))
+            y_data = y_data.reshape((-1, 1))
+            init_hidden_data = np.zeros(
+                (num_layers, batch_size, hidden_size), dtype='float32')
+            init_cell_data = np.zeros(
+                (num_layers, batch_size, hidden_size), dtype='float32')
+            x = to_variable(x_data)
+            y = to_variable(y_data)
+            init_hidden = to_variable(init_hidden_data)
+            init_cell = to_variable(init_cell_data)
+            dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
+                                                        init_cell)
+            param_path = "./my_paddle_model"
+            fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path,
+                                       layer=ptb_model)
+    """
+    if isinstance(vardict, collections.OrderedDict):
+        _save_var_to_file(vardict, dirname, filename)
+
+
+def load_persistables(vardict, dirname, filename=None):
+    """
+    This function trys to load persistable variables from the folder
+    `dirname` or the file `filename`.
+
+    Use the `dirname` to specify the folder where persistable variables were
+    saved. If variables were saved in separate files, set `filename` None;
+    if all variables were saved in a single file, use `filename` to specify
+    the file name.
+
+    Args:
+        vardict(dict of Parameters): The parameters will be loaded.
+        dirname(str): The directory path.
+        filename(str|None): The file which saved all variables, this file path should be end with '.npz'. If variables were
+                            saved in differnet files, set it to None.
+                            Default: None
+
+    Returns:
+        dict: The parameter-dict resumed from file
+
+    Examples:
+        .. code-block:: python
+            my_layer = layer(fluid.imperative.Layer)
+            param_path = "./my_paddle_model"
+
+            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path)
+            param_1 = param_dict['PtbModel_0.w_1']
+
+            or:
+            my_layer = layer(fluid.imperative.Layer)
+            param_path = "./my_paddle_model"
+            filename = "model.file"
+            param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path,
+                                                                       filename=filename)
+            param_1 = param_dict['PtbModel_0.w_1']
+
+        """
+    if isinstance(vardict, collections.OrderedDict):
+        return _load_var_from_file(vardict, dirname, filename)
+
+    return {}
+
+
+def _save_var_to_file(stat_dict, file_dir, file_name):
+    save_block = default_main_program().global_block()
+    save_var_map = {}
+    for each_var in stat_dict.items():
+        save_var_map[each_var.name] = each_var
+        if file_name is None:
+            save_block.append_op(
+                type='save',
+                inputs={'X': [each_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+
+    if file_name is not None:
+        save_var_list = []
+        for name in sorted(save_var_map.keys()):
+            save_var_list.append(save_var_map[name])
+
+        save_block.append_op(
+            type='save_combine',
+            inputs={'X': save_var_list},
+            outputs={},
+            attrs={'file_path': os.path.join(file_dir, file_name)})
+
+
+def _load_var_from_file(stat_dict, file_dir, file_name):
+    load_block = default_main_program().global_block()
+    load_var_map = {}
+
+    for each_var in stat_dict.items():
+        assert isinstance(each_var, Variable)
+        if each_var.type == core.VarDesc.VarType.RAW:
+            continue
+        new_var = _clone_var_in_block_(load_block, each_var)
+        if file_name is None:
+            load_block.append_op(
+                type='load',
+                inputs={},
+                outputs={'Out': [new_var]},
+                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+
+        load_var_map[new_var.name] = new_var
+
+    if file_name is not None:
+        load_var_list = []
+        for name in sorted(load_var_map.keys()):
+            load_var_list.append(load_var_map[name])
+
+        load_block.append_op(
+            type='load_combine',
+            inputs={},
+            outputs={"Out": load_var_list},
+            attrs={'file_path': os.path.join(file_dir, file_name)})
+        for res_var in load_var_list:
+            load_var_map[res_var.name] = res_var
+
+    return load_var_map
+
+
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.dtype,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/imperative/layer_object_helper.py
index 6afffe3636dd79d124a5b0e9d9eccb02630f5b8c..0dac99a49183614b080c02278fd8aa4e9a70cb01 100644
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
@@ -105,6 +105,7 @@ class LayerObjectHelper(LayerHelperBase):
 
         Returns dtype of the input
         """
+        inputs_in = inputs_in if (inputs_in is not None) else []
         inputs = self._multiple_input(inputs_in)
         dtype = None
         for each in inputs:
diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index 71d169a7dc36d5b2bd90e513f10c179006f89382..e64667f7f467d0d0a3c07d14ce22c3f231e82eb6 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -17,10 +17,12 @@ import contextlib
 import sys
 import numpy as np
 import collections
+import six
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
+from ..param_attr import ParamAttr
 
 __all__ = ['Layer', 'PyLayer']
 
@@ -72,6 +74,10 @@ class Layer(core.Layer):
 
         Returns created parameter Variable.
         """
+        if isinstance(attr, ParamAttr) and (attr.name is not None):
+            attr.name = ".".join([self._full_name, attr.name])
+        elif isinstance(attr, six.string_types):
+            attr = ".".join([self._full_name, attr])
         return self._helper.create_parameter(attr, shape, dtype, is_bias,
                                              default_initializer)
 
@@ -164,6 +170,7 @@ class Layer(core.Layer):
             the sublayer passed in.
         """
         assert isinstance(sublayer, core.Layer)
+
         self._sub_layers[name] = sublayer
         return sublayer
 
@@ -212,6 +219,34 @@ class Layer(core.Layer):
         else:
             object.__delattr__(self, name)
 
+    def state_dict(self, destination=None, prefix='', include_sublayers=True):
+        if destination is None:
+            destination = collections.OrderedDict()
+        for name, data in self._parameters.items():
+            if data is not None:
+                destination[prefix + name] = data
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    destination_temp = destination.copy()
+                    destination_temp.update(
+                        layer_item.state_dict(destination_temp, prefix +
+                                              layer_name + ".",
+                                              include_sublayers))
+                    destination = destination_temp
+        return destination
+
+    def load_dict(self, stat_dict, include_sublayers=True):
+        for name, item in self.__dict__.get('_parameters', None).items():
+            if item.name in stat_dict:
+                self.__setattr__(name, stat_dict[item.name])
+
+        if include_sublayers:
+            for layer_name, layer_item in self._sub_layers.items():
+                if layer_item is not None:
+                    layer_item.load_dict(stat_dict)
+
 
 class PyLayer(core.PyLayer):
     """Layers composed of user-defined python codes."""
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 604ff753491925be8194522b3efdb77a7e9c3dfa..9cbfc3e3eb7f56dd6fdd1a8ad3c65d1ba03c20fd 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -20,10 +20,12 @@ from .. import core
 from ..layers import utils
 from . import layers
 from ..framework import Variable, OpProtoHolder
+from ..layers import layer_function_generator
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant
-
-__all__ = ['Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit']
+__all__ = [
+    'Conv2D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit', 'LayerNorm'
+]
 
 
 class Conv2D(layers.Layer):
@@ -438,7 +440,6 @@ class Embedding(layers.Layer):
         self._size = size
         self._is_sparse = is_sparse
         self._is_distributed = is_distributed
-
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             size[0] + padding_idx)
 
@@ -471,6 +472,131 @@ class Embedding(layers.Layer):
         return out
 
 
+class LayerNorm(layers.Layer):
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
+        """
+        ${comment}
+
+        The formula is as follows:
+
+        ..  math::
+
+            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+
+            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+
+            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+
+        * :math:`a`: the vector representation of the summed inputs to the neurons
+        in that layer.
+
+        * :math:`H`: the number of hidden units in a layers
+
+        * :math:`g`: the trainable scale parameter.
+
+        * :math:`b`: the trainable bias parameter.
+
+        Args:
+            input(Variable): The input tensor variable.
+            scale(bool): Whether to learn the adaptive gain :math:`g` after
+                normalization. Default True.
+            shift(bool): Whether to learn the adaptive bias :math:`b` after
+                normalization. Default True.
+            begin_norm_axis(int): The normalization will be performed along
+                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+                Default 1.
+            epsilon(float): The small value added to the variance to prevent
+                division by zero. Default 1e-05.
+            param_attr(ParamAttr|None): The parameter attribute for the learnable
+                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as scale. The
+                :attr:`param_attr` is initialized as 1 if it is added. Default None.
+            bias_attr(ParamAttr|None): The parameter attribute for the learnable
+                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+                a default :code:`ParamAttr` would be added as bias. The
+                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+            act(str): Activation to be applied to the output of layer normalizaiton.
+                      Default None.
+        Returns:
+            ${y_comment}
+
+        Examples:
+
+            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+            >>>                          dtype='float32')
+            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+        """
+
+        super(LayerNorm, self).__init__(name_scope)
+        self._scale = scale
+        self._shift = shift
+        self._begin_norm_axis = begin_norm_axis
+        self._epsilon = epsilon
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+
+    def _build_once(self, input):
+        self._dtype = self._helper.input_dtype(input)
+        input_shape = input.shape
+        param_shape = [
+            reduce(lambda x, y: x * y, input_shape[self._begin_norm_axis:])
+        ]
+        if self._scale:
+            self._scale_w = self.create_parameter(
+                attr=self._param_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(1.0))
+        if self._shift:
+            assert self._bias_attr is not False
+            self._bias_w = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True)
+
+    def forward(self, input):
+        inputs = dict()
+        inputs['X'] = input
+        if self._scale:
+            inputs['Scale'] = self._scale_w
+        if self._shift:
+            inputs['Bias'] = self._bias_w
+        # create output
+        mean_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        variance_out = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype, stop_gradient=True)
+        layer_norm_out = self._helper.create_variable_for_type_inference(
+            self._dtype)
+
+        self._helper.append_op(
+            type="layer_norm",
+            inputs=inputs,
+            outputs={
+                "Y": layer_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "begin_norm_axis": self._begin_norm_axis
+            })
+
+        return self._helper.append_activation(layer_norm_out)
+
+
 class GRUUnit(layers.Layer):
     """
     **GRU unit layer**
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 326a84d82b5718dad898620a6d9e0490f7519448..4d5523627218601d00021c72a8777b4b6413880e 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -26,12 +26,14 @@ from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
+from . import reader
+from .reader import *
 from . import core
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
     'load_persistables', 'save_inference_model', 'load_inference_model'
-]
+] + reader.__all__
 
 
 def is_parameter(var):
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 3504cb7935178f28369914ecbd93c24b82622b11..a68160d797bcaca8cff849c560960d6a8823de53 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -268,11 +268,9 @@ class LayerHelperBase(object):
         """
         # Deepcopy the attr so that parameters can be shared in program
         attr = copy.deepcopy(attr)
-        if attr is None:
-            attr = ParamAttr._to_attr(attr)
+        attr = ParamAttr._to_attr(attr)
         if not attr:
             return None
-
         assert isinstance(attr, ParamAttr)
         suffix = 'b' if is_bias else 'w'
         if attr.name is None:
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a9b391fd53a98dc05ee2d909a38dcf82cd5880ea..94fd9f3ea5a41a542da0115a66a52a5cd7f26748 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -563,22 +563,26 @@ def _py_reader(capacity,
 
     def start_provide_thread(func):
         def __provider_thread__():
-            for tensors in func():
-                array = core.LoDTensorArray()
-                for item in tensors:
-                    if not isinstance(item, core.LoDTensor):
-                        tmp = core.LoDTensor()
-                        tmp.set(item, core.CPUPlace())
-                        item = tmp
-
-                    array.append(item)
-
-                if reader.exited:
-                    break
-                feed_queue.push(array)
-                if reader.exited:
-                    break
-            feed_queue.close()
+            try:
+                for tensors in func():
+                    array = core.LoDTensorArray()
+                    for item in tensors:
+                        if not isinstance(item, core.LoDTensor):
+                            tmp = core.LoDTensor()
+                            tmp.set(item, core.CPUPlace())
+                            item = tmp
+
+                        array.append(item)
+
+                    if reader.exited:
+                        break
+                    feed_queue.push(array)
+                    if reader.exited:
+                        break
+                feed_queue.close()
+            except Exception as ex:
+                feed_queue.close()
+                raise ex
 
         reader.thread = threading.Thread(target=__provider_thread__)
         reader.thread.daemon = True
@@ -628,6 +632,9 @@ def _py_reader(capacity,
     reader.reset = __reset__
     reader.decorate_tensor_provider = __set_tensor_provider__
     reader.decorate_paddle_reader = __set_paddle_reader__
+
+    reader.decorate_batch_generator = __set_tensor_provider__
+    reader.decorate_sample_list_generator = __set_paddle_reader__
     reader.start = __start__
 
     return reader
@@ -692,6 +699,11 @@ def py_reader(capacity,
         >>>             exe.run(fetch_list=[loss.name])
         >>>     except fluid.core.EOFException:
         >>>         reader.reset()
+        >>>
+        >>> ...
+        >>>
+        >>> fluid.io.save_inference_model(dirname='./model', feeded_var_names=[img, label],
+        >>>     target_vars=[loss], executor=fluid.Executor(fluid.CUDAPlace(0)))  
 
         2. When training and testing are both performed, two different
         :code:`py_reader` should be created with different names, e.g.:
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index a458cebfb194a068d040a8919fd4abcb4b4bea80..734383655cf6a85015750ab432c0f6697dd6a9b8 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -174,6 +174,8 @@ def monkey_patch_variable():
         ("__rtruediv__", "elementwise_div", True),
         ("__pow__", "elementwise_pow", False),
         ("__rpow__", "elementwise_pow", True),
+        ("__floordiv__", "elementwise_floordiv", False),
+        ("__mod__", "elementwise_mod", False),
             # for logical compare
         ("__eq__", "equal", False),
         ("__ne__", "not_equal", False),
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 31e12f9b27feb134a027d8e150348c1d1da058b3..713aedb81e412b63ed32f162c649d1a1617feb96 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -189,6 +189,7 @@ __all__ = [
     'huber_loss',
     'tree_conv',
     'npair_loss',
+    'fsp_matrix',
 ]
 
 kIgnoreIndex = -100
@@ -6205,7 +6206,8 @@ def one_hot(input, depth):
         type="one_hot",
         inputs={'X': input},
         attrs={'depth': depth},
-        outputs={'Out': one_hot_out})
+        outputs={'Out': one_hot_out},
+        stop_gradient=True)
     return one_hot_out
 
 
@@ -9231,9 +9233,24 @@ def elementwise_pow(x, y, axis=-1, act=None, name=None):
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
+def elementwise_mod(x, y, axis=-1, act=None, name=None):
+    return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
+
+
+def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
+    return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
+
+
 for func in [
-        elementwise_add, elementwise_div, elementwise_sub, elementwise_mul,
-        elementwise_max, elementwise_min, elementwise_pow
+        elementwise_add,
+        elementwise_div,
+        elementwise_sub,
+        elementwise_mul,
+        elementwise_max,
+        elementwise_min,
+        elementwise_pow,
+        elementwise_mod,
+        elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
     func.__doc__ = _generate_doc_string_(
@@ -9753,7 +9770,7 @@ def affine_channel(x,
                 'Bias': bias},
         attrs={"data_layout": data_layout},
         outputs={"Out": out})
-    return helper.append_activation(pre_activation)
+    return helper.append_activation(out)
 
 
 def similarity_focus(input, axis, indexes, name=None):
@@ -10775,3 +10792,46 @@ def npair_loss(anchor, positive, labels, l2_reg=0.002):
     celoss = reduce_mean(cross_entropy)
 
     return l2loss + celoss
+
+
+def fsp_matrix(x, y):
+    """
+
+    **FSP matrix op**
+
+    This op is used to calculate the flow of solution procedure (FSP) matrix of two feature maps.
+    Given feature map x with shape [x_channel, h, w] and feature map y with shape
+    [y_channel, h, w], we can get the fsp matrix of x and y in two steps:
+
+    1. reshape x into matrix with shape [x_channel, h * w] and reshape and
+       transpose y into matrix with shape [h * w, y_channel].
+    2. multiply x and y to get fsp matrix with shape [x_channel, y_channel].
+
+    The output is a batch of fsp matrices.
+
+    Args:
+
+        x (Variable): A feature map with shape [batch_size, x_channel, height, width].
+        y (Variable): A feature map with shape [batch_size, y_channel, height, width].
+                      The y_channel can be different with the x_channel of Input(X)
+                      while the other dimensions must be the same with Input(X)'s.
+
+    Returns:
+
+        fsp matrix (Variable): The output of FSP op with shape [batch_size, x_channel, y_channel].
+        The x_channel is the channel of x and the y_channel is the channel of y.
+
+    Examples:
+
+        .. code-block:: python
+
+            feature_map_0 = fluid.layers.conv2d(x)
+            feature_map_1 = fluid.layers.conv2d(feature_map_0)
+            loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
+
+    """
+    helper = LayerHelper('fsp_matrix', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
+        input_param_name='x'))
+    helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index cb973986988c2909f5ef1e15dd32db3e83b1d269..a18e5b6a9c3fe69ee0bcadc150f07b72227df85e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -25,10 +25,26 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite'
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'tensor_array_to_tensor',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
+    'ones',
+    'zeros',
+    'reverse',
+    'has_inf',
+    'has_nan',
+    'isfinite',
+    'range',
 ]
 
 
@@ -764,3 +780,50 @@ def isfinite(x):
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
     return out
+
+
+def range(start, end, step, dtype):
+    """
+    Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval [start, stop) (in other words,
+    the interval including start but excluding stop).
+
+    args:
+        start(int|float|Variable): Start of interval. The interval includes this value.
+        end(int|float|Variable): End of interval. The interval does not include this
+                                 value, except in some cases where step is not an integer
+                                 and floating point round-off affects the length of out. 
+        step(int|float|Variable): Spacing between values. For any output out, this is the
+                                  distance between two adjacent values, out[i+1] - out[i].
+                                  The default step size is 1.
+        dtype(string): 'float32'|'int32'|..., the data type of the output tensor.
+
+    returns:
+        Evenly spaced values within a given interval.
+
+    examples:
+
+        .. code-block:: python
+
+             data = fluid.layers.range(0, 10, 2, 'int32')
+
+    """
+    helper = LayerHelper("range", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(end, Variable):
+        end = fill_constant([1], dtype, end)
+    if not isinstance(step, Variable):
+        step = fill_constant([1], dtype, step)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='range',
+        inputs={'Start': start,
+                'End': end,
+                'Step': step},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index d501d02bd41349d57bdd9362bad44056075fb315..505d9572a64e8c6f096764c4947a1fa554527e65 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -70,6 +70,10 @@ class Optimizer(object):
         # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
         self._accumulators = defaultdict(lambda: dict())
         self.helper = None
+        self._opti_name_list = []
+
+    def get_opti_var_name_list(self):
+        return self._opti_name_list
 
     def _create_global_learning_rate(self):
         lr = self._global_learning_rate()
@@ -166,8 +170,13 @@ class Optimizer(object):
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + "_" + name
+        var_name = unique_name.generate(var_name)
+        self._opti_name_list.append(var_name)
+
         var = self.helper.create_global_variable(
-            name=unique_name.generate(name),
+            name=var_name,
             persistable=True,
             dtype=dtype or param.dtype,
             type=param.type,
@@ -388,13 +397,14 @@ class Optimizer(object):
             for param in parameters:
                 if not param.trainable:
                     continue
-                # create gradient variable
-                grad_var = Variable(
-                    block=loss.block,
-                    name=param._ivar._grad_name(),
-                    stop_gradient=True,
-                    ivar=param._ivar._grad_ivar())
-                params_grads.append((param, grad_var))
+                if param._ivar._grad_ivar() is not None:
+                    # create gradient variable
+                    grad_var = Variable(
+                        block=loss.block,
+                        name=param._ivar._grad_name(),
+                        stop_gradient=True,
+                        ivar=param._ivar._grad_ivar())
+                    params_grads.append((param, grad_var))
             with program_guard(framework.default_main_program(),
                                framework.default_startup_program()):
                 optimize_ops = self._create_optimization_pass(params_grads)
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 517418da1cf2f745ee5578e3c2b118394db7fae7..6702fc808b121d80fe555412e2cc7f673d6d8389 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -99,7 +99,8 @@ class ParallelExecutor(object):
         build_strategy.num_trainers = num_trainers
         build_strategy.trainer_id = trainer_id
 
-        self._places = compiler.get_available_places(use_cuda)
+        self._places = framework.cuda_places(
+        ) if use_cuda else framework.cpu_places()
         self._scope = scope if scope is not None else executor.global_scope()
 
         main_program = main_program if main_program is not None \
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..74ee2828deb6ecd51ff36b878e97254a62ad1cb6
--- /dev/null
+++ b/python/paddle/fluid/reader.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import core
+import six
+import threading
+from .framework import Program, Variable, program_guard, default_main_program, default_startup_program
+from .executor import global_scope
+from .data_feeder import DataFeeder, BatchedTensorProvider
+from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
+from .unique_name import UniqueNameGenerator
+
+__all__ = ['PyReader']
+
+
+def _convert_places(places):
+    if not isinstance(places, (list, tuple)):
+        places = [places]
+
+    ret = []
+    for p in places:
+        if not isinstance(p, core.Place):
+            tmp = core.Place()
+            tmp.set_place(p)
+            p = tmp
+
+        ret.append(p)
+    return ret
+
+
+class PyReader(object):
+    """
+    Create a reader object for data feeding in Python. 
+    Data would be prefetched using Python thread and be pushed
+    into a queue asynchronously. Data in the queue would be extracted 
+    automatically when `Executor.run(...)` is called.
+
+    Args:  
+        feed_list (list(Variable)|tuple(Variable)): feed variable list.
+            The variables should be created by :code:`fluid.layers.data()`. 
+        capacity (int): capacity of the queue maintained in PyReader object. 
+        use_double_buffer (bool): whether to use double_buffer_reader to 
+            speed up data feeding. 
+        iterable (bool): whether the created reader object is iterable.   
+
+    Returns:
+        reader (Reader): the created reader object.
+
+    Examples:
+        1. If iterable = False, the created PyReader object is almost the
+           same as :code:`fluid.layers.py_reader()`. Operators would be 
+           inserted into the program. User should call :code:`start()` 
+           before each epoch and catch :code:`fluid.core.EOFException`
+           thrown by :code:`Executor.run()` when epoch ends. Once the 
+           exception is caught, user should call :code:`reset()` to reset 
+           the reader manually.
+
+        .. code-block:: python
+            
+            image = fluid.layers.data(
+                        name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+            
+            reader = fluid.io.PyReader(feed_list=[image, label], 
+                        capacity=4, iterable=False)
+            reader.decorate_sample_list_generator(user_defined_reader)
+            ... # definition of network is omitted
+            executor.run(fluid.default_main_program())
+            for _ in range(EPOCH_NUM):
+                reader.start()
+                while True:
+                    try:
+                        executor.run(feed=None, ...)
+                    except fluid.core.EOFException:
+                        reader.reset()
+                        break
+                    
+        2. If iterable=True, the created PyReader object is decoupled with
+           the program. No operator would be inserted into the program. 
+           In this case, the created reader is a Python generator, which 
+           is iterable. User should feed the data yielded from PyReader 
+           object into :code:`Executor.run(feed=...)`.  
+
+        .. code-block:: python
+
+            image = fluid.layers.data(
+                        name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+
+            reader = fluid.io.PyReader(feed_list=[image, label], 
+                        capacity=4, iterable=True)
+            reader.decorate_sample_list_generator(user_defined_reader, 
+                        places=fluid.cuda_places())
+            ... # definition of network is omitted
+            executor.run(fluid.default_main_program())
+            for _ in range(EPOCH_NUM):
+                for data in reader():
+                    executor.run(feed=data, ...)
+    """
+
+    unique_name_generator = UniqueNameGenerator()
+
+    def __init__(self,
+                 feed_list,
+                 capacity,
+                 use_double_buffer=True,
+                 iterable=False):
+        self._tensor_reader = None
+        self._thread = None
+        self._iterable = iterable
+        self._use_double_buffer = use_double_buffer
+        self._capacity = capacity
+        self._feed_list = feed_list
+        if not self._iterable:
+            self._init_non_iterable()
+
+    def _init_iterable(self, places):
+        self._var_names = [v.name for v in self._feed_list]
+        self._places = _convert_places(places)
+        self._queue = core.init_lod_tensor_blocking_queue(core.Variable(),
+                                                          self._capacity)
+        self._reader = core.create_py_reader(
+            self.queue, self._var_names, self._places, self._use_double_buffer)
+
+    def _init_non_iterable(self):
+        lod_levels = []
+        dtypes = []
+        shape_concat = []
+        ranks = []
+        shapes = []
+
+        for feed_data in self._feed_list:
+            dtypes.append(feed_data.dtype)
+            shape_concat.extend(feed_data.shape)
+            ranks.append(len(feed_data.shape))
+            shapes.append(feed_data.shape)
+            lod_levels.append(feed_data.lod_level)
+
+        queue_name = PyReader.unique_name_generator('lod_tensor_blocking_queue')
+        reader_name = PyReader.unique_name_generator('create_py_reader')
+        double_buffer_name = PyReader.unique_name_generator('double_buffer')
+
+        var = global_scope().var(queue_name)
+        self._queue = core.init_lod_tensor_blocking_queue(var, self._capacity)
+
+        startup_blk = default_startup_program().current_block()
+        startup_var = startup_blk.create_var(name=reader_name)
+
+        startup_blk.append_op(
+            type='create_py_reader',
+            inputs={'blocking_queue': [queue_name]},
+            outputs={'Out': [startup_var]},
+            attrs={
+                'shape_concat': shape_concat,
+                'lod_levels': lod_levels,
+                'ranks': ranks
+            })
+
+        startup_var.desc.set_dtypes(dtypes)
+        startup_var.persistable = True
+
+        main_prog_var = _copy_reader_var_(
+            default_main_program().current_block(), startup_var)
+
+        main_prog_var.stop_gradient = True
+        main_prog_var.persistable = True
+
+        reader = monkey_patch_reader_methods(main_prog_var)
+        if self._use_double_buffer:
+            double_buffer_reader = double_buffer(
+                reader, name=double_buffer_name)
+            # we return a double buffer reader. However, the reset method comes from
+            # py_reader.
+            double_buffer_reader.reset = reader.reset
+            reader = double_buffer_reader
+
+        self._reader = reader
+
+        default_main_program().current_block().append_op(
+            type='read',
+            inputs={'Reader': [self._reader]},
+            outputs={'Out': self._feed_list})
+
+    @property
+    def queue(self):
+        return self._queue
+
+    @property
+    def iterable(self):
+        return self._iterable
+
+    def __call__(self):
+        assert self.iterable, "PyReader is not iterable"
+        assert self._tensor_reader is not None, \
+            "Data source of PyReader has not set yet"
+
+        class Iterator(object):
+            def __init__(self, reader):
+                self._reader = reader._reader
+                self._reset = reader._reset
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                return self.next()
+
+            def next(self):
+                ret = self._reader.read_next()
+                if ret:
+                    return ret
+                else:
+                    self._reset()
+                    raise StopIteration
+
+        self._start()
+        return Iterator(self)
+
+    def _reset(self):
+        self._reader.reset()
+        self._thread.join()
+
+    def start(self):
+        '''
+        Start the data feeding thread. 
+        Can only call when the reader object is not iterable.  
+        '''
+        assert not self._iterable, "start() cannot be called when PyReader is iterable"
+        self._start()
+
+    def reset(self):
+        '''
+        Reset the reader object when :code:`fluid.core.EOFException` raises. 
+        Can only call when the reader object is not iterable.
+        '''
+        assert not self._iterable, "reset() cannot be called when PyReader is iterable"
+        self._reset()
+
+    def _start(self):
+        def __thread_main__():
+            try:
+                for tensors in self._tensor_reader():
+                    array = core.LoDTensorArray()
+                    for item in tensors:
+                        if not isinstance(item, core.LoDTensor):
+                            tmp = core.LoDTensor()
+                            tmp.set(item, core.CPUPlace())
+                            item = tmp
+
+                        array.append(item)
+
+                    if not self._queue.push(array):
+                        break
+
+                self._queue.close()
+            except Exception as ex:
+                self._queue.close()
+                raise ex
+
+        self._thread = threading.Thread(target=__thread_main__)
+        self._thread.daemon = True
+        self._thread.start()
+
+    def decorate_sample_generator(self,
+                                  sample_generator,
+                                  batch_size,
+                                  drop_last=True,
+                                  places=None):
+        '''
+        Set the data source of the PyReader object.
+        
+        The provided :code:`sample_generator` should be a Python generator,
+        which yields numpy.ndarray typed data of each sample.
+
+        :code:`places` must be set when the PyReader object is iterable.
+
+        If all inputs have no lods, this method is faster than 
+        :code:`decorate_sample_list_generator(paddle.batch(sample_generator, ...))` .
+
+        Args:
+            sample_generator (generator): Python generator that yields
+                numpy.ndarray-typed sample data.
+            batch_size (int): batch size. Must be larger than 0.
+            drop_last (bool): Whether to drop the last batch when sample number
+                is less than batch_size. 
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert batch_size > 0, "batch_size must be larger than 0"
+        has_lod = False
+        for f in self._feed_list:
+            if f.lod_level != 0:
+                has_lod = True
+                break
+
+        if has_lod:
+            self.decorate_sample_list_generator(
+                paddle.batch(
+                    sample_generator,
+                    batch_size=batch_size,
+                    drop_last=drop_last),
+                places=places)
+        else:
+            reader = BatchedTensorProvider(
+                feed_list=self._feed_list,
+                place=core.CPUPlace(),
+                batch_size=batch_size,
+                generator=sample_generator,
+                drop_last=drop_last)
+            self.decorate_batch_generator(reader, places=places)
+
+    def decorate_sample_list_generator(self, reader, places=None):
+        '''
+        Set the data source of the PyReader object. 
+
+        The provided :code:`reader` should be a Python generator,
+        which yields list(numpy.ndarray) typed batched data. 
+        
+        :code:`places` must be set when the PyReader object is iterable.
+
+        Args:
+            reader (generator): Python generator that yields 
+                list(numpy.ndarray)-typed batched data. 
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert self._tensor_reader is None, \
+            "Cannot reset the data source of PyReader"
+        with program_guard(Program(), Program()):
+            feeder = DataFeeder(
+                feed_list=self._feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(reader, multi_devices=False)
+
+        def __tensor_reader_impl__():
+            for slots in paddle_reader():
+                yield [slots[var.name] for var in self._feed_list]
+
+        self.decorate_batch_generator(__tensor_reader_impl__, places)
+
+    def decorate_batch_generator(self, reader, places=None):
+        '''
+        Set the data source of the PyReader object.
+
+        The provided :code:`reader` should be a Python generator,
+        which yields numpy.ndarray-typed or LoDTensor-typed batched data.
+
+        :code:`places` must be set when the PyReader object is iterable.
+
+        Args:
+            reader (generator): Python generator that yields LoDTensor-typed
+                batched data.
+            places (None|list(CUDAPlace)|list(CPUPlace)): place list. Must
+                be provided when PyReader is iterable.
+        '''
+        assert self._tensor_reader is None, \
+            "Cannot reset the data source of PyReader"
+        self._tensor_reader = reader
+        if self._iterable:
+            assert places is not None, "Places cannot be None when py_reader is iterable"
+            self._init_iterable(places)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b61ef706ba2460a12c4fe659984917b627e20906..cefa2b491970c380faeabe43c0cce54c36069eb9 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -105,7 +105,7 @@ if(WITH_DISTRIBUTE)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
         set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
-    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
@@ -118,8 +118,8 @@ if(NOT APPLE)
     py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-    # change the timeout from 600 to 1200, because in debug mode, this test need more time.
-    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
+    # change the timeout from 600 to 2200, because in debug mode, this test need more time.
+    set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
 endif()
 
 if (WITH_NGRAPH)
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
index 95e39d891f7e6a3dcb57540bd96fe70027443cda..48a4768782c1b4aa8ff6cfdbda9c8e8eb717d08f 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
@@ -20,7 +20,7 @@ logging.basicConfig()
 logger = logging.getLogger("paddle")
 logger.setLevel(logging.INFO)
 
-DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz"
+DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
 DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
 """
 avazu_ctr_data/train.txt
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b6556746cd91676d153d862126dd48661fa281d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.op_type = "concat"
+        self.use_mkldnn = True
+        self._cpu_only = True
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': True}
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype('int')
+
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output()
+
+#--------------------test concat s8 in with axis 0--------------------
+
+    def init_test_data(self):
+        self.x0 = (np.random.randint(0, 100, self.x0_shape) - 50).astype('int8')
+        self.x1 = (np.random.randint(0, 80, self.x1_shape) - 30).astype('int8')
+        self.x2 = (np.random.randint(0, 110, self.x2_shape) - 80).astype('int8')
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 2]
+        self.x1_shape = [1, 2, 1, 2]
+        self.x2_shape = [3, 2, 1, 2]
+
+
+#--------------------test concat u8 in with axis 0--------------------
+
+
+class TestConcatOp2(TestConcatOp):
+    def init_test_data(self):
+        self.x0 = (np.random.randint(0, 100, self.x0_shape)).astype('uint8')
+        self.x1 = (np.random.randint(0, 50, self.x1_shape)).astype('uint8')
+        self.x2 = (np.random.randint(0, 80, self.x2_shape)).astype('uint8')
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 1, 5, 5]
+        self.x1_shape = [1, 1, 5, 5]
+        self.x2_shape = [3, 1, 5, 5]
+
+
+def create_test_int8_class(parent):
+
+    #--------------------test concat s8/u8 in with axis 1--------------------
+
+    class TestAxis1Case(parent):
+        def init_axis(self):
+            self.axis = 1
+
+        def init_shape(self):
+            self.x0_shape = [1, 1, 5, 5]
+            self.x1_shape = [1, 2, 5, 5]
+            self.x2_shape = [1, 3, 5, 5]
+
+#--------------------test concat s8/u8 in with axis 2--------------------
+
+    class TestAxis2Case(parent):
+        def init_axis(self):
+            self.axis = 2
+
+        def init_shape(self):
+            self.x0_shape = [2, 3, 4, 5]
+            self.x1_shape = [2, 3, 5, 5]
+            self.x2_shape = [2, 3, 6, 5]
+
+#--------------------test concat s8/u8 in with axis 3--------------------
+
+    class TestAxis3Case(parent):
+        def init_axis(self):
+            self.axis = 3
+
+        def init_shape(self):
+            self.x0_shape = [2, 3, 5, 5]
+            self.x1_shape = [2, 3, 5, 6]
+            self.x2_shape = [2, 3, 5, 7]
+
+    cls_name_1 = "{0}_axis_{1}".format(parent.__name__, "1")
+    cls_name_2 = "{0}_axis_{1}".format(parent.__name__, "2")
+    cls_name_3 = "{0}_axis_{1}".format(parent.__name__, "3")
+    TestAxis1Case.__name__ = cls_name_1
+    TestAxis2Case.__name__ = cls_name_2
+    TestAxis3Case.__name__ = cls_name_3
+    globals()[cls_name_1] = TestAxis1Case
+    globals()[cls_name_2] = TestAxis2Case
+    globals()[cls_name_3] = TestAxis3Case
+
+create_test_int8_class(TestConcatOp)
+create_test_int8_class(TestConcatOp2)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..377014510b55633f697ef7bf2f5f597281e5f5a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import numpy as np
+import time
+import six
+import unittest
+
+EPOCH_NUM = 60
+BATCH_SIZE = 32
+CLASS_NUM = 10
+
+
+def random_reader():
+    np.random.seed(1)
+    for i in range(BATCH_SIZE * 40):
+        image = np.random.random([784])
+        label = np.random.random_integers(low=0, high=CLASS_NUM - 1)
+        yield image, label
+
+
+def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
+    startup_prog = fluid.Program()
+    main_prog = fluid.Program()
+    startup_prog.random_seed = 1
+    main_prog.random_seed = 1
+
+    with fluid.unique_name.guard():
+        with fluid.program_guard(main_prog, startup_prog):
+            image = fluid.layers.data(
+                name='image', shape=[784], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            py_reader = fluid.io.PyReader(
+                feed_list=[image, label],
+                capacity=4,
+                iterable=not use_legacy_py_reader,
+                use_double_buffer=use_double_buffer)
+            hidden = image
+            for hidden_size in [10, 20, 30]:
+                hidden = fluid.layers.fc(
+                    hidden,
+                    size=hidden_size,
+                    act='tanh',
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(value=1.0)))
+
+            predict_label = fluid.layers.fc(hidden,
+                                            size=CLASS_NUM,
+                                            act='softmax')
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict_label, label=label))
+
+            optimizer = fluid.optimizer.Adam()
+            optimizer.minimize(loss)
+    return startup_prog, main_prog, py_reader, loss
+
+
+class TestBase(unittest.TestCase):
+    def run_main(self, use_legacy_py_reader, with_data_parallel, places,
+                 use_double_buffer):
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            startup_prog, main_prog, py_reader, loss = simple_fc_net(
+                places, use_legacy_py_reader, use_double_buffer)
+
+            reader = paddle.batch(random_reader, batch_size=BATCH_SIZE)
+
+            ps = places if use_double_buffer else fluid.cpu_places(len(places))
+
+            py_reader.decorate_sample_list_generator(
+                reader, places=ps if py_reader.iterable else None)
+
+            exe = fluid.Executor(place=places[0])
+            exe.run(startup_prog)
+
+            prog = fluid.CompiledProgram(main_prog)
+            if with_data_parallel:
+                prog = prog.with_data_parallel(
+                    loss_name=loss.name, places=places)
+
+            step = 0
+            step_list = []
+            loss_list = []
+            start_t = time.time()
+            if not py_reader.iterable:
+                for _ in six.moves.range(EPOCH_NUM):
+                    step = 0
+                    py_reader.start()
+                    while True:
+                        try:
+                            L, = exe.run(program=prog,
+                                         fetch_list=[loss],
+                                         use_program_cache=True)
+                            loss_list.append(np.mean(L))
+                            step += 1
+                        except fluid.core.EOFException:
+                            py_reader.reset()
+                            break
+                    step_list.append(step)
+            else:
+                for _ in six.moves.range(EPOCH_NUM):
+                    step = 0
+                    for d in py_reader():
+                        assert len(d) == len(places)
+                        for i, item in enumerate(d):
+                            image = item['image']
+                            label = item['label']
+                            assert image.shape() == [BATCH_SIZE, 784]
+                            assert label.shape() == [BATCH_SIZE, 1]
+                            assert image._place()._equals(ps[i])
+                            assert label._place()._equals(ps[i])
+                        L, = exe.run(program=prog,
+                                     feed=d,
+                                     fetch_list=[loss],
+                                     use_program_cache=True)
+                        loss_list.append(np.mean(L))
+                        step += 1
+                    step_list.append(step)
+            end_t = time.time()
+            ret = {
+                "time": end_t - start_t,
+                "step": step_list,
+                "loss": np.array(loss_list)
+            }
+            return ret
+
+    def prepare_places(self, with_data_parallel, with_cpu=True, with_gpu=True):
+        places = []
+        if with_cpu:
+            places.append([fluid.CPUPlace()])
+            if with_data_parallel:
+                places.append([fluid.CPUPlace()] * 2)
+
+        if with_gpu and fluid.core.is_compiled_with_cuda():
+            tmp = fluid.cuda_places()
+            assert len(tmp) > 0, "no gpu detected"
+            if with_data_parallel:
+                places.append(tmp)
+            places.append([tmp[0]])
+        return places
+
+    def test_main(self):
+        for with_data_parallel in [True, False]:
+            for p in self.prepare_places(with_data_parallel):
+                for use_double_buffer in [False, True]:
+                    results = []
+                    for use_legacy_py_reader in [False, True]:
+                        ret = self.run_main(
+                            use_legacy_py_reader=use_legacy_py_reader,
+                            with_data_parallel=with_data_parallel,
+                            places=p,
+                            use_double_buffer=use_double_buffer)
+                        results.append(ret)
+                    if not use_double_buffer:
+                        diff = np.max(
+                            np.abs(results[0]['loss'] - results[1]['loss']))
+                        self.assertLess(diff, 1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
index 25dcccc28d710695d4c5e08c17816669d0fae5d8..3307caa8b2d62d5a31a7eeb36bb207b31d749b55 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -21,7 +21,7 @@ from test_dist_base import TestDistBase
 
 
 def download_files():
-    url_prefix = 'http://paddle-unittest-data.cdn.bcebos.com/dist_transformer/'
+    url_prefix = 'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
     vocab_url = url_prefix + 'vocab.bpe.32000'
     vocab_md5 = 'a86d345ca6e27f6591d0dccb1b9be853'
     paddle.dataset.common.download(vocab_url, 'test_dist_transformer',
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..104e896b6e440f5657a90e0ce741b49f72ba75c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+import random
+
+
+class TestElementwiseModOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.dtype = np.int32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a354ba0177ae70ba4f3a1565360f96a55edd33b6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -0,0 +1,69 @@
+#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from op_test import OpTest
+
+import random
+
+
+class TestElementwiseModOp(OpTest):
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def setUp(self):
+        self.op_type = "elementwise_mod"
+        self.dtype = np.int32
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.mod(self.x, self.y)
+
+    def init_dtype(self):
+        pass
+
+    def init_axis(self):
+        pass
+
+
+class TestElementwiseModOp_scalar(TestElementwiseModOp):
+    def init_input_output(self):
+        scale_x = random.randint(0, 100000000)
+        scale_y = random.randint(1, 100000000)
+        self.x = (np.random.rand(2, 3, 4) * scale_x).astype(self.dtype)
+        self.y = (np.random.rand(1) * scale_y + 1).astype(self.dtype)
+        self.out = np.mod(self.x, self.y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
index 32cb23cbfa9bdef4728e85d0014123652e4aefea..0812b02b47db7fa2d43e1d3bbd0a3f7b59911326 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
@@ -31,15 +31,27 @@ def dequantize_max_abs(x, scale, max_range):
     return y
 
 
-def channel_wise_quantize_max_abs(x, quant_bit=8):
+def channel_wise_quantize_max_abs(x, quant_bit=8, use_second_dim=False):
     scales = []
-    for i in range(x.shape[0]):
-        scales.append(np.max(np.abs(x[i])).astype("float32"))
-
-    y = x.copy()
-    max_range = math.pow(2, quant_bit - 1) - 1
-    for i, scale in enumerate(scales):
-        y[i] = np.round(y[i] / scale * max_range)
+    if not use_second_dim:
+        for i in range(x.shape[0]):
+            scales.append(np.max(np.abs(x[i])).astype("float32"))
+        y = x.copy()
+        max_range = math.pow(2, quant_bit - 1) - 1
+        for i, scale in enumerate(scales):
+            y[i] = np.round(x[i] / scale * max_range)
+    else:
+        for i in range(x.shape[0]):
+            s = []
+            for j in range(x.shape[1]):
+                s.append(np.max(np.abs(x[i][j])).astype("float32"))
+            scales.append(s)
+        scales = np.amax(np.array(scales), axis=0)
+        y = x.copy()
+        max_range = math.pow(2, quant_bit - 1) - 1
+        for i in range(x.shape[0]):
+            for j, scale in enumerate(scales):
+                y[i][j] = np.round(x[i][j] / scale * max_range)
     return y, scales
 
 
@@ -47,10 +59,16 @@ def channel_wise_dequantize_max_abs(x,
                                     scales,
                                     quant_bits,
                                     activation_scale=None):
-    y = x.copy()
-    for i in range(x.shape[0]):
-        y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * y[i]
-    if activation_scale is not None:
+    if activation_scale is None:
+        y = x.copy()
+        for i in range(x.shape[0]):
+            y[i] = (scales[i] / (math.pow(2, quant_bits[0] - 1) - 1)) * x[i]
+    else:
+        y = x.copy()
+        for i in range(x.shape[0]):
+            for j in range(x.shape[1]):
+                y[i][j] = (scales[j] /
+                           (math.pow(2, quant_bits[0] - 1) - 1)) * x[i][j]
         y *= activation_scale / (math.pow(2, quant_bits[1] - 1) - 1)
     return y
 
@@ -65,7 +83,8 @@ class TestFakeChannelWiseDequantizeMaxAbsOpTwoScales(OpTest):
         self.set_args()
         self.op_type = "fake_channel_wise_dequantize_max_abs"
         x = np.random.randn(4, 3, 64, 64).astype(self.data_type)
-        yq, scales = channel_wise_quantize_max_abs(x, self.quant_bits[0])
+        yq, scales = channel_wise_quantize_max_abs(
+            x, self.quant_bits[0], use_second_dim=True)
         ydq = channel_wise_dequantize_max_abs(yq, scales, self.quant_bits,
                                               self.activation_scale)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index cf8f01edb9a6a2b6d91080248553491c54e7707b..07038b0441d0dc37a42cbf2058c1b5f41b47a5da 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -53,7 +53,7 @@ class TestFakeChannelWiseQuantizeOp(OpTest):
 
         self.outputs = {
             'Out': outputs,
-            'OutScales': np.array(scales).astype("float32"),
+            'OutScale': np.array(scales).astype("float32"),
         }
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad7418447b4bac5e6a6034f94540091590fa189
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fsp_matrix(a, b):
+    batch = a.shape[0]
+    a_channel = a.shape[1]
+    b_channel = b.shape[1]
+    h = a.shape[2]
+    w = a.shape[3]
+    a_t = a.transpose([0, 2, 3, 1])
+    a_t = a_t.reshape([batch, h * w, a_channel])
+    b_t = b.transpose([0, 2, 3, 1]).reshape([batch, h * w, b_channel])
+    a_r = a_t.repeat(
+        b_channel, axis=1).reshape(
+            [batch, h * w, b_channel, a_channel]).transpose([0, 1, 3, 2])
+    b_r = b_t.repeat(
+        a_channel, axis=1).reshape([batch, h * w, a_channel, b_channel])
+    return np.mean(a_r * b_r, axis=1)
+
+
+class TestFSPOp(OpTest):
+    def setUp(self):
+        self.op_type = "fsp"
+        self.initTestCase()
+
+        feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32')
+        feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32')
+
+        self.inputs = {'X': feature_map_0, 'Y': feature_map_1}
+        self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)}
+
+    def initTestCase(self):
+        self.a_shape = (2, 16, 32, 31)
+        self.b_shape = (2, 28, 32, 31)
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..62c25f734598e35b7c668d1ec1b89b5c57449f73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from paddle.fluid.imperative.base import to_variable
+
+
+class SimpleImgConvPool(fluid.imperative.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestImperativeCheckpoint(unittest.TestCase):
+    def save_load_persistables(self):
+        seed = 90
+        epoch_num = 1
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            dy_param_init_value = {}
+
+            step = 0
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label._stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss._numpy()
+
+                    avg_loss._backward()
+                    sgd.minimize(avg_loss)
+                    fluid.imperative.save_persistables(mnist, "save_dir")
+                    mnist.clear_gradients()
+
+                    for param in mnist.parameters():
+                        dy_param_init_value[param.name] = param._numpy()
+
+                    mnist.load_dict(
+                        fluid.imperative.load_persistables(mnist, "save_dir"))
+
+                    restore = mnist.parameters()
+
+                    self.assertEqual(len(dy_param_init_value), len(restore))
+                    for value in restore:
+                        self.assertTrue(
+                            np.allclose(value, dy_param_init_value[value.name]))
+                        self.assertTrue(np.isfinite(value.all()))
+                        self.assertFalse(np.isnan(value.any()))
+
+                    step += 1
+
+                    if step > 20:
+                        break
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
new file mode 100644
index 0000000000000000000000000000000000000000..af80ca6ce77a4ec187dd52863c2fe2ba278d5023
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+NUM_USERS = 100
+NUM_ITEMS = 1000
+
+BATCH_SIZE = 32
+NUM_BATCHES = 2
+
+
+class MLP(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(MLP, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+
+        self._user_layers = []
+        self._item_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._user_layers.append(
+                self.add_sublayer(
+                    'user_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+            self._item_layers.append(
+                self.add_sublayer(
+                    'item_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+
+        for ul, il in zip(self._user_layers, self._item_layers):
+            users = ul(users)
+            items = il(items)
+        return fluid.layers.elementwise_mul(users, items)
+
+
+class DMF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DMF, self).__init__(name_scope)
+        self._user_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._item_latent = fluid.imperative.FC(self.full_name(), 256)
+        self._match_layers = []
+        self._hid_sizes = [128, 64]
+        for i in range(len(self._hid_sizes)):
+            self._match_layers.append(
+                self.add_sublayer(
+                    'match_layer_%d' % i,
+                    fluid.imperative.FC(
+                        self.full_name(), self._hid_sizes[i], act='relu')))
+        self._mat
+
+    def forward(self, users, items):
+        users = self._user_latent(users)
+        items = self._item_latent(items)
+        match_vec = fluid.layers.concat(
+            [users, items], axis=len(users.shape) - 1)
+        for l in self._match_layers:
+            match_vec = l(match_vec)
+        return match_vec
+
+
+class DeepCF(fluid.imperative.Layer):
+    def __init__(self, name_scope):
+        super(DeepCF, self).__init__(name_scope)
+
+        self._user_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_USERS, 256])
+        self._item_emb = fluid.imperative.Embedding(self.full_name(),
+                                                    [NUM_ITEMS, 256])
+
+        self._mlp = MLP(self.full_name())
+        self._dmf = DMF(self.full_name())
+        self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid')
+
+    def forward(self, users, items):
+        users_emb = self._user_emb(users)
+        items_emb = self._item_emb(items)
+
+        mlp_predictive = self._mlp(users_emb, items_emb)
+        dmf_predictive = self._dmf(users_emb, items_emb)
+        predictive = fluid.layers.concat(
+            [mlp_predictive, dmf_predictive],
+            axis=len(mlp_predictive.shape) - 1)
+        prediction = self._match_fc(predictive)
+        return prediction
+
+
+def get_data():
+    user_ids = []
+    item_ids = []
+    labels = []
+    for uid in range(NUM_USERS):
+        for iid in range(NUM_ITEMS):
+            # 10% positive
+            label = float(random.randint(1, 10) == 1)
+            user_ids.append(uid)
+            item_ids.append(iid)
+            labels.append(label)
+    indices = np.arange(NUM_USERS * NUM_ITEMS)
+    np.random.shuffle(indices)
+    users_np = np.array(user_ids, dtype=np.int64)[indices]
+    items_np = np.array(item_ids, dtype=np.int64)[indices]
+    labels_np = np.array(labels, dtype=np.float32)[indices]
+    return np.expand_dims(users_np, -1), \
+           np.expand_dims(items_np, -1), \
+           np.expand_dims(labels_np, -1)
+
+
+class TestImperativeDeepCF(unittest.TestCase):
+    def test_gan_float32(self):
+        seed = 90
+        users_np, items_np, labels_np = get_data()
+
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            users = fluid.layers.data('users', [1], dtype='int64')
+            items = fluid.layers.data('items', [1], dtype='int64')
+            labels = fluid.layers.data('labels', [1], dtype='float32')
+
+            deepcf = DeepCF('deepcf')
+            prediction = deepcf(users, items)
+            loss = fluid.layers.reduce_sum(
+                fluid.layers.log_loss(prediction, labels))
+            adam = fluid.optimizer.AdamOptimizer(0.01)
+            adam.minimize(loss)
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                static_loss = exe.run(
+                    main,
+                    feed={
+                        users.name: users_np[slice:slice + BATCH_SIZE],
+                        items.name: items_np[slice:slice + BATCH_SIZE],
+                        labels.name: labels_np[slice:slice + BATCH_SIZE]
+                    },
+                    fetch_list=[loss])[0]
+                sys.stderr.write('static loss %s\n' % static_loss)
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            deepcf = DeepCF('deepcf')
+            for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
+                prediction = deepcf(
+                    to_variable(users_np[slice:slice + BATCH_SIZE]),
+                    to_variable(items_np[slice:slice + BATCH_SIZE]))
+                loss = fluid.layers.reduce_sum(
+                    fluid.layers.log_loss(prediction,
+                                          to_variable(labels_np[slice:slice +
+                                                                BATCH_SIZE])))
+                loss._backward()
+                adam = fluid.optimizer.AdamOptimizer(0.01)
+                adam.minimize(loss)
+                deepcf.clear_gradients()
+                dy_loss = loss._numpy()
+
+        self.assertEqual(static_loss, dy_loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index a80202d6dddacaa4cb6fa3efd3c3dfd5b0ab4400..6024fb5f816d10cedad36272e353704797526676 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer):
         return self._fc3(x)
 
 
-class TestImperativeMnist(unittest.TestCase):
+class TestImperativeGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 3b602303ae9a183c7b66f5613321f58898fdfcc2..460ba65a48c863315cda4847aee1b4e2366bba96 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -59,7 +59,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                 dtype="float32",
                 default_initializer=fluid.initializer.UniformInitializer(
                     low=-self._init_scale, high=self._init_scale))
-            self.weight_1_arr.append(weight_1)
+            self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1))
             bias_1 = self.create_parameter(
                 attr=fluid.ParamAttr(
                     initializer=fluid.initializer.UniformInitializer(
@@ -67,7 +67,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                 shape=[self._hidden_size * 4],
                 dtype="float32",
                 default_initializer=fluid.initializer.Constant(0.0))
-            self.bias_arr.append(bias_1)
+            self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
 
     def forward(self, input_embedding, init_hidden=None, init_cell=None):
         self.cell_array = []
@@ -242,7 +242,7 @@ class TestImperativePtbRnn(unittest.TestCase):
             dy_loss = None
             last_hidden = None
             last_cell = None
-            batch_num = 50
+            batch_num = 200
 
             for i in range(batch_num):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -264,8 +264,10 @@ class TestImperativePtbRnn(unittest.TestCase):
                         dy_param_init[param.name] = param._numpy()
                 dy_loss._backward()
                 sgd.minimize(dy_loss)
-                for param in ptb_model.parameters():
-                    dy_param_updated[param.name] = param._numpy()
+                ptb_model.clear_gradients()
+                if i == batch_num - 1:
+                    for param in ptb_model.parameters():
+                        dy_param_updated[param.name] = param._numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -323,25 +325,28 @@ class TestImperativePtbRnn(unittest.TestCase):
                               },
                               fetch_list=fetch_list)
                 static_loss_value = out[0]
-                static_last_cell_value = out[1]
-                static_last_hidden_value = out[2]
-                for k in range(3, len(out)):
-                    static_param_updated[static_param_name_list[k - 3]] = out[k]
+                static_last_hidden_value = out[1]
+                static_last_cell_value = out[2]
 
+                if i == batch_num - 1:
+                    for k in range(3, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    3]] = out[k]
+
+        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
+        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
+        self.assertTrue(
+            np.allclose(static_last_hidden_value, last_hidden._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            # print("static_init name: {}, value {}".format(key, value))
+            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))
+            self.assertTrue(np.allclose(value, dy_param_init[key], atol=1e-5))
+        for key, value in six.iteritems(static_param_updated):
+            # print("static name: {}, value {}".format(key, value))
+            # print("dy name: {}, value {}".format(key, dy_param_updated[key]))
             self.assertTrue(
-                np.allclose(static_loss_value.all(), dy_loss._numpy().all()))
-            self.assertTrue(
-                np.allclose(static_last_cell_value.all(),
-                            last_cell._numpy().all()))
-            self.assertTrue(
-                np.allclose(static_last_hidden_value.all(),
-                            last_hidden._numpy().all()))
-            for key, value in six.iteritems(static_param_init):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_init[key].all()))
-            for key, value in six.iteritems(static_param_updated):
-                self.assertTrue(
-                    np.allclose(value.all(), dy_param_updated[key].all()))
+                np.allclose(
+                    value, dy_param_updated[key], atol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b06d3e8894072943b06456340f928cda260763c3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -0,0 +1,1096 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from test_imperative_base import new_program_scope
+from paddle.fluid import core
+import numpy as np
+import six
+np.set_printoptions(suppress=True)
+
+
+# Copy from models
+class TrainTaskConfig(object):
+    # support both CPU and GPU now.
+    use_gpu = True
+    # the epoch number to train.
+    pass_num = 30
+    # the number of sequences contained in a mini-batch.
+    # deprecated, set batch_size in args.
+    batch_size = 32
+    # the hyper parameters for Adam optimizer.
+    # This static learning_rate will be multiplied to the LearningRateScheduler
+    # derived learning rate the to get the final learning rate.
+    learning_rate = 2.0
+    beta1 = 0.9
+    beta2 = 0.997
+    eps = 1e-9
+    # the parameters for learning rate scheduling.
+    warmup_steps = 8000
+    # the weight used to mix up the ground-truth distribution and the fixed
+    # uniform distribution in label smoothing when training.
+    # Set this as zero if label smoothing is not wanted.
+    label_smooth_eps = 0.1
+    # the directory for saving trained models.
+    model_dir = "trained_models"
+    # the directory for saving checkpoints.
+    ckpt_dir = "trained_ckpts"
+    # the directory for loading checkpoint.
+    # If provided, continue training from the checkpoint.
+    ckpt_path = None
+    # the parameter to initialize the learning rate scheduler.
+    # It should be provided if use checkpoints, since the checkpoint doesn't
+    # include the training step counter currently.
+    start_step = 0
+    # the frequency to save trained models.
+    save_freq = 10000
+
+
+class InferTaskConfig(object):
+    use_gpu = True
+    # the number of examples in one run for sequence generation.
+    batch_size = 10
+    # the parameters for beam search.
+    beam_size = 5
+    max_out_len = 256
+    # the number of decoded sentences to output.
+    n_best = 1
+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = True
+    # the directory for loading the trained model.
+    model_path = "trained_models/pass_1.infer.model"
+
+
+class ModelHyperParams(object):
+    # These following five vocabularies related configurations will be set
+    # automatically according to the passed vocabulary path and special tokens.
+    # size of source word dictionary.
+    src_vocab_size = 10000
+    # size of target word dictionay
+    trg_vocab_size = 10000
+    # index for <bos> token
+    bos_idx = 0
+    # index for <eos> token
+    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2
+    # max length of sequences deciding the size of position encoding table.
+    max_length = 4
+    # the dimension for word embeddings, which is also the last dimension of
+    # the input and output of multi-head attention, position-wise feed-forward
+    # networks, encoder and decoder.
+    d_model = 512
+    # size of the hidden layer in position-wise feed-forward networks.
+    d_inner_hid = 2048
+    # the dimension that keys are projected to for dot-product attention.
+    d_key = 64
+    # the dimension that values are projected to for dot-product attention.
+    d_value = 64
+    # number of head used in multi-head attention.
+    n_head = 8
+    # number of sub-layers to be stacked in the encoder and decoder.
+    n_layer = 6
+    # dropout rates of different modules.
+    prepostprocess_dropout = 0.1
+    attention_dropout = 0.1
+    relu_dropout = 0.1
+    # to process before each sub-layer
+    preprocess_cmd = "n"  # layer normalization
+    # to process after each sub-layer
+    postprocess_cmd = "da"  # dropout + residual connection
+    # random seed used in dropout for CE.
+    dropout_seed = 1
+    # the flag indicating whether to share embedding and softmax weights.
+    # vocabularies in source and target should be same for weight sharing.
+    weight_sharing = True
+
+
+def merge_cfg_from_list(cfg_list, g_cfgs):
+    """
+    Set the above global configurations using the cfg_list.
+    """
+    assert len(cfg_list) % 2 == 0
+    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
+        for g_cfg in g_cfgs:
+            if hasattr(g_cfg, key):
+                try:
+                    value = eval(value)
+                except Exception:  # for file path
+                    pass
+                setattr(g_cfg, key, value)
+                break
+
+
+def position_encoding_init(n_position, d_pos_vec):
+    """
+    Generate the initial values for the sinusoid position encoding table.
+    """
+    channels = d_pos_vec
+    position = np.arange(n_position)
+    num_timescales = channels // 2
+    log_timescale_increment = (np.log(float(1e4) / float(1)) /
+                               (num_timescales - 1))
+    inv_timescales = np.exp(np.arange(
+        num_timescales)) * -log_timescale_increment
+    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
+                                                               0)
+    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
+    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
+    position_enc = signal
+    return position_enc.astype("float32")
+
+
+def create_data(is_static=False):
+    if is_static:
+        return [
+            src_word_np, src_pos_np, src_slf_attn_bias_np, trg_word_np,
+            trg_pos_np, trg_slf_attn_bias_np, trg_src_attn_bias_np, lbl_word_np,
+            lbl_weight_np
+        ]
+    else:
+        enc_inputs = [
+            to_variable(src_word_np), to_variable(src_pos_np),
+            to_variable(src_slf_attn_bias_np)
+        ]
+        dec_inputs = [
+            to_variable(trg_word_np), to_variable(trg_pos_np),
+            to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
+        ]
+        label = to_variable(lbl_word_np)
+        weight = to_variable(lbl_weight_np)
+        return enc_inputs, dec_inputs, label, weight
+
+
+def create_feed_dict_list(data, init=False):
+    if init:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields + pos_enc_param_names
+    else:
+        data_input_names = encoder_data_input_fields + \
+                           decoder_data_input_fields[:-1] + label_data_input_fields
+    feed_dict_list = dict()
+    for i in range(len(data_input_names)):
+        feed_dict_list[data_input_names[i]] = data[i]
+    return feed_dict_list
+
+
+def make_all_inputs(input_fields):
+    """
+    Define the input data layers for the transformer model.
+    """
+    inputs = []
+    for input_field in input_fields:
+        input_var = fluid.layers.data(
+            name=input_field,
+            shape=input_descs[input_field][0],
+            dtype=input_descs[input_field][1],
+            lod_level=input_descs[input_field][2]
+            if len(input_descs[input_field]) == 3 else 0,
+            append_batch_size=False)
+        inputs.append(input_var)
+    return inputs
+
+
+# The placeholder for batch_size in compile time. Must be -1 currently to be
+# consistent with some ops' infer-shape output in compile time, such as the
+# sequence_expand op used in beamsearch decoder.
+batch_size = 32
+# The placeholder for squence length in compile time.
+seq_len = ModelHyperParams.max_length
+# Here list the data shapes and data types of all inputs.
+# The shapes here act as placeholder and are set to pass the infer-shape in
+# compile time.
+input_descs = {
+    # The actual data shape of src_word is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_word": [(batch_size, seq_len, 1), "int64", 2],
+    # The actual data shape of src_pos is:
+    # [batch_size, max_src_len_in_batch, 1]
+    "src_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings in the
+    # encoder.
+    # The actual data shape of src_slf_attn_bias is:
+    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
+    "src_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # The actual data shape of trg_word is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_word": [(batch_size, seq_len, 1), "int64",
+                 2],  # lod_level is only used in fast decoder.
+    # The actual data shape of trg_pos is:
+    # [batch_size, max_trg_len_in_batch, 1]
+    "trg_pos": [(batch_size, seq_len, 1), "int64"],
+    # This input is used to remove attention weights on paddings and
+    # subsequent words in the decoder.
+    # The actual data shape of trg_slf_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
+    "trg_slf_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used to remove attention weights on paddings of the source
+    # input in the encoder-decoder attention.
+    # The actual data shape of trg_src_attn_bias is:
+    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
+    "trg_src_attn_bias": [(batch_size, ModelHyperParams.n_head, seq_len,
+                           seq_len), "float32"],
+    # This input is used in independent decoder program for inference.
+    # The actual data shape of enc_output is:
+    # [batch_size, max_src_len_in_batch, d_model]
+    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
+    # The actual data shape of label_word is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_word": [(batch_size * seq_len, 1), "int64"],
+    # This input is used to mask out the loss of paddding tokens.
+    # The actual data shape of label_weight is:
+    # [batch_size * max_trg_len_in_batch, 1]
+    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
+    # This input is used in beam-search decoder.
+    "init_score": [(batch_size, 1), "float32", 2],
+    # This input is used in beam-search decoder for the first gather
+    # (cell states updation)
+    "init_idx": [(batch_size, ), "int32"],
+}
+
+# Names of word embedding table which might be reused for weight sharing.
+word_emb_param_names = (
+    "src_word_emb_table",
+    "trg_word_emb_table", )
+# Names of position encoding table which will be initialized externally.
+pos_enc_param_names = (
+    "src_pos_enc_table",
+    "trg_pos_enc_table", )
+# separated inputs for different usages.
+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+decoder_data_input_fields = (
+    "trg_word",
+    "trg_pos",
+    "trg_slf_attn_bias",
+    "trg_src_attn_bias",
+    "enc_output", )
+label_data_input_fields = (
+    "lbl_word",
+    "lbl_weight", )
+# In fast decoder, trg_pos (only containing the current time step) is generated
+# by ops and trg_slf_attn_bias is not needed.
+fast_decoder_data_input_fields = (
+    "trg_word",
+    "init_score",
+    "init_idx",
+    "trg_src_attn_bias", )
+# if we use py_reader
+use_py_reader = False
+
+# if we run sync mode
+sync = False
+
+# how many batches we use
+batch_num = 2
+
+np.random.seed = 1
+src_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+src_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+
+trg_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size, seq_len, 1),
+    dtype='int64')
+trg_pos_np = np.random.randint(
+    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+                                       seq_len, seq_len).astype('float32')
+
+lbl_word_np = np.random.randint(
+    1,
+    ModelHyperParams.src_vocab_size - 1,
+    size=(batch_size * seq_len, 1),
+    dtype='int64')
+lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+
+# np.random.seed = 1
+# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# src_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
+# trg_pos_np = np.random.randint(
+#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
+# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
+#                                        seq_len, seq_len).astype('float32')
+#
+# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
+# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
+#
+pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
+                                  ModelHyperParams.d_model)
+
+
+class PrePostProcessLayer(Layer):
+    def __init__(self, name_scope, process_cmd, shape_len=None):
+        super(PrePostProcessLayer, self).__init__(name_scope)
+        for cmd in process_cmd:
+            if cmd == "n":
+                self._layer_norm = LayerNorm(
+                    name_scope=self.full_name(),
+                    begin_norm_axis=shape_len - 1,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(1.)),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Constant(0.)))
+
+    def forward(self, prev_out, out, process_cmd, dropout_rate=0.):
+        for cmd in process_cmd:
+            if cmd == "a":  # add residual connection
+                out = out + prev_out if prev_out else out
+            elif cmd == "n":  # add layer normalization
+                out = self._layer_norm(out)
+            elif cmd == "d":  # add dropout
+                if dropout_rate:
+                    out = fluid.layers.dropout(
+                        out,
+                        dropout_prob=dropout_rate,
+                        seed=ModelHyperParams.dropout_seed,
+                        is_test=False)
+        return out
+
+
+class PositionwiseFeedForwardLayer(Layer):
+    def __init__(self, name_scope, d_inner_hid, d_hid, dropout_rate):
+        super(PositionwiseFeedForwardLayer, self).__init__(name_scope)
+        self._i2h = FC(name_scope=self.full_name(),
+                       size=d_inner_hid,
+                       num_flatten_dims=2,
+                       act="relu")
+        self._h2o = FC(name_scope=self.full_name(),
+                       size=d_hid,
+                       num_flatten_dims=2)
+        self._dropout_rate = dropout_rate
+
+    def forward(self, x):
+        hidden = self._i2h(x)
+        if self._dropout_rate:
+            hidden = fluid.layers.dropout(
+                hidden,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+        out = self._h2o(hidden)
+        return out
+
+
+class MultiHeadAttentionLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 d_key,
+                 d_value,
+                 d_model,
+                 n_head=1,
+                 dropout_rate=0.,
+                 cache=None,
+                 gather_idx=None,
+                 static_kv=False):
+        super(MultiHeadAttentionLayer, self).__init__(name_scope)
+        self._n_head = n_head
+        self._d_key = d_key
+        self._d_value = d_value
+        self._d_model = d_model
+        self._dropout_rate = dropout_rate
+        self._q_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._k_fc = FC(name_scope=self.full_name(),
+                        size=d_key * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._v_fc = FC(name_scope=self.full_name(),
+                        size=d_value * n_head,
+                        bias_attr=False,
+                        num_flatten_dims=2)
+        self._proj_fc = FC(name_scope=self.full_name(),
+                           size=self._d_model,
+                           bias_attr=False,
+                           num_flatten_dims=2)
+
+    def forward(self, queries, keys, values, attn_bias):
+        # compute q ,k ,v
+        keys = queries if keys is None else keys
+        values = keys if values is None else values
+
+        q = self._q_fc(queries)
+        k = self._k_fc(keys)
+        v = self._v_fc(values)
+
+        # split head
+        reshaped_q = fluid.layers.reshape(
+            x=q, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_q = fluid.layers.transpose(x=reshaped_q, perm=[0, 2, 1, 3])
+        reshaped_k = fluid.layers.reshape(
+            x=k, shape=[0, 0, self._n_head, self._d_key], inplace=False)
+        transpose_k = fluid.layers.transpose(x=reshaped_k, perm=[0, 2, 1, 3])
+        reshaped_v = fluid.layers.reshape(
+            x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
+        transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
+
+        #scale dot product attention
+        product = fluid.layers.matmul(
+            x=transpose_q,
+            y=transpose_k,
+            transpose_y=True,
+            alpha=self._d_model**-0.5)
+        if attn_bias:
+            product += attn_bias
+        weights = fluid.layers.softmax(product)
+        if self._dropout_rate:
+            weights_droped = fluid.layers.dropout(
+                weights,
+                dropout_prob=self._dropout_rate,
+                seed=ModelHyperParams.dropout_seed,
+                is_test=False)
+            out = fluid.layers.matmul(weights_droped, transpose_v)
+        else:
+            out = fluid.layers.matmul(weights, transpose_v)
+
+        # combine heads
+        if len(out.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+        trans_x = fluid.layers.transpose(out, perm=[0, 2, 1, 3])
+        final_out = fluid.layers.reshape(
+            x=trans_x,
+            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
+            inplace=False)
+
+        # fc to output
+        proj_out = self._proj_fc(final_out)
+        return proj_out
+
+
+class EncoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(EncoderSubLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._postprocess_cmd = postprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(), d_key, d_value, d_model, n_head,
+            attention_dropout)
+        self._postprocess_layer = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+        self._preprocess_layer2 = PrePostProcessLayer(self.full_name(),
+                                                      self._preprocess_cmd, 3)
+        self._positionwise_feed_forward = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._postprocess_layer2 = PrePostProcessLayer(
+            self.full_name(), self._postprocess_cmd, None)
+
+    def forward(self, enc_input, attn_bias):
+        pre_process_multihead = self._preprocess_layer(
+            None, enc_input, self._preprocess_cmd, self._prepostprocess_dropout)
+        attn_output = self._multihead_attention_layer(pre_process_multihead,
+                                                      None, None, attn_bias)
+        attn_output = self._postprocess_layer(enc_input, attn_output,
+                                              self._postprocess_cmd,
+                                              self._prepostprocess_dropout)
+        pre_process2_output = self._preprocess_layer2(
+            None, attn_output, self._preprocess_cmd,
+            self._prepostprocess_dropout)
+        ffd_output = self._positionwise_feed_forward(pre_process2_output)
+        return self._postprocess_layer2(attn_output, ffd_output,
+                                        self._postprocess_cmd,
+                                        self._prepostprocess_dropout)
+
+
+class EncoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd="n",
+                 postprocess_cmd="da"):
+
+        super(EncoderLayer, self).__init__(name_scope)
+        self._preprocess_cmd = preprocess_cmd
+        self._encoder_sublayers = list()
+        self._prepostprocess_dropout = prepostprocess_dropout
+        self._n_layer = n_layer
+        self._preprocess_layer = PrePostProcessLayer(self.full_name(),
+                                                     self._preprocess_cmd, 3)
+        for i in range(n_layer):
+            self._encoder_sublayers.append(
+                self.add_sublayer(
+                    'esl_%d' % i,
+                    EncoderSubLayer(
+                        self.full_name(), n_head, d_key, d_value, d_model,
+                        d_inner_hid, prepostprocess_dropout, attention_dropout,
+                        relu_dropout, preprocess_cmd, postprocess_cmd)))
+
+    def forward(self, enc_input, attn_bias):
+        for i in range(self._n_layer):
+            enc_output = self._encoder_sublayers[i](enc_input, attn_bias)
+            enc_input = enc_output
+
+        return self._preprocess_layer(None, enc_output, self._preprocess_cmd,
+                                      self._prepostprocess_dropout)
+
+
+class PrepareEncoderDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 src_emb_dim,
+                 src_max_len,
+                 dropout_rate,
+                 word_emb_param_name=None,
+                 pos_enc_param_name=None):
+        super(PrepareEncoderDecoderLayer, self).__init__(name_scope)
+        self._src_max_len = src_max_len
+        self._src_emb_dim = src_emb_dim
+        self._src_vocab_size = src_vocab_size
+        self._dropout_rate = dropout_rate
+        self._input_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[src_vocab_size, src_emb_dim],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                name=word_emb_param_name,
+                initializer=fluid.initializer.Normal(0., src_emb_dim**-0.5)))
+
+        if pos_enc_param_name is pos_enc_param_names[0]:
+            pos_inp = pos_inp1
+        else:
+            pos_inp = pos_inp2
+        self._pos_emb = Embedding(
+            name_scope=self.full_name(),
+            size=[self._src_max_len, src_emb_dim],
+            param_attr=fluid.ParamAttr(
+                name=pos_enc_param_name,
+                initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
+                trainable=False))
+
+        # use in imperative_mode to fit different length batch
+        # self._pos_emb._w = to_variable(
+        #     position_encoding_init(self._src_max_len, self._src_emb_dim))
+
+    def forward(self, src_word, src_pos):
+        src_word_emb = self._input_emb(src_word)
+        src_word_emb = fluid.layers.scale(
+            x=src_word_emb, scale=self._src_emb_dim**0.5)
+        # # TODO change this to fit dynamic length input
+        src_pos_emb = self._pos_emb(src_pos)
+        src_pos_emb.stop_gradient = True
+        enc_input = src_word_emb + src_pos_emb
+        return fluid.layers.dropout(
+            enc_input,
+            dropout_prob=self._dropout_rate,
+            seed=ModelHyperParams.dropout_seed,
+            is_test=False) if self._dropout_rate else enc_input
+
+
+class WrapEncoderLayer(Layer):
+    def __init__(self, name_cope, src_vocab_size, max_length, n_layer, n_head,
+                 d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+                 attention_dropout, relu_dropout, preprocess_cmd,
+                 postprocess_cmd, weight_sharing):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapEncoderLayer, self).__init__(name_cope)
+
+        self._prepare_encoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            src_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[0],
+            pos_enc_param_name=pos_enc_param_names[0])
+        self._encoder = EncoderLayer(
+            self.full_name(), n_layer, n_head, d_key, d_value, d_model,
+            d_inner_hid, prepostprocess_dropout, attention_dropout,
+            relu_dropout, preprocess_cmd, postprocess_cmd)
+
+    def forward(self, enc_inputs):
+        src_word, src_pos, src_slf_attn_bias = enc_inputs
+        enc_input = self._prepare_encoder_layer(src_word, src_pos)
+        enc_output = self._encoder(enc_input, src_slf_attn_bias)
+        return enc_output
+
+
+class DecoderSubLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 cache=None,
+                 gather_idx=None):
+        super(DecoderSubLayer, self).__init__(name_scope)
+        self._postprocess_cmd = postprocess_cmd
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprcess_dropout = prepostprocess_dropout
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._multihead_attention_layer = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx)
+        self._post_process_layer = PrePostProcessLayer(self.full_name(),
+                                                       postprocess_cmd, None)
+        self._pre_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._multihead_attention_layer2 = MultiHeadAttentionLayer(
+            self.full_name(),
+            d_key,
+            d_value,
+            d_model,
+            n_head,
+            attention_dropout,
+            cache=cache,
+            gather_idx=gather_idx,
+            static_kv=True)
+        self._post_process_layer2 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+        self._pre_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                       preprocess_cmd, 3)
+        self._positionwise_feed_forward_layer = PositionwiseFeedForwardLayer(
+            self.full_name(), d_inner_hid, d_model, relu_dropout)
+        self._post_process_layer3 = PrePostProcessLayer(self.full_name(),
+                                                        postprocess_cmd, None)
+
+    def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
+        pre_process_rlt = self._pre_process_layer(
+            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout)
+        slf_attn_output = self._multihead_attention_layer(pre_process_rlt, None,
+                                                          None, slf_attn_bias)
+        slf_attn_output_pp = self._post_process_layer(
+            dec_input, slf_attn_output, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt2 = self._pre_process_layer2(None, slf_attn_output_pp,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        enc_attn_output_pp = self._multihead_attention_layer2(
+            pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
+        enc_attn_output = self._post_process_layer2(
+            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            self._prepostprcess_dropout)
+        pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
+                                                    self._preprocess_cmd,
+                                                    self._prepostprcess_dropout)
+        ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
+        dec_output = self._post_process_layer3(enc_attn_output, ffd_output,
+                                               self._postprocess_cmd,
+                                               self._prepostprcess_dropout)
+        return dec_output
+
+
+class DecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 caches=None,
+                 gather_idx=None):
+        super(DecoderLayer, self).__init__(name_scope)
+        self._pre_process_layer = PrePostProcessLayer(self.full_name(),
+                                                      preprocess_cmd, 3)
+        self._decoder_sub_layers = list()
+        self._n_layer = n_layer
+        self._preprocess_cmd = preprocess_cmd
+        self._prepostprocess_dropout = prepostprocess_dropout
+        for i in range(n_layer):
+            self._decoder_sub_layers.append(
+                self.add_sublayer(
+                    'dsl_%d' % i,
+                    DecoderSubLayer(
+                        self.full_name(),
+                        n_head,
+                        d_key,
+                        d_value,
+                        d_model,
+                        d_inner_hid,
+                        prepostprocess_dropout,
+                        attention_dropout,
+                        relu_dropout,
+                        preprocess_cmd,
+                        postprocess_cmd,
+                        cache=None if caches is None else caches[i],
+                        gather_idx=gather_idx)))
+
+    def forward(self, dec_input, enc_output, dec_slf_attn_bias,
+                dec_enc_attn_bias):
+        for i in range(self._n_layer):
+            tmp_dec_output = self._decoder_sub_layers[i](
+                dec_input, enc_output, dec_slf_attn_bias, dec_enc_attn_bias)
+            dec_input = tmp_dec_output
+
+        dec_output = self._pre_process_layer(None, tmp_dec_output,
+                                             self._preprocess_cmd,
+                                             self._prepostprocess_dropout)
+        return dec_output
+
+
+class WrapDecoderLayer(Layer):
+    def __init__(self,
+                 name_scope,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 caches=None,
+                 gather_idx=None):
+        """
+        The wrapper assembles together all needed layers for the encoder.
+        """
+        super(WrapDecoderLayer, self).__init__(name_scope)
+
+        self._prepare_decoder_layer = PrepareEncoderDecoderLayer(
+            self.full_name(),
+            trg_vocab_size,
+            d_model,
+            max_length,
+            prepostprocess_dropout,
+            word_emb_param_name=word_emb_param_names[1],
+            pos_enc_param_name=pos_enc_param_names[1])
+        self._decoder_layer = DecoderLayer(
+            self.full_name(),
+            n_layer,
+            n_head,
+            d_key,
+            d_value,
+            d_model,
+            d_inner_hid,
+            prepostprocess_dropout,
+            attention_dropout,
+            relu_dropout,
+            preprocess_cmd,
+            postprocess_cmd,
+            caches=caches,
+            gather_idx=gather_idx)
+        self._weight_sharing = weight_sharing
+        if not weight_sharing:
+            self._fc = FC(self.full_name(),
+                          size=trg_vocab_size,
+                          bias_attr=False)
+
+    def forward(self, dec_inputs=None, enc_output=None):
+        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
+        dec_input = self._prepare_decoder_layer(trg_word, trg_pos)
+        dec_output = self._decoder_layer(dec_input, enc_output,
+                                         trg_slf_attn_bias, trg_src_attn_bias)
+
+        dec_output_reshape = fluid.layers.reshape(
+            dec_output, shape=[-1, dec_output.shape[-1]], inplace=False)
+
+        if self._weight_sharing:
+            predict = fluid.layers.matmul(
+                x=dec_output_reshape,
+                y=self._prepare_decoder_layer._input_emb._w,
+                transpose_y=True)
+        else:
+            predict = self._fc(dec_output_reshape)
+
+        if dec_inputs is None:
+            # Return probs for independent decoder program.
+            predict_out = fluid.layers.softmax(predict)
+            return predict_out
+        return predict
+
+
+class TransFormer(Layer):
+    def __init__(self,
+                 name_scope,
+                 src_vocab_size,
+                 trg_vocab_size,
+                 max_length,
+                 n_layer,
+                 n_head,
+                 d_key,
+                 d_value,
+                 d_model,
+                 d_inner_hid,
+                 prepostprocess_dropout,
+                 attention_dropout,
+                 relu_dropout,
+                 preprocess_cmd,
+                 postprocess_cmd,
+                 weight_sharing,
+                 label_smooth_eps,
+                 use_py_reader=False,
+                 is_test=False):
+        super(TransFormer, self).__init__(name_scope)
+        self._label_smooth_eps = label_smooth_eps
+        self._trg_vocab_size = trg_vocab_size
+        if weight_sharing:
+            assert src_vocab_size == trg_vocab_size, (
+                "Vocabularies in source and target should be same for weight sharing."
+            )
+        self._wrap_encoder_layer = WrapEncoderLayer(
+            self.full_name(), src_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+        self._wrap_decoder_layer = WrapDecoderLayer(
+            self.full_name(), trg_vocab_size, max_length, n_layer, n_head,
+            d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout,
+            attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd,
+            weight_sharing)
+
+        if weight_sharing:
+            self._wrap_decoder_layer._prepare_decoder_layer._input_emb._w = self._wrap_encoder_layer._prepare_encoder_layer._input_emb._w
+
+    def forward(self, enc_inputs, dec_inputs, label, weights):
+        enc_output = self._wrap_encoder_layer(enc_inputs)
+        predict = self._wrap_decoder_layer(dec_inputs, enc_output)
+        if self._label_smooth_eps:
+            label_out = fluid.layers.label_smooth(
+                label=fluid.layers.one_hot(
+                    input=label, depth=self._trg_vocab_size),
+                epsilon=self._label_smooth_eps)
+
+        cost = fluid.layers.softmax_with_cross_entropy(
+            logits=predict,
+            label=label_out,
+            soft_label=True if self._label_smooth_eps else False)
+        weighted_cost = cost * weights
+        sum_cost = fluid.layers.reduce_sum(weighted_cost)
+        token_num = fluid.layers.reduce_sum(weights)
+        token_num.stop_gradient = True
+        avg_cost = sum_cost / token_num
+        return sum_cost, avg_cost, predict, token_num
+
+
+class TestImperativeTransformer(unittest.TestCase):
+    def test_transformer_float32(self):
+        seed = 90
+        with guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            if sync:
+                lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
+                    ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
+                with fluid.default_main_program()._lr_schedule_guard():
+                    learning_rate = lr_decay * TrainTaskConfig.learning_rate
+                optimizer = fluid.optimizer.Adam(
+                    learning_rate=learning_rate,
+                    beta1=TrainTaskConfig.beta1,
+                    beta2=TrainTaskConfig.beta2,
+                    epsilon=TrainTaskConfig.eps)
+            else:
+                optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            dy_param_init = dict()
+            dy_param_updated = dict()
+            for i in range(batch_num):
+                enc_inputs, dec_inputs, label, weights = create_data()
+                dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
+                    enc_inputs, dec_inputs, label, weights)
+                if i == 0:
+                    for param in transformer.parameters():
+                        dy_param_init[param.name] = param._numpy()
+
+                dy_avg_cost._backward()
+                optimizer.minimize(dy_avg_cost)
+                transformer.clear_gradients()
+                if i == batch_num - 1:
+                    for param in transformer.parameters():
+                        dy_param_updated[param.name] = param._numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            transformer = TransFormer(
+                'transformer',
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=use_py_reader,
+                is_test=False)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+
+            data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
+                                                                                     -1] + label_data_input_fields
+            all_inputs = make_all_inputs(data_input_names)
+            enc_inputs_len = len(encoder_data_input_fields)
+            dec_inputs_len = len(decoder_data_input_fields[:-1])
+            enc_inputs = all_inputs[0:enc_inputs_len]
+            dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
+                                    dec_inputs_len]
+            label = all_inputs[-2]
+            weights = all_inputs[-1]
+            static_param_updated = dict()
+            static_param_init = dict()
+            static_param_name_list = list()
+            static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
+                enc_inputs, dec_inputs, label, weights)
+
+            optimizer.minimize(static_avg_cost)
+            for param in transformer.parameters():
+                static_param_name_list.append(param.name)
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init[static_param_name_list[i]] = out[i]
+            static_sum_cost_value = None
+            static_avg_cost_value = None
+            static_predict_value = None
+            static_token_num_value = None
+            for i in range(batch_num):
+                feed_dict = create_feed_dict_list(create_data(True))
+                fetch_list = [
+                    static_sum_cost, static_avg_cost, static_predict,
+                    static_token_num
+                ]
+                fetch_list.extend(static_param_name_list)
+
+                out = exe.run(fluid.default_main_program(),
+                              feed=feed_dict,
+                              fetch_list=fetch_list)
+                static_sum_cost_value = out[0]
+                static_avg_cost_value = out[1]
+                static_predict_value = out[2]
+                static_token_num_value = out[3]
+                if i == batch_num - 1:
+                    for k in range(4, len(out)):
+                        static_param_updated[static_param_name_list[k -
+                                                                    4]] = out[k]
+
+        self.assertTrue(
+            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+        self.assertTrue(
+            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+        self.assertTrue(
+            np.allclose(
+                static_predict_value, dy_predict._numpy(), atol=1e-5))
+        self.assertTrue(
+            np.allclose(static_token_num_value, dy_token_num._numpy()))
+        for key, value in six.iteritems(static_param_init):
+            self.assertTrue(np.allclose(value, dy_param_init[key]))
+        for key, value in six.iteritems(static_param_updated):
+            self.assertTrue(
+                np.allclose(
+                    value, dy_param_updated[key], atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 885ee170e8032ef865ebfdd646fed1e995e9e60b..1429d24107ebfc487186a25a7207f8055edad213 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -70,6 +70,34 @@ class LayerTest(unittest.TestCase):
 
 
 class TestLayer(LayerTest):
+    def test_layer_norm(self):
+        inp = np.ones([3, 32, 32], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            ret = layers.layer_norm(t)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            lm = nn.LayerNorm('layer_norm')
+            ret = lm(t)
+            static_ret2 = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret])[0]
+        with self.dynamic_graph():
+            lm = nn.LayerNorm('layer_norm')
+            dy_ret = lm(base.to_variable(inp))
+
+        self.assertTrue(np.allclose(static_ret, static_ret2))
+        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
+
     def test_relu(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -1240,6 +1268,14 @@ class TestBook(unittest.TestCase):
 
         print(str(program))
 
+    def test_range(self):
+        program = Program()
+        with program_guard(program):
+            layers.range(0, 10, 2, 'int32')
+            layers.range(0.1, 10.0, 0.2, 'float32')
+
+        print(str(program))
+
     def test_spectral_norm(self):
         program = Program()
         with program_guard(program):
@@ -1261,6 +1297,15 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(out)
         print(str(program))
 
+    def test_fsp(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="X", shape=[16, 4, 4], dtype="float32")
+            y = layers.data(name="Y", shape=[8, 4, 4], dtype="float32")
+            out = layers.fsp_matrix(x, y)
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4efca5e2aafd9c370ccc37791a9900b18f2705f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import math
+import unittest
+import numpy as np
+import os
+
+os.environ['CPU_NUM'] = '1'
+
+
+def random_reader(sample_num):
+    def __impl__():
+        for _ in range(sample_num):
+            yield np.random.random(
+                size=[784]).astype('float32'), np.random.random_integers(
+                    low=0, high=9, size=[1]).astype('int64')
+
+    return paddle.reader.cache(__impl__)
+
+
+class TestCaseBase(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 165
+
+    def generate_all_data(self, reader):
+        ret = []
+        for d in reader():
+            slots = [[], []]
+            for item in d:
+                slots[0].append(item[0])
+                slots[1].append(item[1])
+            slots = [np.array(slot) for slot in slots]
+            ret.append(slots)
+        return ret
+
+    def run_main(self, reader, use_sample_generator, iterable, drop_last):
+        image = fluid.layers.data(name='image', dtype='float32', shape=[784])
+        label = fluid.layers.data(name='label', dtype='int64', shape=[1])
+        py_reader = fluid.io.PyReader(
+            feed_list=[image, label],
+            capacity=16,
+            iterable=iterable,
+            use_double_buffer=False)
+
+        batch_reader = paddle.batch(reader, self.batch_size, drop_last)
+        all_datas = self.generate_all_data(batch_reader)
+
+        if not use_sample_generator:
+            py_reader.decorate_sample_list_generator(
+                batch_reader, places=fluid.cpu_places())
+        else:
+            py_reader.decorate_sample_generator(
+                reader, self.batch_size, drop_last, places=fluid.cpu_places())
+
+        if drop_last:
+            batch_num = int(self.sample_num / self.batch_size)
+        else:
+            batch_num = math.ceil(float(self.sample_num) / self.batch_size)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_startup_program())
+        for _ in range(self.epoch_num):
+            if py_reader.iterable:
+                step = 0
+                for data in py_reader():
+                    img, lbl = exe.run(feed=data, fetch_list=[image, label])
+                    self.assertArrayEqual(img, all_datas[step][0])
+                    self.assertArrayEqual(lbl, all_datas[step][1])
+                    step += 1
+                self.assertEqual(step, len(all_datas))
+            else:
+                step = 0
+                try:
+                    py_reader.start()
+                    while True:
+                        img, lbl = exe.run(fetch_list=[image, label])
+                        self.assertArrayEqual(img, all_datas[step][0])
+                        self.assertArrayEqual(lbl, all_datas[step][1])
+                        step += 1
+                except fluid.core.EOFException:
+                    py_reader.reset()
+                    self.assertEqual(step, len(all_datas))
+                    break
+
+    def assertArrayEqual(self, arr1, arr2):
+        self.assertEqual(arr1.shape, arr2.shape)
+        self.assertTrue((arr1 == arr2).all())
+
+    def test_main(self):
+        reader = random_reader(self.sample_num)
+        for use_sample_generator in [False, True]:
+            for iterable in [False, True]:
+                for drop_last in [False, True]:
+                    with fluid.program_guard(fluid.Program(), fluid.Program()):
+                        self.run_main(reader, use_sample_generator, iterable,
+                                      drop_last)
+
+
+class TestCase1(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 10
+        self.sample_num = 160
+
+
+class TestCase2(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 200
+
+
+class TestCase3(TestCaseBase):
+    def setUp(self):
+        self.batch_size = 32
+        self.epoch_num = 2
+        self.sample_num = 159
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_range.py b/python/paddle/fluid/tests/unittests/test_range.py
new file mode 100644
index 0000000000000000000000000000000000000000..f129ae78cbf7e2ccd5d974de265b8e95d1391df8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_range.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            'Start': np.array([self.case[0]]).astype(self.dtype),
+            'End': np.array([self.case[1]]).astype(self.dtype),
+            'Step': np.array([self.case[2]]).astype(self.dtype)
+        }
+
+        self.outputs = {
+            'Out': np.arange(self.case[0], self.case[1],
+                             self.case[2]).astype(self.dtype)
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFloatRangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.case = (0, 5, 1)
+
+
+class TestInt32RangeOpCase0(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (0, 5, 2)
+
+
+class TestInt32RangeOpCase1(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (10, 1, -2)
+
+
+class TestInt32RangeOpCase2(TestRangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.case = (-1, -10, -2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 685d08b9e0b2127fbe8f8b55f8c329ce0002bbe7..f8c5ae0eaf45fd3ab43652c16b4954d622787702 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 __all__ = [
-    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
     'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader',
     'multiprocess_reader', 'Fake'
 ]
@@ -33,6 +33,30 @@ import zlib
 import paddle.compat as cpt
 
 
+def cache(reader):
+    """
+    Cache the reader data into memory. 
+
+    Be careful that this method may take long time to process, 
+    and consume lots of memory. :code:`reader()` would only 
+    call once. 
+
+    Args:
+        reader (generator): a reader object which yields 
+            data each time.
+
+    Returns:
+        generator: a decorated reader object which yields data from cached memory.
+    """
+    all_data = tuple(reader())
+
+    def __impl__():
+        for item in all_data:
+            yield item
+
+    return __impl__
+
+
 def map_readers(func, *readers):
     """
     Creates a data reader that outputs return value of function using
diff --git a/python/requirements.txt b/python/requirements.txt
index 36bd5d4261cc7aa78d26b8c8ddfd87abd4f4e2e2..ce56462fac9c69df79c3c542202d21c0c67a91b8 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -12,3 +12,4 @@ six
 funcsigs
 pyyaml
 decorator
+prettytable
diff --git a/python/setup.py.in b/python/setup.py.in
index a7c1e91f9c3a9597d799659a0abe3c9f56e54a57..9f87f5644fc969f3f55fd08689f3e2bbaf36dc39 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -117,6 +117,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.slim.quantization',
+          'paddle.fluid.contrib.slim.distillation',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details']
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index c2fd743f62f536ab7443ca215d100478021d8f7c..c37a9a92e654e2d0c7d1b3decca0a34a3f34863b 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -52,7 +52,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
 
-RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]