From 84109c641a2476d2df4a3b55189dadfd64896486 Mon Sep 17 00:00:00 2001
From: sangoly <sangoly@aliyun.com>
Date: Fri, 9 Aug 2019 17:01:05 +0800
Subject: [PATCH] [sangoly] paddle-lite step rnn new (#19100)

* step rnn

* disable ci
---
 .travis.yml                                   |    4 +-
 CMakeLists.txt                                |   86 +-
 Dockerfile                                    |   11 +-
 README.md                                     |    6 +-
 README_cn.md                                  |    6 +-
 cmake/anakin_subgraph.cmake                   |   11 +-
 cmake/configure.cmake                         |   31 +-
 cmake/cross_compiling/android.cmake           |   63 +-
 cmake/cross_compiling/armlinux.cmake          |   26 +-
 cmake/cross_compiling/findar.cmake            |   33 +
 cmake/cross_compiling/postproject.cmake       |   56 +
 cmake/cross_compiling/preproject.cmake        |   58 +
 cmake/cuda.cmake                              |    6 +-
 cmake/cudnn.cmake                             |    2 +-
 cmake/external/dgc.cmake                      |    2 +
 cmake/external/eigen.cmake                    |   11 +-
 cmake/external/gflags.cmake                   |   39 +-
 cmake/external/glog.cmake                     |   37 +-
 cmake/external/gtest.cmake                    |   29 +-
 cmake/external/mklml.cmake                    |    1 -
 cmake/external/ngraph.cmake                   |    2 +-
 cmake/external/opencl-clhpp.cmake             |   36 +
 cmake/external/opencl-headers.cmake           |   33 +
 cmake/external/protobuf.cmake                 |   51 +-
 cmake/external/pslib.cmake                    |    6 +-
 cmake/external/snappy.cmake                   |    7 +-
 cmake/external/warpctc.cmake                  |    7 +-
 cmake/external/xxhash.cmake                   |    7 +-
 cmake/external/zlib.cmake                     |    7 +-
 cmake/generic.cmake                           |   75 +-
 cmake/lite.cmake                              |   79 +
 cmake/operators.cmake                         |    2 +-
 cmake/version.cmake                           |   17 +-
 paddle/CMakeLists.txt                         |    9 +-
 paddle/fluid/API.spec                         |  477 ++-
 paddle/fluid/CMakeLists.txt                   |    9 +-
 paddle/fluid/framework/CMakeLists.txt         |   21 +-
 paddle/fluid/framework/async_executor.cc      |    5 +-
 paddle/fluid/framework/blocking_queue.h       |    5 -
 paddle/fluid/framework/data_feed.cc           |  316 +-
 paddle/fluid/framework/data_feed.h            |  105 +-
 paddle/fluid/framework/data_feed_factory.cc   |    3 -
 .../fluid/framework/data_layout_transform.cc  |   22 +-
 paddle/fluid/framework/data_set.cc            |   25 +-
 paddle/fluid/framework/data_set.h             |    6 -
 paddle/fluid/framework/details/CMakeLists.txt |    2 +-
 .../framework/details/all_reduce_op_handle.cc |   32 +-
 .../framework/details/all_reduce_op_handle.h  |   17 +-
 .../details/async_ssa_graph_executor.cc       |   66 +-
 .../fluid/framework/details/build_strategy.cc |   95 +-
 .../fluid/framework/details/build_strategy.h  |   18 +-
 .../details/eager_deletion_op_handle.cc       |    2 -
 .../fast_threaded_ssa_graph_executor.cc       |  163 +-
 .../fast_threaded_ssa_graph_executor.h        |   18 -
 .../details/fused_all_reduce_op_handle.cc     |   24 +-
 .../details/fused_all_reduce_op_handle.h      |   13 +-
 .../framework/details/multi_devices_helper.h  |    1 -
 .../fluid/framework/details/op_handle_base.cc |    7 +-
 .../details/parallel_ssa_graph_executor.cc    |    1 -
 .../fluid/framework/details/rpc_op_handle.cc  |    3 -
 .../details/scale_loss_grad_op_handle.cc      |    3 +-
 .../scope_buffered_ssa_graph_executor.cc      |   56 +-
 .../scope_buffered_ssa_graph_executor.h       |    5 +-
 .../details/sparse_all_reduce_op_handle.cc    |    5 +-
 .../details/sparse_all_reduce_op_handle.h     |    2 +-
 .../framework/details/ssa_graph_executor.cc   |    5 +-
 .../framework/details/ssa_graph_executor.h    |    2 +-
 .../details/threaded_ssa_graph_executor.cc    |  148 +-
 .../details/threaded_ssa_graph_executor.h     |   13 +-
 paddle/fluid/framework/device_worker.h        |  109 +-
 .../fluid/framework/device_worker_factory.cc  |    3 -
 paddle/fluid/framework/downpour_worker.cc     |   40 +-
 paddle/fluid/framework/executor.cc            |   15 +-
 paddle/fluid/framework/executor.h             |   24 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |  131 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |   22 +-
 paddle/fluid/framework/framework.proto        |    2 +-
 paddle/fluid/framework/hogwild_worker.cc      |    3 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |   17 +-
 .../alloc_continuous_space_for_grad_pass.cc   |  120 +-
 .../ir/alloc_continuous_space_for_grad_pass.h |    4 +-
 .../framework/ir/attention_lstm_fuse_pass.cc  |   40 +-
 .../ir/embedding_fc_lstm_fuse_pass.cc         |    4 +-
 paddle/fluid/framework/ir/fc_fuse_pass.cc     |    7 -
 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc |   11 +-
 .../fluid/framework/ir/fc_lstm_fuse_pass.cc   |    4 +-
 .../framework/ir/fuse_elewise_add_act_pass.cc |    2 +-
 paddle/fluid/framework/ir/fuse_pass_base.cc   |    3 +-
 paddle/fluid/framework/ir/graph.cc            |    2 -
 .../framework/ir/graph_pattern_detector.cc    |  205 +-
 .../framework/ir/graph_pattern_detector.h     |  105 +-
 paddle/fluid/framework/ir/graph_viz_pass.cc   |    4 +-
 .../framework/ir/lock_free_optimize_pass.h    |    5 +-
 .../memory_optimize_pass/inplace_op_pass.cc   |    9 +
 .../memory_optimize_pass.cc                   |  100 +
 .../record_skip_memory_opt_vars_pass.cc       |    6 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   |   12 +-
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h    |    9 +-
 .../conv_bias_mkldnn_fuse_pass_tester.cc      |   12 +-
 .../framework/ir/mkldnn/cpu_quantize_pass.cc  |   91 -
 .../framework/ir/mkldnn/cpu_quantize_pass.h   |    7 -
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     |  105 +-
 .../ir/mkldnn/cpu_quantize_squash_pass.cc     |   11 +-
 .../mkldnn/cpu_quantize_squash_pass_tester.cc |    5 +-
 .../ir/mkldnn/mkldnn_placement_pass.cc        |    4 -
 .../multi_devices_graph_pass/CMakeLists.txt   |    1 -
 .../all_reduce_deps_pass.cc                   |   40 +-
 .../fuse_all_reduce_op_pass.cc                |   15 +-
 .../multi_devices_graph_pass.cc               |   12 +-
 .../multi_devices_graph_pass.h                |    5 +-
 .../ir/quant_conv2d_dequant_fuse_pass.cc      |   73 +-
 .../ir/seqconv_eltadd_relu_fuse_pass.cc       |    4 +-
 paddle/fluid/framework/lod_tensor.cc          |   28 +-
 paddle/fluid/framework/lod_tensor_test.cc     |    2 -
 paddle/fluid/framework/op_desc.cc             |    1 +
 paddle/fluid/framework/op_desc.h              |    3 +-
 paddle/fluid/framework/operator.cc            |   31 +-
 paddle/fluid/framework/operator.h             |   38 +-
 .../fluid/framework/operator_kernel_configs.h |    2 +-
 paddle/fluid/framework/parallel_executor.cc   |  217 +-
 paddle/fluid/framework/parallel_executor.h    |    4 +
 paddle/fluid/framework/pull_dense_worker.cc   |   34 +-
 paddle/fluid/framework/tensor.cc              |    8 +-
 paddle/fluid/framework/tensor.h               |   15 +-
 paddle/fluid/framework/tensor_impl.h          |    9 +-
 paddle/fluid/framework/tensor_util.cc         |   46 -
 paddle/fluid/framework/tensor_util.h          |    2 -
 paddle/fluid/framework/trainer.h              |   53 -
 paddle/fluid/framework/trainer_desc.proto     |   28 -
 paddle/fluid/framework/trainer_factory.cc     |    3 -
 paddle/fluid/framework/var_type_traits.cc     |    3 +-
 paddle/fluid/framework/var_type_traits.h      |    3 +-
 .../fluid/framework/var_type_traits_test.cc   |    1 -
 paddle/fluid/imperative/CMakeLists.txt        |    7 +-
 paddle/fluid/imperative/layer.cc              |  455 ++-
 paddle/fluid/imperative/layer.h               |  204 +-
 paddle/fluid/imperative/nccl_context.cc       |    1 -
 paddle/fluid/imperative/tracer.cc             |  130 +-
 paddle/fluid/imperative/tracer.h              |   16 +-
 paddle/fluid/imperative/type_defs.h           |   15 +-
 paddle/fluid/inference/CMakeLists.txt         |    8 +-
 .../inference/anakin/convert/CMakeLists.txt   |   32 +-
 .../fluid/inference/anakin/convert/conv2d.cc  |    7 +-
 .../inference/anakin/convert/conv2d_fusion.cc |    7 +-
 .../inference/anakin/convert/elementwise.cc   |    2 +-
 paddle/fluid/inference/anakin/convert/fc.cc   |    7 +-
 .../inference/anakin/convert/op_converter.h   |   27 +-
 .../anakin/convert/test_activation_op.cc      |   32 +
 .../anakin/convert/test_affine_channel_op.cc  |    7 +-
 .../anakin/convert/test_batch_norm_op.cc      |    8 +-
 .../anakin/convert/test_concat_op.cc          |    8 +-
 .../anakin/convert/test_conv2d_op.cc          |    8 +-
 .../anakin/convert/test_dropout_op.cc         |    7 +-
 .../anakin/convert/test_elementwise_op.cc     |   10 +-
 .../inference/anakin/convert/test_fc_op.cc    |    7 +-
 .../anakin/convert/test_flatten_op.cc         |    7 +-
 .../anakin/convert/test_pool2d_op.cc          |    8 +-
 .../inference/anakin/convert/test_relu_op.cc  |    5 +
 .../anakin/convert/test_reshape_op.cc         |    8 +-
 .../anakin/convert/test_softmax_op.cc         |    8 +-
 .../inference/anakin/convert/test_split_op.cc |    7 +-
 .../inference/anakin/convert/test_sum_op.cc   |    7 +-
 .../anakin/convert/test_transpose_op.cc       |    7 +-
 .../inference/anakin/convert/ut_helper.h      |    8 +-
 paddle/fluid/inference/anakin/engine.cc       |   27 +-
 paddle/fluid/inference/anakin/engine.h        |    9 +-
 .../inference/anakin/test_anakin_engine.cc    |    1 +
 .../fluid/inference/analysis/CMakeLists.txt   |   38 +-
 paddle/fluid/inference/analysis/argument.h    |   11 -
 paddle/fluid/inference/analysis/dot.h         |   13 +-
 paddle/fluid/inference/analysis/helper.cc     |   12 -
 .../inference/analysis/ir_pass_manager.cc     |   14 +-
 .../analysis/ir_passes/CMakeLists.txt         |    2 +-
 .../ir_passes/anakin_subgraph_pass.cc         |    6 +-
 .../analysis/ir_passes/subgraph_detector.cc   |    2 +-
 .../analysis/ir_passes/subgraph_detector.h    |    7 +-
 .../analysis/ir_passes/subgraph_util.cc       |    4 +-
 .../analysis/ir_passes/subgraph_util.h        |    2 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |   44 +-
 .../inference/analysis/passes/CMakeLists.txt  |    2 +-
 .../analysis/passes/ir_graph_build_pass.cc    |    7 +-
 .../ir_params_sync_among_devices_pass.cc      |    2 +-
 .../analysis/passes/memory_optimize_pass.cc   |  136 +-
 .../analysis/passes/memory_optimize_pass.h    |    2 -
 .../{passes.cc => paddle_use_passes.cc}       |    2 +-
 paddle/fluid/inference/api/CMakeLists.txt     |   48 +-
 paddle/fluid/inference/api/analysis_config.cc |   64 +-
 .../fluid/inference/api/analysis_predictor.cc |   24 +-
 .../api/analysis_predictor_tester.cc          |    4 +-
 .../fluid/inference/api/api_anakin_engine.cc  |  446 +--
 .../fluid/inference/api/api_anakin_engine.h   |   60 +-
 .../inference/api/demo_ci/CMakeLists.txt      |    9 +-
 paddle/fluid/inference/api/helper.h           |   49 +-
 .../fluid/inference/api/mkldnn_quantizer.cc   |   69 +-
 paddle/fluid/inference/api/mkldnn_quantizer.h |    7 +-
 .../inference/api/mkldnn_quantizer_config.cc  |    7 +-
 .../inference/api/paddle_anakin_config.h      |   20 +-
 .../inference/api/paddle_analysis_config.h    |   26 +-
 .../inference/api/paddle_inference_api.h      |    2 +-
 .../inference/api/paddle_pass_builder.cc      |   63 +-
 .../fluid/inference/api/paddle_pass_builder.h |   10 -
 .../inference/tensorrt/convert/CMakeLists.txt |   66 +-
 .../tensorrt/convert/activation_op.cc         |   13 +-
 .../tensorrt/convert/batch_norm_op.cc         |    8 +-
 .../inference/tensorrt/convert/concat_op.cc   |    8 +-
 .../inference/tensorrt/convert/conv2d_op.cc   |   47 +-
 .../inference/tensorrt/convert/dropout_op.cc  |    7 +-
 .../tensorrt/convert/elementwise_op.cc        |   55 +-
 .../fluid/inference/tensorrt/convert/fc_op.cc |   71 +-
 .../tensorrt/convert/leaky_relu_op.cc         |   10 +-
 .../inference/tensorrt/convert/op_converter.h |   16 -
 .../inference/tensorrt/convert/pad_op.cc      |    8 +-
 .../inference/tensorrt/convert/pool2d_op.cc   |   12 +-
 .../inference/tensorrt/convert/prelu_op.cc    |    9 +-
 .../inference/tensorrt/convert/softmax_op.cc  |   10 +-
 .../inference/tensorrt/convert/ut_helper.h    |    6 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   79 +-
 paddle/fluid/inference/tensorrt/engine.h      |   53 -
 paddle/fluid/inference/tensorrt/op_teller.cc  |    5 +-
 paddle/fluid/inference/tensorrt/op_teller.h   |    1 -
 .../fluid/inference/tests/api/CMakeLists.txt  |  180 +-
 .../tests/api/anakin_mobilenet_tester.cc      |    4 +-
 .../inference/tests/api/anakin_rnn1_tester.cc |    4 +-
 .../tests/api/analyzer_bert_tester.cc         |   30 +-
 .../tests/api/analyzer_dam_tester.cc          |    5 +-
 .../tests/api/analyzer_mm_dnn_tester.cc       |    2 -
 .../tests/api/analyzer_pyramid_dnn_tester.cc  |    4 -
 .../tests/api/analyzer_rnn1_tester.cc         |    4 -
 .../tests/api/analyzer_save_model_tester.cc   |   18 +-
 .../tests/api/analyzer_seq_pool1_tester.cc    |    5 -
 .../analyzer_text_classification_tester.cc    |    9 +-
 .../tests/api/analyzer_transformer_tester.cc  |    2 -
 .../tests/api/analyzer_vis_tester.cc          |    2 -
 .../inference/tests/api/config_printer.h      |    2 -
 .../fluid/inference/tests/api/tester_helper.h |   67 +-
 paddle/fluid/inference/tests/test.cmake       |   28 +-
 paddle/fluid/lite/CMakeLists.txt              |  177 +-
 paddle/fluid/lite/api/CMakeLists.txt          |  166 +-
 paddle/fluid/lite/api/android/.gitignore      |    2 +
 paddle/fluid/lite/api/android/CMakeLists.txt  |    5 +
 paddle/fluid/lite/api/android/jni/.gitignore  |    3 +
 .../fluid/lite/api/android/jni/CMakeLists.txt |   52 +
 .../api/android/jni/native/CMakeLists.txt     |   18 +
 .../api/android/jni/native/convert_util_jni.h |  183 +
 .../api/android/jni/native/paddle_init_jni.cc |   83 +
 .../api/android/jni/native/paddle_init_jni.h  |   42 +
 .../api/android/jni/native/paddle_lite_jni.cc |  158 +
 .../api/android/jni/native/paddle_lite_jni.h  |  109 +
 .../lite/api/android/jni/native/tensor_jni.cc |  168 +
 .../lite/api/android/jni/native/tensor_jni.h  |   91 +
 .../jni/src/com/baidu/paddle/lite/.gitignore  |    2 +
 .../src/com/baidu/paddle/lite/ConfigBase.java |   31 +
 .../src/com/baidu/paddle/lite/CxxConfig.java  |   39 +
 .../com/baidu/paddle/lite/MobileConfig.java   |   22 +
 .../paddle/lite/PaddleLiteInitializer.java    |   26 +
 .../baidu/paddle/lite/PaddlePredictor.java    |   94 +
 .../jni/src/com/baidu/paddle/lite/Place.java  |  119 +
 .../com/baidu/paddle/lite/PrecisionType.java  |   11 +
 .../jni/src/com/baidu/paddle/lite/Tensor.java |   57 +
 .../paddle/lite/PaddlePredictorTest.java      |   48 +
 paddle/fluid/lite/api/apis_test.cc            |  110 +
 paddle/fluid/lite/api/cxx_api.cc              |   71 +-
 paddle/fluid/lite/api/cxx_api.h               |  106 +-
 paddle/fluid/lite/api/cxx_api_bin.cc          |   74 +-
 paddle/fluid/lite/api/cxx_api_bin_int8.cc     |   85 +
 paddle/fluid/lite/api/cxx_api_impl.cc         |   87 +
 paddle/fluid/lite/api/cxx_api_test.cc         |  104 +-
 paddle/fluid/lite/api/inceptionv4_test.cc     |   90 +
 paddle/fluid/lite/api/light_api.cc            |   64 +
 paddle/fluid/lite/api/light_api.h             |   67 +-
 paddle/fluid/lite/api/light_api_impl.cc       |   69 +
 paddle/fluid/lite/api/light_api_test.cc       |   38 +-
 paddle/fluid/lite/api/lite_api_test_helper.cc |   60 +
 paddle/fluid/lite/api/lite_api_test_helper.h  |   31 +
 paddle/fluid/lite/api/mobilenetv1_test.cc     |  102 +
 paddle/fluid/lite/api/mobilenetv2_test.cc     |   86 +
 paddle/fluid/lite/api/model_optimize_tool.cc  |   76 +
 paddle/fluid/lite/api/model_test.cc           |  143 +
 paddle/fluid/lite/api/paddle_api.cc           |   69 +
 paddle/fluid/lite/api/paddle_api.h            |  110 +
 paddle/fluid/lite/api/paddle_api_test.cc      |   86 +
 .../lite/api/paddle_lite_factory_helper.h     |   37 +
 paddle/fluid/lite/api/paddle_place.cc         |   93 +
 paddle/fluid/lite/api/paddle_place.h          |  118 +
 paddle/fluid/lite/api/paddle_use_kernels.h    |   84 +
 paddle/fluid/lite/api/paddle_use_ops.h        |   51 +
 .../mir/passes.h => api/paddle_use_passes.h}  |   22 +-
 paddle/fluid/lite/api/resnet50_test.cc        |   86 +
 paddle/fluid/lite/api/test_googlenet_lite.cc  |   80 +
 paddle/fluid/lite/api/test_helper.h           |   37 +
 .../lite/api/test_inceptionv4_lite_x86.cc     |  105 +
 .../lite/api/test_mobilenetv1_lite_x86.cc     |  102 +
 .../lite/api/test_mobilenetv2_lite_x86.cc     |  105 +
 .../fluid/lite/api/test_step_rnn_lite_x86.cc  |  119 +
 paddle/fluid/lite/arm/CMakeLists.txt          |    1 +
 paddle/fluid/lite/arm/math/CMakeLists.txt     |   75 +-
 paddle/fluid/lite/arm/math/elementwise.cc     |   70 -
 paddle/fluid/lite/arm/math/funcs.cc           |  155 -
 paddle/fluid/lite/arm/math/funcs.h            |  336 --
 paddle/fluid/lite/arm/math/packed_sgemm.cc    | 3049 -----------------
 paddle/fluid/lite/arm/math/packed_sgemm.h     |   60 -
 paddle/fluid/lite/arm/math/scale.cc           |   64 -
 paddle/fluid/lite/arm/math/softmax.cc         |  601 ----
 paddle/fluid/lite/arm/math/softmax.h          |   52 -
 paddle/fluid/lite/core/CMakeLists.txt         |   38 +-
 paddle/fluid/lite/core/context.cc             |  312 +-
 paddle/fluid/lite/core/context.h              |  116 +-
 paddle/fluid/lite/core/cpu_info.cc            | 1188 ++++---
 paddle/fluid/lite/core/cpu_info.h             |  113 +-
 paddle/fluid/lite/core/hvy_tensor.h           |    7 +
 paddle/fluid/lite/core/kernel.cc              |   31 +
 paddle/fluid/lite/core/kernel.h               |   26 +-
 paddle/fluid/lite/core/lite_tensor.h          |   18 +
 paddle/fluid/lite/core/memory.h               |    2 +
 paddle/fluid/lite/core/mir/CMakeLists.txt     |   94 +-
 .../lite/core/mir/elimination/CMakeLists.txt  |    7 +
 .../identity_scale_eliminate_pass.cc          |   72 +
 .../identity_scale_eliminate_pass_test.cc     |   93 +
 .../fluid/lite/core/mir/fusion/CMakeLists.txt |   22 +-
 .../mir/{ => fusion}/conv_bn_fuse_pass.cc     |    2 +-
 .../core/mir/{ => fusion}/conv_bn_fuse_pass.h |    0
 .../core/mir/fusion/conv_bn_fuse_pass_test.cc |    7 +-
 .../lite/core/mir/fusion/conv_bn_fuser.cc     |    8 +-
 ...nv_elementwise_add_activation_fuse_pass.cc |   40 +
 ...onv_elementwise_add_activation_fuse_pass.h |   32 +
 ...ementwise_add_activation_fuse_pass_test.cc |  153 +
 ... conv_elementwise_add_activation_fuser.cc} |   28 +-
 .../conv_elementwise_add_activation_fuser.h   |   47 +
 .../conv_elementwise_add_relu_fuse_pass.cc    |    2 +-
 .../conv_elementwise_add_relu_fuse_pass.h     |    0
 ...onv_elementwise_add_relu_fuse_pass_test.cc |    2 +-
 .../elementwise_add_activation_fuse_pass.cc   |   36 +
 .../elementwise_add_activation_fuse_pass.h}   |   16 +-
 ...ementwise_add_activation_fuse_pass_test.cc |  117 +
 .../elementwise_add_activation_fuser.cc       |   87 +
 ...r.h => elementwise_add_activation_fuser.h} |    8 +-
 .../core/mir/{ => fusion}/fc_fuse_pass.cc     |    2 +-
 .../lite/core/mir/{ => fusion}/fc_fuse_pass.h |    0
 .../mir/{ => fusion}/fc_fuse_pass_test.cc     |    8 +-
 paddle/fluid/lite/core/mir/fusion/fc_fuser.cc |    4 +-
 .../mir/fusion/quant_dequant_fuse_pass.cc     |   45 +
 .../core/mir/fusion/quant_dequant_fuse_pass.h |   33 +
 .../core/mir/fusion/quant_dequant_op_fuser.cc |  175 +
 .../core/mir/fusion/quant_dequant_op_fuser.h  |   58 +
 .../lite/core/mir/generate_program_pass.cc    |    6 +-
 .../lite/core/mir/graph_visualize_pass.cc     |    3 +-
 .../lite/core/mir/io_copy_kernel_pick_pass.cc |    4 +-
 paddle/fluid/lite/core/mir/node.cc            |   59 +
 paddle/fluid/lite/core/mir/node.h             |   66 +-
 paddle/fluid/lite/core/mir/pass_registry.h    |    8 +-
 paddle/fluid/lite/core/mir/pattern_matcher.cc |   13 +-
 paddle/fluid/lite/core/mir/pattern_matcher.h  |    8 +-
 .../lite/core/mir/pattern_matcher_high_api.cc |    8 +-
 .../lite/core/mir/pattern_matcher_high_api.h  |    8 +-
 .../core/mir/pattern_matcher_high_api_test.cc |    4 +-
 .../lite/core/mir/pattern_matcher_test.cc     |   26 +-
 .../lite/core/mir/pattern_matcher_tester.cc   |  233 ++
 paddle/fluid/lite/core/mir/ssa_graph.cc       |   12 +-
 paddle/fluid/lite/core/mir/ssa_graph.h        |    5 +
 paddle/fluid/lite/core/mir/ssa_graph_test.cc  |    6 +-
 .../lite/core/mir/static_kernel_pick_pass.cc  |   66 +-
 .../fluid/lite/core/mir/trans_weigths_pass.cc |  171 +
 .../fluid/lite/core/mir/trans_weigths_pass.h  |   85 +
 .../lite/core/mir/type_precision_cast_pass.cc |  166 +
 .../lite/core/mir/type_precision_cast_pass.h  |   61 +
 ...sform_pass.cc => type_target_cast_pass.cc} |   23 +-
 ...ansform_pass.h => type_target_cast_pass.h} |    0
 .../core/mir/variable_place_inference_pass.h  |   10 +-
 .../mir/variable_place_inference_pass_test.cc |    4 +-
 paddle/fluid/lite/core/naive_test_model.py    |   12 +-
 paddle/fluid/lite/core/op_lite.cc             |    5 +-
 paddle/fluid/lite/core/op_lite.h              |   22 +-
 paddle/fluid/lite/core/op_registry.cc         |   11 +
 paddle/fluid/lite/core/op_registry.h          |   31 +-
 paddle/fluid/lite/core/optimizer.cc           |    2 +-
 paddle/fluid/lite/core/optimizer.h            |   56 +-
 paddle/fluid/lite/core/optimizer_test.cc      |    2 +-
 paddle/fluid/lite/core/profile/CMakeLists.txt |    1 +
 .../fluid/lite/core/profile/basic_profiler.cc |    2 +-
 .../fluid/lite/core/profile/basic_profiler.h  |   37 +-
 paddle/fluid/lite/core/program.cc             |    2 +
 paddle/fluid/lite/core/program.h              |    5 +-
 paddle/fluid/lite/core/scope.h                |    6 +
 paddle/fluid/lite/core/target_wrapper.cc      |   28 +-
 paddle/fluid/lite/core/target_wrapper.h       |  126 +-
 paddle/fluid/lite/core/tensor.h               |   21 +
 paddle/fluid/lite/core/type_system.h          |   13 +-
 paddle/fluid/lite/core/types.h                |    4 +-
 paddle/fluid/lite/core/variable.h             |    7 +-
 paddle/fluid/lite/cuda/CMakeLists.txt         |    1 +
 paddle/fluid/lite/demo/cxx/Makefile.def       |   37 +
 paddle/fluid/lite/demo/cxx/README.md          |   42 +
 .../mobile_full/Makefile.android.armv7        |   22 +
 .../mobile_full/Makefile.android.armv8        |   22 +
 .../mobile_light/Makefile.android.armv7       |   22 +
 .../mobile_light/Makefile.android.armv8       |   22 +
 .../cxx/mobile_full/mobilenetv1_full_api.cc   |   74 +
 .../cxx/mobile_light/mobilenetv1_light_api.cc |   68 +
 paddle/fluid/lite/demo/java/README.md         |   79 +
 .../java/android/PaddlePredictor/.gitignore   |   13 +
 .../android/PaddlePredictor/app/.gitignore    |    1 +
 .../android/PaddlePredictor/app/build.gradle  |   28 +
 .../PaddlePredictor/app/proguard-rules.pro    |   21 +
 .../paddle/lite/ExampleInstrumentedTest.java  |   26 +
 .../app/src/main/AndroidManifest.xml          |   21 +
 .../app/src/main/assets/README.txt            |    7 +
 .../com/baidu/paddle/lite/MainActivity.java   |  244 ++
 .../drawable-v24/ic_launcher_foreground.xml   |   34 +
 .../res/drawable/ic_launcher_background.xml   |  170 +
 .../app/src/main/res/layout/activity_main.xml |   19 +
 .../res/mipmap-anydpi-v26/ic_launcher.xml     |    5 +
 .../mipmap-anydpi-v26/ic_launcher_round.xml   |    5 +
 .../src/main/res/mipmap-hdpi/ic_launcher.png  |  Bin 0 -> 2963 bytes
 .../res/mipmap-hdpi/ic_launcher_round.png     |  Bin 0 -> 4905 bytes
 .../src/main/res/mipmap-mdpi/ic_launcher.png  |  Bin 0 -> 2060 bytes
 .../res/mipmap-mdpi/ic_launcher_round.png     |  Bin 0 -> 2783 bytes
 .../src/main/res/mipmap-xhdpi/ic_launcher.png |  Bin 0 -> 4490 bytes
 .../res/mipmap-xhdpi/ic_launcher_round.png    |  Bin 0 -> 6895 bytes
 .../main/res/mipmap-xxhdpi/ic_launcher.png    |  Bin 0 -> 6387 bytes
 .../res/mipmap-xxhdpi/ic_launcher_round.png   |  Bin 0 -> 10413 bytes
 .../main/res/mipmap-xxxhdpi/ic_launcher.png   |  Bin 0 -> 9128 bytes
 .../res/mipmap-xxxhdpi/ic_launcher_round.png  |  Bin 0 -> 15132 bytes
 .../app/src/main/res/values/colors.xml        |    6 +
 .../app/src/main/res/values/strings.xml       |    3 +
 .../app/src/main/res/values/styles.xml        |   11 +
 .../baidu/paddle/lite/ExampleUnitTest.java    |   17 +
 .../java/android/PaddlePredictor/build.gradle |   27 +
 .../android/PaddlePredictor/gradle.properties |   13 +
 .../gradle/wrapper/gradle-wrapper.jar         |  Bin 0 -> 54329 bytes
 .../gradle/wrapper/gradle-wrapper.properties  |    6 +
 .../demo/java/android/PaddlePredictor/gradlew |  172 +
 .../java/android/PaddlePredictor/gradlew.bat  |   84 +
 .../android/PaddlePredictor/settings.gradle   |    1 +
 paddle/fluid/lite/gen_code/CMakeLists.txt     |   42 +-
 paddle/fluid/lite/gen_code/gen_code.cc        |   15 +
 paddle/fluid/lite/gen_code/gen_code.h         |    4 +-
 paddle/fluid/lite/gen_code/gen_code_test.cc   |   34 +-
 .../lite/gen_code/generated_code_test.cc      |   36 +-
 .../lite/gen_code/paddle_code_generator.cc    |   49 +
 paddle/fluid/lite/gen_code/paddle_infer.cc    |    6 +
 paddle/fluid/lite/gen_code/paddle_infer.h     |    2 +
 paddle/fluid/lite/host/CMakeLists.txt         |    3 +-
 paddle/fluid/lite/host/target_wrapper.cc      |   18 +-
 paddle/fluid/lite/kernels/CMakeLists.txt      |    5 +-
 paddle/fluid/lite/kernels/arm/CMakeLists.txt  |   45 +-
 ...e_add_compute.cc => activation_compute.cc} |   22 +-
 .../lite/kernels/arm/activation_compute.h     |   37 +
 .../kernels/arm/activation_compute_test.cc    |  100 +
 .../lite/kernels/arm/batch_norm_compute.cc    |  114 +
 .../lite/kernels/arm/batch_norm_compute.h     |   42 +
 .../kernels/arm/batch_norm_compute_test.cc    |  221 ++
 .../fluid/lite/kernels/arm/calib_compute.cc   |   64 +
 paddle/fluid/lite/kernels/arm/calib_compute.h |   51 +
 .../lite/kernels/arm/calib_compute_test.cc    |  150 +
 .../fluid/lite/kernels/arm/concat_compute.cc  |   87 +
 .../fluid/lite/kernels/arm/concat_compute.h   |   37 +
 .../lite/kernels/arm/concat_compute_test.cc   |  235 ++
 paddle/fluid/lite/kernels/arm/conv_compute.cc |  214 ++
 paddle/fluid/lite/kernels/arm/conv_compute.h  |   67 +
 .../lite/kernels/arm/conv_compute_test.cc     |  979 ++++++
 .../fluid/lite/kernels/arm/dropout_compute.cc |   47 +
 ...ntwise_add_compute.h => dropout_compute.h} |    5 +-
 .../lite/kernels/arm/dropout_compute_test.cc  |  106 +
 .../arm/elementwise_add_compute_test.cc       |   86 -
 .../lite/kernels/arm/elementwise_compute.cc   |  111 +
 .../lite/kernels/arm/elementwise_compute.h    |   44 +
 .../kernels/arm/elementwise_compute_test.cc   |  292 ++
 paddle/fluid/lite/kernels/arm/fc_compute.cc   |  178 +-
 paddle/fluid/lite/kernels/arm/fc_compute.h    |   36 +-
 .../fluid/lite/kernels/arm/fc_compute_test.cc |  248 +-
 paddle/fluid/lite/kernels/arm/mul_compute.cc  |   80 +-
 paddle/fluid/lite/kernels/arm/mul_compute.h   |   42 +
 .../lite/kernels/arm/mul_compute_test.cc      |  175 +
 paddle/fluid/lite/kernels/arm/pool_compute.cc |  150 +
 paddle/fluid/lite/kernels/arm/pool_compute.h  |   38 +
 .../lite/kernels/arm/pool_compute_test.cc     |  284 ++
 .../lite/kernels/arm/scale_compute_test.cc    |   11 +
 .../lite/kernels/arm/softmax_compute_test.cc  |    9 +-
 .../fluid/lite/kernels/arm/split_compute.cc   |   46 +
 .../arm/split_compute.h}                      |   20 +-
 .../lite/kernels/arm/split_compute_test.cc    |  176 +
 .../lite/kernels/arm/transpose_compute.cc     |  174 +
 .../lite/kernels/arm/transpose_compute.h      |   48 +
 .../kernels/arm/transpose_compute_test.cc     |  205 ++
 paddle/fluid/lite/kernels/arm/use_kernels.h   |   23 -
 paddle/fluid/lite/kernels/cuda/CMakeLists.txt |    3 +-
 paddle/fluid/lite/kernels/host/CMakeLists.txt |   11 +-
 .../fluid/lite/kernels/host/feed_compute.cc   |    4 +-
 .../fluid/lite/kernels/opencl/CMakeLists.txt  |   25 +
 .../kernels/opencl/elementwise_add_compute.cc |   53 +
 .../opencl/elementwise_add_compute_test.cc    |   66 +
 .../fluid/lite/kernels/opencl/pool_compute.cc |   67 +
 .../lite/kernels/opencl/pool_compute_test.cc  |  123 +
 paddle/fluid/lite/kernels/x86/CMakeLists.txt  |   57 +-
 .../lite/kernels/x86/activation_compute.cc    |   25 +
 .../lite/kernels/x86/batch_norm_compute.cc    |   30 +
 .../lite/kernels/x86/batch_norm_compute.h     |  159 +
 .../kernels/x86/batch_norm_compute_test.cc    |  139 +
 .../fluid/lite/kernels/x86/concat_compute.cc  |   85 +-
 .../fluid/lite/kernels/x86/concat_compute.h   |   72 +
 .../lite/kernels/x86/concat_compute_test.cc   |   83 +
 paddle/fluid/lite/kernels/x86/conv_compute.cc |  139 +-
 paddle/fluid/lite/kernels/x86/conv_compute.h  |  153 +
 .../lite/kernels/x86/conv_compute_test.cc     |   92 +
 .../fluid/lite/kernels/x86/dropout_compute.cc |   67 +-
 .../fluid/lite/kernels/x86/dropout_compute.h  |   81 +
 .../lite/kernels/x86/dropout_compute_test.cc  |   78 +
 .../lite/kernels/x86/elementwise_compute.cc   |  130 +-
 .../lite/kernels/x86/elementwise_compute.h    |  126 +
 .../kernels/x86/elementwise_compute_test.cc   |   88 +
 paddle/fluid/lite/kernels/x86/fc_compute.cc   |   84 +-
 paddle/fluid/lite/kernels/x86/fc_compute.h    |  104 +
 .../fluid/lite/kernels/x86/fc_compute.h.bak   |   98 +
 .../fluid/lite/kernels/x86/fc_compute_test.cc |  100 +
 paddle/fluid/lite/kernels/x86/gru_compute.cc  |   27 +
 paddle/fluid/lite/kernels/x86/gru_compute.h   |  150 +
 .../lite/kernels/x86/lookup_table_compute.cc  |   30 +
 .../lite/kernels/x86/lookup_table_compute.h   |   84 +
 paddle/fluid/lite/kernels/x86/mul_compute.cc  |  117 +-
 paddle/fluid/lite/kernels/x86/mul_compute.h   |  149 +
 .../fluid/lite/kernels/x86/mul_compute.h.bak  |  143 +
 .../lite/kernels/x86/mul_compute_test.cc      |   86 +
 paddle/fluid/lite/kernels/x86/pool_compute.cc |   61 +-
 paddle/fluid/lite/kernels/x86/pool_compute.h  |   75 +
 .../lite/kernels/x86/pool_compute_test.cc     |   79 +
 .../fluid/lite/kernels/x86/reduce_compute.cc  |   21 +
 .../fluid/lite/kernels/x86/reduce_compute.h   |   86 +
 paddle/fluid/lite/kernels/x86/relu_compute.cc |   37 +-
 .../lite/kernels/{arm => x86}/relu_compute.h  |   32 +-
 .../lite/kernels/x86/relu_compute_test.cc     |   75 +
 .../fluid/lite/kernels/x86/reshape_compute.cc |   27 +
 .../fluid/lite/kernels/x86/reshape_compute.h  |   61 +
 .../fluid/lite/kernels/x86/scale_compute.cc   |   43 +-
 paddle/fluid/lite/kernels/x86/scale_compute.h |   57 +
 .../lite/kernels/x86/scale_compute_test.cc    |   76 +
 .../kernels/x86/sequence_reshape_compute.cc   |   22 +
 .../kernels/x86/sequence_reshape_compute.h    |   82 +
 paddle/fluid/lite/kernels/x86/sgd_compute.cc  |    1 +
 .../fluid/lite/kernels/x86/softmax_compute.cc |   71 +-
 .../fluid/lite/kernels/x86/softmax_compute.h  |   87 +
 .../lite/kernels/x86/softmax_compute_test.cc  |   74 +
 .../kernels/x86/uniform_random_compute.cc     |   67 +
 paddle/fluid/lite/model_parser/CMakeLists.txt |   23 +-
 .../fluid/lite/model_parser/compatible_pb.cc  |    7 +
 .../lite/model_parser/cpp/CMakeLists.txt      |    2 +-
 paddle/fluid/lite/model_parser/cpp/op_desc.cc |    3 +
 paddle/fluid/lite/model_parser/cpp/op_desc.h  |   11 +
 paddle/fluid/lite/model_parser/desc_apis.h    |   22 +
 .../fluid/lite/model_parser/model_parser.cc   |    6 +-
 .../fluid/lite/model_parser/pb/CMakeLists.txt |    5 +-
 paddle/fluid/lite/model_parser/pb/op_desc.cc  |    3 +
 paddle/fluid/lite/model_parser/pb/op_desc.h   |    2 +
 paddle/fluid/lite/opencl/CMakeLists.txt       |   16 +
 paddle/fluid/lite/opencl/cl_caller.cc         |  156 +
 paddle/fluid/lite/opencl/cl_caller.h          |   40 +
 paddle/fluid/lite/opencl/cl_context.cc        |   73 +
 paddle/fluid/lite/opencl/cl_context.h         |   43 +
 paddle/fluid/lite/opencl/cl_engine.cc         |  170 +
 paddle/fluid/lite/opencl/cl_engine.h          |   96 +
 paddle/fluid/lite/opencl/cl_helper.cc         |   91 +
 paddle/fluid/lite/opencl/cl_helper.h          |   59 +
 paddle/fluid/lite/opencl/cl_image.cc          |  163 +
 paddle/fluid/lite/opencl/cl_image.h           |  118 +
 .../fluid/lite/opencl/cl_image_converter.cc   |  449 +++
 paddle/fluid/lite/opencl/cl_image_converter.h |  119 +
 paddle/fluid/lite/opencl/cl_include.h         |   21 +
 .../opencl/cl_kernel/channel_add_kernel.cl    |   29 +
 .../fluid/lite/opencl/cl_kernel/cl_common.h   |   32 +
 .../cl_kernel/elementwise_add_kernel.cl       |   26 +
 .../lite/opencl/cl_kernel/pool_kernel.cl      |   90 +
 paddle/fluid/lite/opencl/cl_test.cc           |  290 ++
 paddle/fluid/lite/opencl/cl_tool.cc           |   84 +
 paddle/fluid/lite/opencl/cl_tool.h            |   32 +
 paddle/fluid/lite/opencl/cl_wrapper.cxx       |  964 ++++++
 paddle/fluid/lite/operators/CMakeLists.txt    |  114 +-
 paddle/fluid/lite/operators/activation_ops.cc |   17 +
 paddle/fluid/lite/operators/batch_norm.h      |   87 -
 paddle/fluid/lite/operators/batch_norm_op.cc  |  110 +
 paddle/fluid/lite/operators/batch_norm_op.h   |   46 +
 .../lite/operators/batch_norm_op_test.cc      |  139 +
 paddle/fluid/lite/operators/calib_op.cc       |   52 +
 paddle/fluid/lite/operators/calib_op.h        |   59 +
 paddle/fluid/lite/operators/calib_op_test.cc  |   62 +
 paddle/fluid/lite/operators/concat_op.cc      |    6 +-
 paddle/fluid/lite/operators/conv_op.cc        |   52 +-
 paddle/fluid/lite/operators/conv_op.h         |   64 +-
 paddle/fluid/lite/operators/dropout_op.cc     |   11 +-
 .../fluid/lite/operators/elementwise_ops.cc   |  130 +-
 paddle/fluid/lite/operators/elementwise_ops.h |   66 +
 .../fake_dequantize_max_abs.cc}               |   15 +-
 .../lite/operators/fake_dequantize_max_abs.h  |   64 +
 .../fake_quantize_moving_avg_max_abs.cc       |   25 +
 .../fake_quantize_moving_avg_max_abs.h        |   69 +
 paddle/fluid/lite/operators/fc_op.cc          |    2 +-
 paddle/fluid/lite/operators/fc_op.h           |   11 +
 .../fluid/lite/operators/fill_constant_op.cc  |    2 +-
 .../fusion_elementwise_activation_ops.cc      |   99 +
 .../fusion_elementwise_activation_ops.h       |   71 +
 .../fusion_elementwise_activation_ops_test.cc |   63 +
 paddle/fluid/lite/operators/gru_op.cc         |  102 +
 paddle/fluid/lite/operators/gru_op.h          |   46 +
 .../fluid/lite/operators/lookup_table_op.cc   |   79 +
 paddle/fluid/lite/operators/lookup_table_op.h |   46 +
 paddle/fluid/lite/operators/mean_op.cc        |    4 +-
 paddle/fluid/lite/operators/mul_op.cc         |   49 +-
 paddle/fluid/lite/operators/mul_op.h          |   10 +-
 paddle/fluid/lite/operators/op_params.h       |  183 +-
 paddle/fluid/lite/operators/pool_op.cc        |   52 +-
 paddle/fluid/lite/operators/pool_op.h         |   34 +-
 paddle/fluid/lite/operators/pool_op_test.cc   |   90 +
 paddle/fluid/lite/operators/reduce_ops.cc     |   89 +
 paddle/fluid/lite/operators/reduce_ops.h      |   46 +
 paddle/fluid/lite/operators/relu_op.cc        |   16 +-
 paddle/fluid/lite/operators/relu_op.h         |    2 +-
 paddle/fluid/lite/operators/reshape_op.cc     |   12 +-
 paddle/fluid/lite/operators/scale_op.cc       |    1 +
 .../lite/operators/sequence_reshape_op.cc     |   54 +
 .../lite/operators/sequence_reshape_op.h      |   46 +
 paddle/fluid/lite/operators/sgd_op.cc         |    7 +-
 paddle/fluid/lite/operators/sgd_op.h          |    2 +-
 paddle/fluid/lite/operators/softmax_op.cc     |    8 +-
 paddle/fluid/lite/operators/split_op.cc       |   82 +
 paddle/fluid/lite/operators/split_op.h        |   46 +
 paddle/fluid/lite/operators/transpose_op.cc   |  165 +
 paddle/fluid/lite/operators/transpose_op.h    |   66 +
 .../fluid/lite/operators/transpose_op_test.cc |   93 +
 .../fluid/lite/operators/uniform_random_op.cc |   45 +
 .../fluid/lite/operators/uniform_random_op.h  |   50 +
 paddle/fluid/lite/python/lite_test.py         |  103 +
 paddle/fluid/lite/tools/CMakeLists.txt        |    1 +
 paddle/fluid/lite/tools/Dockerfile.mobile     |    8 +-
 paddle/fluid/lite/tools/build.sh              |  664 +++-
 paddle/fluid/lite/tools/debug/CMakeLists.txt  |   13 +
 .../fluid/lite/tools/debug/analysis_tool.py   |  401 +++
 paddle/fluid/lite/tools/debug/check_model.sh  |  182 +
 .../debug/debug_utils.cc}                     |    2 +-
 paddle/fluid/lite/tools/debug/debug_utils.h   |  329 ++
 .../lite/tools/debug/model_debug_tool.cc      |   94 +
 paddle/fluid/lite/tools/mobile_readme.md      |   13 +-
 paddle/fluid/lite/utils/CMakeLists.txt        |    5 +-
 paddle/fluid/lite/utils/any.h                 |    9 +-
 paddle/fluid/lite/utils/io.h                  |   13 +-
 paddle/fluid/lite/utils/string.h              |   21 +-
 paddle/fluid/lite/utils/varient.h             |    7 +-
 paddle/fluid/lite/x86/CMakeLists.txt          |    3 +-
 paddle/fluid/memory/CMakeLists.txt            |    3 +-
 paddle/fluid/memory/allocation/CMakeLists.txt |   23 +-
 .../memory/allocation/aligned_allocator.h     |    7 +-
 paddle/fluid/memory/allocation/allocator.cc   |   19 +-
 paddle/fluid/memory/allocation/allocator.h    |  158 +-
 .../memory/allocation/allocator_facade.cc     |  128 +-
 .../memory/allocation/allocator_facade.h      |    8 +-
 .../memory/allocation/allocator_strategy.cc   |   14 +-
 .../allocation/auto_increment_allocator.cc    |    9 +-
 .../allocation/auto_increment_allocator.h     |    2 +-
 .../memory/allocation/best_fit_allocator.cc   |    4 +-
 .../memory/allocation/best_fit_allocator.h    |    4 +-
 .../allocation/best_fit_allocator_test.cc     |   25 +-
 .../allocation/best_fit_allocator_test.cu     |    7 +-
 .../memory/allocation/buffered_allocator.cc   |   25 +-
 .../memory/allocation/buffered_allocator.h    |    8 +-
 .../allocation/buffered_allocator_test.cc     |   21 +-
 .../allocation/conditional_allocator.cc       |   10 +-
 .../memory/allocation/conditional_allocator.h |   13 +-
 .../fluid/memory/allocation/cpu_allocator.cc  |   30 +-
 .../fluid/memory/allocation/cpu_allocator.h   |   12 +-
 .../fluid/memory/allocation/cuda_allocator.cc |   12 +-
 .../fluid/memory/allocation/cuda_allocator.h  |   11 +-
 .../memory/allocation/legacy_allocator.cc     |   12 +-
 .../memory/allocation/legacy_allocator.h      |    4 +-
 .../memory/allocation/locked_allocator.cc     |   22 +-
 .../memory/allocation/locked_allocator.h      |    8 +-
 .../memory/allocation/pinned_allocator.cc     |   12 +-
 .../memory/allocation/pinned_allocator.h      |   10 +-
 .../memory/allocation/retry_allocator.cc      |   21 +-
 .../fluid/memory/allocation/retry_allocator.h |   25 +-
 .../memory/allocation/retry_allocator_test.cc |    2 +-
 paddle/fluid/memory/malloc.cc                 |    9 +-
 paddle/fluid/memory/malloc.h                  |    8 +-
 paddle/fluid/memory/memcpy.cc                 |   13 -
 .../fluid/op_use_default_grad_op_maker.spec   |    1 +
 paddle/fluid/operators/CMakeLists.txt         |    8 +-
 paddle/fluid/operators/activation_op.cc       |  158 +-
 paddle/fluid/operators/activation_op.cu       |   31 -
 paddle/fluid/operators/activation_op.h        |  166 +-
 .../operators/add_position_encoding_op.cc     |   15 -
 .../operators/alloc_continuous_space_op.cc    |   18 +-
 .../fluid/operators/anakin/anakin_engine_op.h |    4 -
 paddle/fluid/operators/batch_norm_op.cc       |    1 -
 paddle/fluid/operators/benchmark/op_tester.cc |   27 +-
 paddle/fluid/operators/benchmark/op_tester.h  |    2 +-
 .../operators/benchmark/op_tester_config.cc   |    5 +-
 .../operators/benchmark/op_tester_config.h    |    3 +-
 paddle/fluid/operators/concat_op.cc           |   33 +-
 paddle/fluid/operators/concat_op.h            |   16 +-
 paddle/fluid/operators/conv_cudnn_op.cu.cc    |  184 +-
 paddle/fluid/operators/conv_op.cc             |   25 +-
 paddle/fluid/operators/cross_entropy_op.h     |    4 +-
 paddle/fluid/operators/cvm_op.h               |    6 +-
 .../fluid/operators/detection/CMakeLists.txt  |    4 -
 paddle/fluid/operators/detection/bbox_util.h  |    8 +-
 .../detection/distribute_fpn_proposals_op.cu  |    3 +-
 .../detection/generate_mask_labels_op.cc      |    4 -
 .../detection/generate_proposal_labels_op.cc  |  166 +-
 .../detection/generate_proposals_op.cu        |    3 +-
 .../detection/rpn_target_assign_op.cc         |  469 +--
 .../operators/distributed/CMakeLists.txt      |    8 +-
 .../operators/distributed/communicator.cc     |  116 +-
 .../operators/distributed/communicator.h      |   25 +-
 .../distributed/request_handler_impl.cc       |    2 +-
 .../operators/distributed/sendrecvop_utils.cc |    3 +-
 .../operators/distributed_ops/allreduce_op.h  |    3 +-
 .../distributed_ops/gen_nccl_id_op.cc         |  197 +-
 .../operators/distributed_ops/recv_op.cc      |    4 +-
 .../elementwise/elementwise_add_op.cc         |   52 +-
 .../elementwise/elementwise_add_op.cu         |    7 -
 .../elementwise/elementwise_add_op.h          |   53 +-
 .../elementwise/elementwise_div_op.cc         |   40 +-
 .../elementwise/elementwise_div_op.cu         |   10 -
 .../elementwise/elementwise_div_op.h          |  116 -
 .../elementwise/elementwise_mul_op.cc         |   38 +-
 .../elementwise/elementwise_mul_op.cu         |    7 -
 .../elementwise/elementwise_mul_op.h          |   75 +-
 .../operators/elementwise/elementwise_op.h    |   78 -
 .../elementwise/elementwise_op_function.h     |   42 +-
 .../elementwise/elementwise_sub_op.cc         |   50 +-
 .../elementwise/elementwise_sub_op.cu         |   10 -
 .../elementwise/elementwise_sub_op.h          |   28 -
 .../mkldnn/elementwise_add_mkldnn_op.cc       |   37 +-
 paddle/fluid/operators/expand_op.cc           |   64 +-
 paddle/fluid/operators/expand_op.h            |   37 +-
 paddle/fluid/operators/fake_quantize_op.cc    |   70 +-
 paddle/fluid/operators/fake_quantize_op.cu    |   45 +-
 paddle/fluid/operators/fake_quantize_op.h     |   44 +-
 .../fused/fused_elemwise_activation_op.cc     |    5 +-
 .../fused/fused_elemwise_activation_op.h      |   53 -
 paddle/fluid/operators/gather.cu.h            |   20 +-
 paddle/fluid/operators/gather.h               |    8 +-
 paddle/fluid/operators/gather_op.cc           |    7 -
 paddle/fluid/operators/gather_op.cu           |   35 +-
 paddle/fluid/operators/gather_op.h            |   43 +-
 paddle/fluid/operators/gru_op.h               |    2 +-
 paddle/fluid/operators/im2sequence_op.cc      |    2 +-
 paddle/fluid/operators/im2sequence_op.h       |    7 +-
 paddle/fluid/operators/interpolate_op.h       |   72 +-
 paddle/fluid/operators/load_combine_op.h      |    5 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |    4 +-
 .../fluid/operators/math/compound_functors.h  |   12 -
 .../fluid/operators/math/concat_and_split.cu  |  278 +-
 paddle/fluid/operators/math/concat_test.cc    |  224 +-
 paddle/fluid/operators/math/context_project.h |   12 +-
 paddle/fluid/operators/math/cpu_vec.h         |  156 +-
 paddle/fluid/operators/math/cpu_vec_test.cc   |  125 +-
 paddle/fluid/operators/math/functors.h        |   42 -
 paddle/fluid/operators/math/gru_compute.cu    |   44 +-
 .../fluid/operators/math/sequence_padding.cc  |   21 +-
 .../fluid/operators/math/sequence_pooling.cc  |   88 +-
 .../fluid/operators/math/sequence_pooling.cu  |  129 +-
 .../fluid/operators/math/sequence_pooling.h   |    5 +-
 paddle/fluid/operators/math/softmax.h         |    2 +-
 paddle/fluid/operators/math/softmax_impl.h    |  115 +-
 .../operators/mkldnn/batch_norm_mkldnn_op.cc  |   36 +-
 .../operators/mkldnn/concat_mkldnn_op.cc      |    2 -
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  182 +-
 .../mkldnn/conv_transpose_mkldnn_op.cc        |    9 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |  431 +--
 .../fluid/operators/mkldnn/pool_mkldnn_op.cc  |   16 +-
 .../operators/mkldnn/softmax_mkldnn_op.cc     |   34 +-
 .../operators/mkldnn/transpose_mkldnn_op.cc   |    3 +-
 paddle/fluid/operators/mul_op.cc              |   58 +-
 paddle/fluid/operators/mul_op.cu.cc           |    4 -
 paddle/fluid/operators/mul_op.h               |   91 -
 .../fluid/operators/ngraph/ngraph_bridge.cc   |   31 -
 paddle/fluid/operators/ngraph/ngraph_bridge.h |    2 -
 .../fluid/operators/ngraph/ngraph_engine.cc   |  236 +-
 paddle/fluid/operators/ngraph/ngraph_engine.h |   15 +-
 .../operators/ngraph/ops/activation_op.h      |   59 -
 .../operators/ngraph/ops/binary_unary_op.h    |    5 -
 paddle/fluid/operators/ngraph/ops/conv2d_op.h |    1 -
 .../operators/ngraph/ops/cross_entropy_op.h   |  209 +-
 .../ops/elementwise_binary_prepare_node.h     |    3 +-
 .../operators/ngraph/ops/elementwise_node.h   |   15 +-
 .../operators/ngraph/ops/fill_constant_op.h   |   17 +-
 paddle/fluid/operators/ngraph/ops/mul_op.h    |   11 +-
 paddle/fluid/operators/ngraph/ops/pool2d_op.h |   23 +-
 paddle/fluid/operators/one_hot_op.cc          |   33 +-
 paddle/fluid/operators/one_hot_op.cu          |   19 +-
 paddle/fluid/operators/one_hot_op.h           |   10 -
 paddle/fluid/operators/optimizers/adam_op.cc  |  115 +-
 paddle/fluid/operators/optimizers/adam_op.h   |    9 -
 paddle/fluid/operators/print_op.cc            |  125 +-
 .../fluid/operators/reader/buffered_reader.cc |    4 +-
 paddle/fluid/operators/reader/ctr_reader.cc   |   20 +-
 .../fluid/operators/reader/ctr_reader_test.cc |   20 +-
 paddle/fluid/operators/recurrent_op.cc        |   75 +-
 .../operators/reduce_ops/reduce_mean_op.cc    |   61 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |    4 -
 paddle/fluid/operators/reshape_op.cc          |  107 +-
 paddle/fluid/operators/scatter.cu.h           |   58 +-
 paddle/fluid/operators/scatter.h              |   90 +-
 paddle/fluid/operators/scatter_op.cc          |    8 -
 paddle/fluid/operators/scatter_op.cu          |    4 +-
 paddle/fluid/operators/scatter_op.h           |   24 +-
 .../sequence_ops/sequence_pool_op.cc          |    5 -
 .../operators/sequence_ops/sequence_pool_op.h |    5 +-
 .../sequence_ops/sequence_reverse_op.h        |   23 +-
 .../sequence_ops/sequence_slice_op.h          |    3 +-
 .../sequence_ops/sequence_unpad_op.h          |    4 +-
 paddle/fluid/operators/slice_op.cc            |   38 +-
 .../softmax_with_cross_entropy_op.cc          |    1 +
 paddle/fluid/operators/sum_op.cc              |    2 +-
 paddle/fluid/operators/sum_op.cu              |   10 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   41 +-
 .../tensorrt/tensorrt_engine_op_test.cc       |    4 -
 paddle/fluid/operators/unpool_op.h            |    8 +-
 paddle/fluid/operators/warpctc_op.cc          |    9 -
 paddle/fluid/platform/CMakeLists.txt          |    2 +-
 paddle/fluid/platform/cudnn_desc.h            |    6 +-
 paddle/fluid/platform/device_context.cc       |   18 +-
 paddle/fluid/platform/dynload/cudnn.h         |   23 +-
 .../fluid/platform/dynload/dynamic_loader.cc  |    3 +-
 paddle/fluid/platform/init.cc                 |    3 +
 paddle/fluid/platform/lodtensor_printer.cc    |   43 +-
 paddle/fluid/platform/mkldnn_reuse.h          |  261 +-
 paddle/fluid/platform/nccl_helper.h           |  158 +-
 paddle/fluid/platform/ngraph_helper.h         |   30 +-
 paddle/fluid/platform/temporary_allocator.cc  |   35 +-
 paddle/fluid/platform/temporary_allocator.h   |   17 +-
 paddle/fluid/platform/timer.h                 |    2 +-
 paddle/fluid/pybind/CMakeLists.txt            |   35 +-
 paddle/fluid/pybind/data_set_py.cc            |    4 +-
 paddle/fluid/pybind/executor_lite.cc          |  189 +
 paddle/fluid/pybind/executor_lite.h           |   26 +
 paddle/fluid/pybind/fleet_wrapper_py.cc       |    7 +-
 paddle/fluid/pybind/imperative.cc             |  325 +-
 paddle/fluid/pybind/imperative.h              |   23 +
 paddle/fluid/pybind/inference_api.cc          |   43 +-
 paddle/fluid/pybind/ir.cc                     |    9 -
 paddle/fluid/pybind/pybind.cc                 |  398 +--
 paddle/fluid/pybind/reader_py.cc              |   18 -
 paddle/fluid/string/printf.h                  |    6 +-
 paddle/scripts/README.md                      |    8 -
 paddle/scripts/paddle_build.sh                |  129 +-
 python/CMakeLists.txt                         |   49 +-
 python/paddle/dataset/flowers.py              |   34 +-
 python/paddle/dataset/mnist.py                |    9 +-
 python/paddle/dataset/wmt16.py                |    3 +-
 python/paddle/distributed/launch.py           |  268 +-
 python/paddle/fluid/__init__.py               |   16 +-
 python/paddle/fluid/backward.py               |  144 +-
 python/paddle/fluid/clip.py                   |   44 +-
 python/paddle/fluid/compiler.py               |    9 +-
 python/paddle/fluid/contrib/__init__.py       |    3 -
 .../fluid/contrib/int8_inference/README.md    |   15 +-
 .../contrib/mixed_precision/decorator.py      |   87 +-
 .../contrib/mixed_precision/fp16_utils.py     |   82 +-
 python/paddle/fluid/contrib/reader/README.md  |   10 -
 .../paddle/fluid/contrib/reader/__init__.py   |    5 +-
 .../fluid/contrib/slim/core/compressor.py     |  148 +-
 .../paddle/fluid/contrib/slim/core/config.py  |   14 +-
 .../distillation/distillation_strategy.py     |    6 +-
 .../fluid/contrib/slim/graph/executor.py      |    1 -
 .../fluid/contrib/slim/graph/graph_wrapper.py |   21 +-
 .../fluid/contrib/slim/prune/__init__.py      |    3 -
 .../contrib/slim/prune/prune_strategy.py      |  135 +-
 .../contrib/slim/quantization/__init__.py     |    6 -
 .../slim/quantization/quantization_pass.py    |  141 +-
 .../quantization/quantization_strategy.py     |    6 +-
 .../fluid/contrib/slim/tests/CMakeLists.txt   |  136 -
 .../contrib/slim/tests/test_graph_wrapper.py  |    2 -
 .../tests/test_quantization_scale_pass.py     |    8 -
 .../tests/test_calibration_mobilenetv1.py     |   12 +-
 .../tests/test_calibration_resnet50.py        |   14 +-
 .../tests/test_image_classification_fp16.py   |    6 +-
 .../paddle/fluid/contrib/utils/hdfs_utils.py  |    6 +-
 .../fluid/contrib/utils/lookup_table_utils.py |    8 +-
 python/paddle/fluid/cxx_trainer.py            |  163 +
 python/paddle/fluid/data_feed_desc.py         |  157 +-
 python/paddle/fluid/data_feeder.py            |  178 +-
 python/paddle/fluid/dataset.py                |  328 +-
 python/paddle/fluid/device_worker.py          |   68 +-
 python/paddle/fluid/dygraph/__init__.py       |    4 -
 python/paddle/fluid/dygraph/base.py           |  139 +-
 python/paddle/fluid/dygraph/checkpoint.py     |   96 +-
 python/paddle/fluid/dygraph/layers.py         |  110 +-
 .../fluid/dygraph/learning_rate_scheduler.py  |  268 +-
 python/paddle/fluid/dygraph/nn.py             |  544 +--
 python/paddle/fluid/dygraph/parallel.py       |  156 +-
 python/paddle/fluid/dygraph/tracer.py         |   69 +-
 python/paddle/fluid/evaluator.py              |    5 +-
 python/paddle/fluid/executor.py               |   99 +-
 python/paddle/fluid/framework.py              |  473 +--
 .../fluid/incubate/fleet/base/fleet_base.py   |   50 +-
 .../fluid/incubate/fleet/base/role_maker.py   |  138 +-
 .../incubate/fleet/collective/__init__.py     |   15 +-
 .../fleet/parameter_server/pslib/__init__.py  |  134 +-
 .../fleet/parameter_server/pslib/node.py      |   51 +-
 .../pslib/optimizer_factory.py                |   17 +-
 .../fleet/parameter_server/pslib/ps_pb2.py    |   72 +-
 python/paddle/fluid/initializer.py            |   63 +-
 python/paddle/fluid/install_check.py          |    2 +-
 python/paddle/fluid/io.py                     |  119 +-
 python/paddle/fluid/layer_helper_base.py      |   17 +-
 python/paddle/fluid/layers/collective.py      |   14 +-
 python/paddle/fluid/layers/control_flow.py    |  350 +-
 python/paddle/fluid/layers/detection.py       |  558 +--
 python/paddle/fluid/layers/device.py          |    2 +-
 python/paddle/fluid/layers/io.py              |   48 +-
 .../fluid/layers/learning_rate_scheduler.py   |   37 +-
 python/paddle/fluid/layers/math_op_patch.py   |    9 +-
 python/paddle/fluid/layers/metric_op.py       |   13 +-
 python/paddle/fluid/layers/nn.py              | 1297 +------
 python/paddle/fluid/layers/ops.py             |    9 +-
 python/paddle/fluid/layers/tensor.py          |   81 +-
 python/paddle/fluid/metrics.py                |  326 +-
 python/paddle/fluid/net_drawer.py             |    4 +-
 python/paddle/fluid/nets.py                   |    4 +-
 python/paddle/fluid/optimizer.py              |  922 +----
 python/paddle/fluid/parallel_executor.py      |    7 +-
 python/paddle/fluid/param_attr.py             |    5 +-
 python/paddle/fluid/reader.py                 |  180 +-
 python/paddle/fluid/regularizer.py            |   38 +-
 python/paddle/fluid/tests/CMakeLists.txt      |    4 -
 python/paddle/fluid/tests/demo/pyreader.py    |   14 +-
 python/paddle/fluid/tests/test_detection.py   |   29 +-
 python/paddle/fluid/tests/test_lod_tensor.py  |   18 -
 .../fluid/tests/unittests/CMakeLists.txt      |   69 +-
 .../fluid/tests/unittests/dist_ctr_reader.py  |    6 +-
 .../fluid/tests/unittests/gradient_checker.py |   23 +-
 .../unittests/mkldnn/test_conv2d_mkldnn_op.py |   25 -
 .../unittests/mkldnn/test_fc_mkldnn_op.py     |   45 +-
 .../ngraph/test_activation_ngraph_op.py       |    2 +-
 .../unittests/ngraph/test_conv2d_ngraph_op.py |   35 +-
 .../tests/unittests/parallel_dygraph_mnist.py |   16 +-
 .../test_async_ssa_graph_executor_mnist.py    |    6 -
 .../fluid/tests/unittests/test_concat_op.py   |   16 +-
 .../fluid/tests/unittests/test_dist_base.py   |  126 +-
 .../fluid/tests/unittests/test_dist_mnist.py  |   36 +
 .../unittests/test_dygraph_multi_forward.py   |    1 +
 .../fluid/tests/unittests/test_expand_op.py   |   56 -
 .../tests/unittests/test_fake_quantize_op.py  |   97 +-
 .../fluid/tests/unittests/test_gather_op.py   |   66 +-
 .../test_generate_proposal_labels_op.py       |  109 +-
 .../tests/unittests/test_imperative_basic.py  |  141 +-
 .../unittests/test_imperative_checkpoint.py   |   47 +-
 .../tests/unittests/test_imperative_deepcf.py |   28 -
 .../tests/unittests/test_imperative_gan.py    |   48 -
 .../tests/unittests/test_imperative_gnn.py    |   46 +-
 .../tests/unittests/test_imperative_mnist.py  |   52 +-
 .../unittests/test_imperative_optimizer.py    |   32 +-
 .../unittests/test_imperative_ptb_rnn.py      |    5 +-
 .../tests/unittests/test_imperative_resnet.py |   54 +-
 .../unittests/test_imperative_se_resnext.py   |   53 +-
 .../fluid/tests/unittests/test_infer_shape.py |    1 -
 .../tests/unittests/test_install_check.py     |    4 -
 .../tests/unittests/test_layer_norm_op.py     |    6 +-
 .../fluid/tests/unittests/test_layers.py      |  179 +-
 .../fluid/tests/unittests/test_nn_grad.py     |   65 +-
 .../fluid/tests/unittests/test_one_hot_op.py  |   54 +-
 .../tests/unittests/test_operator_desc.py     |    2 +-
 .../unittests/test_parallel_dygraph_mnist.py  |   17 +-
 .../test_parallel_executor_dry_run.py         |    2 -
 .../test_parallel_executor_fetch_feed.py      |   29 +-
 .../fluid/tests/unittests/test_print_op.py    |   53 -
 .../test_py_reader_using_executor.py          |    1 -
 .../tests/unittests/test_recordio_reader.py   |    4 -
 .../fluid/tests/unittests/test_reshape_op.py  |   47 -
 .../unittests/test_rpn_target_assign_op.py    |  159 -
 .../fluid/tests/unittests/test_scatter_op.py  |   94 -
 .../fluid/tests/unittests/test_seq_pool.py    |  244 +-
 .../unittests/test_sync_batch_norm_op.py      |    1 -
 .../fluid/tests/unittests/test_tensor.py      |   20 -
 .../fluid/tests/unittests/test_variable.py    |   20 +-
 .../fluid/tests/unittests/test_version.py     |   10 +-
 .../fluid/tests/unittests/test_warpctc_op.py  |   28 +-
 python/paddle/fluid/trainer_desc.py           |   26 +-
 python/paddle/fluid/trainer_factory.py        |   10 +-
 .../fluid/transpiler/distribute_transpiler.py |  143 +-
 .../fluid/transpiler/inference_transpiler.py  |   93 -
 .../memory_optimization_transpiler.py         |   62 +-
 .../paddle/fluid/transpiler/ps_dispatcher.py  |   20 -
 python/paddle/fluid/unique_name.py            |   91 -
 python/requirements.txt                       |   12 +-
 python/setup.py.in                            |   25 +-
 run.md                                        |    4 +
 tools/document_preview.sh                     |    8 +-
 tools/manylinux1/Dockerfile.x64               |    6 -
 tools/manylinux1/build_all.sh                 |    2 +-
 .../manylinux1/build_scripts/install_nccl2.sh |    8 +-
 tools/print_signatures.py                     |    4 +-
 990 files changed, 32742 insertions(+), 28161 deletions(-)
 create mode 100644 cmake/cross_compiling/findar.cmake
 create mode 100644 cmake/cross_compiling/postproject.cmake
 create mode 100644 cmake/cross_compiling/preproject.cmake
 create mode 100644 cmake/external/opencl-clhpp.cmake
 create mode 100644 cmake/external/opencl-headers.cmake
 create mode 100644 cmake/lite.cmake
 rename paddle/fluid/inference/analysis/passes/{passes.cc => paddle_use_passes.cc} (100%)
 create mode 100644 paddle/fluid/lite/api/android/.gitignore
 create mode 100644 paddle/fluid/lite/api/android/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/api/android/jni/.gitignore
 create mode 100644 paddle/fluid/lite/api/android/jni/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/api/android/jni/native/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/api/android/jni/native/convert_util_jni.h
 create mode 100644 paddle/fluid/lite/api/android/jni/native/paddle_init_jni.cc
 create mode 100644 paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h
 create mode 100644 paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.cc
 create mode 100644 paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h
 create mode 100644 paddle/fluid/lite/api/android/jni/native/tensor_jni.cc
 create mode 100644 paddle/fluid/lite/api/android/jni/native/tensor_jni.h
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PrecisionType.java
 create mode 100644 paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
 create mode 100644 paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
 create mode 100644 paddle/fluid/lite/api/apis_test.cc
 create mode 100644 paddle/fluid/lite/api/cxx_api_bin_int8.cc
 create mode 100644 paddle/fluid/lite/api/cxx_api_impl.cc
 create mode 100644 paddle/fluid/lite/api/inceptionv4_test.cc
 create mode 100644 paddle/fluid/lite/api/light_api_impl.cc
 create mode 100644 paddle/fluid/lite/api/lite_api_test_helper.cc
 create mode 100644 paddle/fluid/lite/api/lite_api_test_helper.h
 create mode 100644 paddle/fluid/lite/api/mobilenetv1_test.cc
 create mode 100644 paddle/fluid/lite/api/mobilenetv2_test.cc
 create mode 100644 paddle/fluid/lite/api/model_optimize_tool.cc
 create mode 100644 paddle/fluid/lite/api/model_test.cc
 create mode 100644 paddle/fluid/lite/api/paddle_api.cc
 create mode 100644 paddle/fluid/lite/api/paddle_api.h
 create mode 100644 paddle/fluid/lite/api/paddle_api_test.cc
 create mode 100644 paddle/fluid/lite/api/paddle_lite_factory_helper.h
 create mode 100644 paddle/fluid/lite/api/paddle_place.cc
 create mode 100644 paddle/fluid/lite/api/paddle_place.h
 create mode 100644 paddle/fluid/lite/api/paddle_use_kernels.h
 create mode 100644 paddle/fluid/lite/api/paddle_use_ops.h
 rename paddle/fluid/lite/{core/mir/passes.h => api/paddle_use_passes.h} (73%)
 create mode 100644 paddle/fluid/lite/api/resnet50_test.cc
 create mode 100644 paddle/fluid/lite/api/test_googlenet_lite.cc
 create mode 100644 paddle/fluid/lite/api/test_helper.h
 create mode 100644 paddle/fluid/lite/api/test_inceptionv4_lite_x86.cc
 create mode 100644 paddle/fluid/lite/api/test_mobilenetv1_lite_x86.cc
 create mode 100644 paddle/fluid/lite/api/test_mobilenetv2_lite_x86.cc
 create mode 100644 paddle/fluid/lite/api/test_step_rnn_lite_x86.cc
 delete mode 100644 paddle/fluid/lite/arm/math/elementwise.cc
 delete mode 100644 paddle/fluid/lite/arm/math/funcs.cc
 delete mode 100644 paddle/fluid/lite/arm/math/funcs.h
 delete mode 100644 paddle/fluid/lite/arm/math/packed_sgemm.cc
 delete mode 100644 paddle/fluid/lite/arm/math/packed_sgemm.h
 delete mode 100644 paddle/fluid/lite/arm/math/scale.cc
 delete mode 100644 paddle/fluid/lite/arm/math/softmax.cc
 delete mode 100644 paddle/fluid/lite/arm/math/softmax.h
 create mode 100644 paddle/fluid/lite/core/mir/elimination/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
 create mode 100644 paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
 rename paddle/fluid/lite/core/mir/{ => fusion}/conv_bn_fuse_pass.cc (94%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/conv_bn_fuse_pass.h (100%)
 create mode 100644 paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.cc
 create mode 100644 paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h
 create mode 100644 paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
 rename paddle/fluid/lite/core/mir/fusion/{conv_elementwise_add_relu_fuser.cc => conv_elementwise_add_activation_fuser.cc} (84%)
 create mode 100644 paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h
 rename paddle/fluid/lite/core/mir/{ => fusion}/conv_elementwise_add_relu_fuse_pass.cc (94%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/conv_elementwise_add_relu_fuse_pass.h (100%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/conv_elementwise_add_relu_fuse_pass_test.cc (98%)
 create mode 100644 paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
 rename paddle/fluid/lite/{arm/math/elementwise.h => core/mir/fusion/elementwise_add_activation_fuse_pass.h} (73%)
 create mode 100644 paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
 create mode 100644 paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
 rename paddle/fluid/lite/core/mir/fusion/{conv_elementwise_add_relu_fuser.h => elementwise_add_activation_fuser.h} (85%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/fc_fuse_pass.cc (94%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/fc_fuse_pass.h (100%)
 rename paddle/fluid/lite/core/mir/{ => fusion}/fc_fuse_pass_test.cc (95%)
 create mode 100644 paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
 create mode 100644 paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.h
 create mode 100644 paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc
 create mode 100644 paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h
 create mode 100644 paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
 create mode 100644 paddle/fluid/lite/core/mir/trans_weigths_pass.cc
 create mode 100644 paddle/fluid/lite/core/mir/trans_weigths_pass.h
 create mode 100644 paddle/fluid/lite/core/mir/type_precision_cast_pass.cc
 create mode 100644 paddle/fluid/lite/core/mir/type_precision_cast_pass.h
 rename paddle/fluid/lite/core/mir/{type_target_transform_pass.cc => type_target_cast_pass.cc} (85%)
 rename paddle/fluid/lite/core/mir/{type_target_transform_pass.h => type_target_cast_pass.h} (100%)
 create mode 100644 paddle/fluid/lite/demo/cxx/Makefile.def
 create mode 100644 paddle/fluid/lite/demo/cxx/README.md
 create mode 100644 paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
 create mode 100644 paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
 create mode 100644 paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
 create mode 100644 paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
 create mode 100644 paddle/fluid/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
 create mode 100644 paddle/fluid/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
 create mode 100644 paddle/fluid/lite/demo/java/README.md
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/.gitignore
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/.gitignore
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/build.gradle
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/build.gradle
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle.properties
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
 create mode 100755 paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew.bat
 create mode 100644 paddle/fluid/lite/demo/java/android/PaddlePredictor/settings.gradle
 create mode 100644 paddle/fluid/lite/gen_code/paddle_code_generator.cc
 rename paddle/fluid/lite/kernels/arm/{elementwise_add_compute.cc => activation_compute.cc} (61%)
 create mode 100644 paddle/fluid/lite/kernels/arm/activation_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/activation_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/batch_norm_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/concat_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/concat_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/concat_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/conv_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/conv_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/conv_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/dropout_compute.cc
 rename paddle/fluid/lite/kernels/arm/{elementwise_add_compute.h => dropout_compute.h} (87%)
 create mode 100644 paddle/fluid/lite/kernels/arm/dropout_compute_test.cc
 delete mode 100644 paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/elementwise_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/elementwise_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/mul_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/mul_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/pool_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/pool_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/pool_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/split_compute.cc
 rename paddle/fluid/lite/{operators/batch_norm.cc => kernels/arm/split_compute.h} (71%)
 create mode 100644 paddle/fluid/lite/kernels/arm/split_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/transpose_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/arm/transpose_compute.h
 create mode 100644 paddle/fluid/lite/kernels/arm/transpose_compute_test.cc
 delete mode 100644 paddle/fluid/lite/kernels/arm/use_kernels.h
 create mode 100644 paddle/fluid/lite/kernels/opencl/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/opencl/pool_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/opencl/pool_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/batch_norm_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/concat_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/concat_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/conv_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/conv_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/dropout_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/elementwise_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/fc_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/fc_compute.h.bak
 create mode 100644 paddle/fluid/lite/kernels/x86/fc_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/gru_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/gru_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/lookup_table_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/lookup_table_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/mul_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/mul_compute.h.bak
 create mode 100644 paddle/fluid/lite/kernels/x86/mul_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/pool_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/pool_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/reduce_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/reduce_compute.h
 rename paddle/fluid/lite/kernels/{arm => x86}/relu_compute.h (60%)
 create mode 100644 paddle/fluid/lite/kernels/x86/relu_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/reshape_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/reshape_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/scale_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/scale_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/sequence_reshape_compute.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/sequence_reshape_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/softmax_compute.h
 create mode 100644 paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
 create mode 100644 paddle/fluid/lite/kernels/x86/uniform_random_compute.cc
 create mode 100644 paddle/fluid/lite/opencl/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/opencl/cl_caller.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_caller.h
 create mode 100644 paddle/fluid/lite/opencl/cl_context.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_context.h
 create mode 100644 paddle/fluid/lite/opencl/cl_engine.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_engine.h
 create mode 100644 paddle/fluid/lite/opencl/cl_helper.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_helper.h
 create mode 100644 paddle/fluid/lite/opencl/cl_image.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_image.h
 create mode 100644 paddle/fluid/lite/opencl/cl_image_converter.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_image_converter.h
 create mode 100644 paddle/fluid/lite/opencl/cl_include.h
 create mode 100644 paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
 create mode 100644 paddle/fluid/lite/opencl/cl_kernel/cl_common.h
 create mode 100644 paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
 create mode 100644 paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
 create mode 100644 paddle/fluid/lite/opencl/cl_test.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_tool.cc
 create mode 100644 paddle/fluid/lite/opencl/cl_tool.h
 create mode 100644 paddle/fluid/lite/opencl/cl_wrapper.cxx
 delete mode 100644 paddle/fluid/lite/operators/batch_norm.h
 create mode 100644 paddle/fluid/lite/operators/batch_norm_op.cc
 create mode 100644 paddle/fluid/lite/operators/batch_norm_op.h
 create mode 100644 paddle/fluid/lite/operators/batch_norm_op_test.cc
 create mode 100644 paddle/fluid/lite/operators/calib_op.cc
 create mode 100644 paddle/fluid/lite/operators/calib_op.h
 create mode 100644 paddle/fluid/lite/operators/calib_op_test.cc
 create mode 100644 paddle/fluid/lite/operators/elementwise_ops.h
 rename paddle/fluid/lite/{arm/math/scale.h => operators/fake_dequantize_max_abs.cc} (72%)
 create mode 100644 paddle/fluid/lite/operators/fake_dequantize_max_abs.h
 create mode 100644 paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.cc
 create mode 100644 paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.h
 create mode 100644 paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
 create mode 100644 paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
 create mode 100644 paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc
 create mode 100644 paddle/fluid/lite/operators/gru_op.cc
 create mode 100644 paddle/fluid/lite/operators/gru_op.h
 create mode 100644 paddle/fluid/lite/operators/lookup_table_op.cc
 create mode 100644 paddle/fluid/lite/operators/lookup_table_op.h
 create mode 100644 paddle/fluid/lite/operators/pool_op_test.cc
 create mode 100644 paddle/fluid/lite/operators/reduce_ops.cc
 create mode 100644 paddle/fluid/lite/operators/reduce_ops.h
 create mode 100644 paddle/fluid/lite/operators/sequence_reshape_op.cc
 create mode 100644 paddle/fluid/lite/operators/sequence_reshape_op.h
 create mode 100644 paddle/fluid/lite/operators/split_op.cc
 create mode 100644 paddle/fluid/lite/operators/split_op.h
 create mode 100644 paddle/fluid/lite/operators/transpose_op.cc
 create mode 100644 paddle/fluid/lite/operators/transpose_op.h
 create mode 100644 paddle/fluid/lite/operators/transpose_op_test.cc
 create mode 100644 paddle/fluid/lite/operators/uniform_random_op.cc
 create mode 100644 paddle/fluid/lite/operators/uniform_random_op.h
 create mode 100644 paddle/fluid/lite/python/lite_test.py
 create mode 100644 paddle/fluid/lite/tools/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/tools/debug/CMakeLists.txt
 create mode 100644 paddle/fluid/lite/tools/debug/analysis_tool.py
 create mode 100755 paddle/fluid/lite/tools/debug/check_model.sh
 rename paddle/fluid/lite/{kernels/arm/relu_compute.cc => tools/debug/debug_utils.cc} (91%)
 create mode 100644 paddle/fluid/lite/tools/debug/debug_utils.h
 create mode 100644 paddle/fluid/lite/tools/debug/model_debug_tool.cc
 create mode 100644 paddle/fluid/pybind/executor_lite.cc
 create mode 100644 paddle/fluid/pybind/executor_lite.h
 create mode 100644 python/paddle/fluid/cxx_trainer.py
 create mode 100644 run.md

diff --git a/.travis.yml b/.travis.yml
index 87de895ddad..bd355310796 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,7 +22,9 @@ before_install:
 script:
   - |
     # 43min timeout
-    paddle/scripts/paddle_docker_build.sh ${JOB}
+    #paddle/scripts/paddle_docker_build.sh ${JOB}
+    ###
+    echo 0;
     if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi;
 notifications:
   email:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd113a9ec8a..cde01fd3896 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,8 +16,10 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
-
+#add_compile_options(-D_GLIBCXX_USE_CXX11_ABI=0)
+add_definitions("-Wall -g")
 include(system)
+include(cross_compiling/preproject)
 
 project(paddle CXX C)
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
@@ -41,7 +43,9 @@ if(WIN32)
     set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
 endif(WIN32)
 
-find_package(CUDA QUIET)
+if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    find_package(CUDA QUIET)
+endif()
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
@@ -79,19 +83,79 @@ option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VER
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
 
-# PY_VERSION
-if(NOT PY_VERSION)
-  set(PY_VERSION 2.7)
+if(ANDROID OR IOS OR ARMLINUX)
+    set(WITH_GPU OFF CACHE STRING
+        "Disable GPU when cross-compiling for Android and iOS" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+        "Disable DSO when cross-compiling for Android and iOS" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+        "Disable AVX when cross-compiling for Android and iOS" FORCE)
+    set(WITH_PYTHON OFF CACHE STRING
+        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
 endif()
-set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
+
+# for lite, both server and mobile framework.
+option(WITH_LITE "Enable lite framework" OFF)
+option(LITE_WITH_JAVA "Enable Java JNI lib in lite mode" OFF)
+option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
+option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
+option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
+option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
+option(LITE_WITH_OPENCL   "Enable OpenCL support in lite" OFF)
+option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)
+option(LITE_WITH_PROFILE  "Enable profile mode in lite framework"  OFF)
+
+
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
+        "A path setting third party libraries download & build directories.")
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
+            "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+            FORCE)
 endif()
 
+include_directories("${PADDLE_SOURCE_DIR}")
+
+# for mobile
+if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    message(STATUS "Building the mobile framework")
+    include(cross_compiling/postproject)
+
+    # include the necessary thirdparty dependencies
+    include(external/gflags)    # download, build, install gflags
+    include(external/glog)      # download, build, install glog
+    include(external/gtest)     # download, build, install gtest
+    #include(external/zlib)     # download, build, install gtest
+    include(external/protobuf)  # download, build, install protobuf
+    include(external/eigen)     # download eigen3
+    include(ccache)             # set ccache for compilation
+
+    # for opencl
+    if (LITE_WITH_OPENCL)
+        include(external/opencl-headers)
+        include(external/opencl-clhpp)
+    endif()
+
+    include(generic)            # simplify cmake module
+    include(configure)          # add paddle env configuration
+
+    add_subdirectory(paddle)
+    return()
+endif()
+
+
+# PY_VERSION
+if(NOT PY_VERSION)
+  set(PY_VERSION 2.7)
+endif()
+set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
+
 if (APPLE)
     set(WITH_MKL OFF CACHE STRING
         "Disable MKL for building on mac" FORCE)
@@ -102,16 +166,12 @@ if (WIN32)
             "Disable DISTRIBUTE when compiling for Windows" FORCE)
 endif()
 
-set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
-  "A path setting third party libraries download & build directories.")
-
 set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
   "A path setting fluid shared and static libraries")
 
 set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
   "A path setting fluid inference shared and static libraries")
 
-set(THIRD_PARTY_BUILD_TYPE Release)
 
 set(WITH_MKLML ${WITH_MKL})
 if (NOT DEFINED WITH_MKLDNN)
@@ -185,7 +245,6 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
-
 include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
@@ -234,7 +293,6 @@ include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
 
 
-include_directories("${PADDLE_SOURCE_DIR}")
 
 if(WITH_AMD_GPU)
     find_package(HIP)
diff --git a/Dockerfile b/Dockerfile
index 0247d1d19ce..c248ac119ca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,7 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
 # When you modify it, please be aware of cudnn-runtime version
+# and libcudnn.so.x in paddle/scripts/docker/build.sh
 FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
@@ -75,7 +76,7 @@ RUN curl -s -q https://glide.sh/get | sh
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 
-RUN wget -q https://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
+RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
     tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
     cp -rf /usr/local/TensorRT/lib /usr
@@ -92,17 +93,17 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+RUN pip3 --no-cache-dir install -U wheel && \
     pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.6 --no-cache-dir install -U wheel && \
     pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.7 --no-cache-dir install -U wheel && \
     pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     easy_install -U pip && \
-    pip --no-cache-dir install -U pip setuptools wheel py-cpuinfo==5.0.0 && \
+    pip --no-cache-dir install -U pip setuptools wheel && \
     pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
diff --git a/README.md b/README.md
index fea320db978..faf8c8ee27b 100644
--- a/README.md
+++ b/README.md
@@ -98,11 +98,9 @@ We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/begin
 
    We appreciate your contributions!
 
-## Communication
+## Ask Questions
 
-- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 432676488 (PaddlePaddle).
-- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
+You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/README_cn.md b/README_cn.md
index 6b224ee8c51..17f61c70aac 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -80,11 +80,9 @@ pip install paddlepaddle-gpu==1.4.1.post85
 
    欢迎您的贡献!
 
-## 交流与反馈
+## 答疑
 
-- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 432676488 (PaddlePaddle)
-- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
+欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
 
 ## 版权和许可证
 PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake
index eb7bce9f3b7..b5437e776d3 100644
--- a/cmake/anakin_subgraph.cmake
+++ b/cmake/anakin_subgraph.cmake
@@ -1,3 +1,7 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
 set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT")
 find_path(ANAKIN_INCLUDE_DIR anakin_config.h
     PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include
@@ -12,7 +16,9 @@ find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so
     DOC "Path to ANAKIN library.")
 
 if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+  if(WITH_DSO)
     set(ANAKIN_FOUND ON)
+  endif(WITH_DSO)
 else()
     set(ANAKIN_FOUND OFF)
 endif()
@@ -25,8 +31,3 @@ if(ANAKIN_FOUND)
     link_directories(${ANAKIN_ROOT})
     add_definitions(-DPADDLE_WITH_ANAKIN)
 endif()
-
-if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO)
-    message(STATUS "Compile with anakin subgraph.")
-    set(ANAKIN_SUBGRAPH ON)
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 279f1eba3f5..95ae0be6384 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -30,7 +30,6 @@ endif(NOT WITH_PROFILER)
 
 if(WITH_AVX AND AVX_FOUND)
     set(SIMD_FLAG ${AVX_FLAG})
-    add_definitions(-DPADDLE_WITH_AVX)
 elseif(SSE3_FOUND)
     set(SIMD_FLAG ${SSE3_FLAG})
 endif()
@@ -158,3 +157,33 @@ endif(WITH_BRPC_RDMA)
 if(ON_INFER)
     add_definitions(-DPADDLE_ON_INFERENCE)
 endif(ON_INFER)
+
+if(WITH_WBAES)
+    add_definitions(-DPADDLE_WITH_WBAES)
+endif(WITH_WBAES)
+
+# for lite
+# TODO(Superjomn) not work fine with the option
+if (LITE_WITH_CUDA)
+add_definitions("-DLITE_WITH_CUDA")
+endif()
+
+if (LITE_WITH_X86)
+    add_definitions("-DLITE_WITH_X86")
+endif()
+
+if (LITE_WITH_ARM)
+    add_definitions("-DLITE_WITH_ARM")
+endif()
+
+if (LITE_WITH_OPENCL)
+    add_definitions("-DLITE_WITH_OPENCL")
+endif()
+
+if (LITE_WITH_PROFILE)
+    add_definitions("-DLITE_WITH_PROFILE")
+endif()
+
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+  add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
+endif()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index e57f32aae7c..76b4a5179d2 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -26,54 +26,59 @@ if(NOT DEFINED ANDROID_NDK)
     endif()
 endif()
 
+if(ARM_TARGET_LANG STREQUAL "gcc")
+    # gcc do not need set lang on android
+    set(ARM_TARGET_LANG "")
+endif()
 
 if(NOT DEFINED ANDROID_API_LEVEL)
     set(ANDROID_API_LEVEL "22")
 endif()
 
-if(NOT DEFINED ANDROID_STL_TYPE)
-    set(ANDROID_STL_TYPE "c++_static" CACHE STRING "stl type")
-endif()
-
-# TODO(TJ): enable me
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
-    message(FATAL_ERROR "Not supported building android armeabi-v7a-hf yet")
+# then check input arm abi
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7hf")
+    message(FATAL_ERROR "ANDROID does not support hardfp on v7 use armv7 instead.")
 endif()
 
 set(ANDROID_ARCH_ABI ${ARM_TARGET_ARCH_ABI} CACHE STRING "Choose Android Arch ABI")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+    set(ANDROID_ARCH_ABI "arm64-v8a")
+endif()
 
-if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a-softfp")
+if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
     set(ANDROID_ARCH_ABI "armeabi-v7a")
 endif()
 
-set(ANDROID_ARCH_ABI_LIST "arm64-v8a" "armeabi-v7a" "armeabi-v6" "armeabi"
-    "mips" "mips64" "x86" "x86_64" "armeabi-v7a-hf")
-set_property(CACHE ANDROID_ARCH_ABI PROPERTY STRINGS ${ANDROID_ARCH_ABI_LIST})
-if(NOT ANDROID_ARCH_ABI IN_LIST ANDROID_ARCH_ABI_LIST)
-    message(FATAL_ERROR "ANDROID_ARCH_ABI must be in one of ${ANDROID_ARCH_ABI_LIST}")
-endif()
+check_input_var(ANDROID_ARCH_ABI DEFAULT ${ANDROID_ARCH_ABI} LIST "arm64-v8a" "armeabi-v7a"
+    "armeabi-v6" "armeabi" "mips" "mips64" "x86" "x86_64")
+check_input_var(ANDROID_STL_TYPE DEFAULT "c++_static" LIST "c++_static" "gnustl_static")
 
 if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
-    message(STATUS "armeabi-v7a default use softfp")
+    message(STATUS "armeabi-v7a use softfp by default.")
     set(CMAKE_ANDROID_ARM_NEON ON)
-    message(STATUS "NEON is enabled on arm-v7a with softfp")
-endif()
-
-if(ANDROID_ARCH_ABI STREQUAL "armeabi-v7a-hf")
-    set(ANDROID_ARCH_ABI "armeabi-v7a")
-    set(CMAKE_CXX_FLAGS "-std=c++11 -march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}" )
-    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-    message(STATUS "NEON is enabled on arm-v7a with hard float")
-endif()
-
-set(ANDROID_STL_TYPE_LITS "gnustl_static" "c++_static")
-set_property(CACHE ANDROID_STL_TYPE PROPERTY STRINGS ${ANDROID_STL_TYPE_LITS}) 
-if (NOT ANDROID_STL_TYPE IN_LIST ANDROID_STL_TYPE_LITS)
-    message(FATAL_ERROR "ANDROID_STL_TYPE must be in one of ${ANDROID_STL_TYPE_LITS}")
+    message(STATUS "NEON is enabled on arm-v7a with softfp.")
 endif()
 
 set(CMAKE_SYSTEM_NAME Android)
 set(CMAKE_SYSTEM_VERSION ${ANDROID_API_LEVEL})
 set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ARCH_ABI})
 set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
+set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION ${ARM_TARGET_LANG})
 set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL_TYPE})
+
+if (ARM_TARGET_LANG STREQUAL "clang")
+    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+        set(triple aarch64-v8a-linux-android)
+    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+        set(triple arm-v7a-linux-android)
+    else()
+        message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
+    endif()
+
+    set(CMAKE_C_COMPILER clang)
+    set(CMAKE_C_COMPILER_TARGET ${triple})
+    set(CMAKE_CXX_COMPILER clang++)
+    set(CMAKE_CXX_COMPILER_TARGET ${triple})
+
+    message(STATUS "CMAKE_CXX_COMPILER_TARGET: ${CMAKE_CXX_COMPILER_TARGET}")
+endif()
diff --git a/cmake/cross_compiling/armlinux.cmake b/cmake/cross_compiling/armlinux.cmake
index 1d752075cca..98f23d43005 100644
--- a/cmake/cross_compiling/armlinux.cmake
+++ b/cmake/cross_compiling/armlinux.cmake
@@ -20,38 +20,22 @@ set(ARMLINUX TRUE)
 add_definitions(-DLITE_WITH_LINUX)
 set(CMAKE_SYSTEM_NAME Linux)
 
-if(ARM_TARGET_ARCH_ABI STREQUAL "arm64-v8a")
+check_input_var(ARMLINUX_ARCH_ABI DEFAULT ${ARM_TARGET_ARCH_ABI} LIST "armv8" "armv7" "armv7hf")
+
+if(ARMLINUX_ARCH_ABI STREQUAL "armv8")
     set(CMAKE_SYSTEM_PROCESSOR aarch64)
     set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
     set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
-
-    set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
-    message(STATUS "NEON is enabled on arm64-v8a")
-endif()
-
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a"
-    OR ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
-    message(FATAL_ERROR "Not supported building arm linux arm-v7 yet")
 endif()
 
-# TODO(TJ): make sure v7 works
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a")
+if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
     set(CMAKE_SYSTEM_PROCESSOR arm)
     set(CMAKE_C_COMPILER "arm-linux-gnueabi-gcc")
     set(CMAKE_CXX_COMPILER "arm-linux-gnueabi-g++")
-
-    set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}")
-    message(STATUS "NEON is enabled on arm-v7a with softfp")
 endif()
 
-if(ARM_TARGET_ARCH_ABI STREQUAL "armeabi-v7a-hf")
+if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
     set(CMAKE_SYSTEM_PROCESSOR arm)
     set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
     set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
-
-    set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
-    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
-    message(STATUS "NEON is enabled on arm-v7a with hard float")
 endif()
diff --git a/cmake/cross_compiling/findar.cmake b/cmake/cross_compiling/findar.cmake
new file mode 100644
index 00000000000..bcb0dc70fd8
--- /dev/null
+++ b/cmake/cross_compiling/findar.cmake
@@ -0,0 +1,33 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT ARM_TARGET_LANG STREQUAL "clang")
+    # only clang need find ar tool
+    return()
+endif()
+
+if(NOT EXISTS "${CMAKE_CXX_COMPILER}")
+    message(ERROR "Can not find CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}")
+endif()
+
+get_filename_component(AR_PATH ${CMAKE_CXX_COMPILER} PATH)
+
+find_file(AR_TOOL NAMES llvm-ar PATHS ${AR_PATH})
+
+if(NOT AR_TOOL)
+    message(ERROR "Failed to find AR_TOOL in ${AR_PATH}")
+else()
+    set(CMAKE_AR ${AR_TOOL})
+    message(STATUS "Found CMAKE_AR : " ${CMAKE_AR})
+endif()
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
new file mode 100644
index 00000000000..2e7ab512b98
--- /dev/null
+++ b/cmake/cross_compiling/postproject.cmake
@@ -0,0 +1,56 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if (ANDROID)
+    include(cross_compiling/findar)
+endif()
+
+if(ARMLINUX)
+    if(ARMLINUX_ARCH_ABI STREQUAL "armv8")
+        set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
+        set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
+        message(STATUS "NEON is enabled on arm64-v8a")
+    endif()
+
+    if(ARMLINUX_ARCH_ABI STREQUAL "armv7")
+        set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
+        set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}")
+        message(STATUS "NEON is enabled on arm-v7a with softfp")
+    endif()
+
+    if(ARMLINUX_ARCH_ABI STREQUAL "armv7hf")
+        set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
+        set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
+        message(STATUS "NEON is enabled on arm-v7a with hard float")
+    endif()
+endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+if(LITE_WITH_OPENMP)
+    find_package(OpenMP REQUIRED)
+    if(OPENMP_FOUND OR OpenMP_CXX_FOUND)
+        add_definitions(-DARM_WITH_OMP)
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}")
+        message(STATUS "OpenMP C flags:  ${OpenMP_C_FLAGS}")
+        message(STATUS "OpenMP CXX flags:  ${OpenMP_CXX_FLAGS}")
+        message(STATUS "OpenMP OpenMP_CXX_LIB_NAMES:  ${OpenMP_CXX_LIB_NAMES}")
+        message(STATUS "OpenMP OpenMP_CXX_LIBRARIES:  ${OpenMP_CXX_LIBRARIES}")
+    else()
+        message(FATAL_ERROR "Could not found OpenMP!")
+    endif()
+endif()
+
diff --git a/cmake/cross_compiling/preproject.cmake b/cmake/cross_compiling/preproject.cmake
new file mode 100644
index 00000000000..3aa636b59ea
--- /dev/null
+++ b/cmake/cross_compiling/preproject.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    return()
+endif()
+
+cmake_minimum_required(VERSION 3.10)
+
+# define check function
+function(check_input_var VAR_NAME)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs DEFAULT LIST)
+  cmake_parse_arguments(check_input_var "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(var_out "")
+  if(NOT DEFINED ${VAR_NAME})
+    set(var_out ${check_input_var_DEFAULT})
+  else()
+    set(var_out ${${VAR_NAME}})
+  endif()
+  
+  if(NOT var_out IN_LIST check_input_var_LIST)
+    message(FATAL_ERROR "${VAR_NAME}:${var_out} must be in one of ${check_input_var_LIST}")
+  endif()
+  set(${VAR_NAME} ${var_out} PARENT_SCOPE)
+endfunction(check_input_var)
+
+check_input_var(ARM_TARGET_OS DEFAULT "android" LIST "android" "armlinux")
+check_input_var(ARM_TARGET_ARCH_ABI DEFAULT "armv8" LIST "armv8" "armv7" "armv7hf" "arm64-v8a" "armeabi-v7a")
+check_input_var(ARM_TARGET_LANG DEFAULT "gcc" LIST "gcc" "clang")
+check_input_var(ARM_TARGET_LIB_TYPE DEFAULT "static" LIST "static" "shared")
+message(STATUS "Lite ARM Compile ${ARM_TARGET_OS} with ${ARM_TARGET_ARCH_ABI} ${ARM_TARGET_LANG}")
+
+include(cross_compiling/host)
+include(cross_compiling/armlinux)
+include(cross_compiling/android)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Default use Release in android" FORCE)
+endif()
+
+if(NOT THIRD_PARTY_BUILD_TYPE)
+    set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING "Default use MinSizeRel in android" FORCE)
+endif()
+
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b9c72c046e7..735846db1db 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -141,10 +141,12 @@ endfunction()
 message(STATUS "CUDA detected: " ${CUDA_VERSION})
 if (${CUDA_VERSION} LESS 7.0)
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+  add_definitions("-DPADDLE_CUDA_BINVER=\"60\"")
 elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"70\"")
 elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
@@ -152,16 +154,18 @@ elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
   # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
   # warning for now.
   list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"80\"")
 elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"90\"")
 elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
   list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
-add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
 
 include_directories(${CUDA_INCLUDE_DIRS})
 if(NOT WITH_DSO)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 98466d44fc0..fff1980637d 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -96,7 +96,7 @@ if(CUDNN_FOUND)
         endif()
 
         message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. "
-            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+            "Current cuDNN version is v${CUDNN_MAJOR_VERSION}. ")
 
     endif()
 endif()
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index 05e63bfe3fe..a58b8c68d77 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -38,3 +38,5 @@ ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES})
 ADD_DEPENDENCIES(dgc extern_dgc)
 
+LIST(APPEND external_project_dependencies dgc)
+
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index dfe81d8f9bf..72441160f89 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -12,13 +12,6 @@ if(NOT WITH_FAST_MATH)
   add_definitions(-DEIGEN_FAST_MATH=0)
 endif()
 
-if(WIN32)
-    set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror)
-    set(EIGEN_GIT_TAG support_cuda9_win)
-else()
-    set(EIGEN_GIT_REPOSITORY https://github.com/eigenteam/eigen-git-mirror)
-    set(EIGEN_GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c)
-endif()
 if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
@@ -36,10 +29,10 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "${EIGEN_GIT_REPOSITORY}"
+        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         ${EIGEN_GIT_TAG}
+        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
         PREFIX          ${EIGEN_SOURCE_DIR}
         DOWNLOAD_NAME   "eigen"
         UPDATE_COMMAND  ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 343b7544788..256e1bbebf0 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,13 +18,32 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+                  "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                  "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+                  "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+                  "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+                  "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+                  "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
+
+if(ANDROID)
+  SET(OPTIONAL_ARGS ${OPTIONAL_ARGS}
+                    "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
+                    "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
+                    "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
+                    "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"            )
+endif()
+
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -32,24 +51,24 @@ ExternalProject_Add(
     GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DBUILD_STATIC_LIBS=ON
+    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${OPTIONAL_ARGS}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${GFLAGS_INSTALL_DIR}/lib/libgflags.lib")
+    add_custom_command(TARGET extern_gflags POST_BUILD
+            COMMAND cmake -E copy ${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib ${GFLAGS_INSTALL_DIR}/lib/libgflags.lib
+            )
+  ENDIF()
+ENDIF(WIN32)
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index ac6294048cf..80abc2350ca 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -19,7 +19,7 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
 
 IF(WIN32)
-  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
   SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530")
 ELSE(WIN32)
   SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
@@ -31,6 +31,25 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
 SET(GLOG_TAG "v0.3.5")
 
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+                  "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                  "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+                  "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+                  "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+                  "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+                  "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
+
+if(ANDROID)
+  SET(OPTIONAL_ARGS ${OPTIONAL_ARGS}
+                    "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
+                    "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
+                    "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
+                    "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
+                    "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+                    "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}")
+endif()
+
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -39,14 +58,7 @@ ExternalProject_Add(
     GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+    CMAKE_ARGS      ${OPTIONAL_ARGS}
                     -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
@@ -60,6 +72,13 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
+IF(WIN32)
+  IF(NOT EXISTS "${GLOG_INSTALL_DIR}/lib/libglog.lib")
+    add_custom_command(TARGET extern_glog POST_BUILD
+    COMMAND cmake -E copy ${GLOG_INSTALL_DIR}/lib/glog.lib ${GLOG_INSTALL_DIR}/lib/libglog.lib
+  )
+  ENDIF()
+ENDIF(WIN32)
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index e459526583b..57fd6812879 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -43,6 +43,26 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
         SET(GTEST_DEPENDS   ${MKLML_PROJECT})
     ENDIF()
 
+    SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+        "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+        "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+        "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+        "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+        "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+        "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+        "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
+
+    if(ANDROID)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS}
+            "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
+            "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
+            "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
+            "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
+            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
+            )
+    endif()
+
     ExternalProject_Add(
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -51,14 +71,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+        CMAKE_ARGS      ${OPTIONAL_ARGS}
                         -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                         -DBUILD_GMOCK=ON
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 066811296e1..142fce816de 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -38,7 +38,6 @@ IF(WIN32)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
-    SET(MKLML_SHARED_LIB_DEPS     ${MKLML_LIB_DIR}/msvcr120.dll)
     SET(MKLML_SHARED_IOMP_LIB     ${MKLML_LIB_DIR}/libiomp5md.dll)
 ELSE()
     #TODO(intel-huying):
diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake
index cdcbdd46a8d..d00195b08d2 100644
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 
 SET(NGRAPH_PROJECT         "extern_ngraph")
-SET(NGRAPH_GIT_TAG         "4ec94acc11084a5d53418f565529310fa584899a")
+SET(NGRAPH_GIT_TAG         "127e0dedfaac8c6f2b148cc03bf5f67ac5fbe6fe")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
diff --git a/cmake/external/opencl-clhpp.cmake b/cmake/external/opencl-clhpp.cmake
new file mode 100644
index 00000000000..ea724860d9b
--- /dev/null
+++ b/cmake/external/opencl-clhpp.cmake
@@ -0,0 +1,36 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(OPENCL_CLHPP_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-clhpp)
+SET(OPENCL_CLHPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/opencl-clhpp)
+SET(OPENCL_CLHPP_INCLUDE_DIR "${OPENCL_CLHPP_INSTALL_DIR}" CACHE PATH "opencl-clhpp include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${OPENCL_CLHPP_INCLUDE_DIR})
+
+ExternalProject_Add(
+  opencl_clhpp
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-CLHPP.git"
+  GIT_TAG           "v2.0.10"
+  PREFIX            "${OPENCL_CLHPP_SRCS_DIR}"
+  CMAKE_ARGS        -DBUILD_DOCS=OFF
+                    -DBUILD_EXAMPLES=OFF
+                    -DBUILD_TESTS=OFF
+                    -DCMAKE_INSTALL_PREFIX=${OPENCL_CLHPP_INSTALL_DIR}
+  CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${OPENCL_CLHPP_INSTALL_DIR}
+                    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+
+ADD_DEPENDENCIES(opencl_clhpp opencl_headers)
diff --git a/cmake/external/opencl-headers.cmake b/cmake/external/opencl-headers.cmake
new file mode 100644
index 00000000000..68c9c5251cf
--- /dev/null
+++ b/cmake/external/opencl-headers.cmake
@@ -0,0 +1,33 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(OPENCL_HEADERS_SRCS_DIR    ${THIRD_PARTY_PATH}/opencl-headers)
+SET(OPENCL_HEADERS_INCLUDE_DIR "${OPENCL_HEADERS_SRCS_DIR}/src/opencl_headers" CACHE PATH "opencl-headers include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${OPENCL_HEADERS_INCLUDE_DIR})
+
+ExternalProject_Add(
+  opencl_headers
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY    "https://github.com/KhronosGroup/OpenCL-Headers.git"
+  GIT_TAG           "c5a4bbeabb10d8ed3d1c651b93aa31737bc473dd"
+  PREFIX            ${OPENCL_HEADERS_SRCS_DIR}
+  DOWNLOAD_NAME     "OpenCL-Headers"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 09eb437aede..6d2136223d3 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -142,7 +142,6 @@ IF (WIN32)
 ENDIF(WIN32)
 
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-
     find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
     find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
     find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
@@ -178,12 +177,29 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
          PARENT_SCOPE)
 
+    SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
     SET(OPTIONAL_CACHE_ARGS "")
     SET(OPTIONAL_ARGS "")
+
     IF(BUILD_FOR_HOST)
-        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF")
-    ELSE()
         SET(OPTIONAL_ARGS
+            "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}"
+            "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
+            "-Dprotobuf_WITH_ZLIB=OFF"
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
+    ELSE()
+        # protobuf have compile issue when use android stl c++_static
+        SET(PROTOBUF_REPO "https://github.com/tensor-tang/protobuf.git")
+        SET(PROTOBUF_TAG "mobile")
+        SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF"
+            "-DCMAKE_SYSTEM_NAME=${CMAKE_SYSTEM_NAME}"
+            "-DCMAKE_SYSTEM_VERSION=${CMAKE_SYSTEM_VERSION}"
+            "-DCMAKE_ANDROID_ARCH_ABI=${CMAKE_ANDROID_ARCH_ABI}"
+            "-DCMAKE_ANDROID_NDK=${CMAKE_ANDROID_NDK}"
+            "-DCMAKE_ANDROID_STL_TYPE=${CMAKE_ANDROID_STL_TYPE}"
+            "-DCMAKE_ANDROID_NDK_TOOLCHAIN_VERSION=${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION}"
             "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
             "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
@@ -191,25 +207,18 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}"
             "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
-            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
-            "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
-            ${EXTERNAL_OPTIONAL_ARGS})
-        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
+            "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}")
     ENDIF()
     IF(WIN32)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
-    SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git")
-    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
-
     ExternalProject_Add(
         ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
-        DEPENDS         zlib
+        #DEPENDS         zlib
         GIT_REPOSITORY  ${PROTOBUF_REPO}
         GIT_TAG         ${PROTOBUF_TAG}
         CONFIGURE_COMMAND
@@ -233,6 +242,13 @@ ENDFUNCTION()
 
 SET(PROTOBUF_VERSION 3.1.0)
 
+IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    build_protobuf(protobuf_host TRUE)
+    LIST(APPEND external_project_dependencies protobuf_host)
+    SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_host_PROTOC_EXECUTABLE}
+        CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF()
+
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
 
@@ -245,7 +261,12 @@ IF(NOT PROTOBUF_FOUND)
     SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
         CACHE FILEPATH "protoc library." FORCE)
 
-    SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
-        CACHE FILEPATH "protobuf executable." FORCE)
-    PROMPT_PROTOBUF_LIB(extern_protobuf)
+    IF(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
+    ELSE()
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
+            CACHE FILEPATH "protobuf executable." FORCE)
+        PROMPT_PROTOBUF_LIB(extern_protobuf)
+    ENDIF()
+
 ENDIF(NOT PROTOBUF_FOUND)
diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
index b7159d14c11..0287e5cf2a8 100644
--- a/cmake/external/pslib.cmake
+++ b/cmake/external/pslib.cmake
@@ -29,9 +29,9 @@ INCLUDE(ExternalProject)
 SET(PSLIB_PROJECT       "extern_pslib")
 IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
   MESSAGE(STATUS "use pre defined download url")
-  SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
-  SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/ps/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE)
+  SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) 
+  SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) 
+  SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) 
 ENDIF()
 MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
 SET(PSLIB_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib")
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 3fb6b49f472..1e01057aa60 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -53,7 +53,12 @@ ExternalProject_Add(
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 IF(WIN32)
-    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+    IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
+        add_custom_command(TARGET extern_snappy POST_BUILD
+                COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib
+                )
+    ENDIF()
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib")
 else(WIN32)
     set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 endif (WIN32)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 5fc46ae8eb8..012283c6ea7 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -64,7 +64,12 @@ ExternalProject_Add(
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 IF(WIN32)
-    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+    IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        add_custom_command(TARGET extern_warpctc POST_BUILD
+                COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
+                )
+    ENDIF()
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
             CACHE FILEPATH "Warp-ctc Library" FORCE)
 else(WIN32)
     SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 262d47f6fd4..23b1e021086 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -56,7 +56,12 @@ else()
 endif()
 
 if (WIN32)
-  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib")
+  IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
+    add_custom_command(TARGET extern_xxhash POST_BUILD
+            COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib
+            )
+  ENDIF()
+  set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib")
 else()
   set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a")
 endif ()
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 58881ac2206..5569fefe992 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -44,7 +44,12 @@ ExternalProject_Add(
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 IF(WIN32)
-  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
+  IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib")
+    add_custom_command(TARGET extern_zlib POST_BUILD
+            COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib
+            )
+  ENDIF()
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE)
 ELSE(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 3e3a5ba66c8..a028dcbd6be 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,10 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
   find_package(Threads REQUIRED)
   link_libraries(${CMAKE_THREAD_LIBS_INIT})
-  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+  set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl")
+  if (NOT ANDROID)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -lrt")
+  endif()
 endif(NOT APPLE)
 
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
@@ -363,10 +366,11 @@ function(cc_binary TARGET_NAME)
   target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
 endfunction(cc_binary)
 
-function(cc_test_build TARGET_NAME)
+function(cc_test TARGET_NAME)
   if(WITH_TESTING)
+    set(options SERIAL)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     if(WIN32)
@@ -379,18 +383,12 @@ function(cc_test_build TARGET_NAME)
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
-  endif()
-endfunction()
-
-function(cc_test_run TARGET_NAME)
-  if(WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs COMMAND ARGS)
-    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-	    COMMAND ${cc_test_COMMAND}
-	    ARGS ${cc_test_ARGS}
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
@@ -398,21 +396,46 @@ function(cc_test_run TARGET_NAME)
     # No unit test should exceed 10 minutes.
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
-endfunction()
+endfunction(cc_test)
 
-function(cc_test TARGET_NAME)
+# cc_test without default dependencies
+function(raw_cc_test TARGET_NAME)
   if(WITH_TESTING)
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cc_test_build(${TARGET_NAME}
-	    SRCS ${cc_test_SRCS}
-	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    if(WIN32)
+      if("${cc_test_DEPS};" MATCHES "python;")
+        list(REMOVE_ITEM cc_test_DEPS python)
+        target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES})
+      endif()
+    endif(WIN32)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} lite_gtest_main gtest gflags glog)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} lite_gtest_main gtest gflags glog)
+    common_link(${TARGET_NAME})
+    add_test(NAME ${TARGET_NAME}
+            COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
   endif()
-endfunction(cc_test)
+endfunction(raw_cc_test)
+
+function(_lite_cc_test args)
+  if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    message(STATUS "building lite raw test: ${args}")
+    raw_cc_test(${args} ${ARGN})
+  else()
+    message(STATUS "building lite heavy test: ${args}")
+    cc_test(${args} ${ARGN})
+  endif()
+endfunction()
 
 function(nv_library TARGET_NAME)
   if (WITH_GPU)
@@ -465,6 +488,7 @@ endfunction(nv_binary)
 
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
+    set(options SERIAL)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -474,6 +498,9 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G
@@ -716,7 +743,7 @@ function(py_proto_compile TARGET_NAME)
   cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(py_srcs)
   protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs} protobuf)
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
 endfunction()
 
 function(py_test TARGET_NAME)
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
new file mode 100644
index 00000000000..35d5f7e2871
--- /dev/null
+++ b/cmake/lite.cmake
@@ -0,0 +1,79 @@
+# Bundle several static libraries into one.
+function(bundle_static_library tgt_name bundled_tgt_name fake_target)
+  list(APPEND static_libs ${tgt_name})
+
+  function(_recursively_collect_dependencies input_target)
+    set(_input_link_libraries LINK_LIBRARIES)
+    get_target_property(_input_type ${input_target} TYPE)
+    if (${_input_type} STREQUAL "INTERFACE_LIBRARY")
+      set(_input_link_libraries INTERFACE_LINK_LIBRARIES)
+    endif()
+    get_target_property(public_dependencies ${input_target} ${_input_link_libraries})
+    foreach(dependency IN LISTS public_dependencies)
+      if(TARGET ${dependency})
+        get_target_property(alias ${dependency} ALIASED_TARGET)
+        if (TARGET ${alias})
+          set(dependency ${alias})
+        endif()
+        get_target_property(_type ${dependency} TYPE)
+        if (${_type} STREQUAL "STATIC_LIBRARY")
+          list(APPEND static_libs ${dependency})
+        endif()
+
+        get_property(library_already_added
+          GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency})
+        if (NOT library_already_added)
+          set_property(GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency} ON)
+          _recursively_collect_dependencies(${dependency})
+        endif()
+      endif()
+    endforeach()
+    set(static_libs ${static_libs} PARENT_SCOPE)
+  endfunction()
+
+  _recursively_collect_dependencies(${tgt_name})
+
+  list(REMOVE_DUPLICATES static_libs)
+
+  set(bundled_tgt_full_name
+    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
+
+  message(STATUS "+++++ bundled_tgt_full_name: ${bundled_tgt_full_name}")
+
+  file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
+    "CREATE ${bundled_tgt_full_name}\n" )
+
+  foreach(tgt IN LISTS static_libs)
+    file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
+      "ADDLIB $<TARGET_FILE:${tgt}>\n")
+  endforeach()
+
+  file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "SAVE\n")
+  file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "END\n")
+
+  file(GENERATE
+    OUTPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
+    INPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in)
+
+  set(ar_tool ${CMAKE_AR})
+  if (CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    set(ar_tool ${CMAKE_CXX_COMPILER_AR})
+  endif()
+
+  add_custom_command(
+    COMMAND ${ar_tool} -M < ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
+    OUTPUT ${bundled_tgt_full_name}
+    COMMENT "Bundling ${bundled_tgt_name}"
+    VERBATIM)
+
+  add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_full_name})
+  add_dependencies(${fake_target} ${tgt_name})
+
+  add_library(${bundled_tgt_name} STATIC IMPORTED)
+  set_target_properties(${bundled_tgt_name}
+    PROPERTIES
+      IMPORTED_LOCATION ${bundled_tgt_full_name}
+      INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+  add_dependencies(${bundled_tgt_name} ${fake_target})
+
+endfunction()
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 134c894392a..c17e718f427 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -110,7 +110,7 @@ function(op_library TARGET)
     # Define operators that don't need pybind here.
     foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op"
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
-"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op")
+"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index dd57d4ab996..8bcc4ffe725 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -3,6 +3,9 @@ set(PADDLE_VERSION $ENV{PADDLE_VERSION})
 set(tmp_version "HEAD")
 set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
 set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
+# set(LATEST_PADDLE_VERSION "latest")
+set(LATEST_PADDLE_VERSION "0.0.0")
+
 while ("${PADDLE_VERSION}" STREQUAL "")
   # Check current branch name
   execute_process(
@@ -23,8 +26,8 @@ while ("${PADDLE_VERSION}" STREQUAL "")
       if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
         # Check the tag is a correct version
         if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
-          # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
+          # if no tag was found, set PADDLE_VERSION to "latest"
+          set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
         elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
           string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
         else()  # otherwise, get the previous git tag name.
@@ -42,19 +45,19 @@ while ("${PADDLE_VERSION}" STREQUAL "")
           if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
             string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME})
           else()
-            set(PADDLE_VERSION "0.0.0")
+            set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
           endif()
         else()
-          # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest
-          set(PADDLE_VERSION "0.0.0")
+          # otherwise, we always set PADDLE_VERSION to "latest"
+          set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
         endif()
       endif()
     else()
-      set(PADDLE_VERSION "0.0.0")
+      set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
       message(WARNING "Cannot add paddle version from git tag")
     endif()
   else()
-    set(PADDLE_VERSION "0.0.0")
+    set(PADDLE_VERSION "${LATEST_PADDLE_VERSION}")
     message(WARNING "Cannot add paddle version for wrong git branch result")
   endif()
 endwhile()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c0c04d47595..7eefaa12dfc 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,4 +1,7 @@
-add_subdirectory(scripts)
-add_subdirectory(testing)
-set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+# to limit the mobile dependencies
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+  add_subdirectory(scripts)
+  add_subdirectory(testing)
+  set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
+endif()
 add_subdirectory(fluid)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 052816abbb6..fd9567dd651 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1,14 +1,14 @@
 paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', '86cd9499e226be661a3d686260ee1150'))
-paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', '17d059efb24c81dde6166c6b0b93e9d0'))
-paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd601c7719e425e3d9cf862ea4ad194ca'))
-paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd64ea1dc96e9f674499ea3006d470aa4'))
-paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '32c14b0f12baae4b352200fa09b5e789'))
+paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', 'af5346376065ff4cf6832a8ac0ae0945'))
+paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ebb7765b2962bd2be041d19720e49d0f'))
+paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5e162d3bf8dd625703463d9e4be36adb'))
+paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'cfb7e05a002b2e64650778cabde7301c'))
+paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1c8647b14fe57c7824b1c9562394dd3c'))
 paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5'))
-paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', '89acca639baf00f3ad08b9d827e81706'))
-paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'ba609cb02e4e55e8d626723567ef1778'))
-paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '06a5a8f649dfb8496c1f683c909db375'))
-paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '78fb5c7f70ef76bcf4a1862c3f6b8191'))
+paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', 'faec17e5a04af28e3776160e34504d15'))
+paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '99e5d53d92d82797093332719c9e3ccd'))
+paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5430f54ab4895f9f47db6bebbaf71659'))
+paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ae5f806f082cfaeaa5194cacc253a5e4'))
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '61660461e1f44e0480ca22fa8a482c41'))
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7f3068b82fc427bfa04b1af953610992'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '8b674e9a7ac7944c27fd853b675c2cb2'))
@@ -22,41 +22,40 @@ paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'data
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', '6e19f92e2f185320a3a86b77e85eb3b3'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32'))
-paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf'))
-paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39'))
-paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd'))
-paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7'))
-paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', '2348247f684bfd5bb9466470f35be064'))
-paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4'))
+paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
+paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
+paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c'))
+paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff'))
+paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4'))
+paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4'))
+paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
 paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40'))
+paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '80d857dc626612e2b2460d0154551e95'))
 paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd'))
 paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff'))
 paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb'))
 paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '75283b5f03ec7b6f74bfca9881a37428'))
-paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '68df53d3ea0f24063bf7689e82c2b82e'))
-paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'd5a78553cd94fe64148399797055d8ad'))
-paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '88d229ea9f892ce8d2922cf028c8bb3a'))
+paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4294493e31c4bc9fc4bd48753044235f'))
+paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21'))
+paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766'))
+paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690'))
 paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96'))
 paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8'))
-paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.ExecutionStrategy) -> None
-paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
-paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None
-paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
-paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942'))
-paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07ffd5351b30cf47172ccfd61bd0de6f'))
+paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
+paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None
+paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
+paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b55d6193a1d4198d45b013fc5779e1f2'))
+paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '3a7a99abac3e1bf898871fe609354218'))
 paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9141bb5f32caf7975eb3fd88c8a1b2da'))
-paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '1bb9454cf09d71f190bb51550c5a3ac9'))
-paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '944291120d37bdb037a689d2c86d0a6e'))
+paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '0a5308f496632ab1ec3ba1f1377e6f95'))
+paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '41779819cef32f2246e83aebc5a002e2'))
 paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '28df5bfe26ca7a077f91156abb0fe6d2'))
-paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment', 'program_only'], varargs=None, keywords=None, defaults=(None, None, None, True, False)), ('document', 'fc82bfd137a9b1ab8ebd1651bd35b6e5'))
-paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '2f54d7c206b62f8c10f4f9d78c731cfd'))
-paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)), ('document', 'af82e1b5fe5764029905a191b987f63d'))
+paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '648f64d8fd81572eef34a69e533459ef'))
+paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable'], varargs=None, keywords=None, defaults=(True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '4a072de39998ee4e0de33fcec11325a6'))
 paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', '3db4b24d33fe4f711e303f9673dc5c6a'))
 paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '94adc0fb71c4b2ae6c3c74886c9cb898'))
@@ -69,137 +68,137 @@ paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['sel
 paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0'))
 paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5'))
-paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '2c6748c1dd1d85f800462869ea7a747f'))
-paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '280b581f5a77e746e47decbc57a7b30a'))
+paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
+paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
-paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '6f9f96d2a1517cd1affebc960c3526f7'))
-paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '8e35ca26adbe44eb631d71045c8d64d5'))
+paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
+paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
-paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3'))
-paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e'))
-paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '34f96be41684b0959897a9e735997e20'))
+paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '4ec4845fd7d991bcac822f8b0dfc101f'))
+paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', 'e0e2439f7af069b57badca18a6ba60b8'))
+paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '7c49ef4bbf0adfd4b9a1d98e2e5f3fea'))
 paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '462ddf2435e3392334e0c05ae57a01c4'))
 paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', 'cefab7c23ee5582727e8b22dffbafac8'))
 paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', '535f1f6213dd7ca0fe5ed7cb4718c0e3'))
-paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4'))
+paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30add751a0f99347a6257634c03ff254'))
 paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'f273bb26833ee88b349c4b8083e1dc67'))
-paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', '5aa25d023acea1fb49a0de56be86990b'))
-paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', '3d8e8f3e0e1cf520156be37605e83ccd'))
+paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ee152a7ba3036e7b9ede9184545179b4'))
+paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', 'b6543768e1afaa2ecb869709d6e9c7e2'))
 paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '8ca6121acd6d23cd8806a93f493c2e17'))
-paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', 'd2990494eaf531fb584321b7edfb5104'))
-paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82'))
+paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '37042620f9bd3a2da6e5d3138b2f724b'))
+paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a194fb80614023f543df3949fbd0d0b8'))
 paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '19ef6f9cdd27feac8a1ae060f19c10b4'))
-paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5'))
+paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', '59b1c6bf2f0fa9dc649c85fef3a3b2ea'))
 paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'bbd84e855e660cd1084bb71a2fd0cdaa'))
 paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '043de7333b79ee0ac55053c14ed81625'))
 paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '859b887174d06f361658f69cb7c06d95'))
-paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c'))
-paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '9cf79315d3423dddba0404e8f85a89b8'))
-paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787'))
-paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0'))
+paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '120f4323a3d7ed9c0916f15a59f0e497'))
+paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '581f9f99cd7f4b0cab9e0aad5fa0ea24'))
+paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', 'e45e09e65a2658e07cad987222f0d9ab'))
+paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b0b8d53821716cd50c42e09b593f3feb'))
 paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '03993955ab1e6d3044c44e6f17fc85e9'))
 paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ec113c6a3686ac94f8fccd1a7953d445'))
-paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', 'e91c4b68cc4d8e9f7787b76032a85e75'))
-paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ecf8d24cf4fd5c035406ee46afccfa0'))
+paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '79c375214fa427faac504043d162dae9'))
+paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d2611f84ab364c5da545e6a82f1770a'))
 paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6a1adf3067b20f6e4bcb354d71c19184'))
 paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd12803c903c99aa36ec03aaac5f0cc5b'))
-paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21'))
-paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'dd5f06fb7cf39ca06cbab4abd03e6893'))
-paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'a3024789eba11a70c2ef27c358173400'))
-paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '10023caec4d7f78c3b901f023a1feaa7'))
-paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '1a1c91625ce3c32646f69ca10d4d1da7'))
-paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b386471f0476c80c61d8c8672278063d'))
+paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', '027723966f3ef0d7bc598f22287a96cc'))
+paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b69998ce3ff4980fb21da0df05565f1b'))
+paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd4d80dd98a1a5839f41eeb3a0f85f370'))
+paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
+paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
+paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
 paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
 paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
 paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', 'f1dd22f7351f7f9853212958e0d8aa7a'))
-paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '59b28903ce8fb6a7e3861ff355592eb4'))
+paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2bc3a59efa9d52b628a6255422d9f0e8'))
-paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', 'f2c252aa2f83f8e503ffaf79668eaa28'))
+paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
 paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
-paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'aa27ca4405e70c6a733cb9806a76af30'))
-paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2a1e9ea041ff4d6a9948bb8d03b743ea'))
-paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', '4aa9df890b47eb67d5442f04aaf9eeec'))
-paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701'))
-paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa'))
-paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16'))
+paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
+paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
+paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
+paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'a10ab9bf88d4a7e328882d411abb6fd1'))
+paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1feac48b843d679db82312dc85885f4'))
+paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '3ce01160ede80b1c26f776f8fef9340f'))
 paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '32b3c442da0f3df682b5fcac10468116'))
-paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '4521da36af223d5a95bb8f190b5c7add'))
-paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50'))
-paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096'))
+paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', '5db30b8a74e8c93687943a3e8d221da0'))
+paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d'))
+paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996'))
 paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e'))
 paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23'))
 paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
 paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
-paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '9461e67095a6fc5d568fb2ce8fef66ff'))
+paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
 paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '8b074f9c56b4233a2b65d03254eb309e'))
 paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88'))
 paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '960fc799549c202da1e85d626cb2c962'))
 paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '67afefa80b6cc38801bd5b631fed8a4a'))
 paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f'))
-paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '06588973f613e9dcd592724322864589'))
-paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9bd3129d36a70e7c4385df51ff71c62'))
-paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '395e6ba041ccfacfe1d534c3e107fd66'))
+paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465'))
+paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbd62da391b1df984a1909d069a759b2'))
+paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f122194c562bd674f6ecdccf33785f99'))
 paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '330241f0bc57e9d16973ec322a6aef71'))
 paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f189f8ef61f1c23779e1593b78755c0'))
 paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f'))
-paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '9060f4cab873c4ab2deed5211080698e'))
-paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f'))
+paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '70c113658102a11cc5d8e3d45145737a'))
+paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
 paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '3d8f4891c1d5e890a4e574371027dd35'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5'))
 paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'f1bc5eb7198175d2b79197a681d98b43'))
 paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '099b9f051e6247ae661e4a7b4fd3f89a'))
 paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '746bf58fdb1bd475f8c5f996b05b0e52'))
 paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '9baf9288c862161ff850d45228047a5e'))
-paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', '3569a6002a96c7f6b5e5bcfdc402df13'))
-paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab'))
-paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '71df5136cf03b06c65027b692fe78f1a'))
+paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '01a198d6fff38d5f0d8180a40b228085'))
+paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342'))
+paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b'))
 paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c9ab9e460ef0a1823249935a30e82c66'))
-paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e3b6630ba43cb13dfeeb1601cb64d671'))
+paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', '35cbbdfa585d027bb490707c95a176b9'))
 paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bf1676268df8ef100b8ab01d51336b25'))
-paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'f93c61f5b0bf933cd425a64dca2c4fdd'))
-paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '056af2c0e6e22d94e8df7fc39677707f'))
+paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '9044c7fe667b76cb2d9264f2db11f417'))
+paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '98247c59d1c9b40af6730001b2aea73d'))
 paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ddf9837ee83e549119210a3d714d5f44'))
-paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1098b7a70c7696cc7437d6d57b5d89ed'))
+paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c542e39ac6add24a6bef6e79bf5617e2'))
 paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '99b3fee0daee04911d2bee8871b26435'))
 paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '463258ee9f8b60760eb1e26357cc9bfa'))
 paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '6f367339caf6c7124bc262fe1475df70'))
-paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '35fa2b79b1ae6968d4a69788051c1d27'))
+paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'a5117c1eb84aca2ac0b0abab337a4799'))
 paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '959936a477efc6c1447a9c8bf8ce94bb'))
-paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', '607d79ca873bee40eed1c79a96611591'))
+paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', 'c82059b6fea1aa730f9aac911807b756'))
 paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ef745e55a48763ee7b46b21a81dc7e84'))
-paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2da40e447716338affebfe058d05d9a9'))
+paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'f6acef7ff7d887e49ff499fbb1dad4a9'))
 paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '3db337c195e156e6ef2b8b4a57113600'))
 paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', 'f878486c82b576938151daad0de995a0'))
-paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '3490ed5c9835ae039a82979daf3918a4'))
+paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '869adce548c342d6cc1bd88a948d83c9'))
 paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'cb295c13cb957db85cd9609269d7784d'))
-paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', '767cea598dee8e2b94f04110fa6b7e67'))
-paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', 'e8d86c47e92bcb878ff8022b6f66cec2'))
-paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '3f3abdb795a5c2aad8c2312249551ce5'))
-paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b0c4ca08d4eb295189e1b107c920d093'))
-paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', '2e49e97069beb57ee89d54ed088ae2da'))
+paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', 'f0dd6eddd3bff015a3c05269d82fcbd8'))
+paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '367cfbb642839beacb5d117e2d2b4041'))
+paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '7f4d46320cc077ca2e8db600c35f4030'))
+paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', '98eb9d633116efcfc6f90c114bd44fd6'))
+paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'f6028537085dc296103bbbd85fa7763d'))
 paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '117d3607d1ffa0571835bbaebc7857ff'))
-paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b992616c1afbd6b0c2a897ac23036381'))
-paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '463e4713806e5adaa4d20a41e2218453'))
-paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '5c0fb7298aec32525f96d451ae4c2851'))
-paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1da49b7cda887dd84087ef8c060fcf6a'))
-paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '992559c8327c61babd2ed25fc9047fbf'))
-paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '213db11a61dcb0f31159d343cc35e2f5'))
-paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '409167a1409ec31b0d3a2f8852a7943f'))
-paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '4e1322836eb69473d5606bfe346c5375'))
-paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'b9e7e9fa1ca28d8b6f07cc59eadb4a02'))
-paddle.fluid.layers.elementwise_mod (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '614984304f810f3ddae6b489ec01296b'))
-paddle.fluid.layers.elementwise_floordiv (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a8c4b26d899246378e878f169582c7a4'))
-paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', 'c8c7518358cfbb3822a019e6b5fbea52'))
-paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '8c78ccb77e291e4a0f0673d34823ce4b'))
+paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a1d155dd1bf6e72a0a3e3e1519591d1'))
+paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '30190413b2fa442e7466d6cf2ce5ea07'))
+paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '210ee7e597f429f836a21b298991ef85'))
+paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '9ce91719cf4a05de9a817e9ff2387ee8'))
+paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'c66c50b550bc547b6c61d15c1f3ee2ab'))
+paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'e6919013e5369c7b0d486b8604da6b2f'))
+paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'f839de1318c794f26b9f5aafcd2ad92f'))
+paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'c37aa719815585f2c20623f92e738d54'))
+paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '984e0e72db2a3b4241a694499f8d76c8'))
+paddle.fluid.layers.elementwise_mod (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '4aa6b682b8676a2f3adf9f58790e327d'))
+paddle.fluid.layers.elementwise_floordiv (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '638ca44932743bda05caf3fcc15f1f0d'))
+paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', '129e0a3257f1d532a948eedf9d5bf671'))
+paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '389dafe36e099841b6a7fb18d11f1b4c'))
 paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', '35428949368cad5121dd37f8522ef8b0'))
 paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', '9e520987168f8ddb7dd71ffd68aa352c'))
-paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '4527fd90e222f67b5f7451fb0cf7c845'))
-paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '3ca6a761570d86e303e473afba99bb49'))
-paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'bf61c8f79d795a8371bdb3b5468aa82b'))
+paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
+paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
+paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
 paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
 paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
 paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
@@ -207,48 +206,44 @@ paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs
 paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cd1c8cf31e040427d4e05711044caeb6'))
 paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ce33756573c572da67302499455dbcd'))
 paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a1ea0bc5a926f427458c4254ca022749'))
-paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9562845452b0455fa23ab64334415417'))
-paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', '784b7e36cea88493f9e37a41b10fbf4d'))
+paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd638d915195ce86a8d7963b81110d4c8'))
+paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', 'ccd37fa6b53f074adbfb732d738c4c2d'))
 paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '180c284317ea45ef89a460d8d79c0b72'))
 paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '71426e02d240d0daedae81a02ca1c191'))
-paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'aba90d0cbb43185216000b82fd231734'))
+paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a9221eaef53884a00654e028551b78e2'))
 paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b'))
-paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51'))
-paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110'))
+paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '77a6d80aa5551ca70324fc975c44507f'))
+paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', 'ab84fdc6dc60f3ad9aa397e6007e3bf9'))
 paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6f90d6ff76bf4f5e592332c1ef28494e'))
-paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'da621ba1363e8f5fe7b702526bbae18f'))
-paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5d16663e096d7f04954c70ce1cc5e195'))
+paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '97bf4353bb046a5629308a38f98ac204'))
+paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd256cba1c41a5ed92ce3f31e24a2ca6d'))
 paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'af541e9263be61ce0e40df58d1b69294'))
-paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937'))
+paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4b9c2e8af5817937d831820874b5aa77'))
 paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'cd0bd55ef1e1762aca25ec972d34d378'))
-paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c2a9c00d5c22e156d92ffa2e8736adf3'))
-paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3e60aec040a6f740a130353323580bff'))
-paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', '4e513cbd7c8d0d64e426dbbc94cb72b7'))
+paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dc63315b84f591ac79ecca0c3632027a'))
+paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7ffc849e71f31dfe29030ff94e662de6'))
+paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'd5e6c494ac35100e2ed4d4bd9a1ed932'))
 paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2fa6782d43d02ae64482d21235a82949'))
 paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', 'fe4481fb31363b09cfdd228fc6776ddf'))
 paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb'))
 paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '42d5155374f69786300d90d751956998'))
-paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '07cb0d95a646dba1b9cc7cdce89e59f0'))
-paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '11bb8e62cc9256958eff3991fe4834da'))
+paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '2f6ff96864054a31aa4bb659c6722c99'))
+paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '431a4301c35032166ec029f7432c80a7'))
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '2985a372ac897ea4e13aced7f930d6f8'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
 paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '132b6e74ff642a392bd6b14c10aedc65'))
-paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '20992b20d19c2e5983f366150827b4a6'))
-paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', '94e2819b7c9715ea71b62e9c78f36b29'))
+paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
+paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'a07a44c2bacdcd09c1f5f35a96a0514e'))
 paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', '3126e3039e752ce26077f1efaca355c6'))
-paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'ccf6bb7912afd2818d24bc45461e807a'))
-paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', 'c896b66265a60bd3c5510f66e6e02919'))
-paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6'))
-paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '65b8dbe13e00c4dc8224652f6ff89540'))
-paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9e87163ba32003f21d2c9d8c6a605ada'))
-paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'cccb6eb5410c822e5307c947aca2c899'))
-paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6'))
-paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'f29d7d159e114f73fc988d9a86805841'))
+paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', 'adf285346e23316097f7789b572491e9'))
+paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'cf12066a3139026119f97f9d4381a1bd'))
+paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
+paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'f967a73426db26f970bc70bfb03cffca'))
 paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'fcb24383c6eef2ca040ee824c26e22fd'))
-paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'c13b8a8521bea5f8123b925ae2a5d5db'))
+paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '07e5b796674796eb1ef3fee9c10d24e3'))
 paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '9b7f0f86ec24bbc97643cadcb6499cff'))
-paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '0a77c56dff556b5ae4c5630d9a0764ef'))
+paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '5c54493d96c7e0760dc6758af1c8dd72'))
 paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'b42332b894e1e0962c6a43f0151c2640'))
 paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -256,24 +251,24 @@ paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, ke
 paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff'))
 paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
-paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'b6fe28cffff32d15e45c411bcf815cb7'))
-paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', '90eb79e0d1261ec2bac7c775ee4f459b'))
+paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
+paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
 paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3'))
-paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '764c095ba4562ae740f979e970152d6e'))
+paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
 paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'f9e905b48123914c78055a45fe23106a'))
-paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '5df743d578638cd2bbb9369499b44af4'))
-paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', '8bd94aef4e123986d9a8c29f67b5532b'))
-paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', 'baf63a2f3b647a2d5da6ba8afb6135ac'))
-paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'd6b76c7d2c7129f8d713ca74f1c2c287'))
+paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
+paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b690184f3537df5501e4d9d8f31152a5'))
+paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', 'd4059a2f5763036b07018d76429f9acb'))
+paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', '1d8b14729639fa38509c79b9784740fa'))
 paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '677c09cc0fd7381974bfc845c4d9f0f2'))
 paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', 'ef64ee883998e7e246a854a845e11e2c'))
 paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '0a85a9a145d2e24e05958a3f1322d68a'))
-paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '812c623ed52610b9773f9fc05413bc34'))
-paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '95379f9288c2d05356ec0e2375c6bc57'))
-paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '628135603692137d52bcf5a8d8d6816d'))
-paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '51a0fa1cfaf2507c00a215adacdb8a63'))
-paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '129cf426e71452fe8276d616a6dc21ae'))
-paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '548a0ae317105e6dbfed321d7e37c03d'))
+paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'b402489c62e668df42e7daceb63c142b'))
+paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', 'c155e2efc56ffa5ed4658cca0272e491'))
+paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '8ee7cb6ca639e7460e825f953b65d94d'))
+paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '8f8c0306117ea441f20dcbbdba1f0ecc'))
+paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
+paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
 paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
 paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
 paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c7e4cfffc93ae89c8f6f53b6d650f923'))
@@ -281,19 +276,19 @@ paddle.fluid.layers.diag (ArgSpec(args=['diagonal'], varargs=None, keywords=None
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', 'f88b5787bb80ae6b8bf513a70dabbdc1'))
-paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '3f913b5069ad40bd85d89b33e4aa5939'))
+paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', 'f7c7160014c1b46cfeda9dd5808d1789'))
+paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '50853ae884df03d9c36703bb46d9ef07'))
+paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
+paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
-paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'eb41e5993f705fcfa354024054a75f5f'))
+paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
 paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd6b173ae1a149e0bdfe7b8bf69285957'))
 paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c9bd414caa6c615539018d27001b44c'))
 paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '62c667d24e7b07e166b47a53b61b2ff4'))
-paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '788aa651e8b9fec79d16931ef3a33e90'))
+paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
 paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '56148fb1024687a08e96af79bdc5c929'))
-paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'caf0d94349cdc28e1bda3b8a19411ac0'))
-paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', '6f24a9b872027634ad758ea2826c9727'))
+paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385'))
+paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
 paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -301,21 +296,21 @@ paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywor
 paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
-paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', '57cdd0a63747f4c670cdb9d250ceb7e1'))
+paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', 'b9174d4e91505b0c8ecc193eb51e248d'))
 paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a'))
-paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '55ab9c562edd7dabec0bd6fd6c1a28cc'))
-paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '4b300851b5201891d0e11c406e4c7d07'))
+paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'f29ad2478b6b2ad4f413d2936a331ea0'))
+paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
 paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
 paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'f1b60dc4194d0bb714d6c6f5921b227f'))
+paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
 paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
 paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
 paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
 paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
 paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
-paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08'))
-paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'ee6c70867d317b0a87094ed23546215f'))
-paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a'))
+paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
+paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
+paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
 paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a4e395ab004e7da34e94a0a1f9eee183'))
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5f2508c52e0a797bb9bd5e29d79ede78'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '41c976b68542f4cbee178640f765d845'))
@@ -337,41 +332,38 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor
 paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fcc0d8ec2d2983f5d2ae0196fa83916b'))
 paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a9bef6674dc20af1ae901656ed041cdf'))
 paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5c1e9c619db82d6392826d0c2908ea55'))
-paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '6de6775d9e9ed885056e764982130cfd'))
+paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', 'a8c4e972b7d6742c838a37abf407ed9a'))
 paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
 paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
 paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
 paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', 'a00d43a08ec664454e8e685bc54e9e78'))
 paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', '7e62e12ce8b127f2c7ce8db79299c3c3'))
-paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fd58078fdfffd899b91f992ba224628f'))
+paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fe9afaee481dd09f28866df22756466f'))
 paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
-paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5'))
-paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'efae414c1137c7944d6174dd08c5347a'))
+paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
+paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
 paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '6d5028fd09d01ab82d296adc0ea95aee'))
-paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1e164a56fe9376e18a56d22563d9f801'))
-paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595'))
-paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d'))
+paddle.fluid.layers.detection_map (ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')), ('document', '1467d91b50c22cd52103b4aa1ee9d0a1'))
+paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '1dddef3eb4b3cbd4df8e03ac480dbf97'))
 paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '82b2aefeeb1b706bc4afec70928a259a'))
-paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'd1ddc75629fedee46f82e631e22c79dc'))
-paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', 'c0d00acf724691ff3480d4207036a722'))
-paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1'))
-paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef'))
-paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '72fca4a39ccf82d5c746ae62d1868a99'))
+paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', '9307c12b1d4e554279b9708f787cd019'))
+paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)), ('document', '87863717edeb7fe87a1268976cbc015d'))
+paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', '57ab49f3f324f310b7eed322e7c1057a'))
+paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'f73706a65468e9ca3e0bee4a31521b0a'))
+paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
-paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e308ce1661cb722b220a6f482f85b9e4'))
+paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', 'eb62b1ff7cc981f3483a62321a491f2e'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f332fb8c5bb581bd1a6b5be450a99990'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '04384378ff00a42ade8fabd52e27cbc5'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
-paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
 paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'dfc953994fd8fef35c49dd9c6eea37a5'))
-paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '82ffd896ecc3c005ae1cad40854dcace'))
-paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', 'ef799022a6040597462ae2b3d2f1c407'))
-paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '300537e259bba86fdefa13a133a0587d'))
-paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eaf430c5a0380fb11bfe9a8922cd6295'))
-paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '63a9e96d446d7de1289f30b832bce36a'))
-paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ea37a3a8a0b3ce2254e7bc49a0951dbe'))
+paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
+paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
+paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))
+paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '676a7bc2a218691db50bca233903d21e'))
+paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'd07e767d59c4a5e6c930f3e6756d3f82'))
 paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', 'a343254c36c2e89512cd8cd8a1960ead'))
 paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'd9f654117542c6b702963dda107a247f'))
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'f96805b1a64f9a12f4627497e5fcb920'))
@@ -406,9 +398,8 @@ paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self'
 paddle.fluid.contrib.Calibrator.__init__ (ArgSpec(args=['self'], varargs='args', keywords='kwargs', defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.Calibrator.sample_data (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3b8c85ca1e2cf753cc8c90a6c6992958'))
 paddle.fluid.contrib.Calibrator.save_int8_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.contrib.distributed_sampler (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '9a271cd9700deb6d837ed724ba094315'))
 paddle.fluid.contrib.reader.ctr_reader.ctr_reader (ArgSpec(args=['feed_dict', 'file_type', 'file_format', 'dense_slot_index', 'sparse_slot_index', 'capacity', 'thread_num', 'batch_size', 'file_list', 'slots', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2ebf3de2a6ef1af2c3b88d2db7591ab'))
-paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0'))
+paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], './checkpoints', None, None)), ('document', '31ae143830c9bf6b43547dd546c5ba80'))
 paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0'))
 paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9'))
 paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
@@ -428,16 +419,15 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca
 paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a'))
 paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a'))
 paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4'))
-paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(1.0, 1000, 2, 2.0, 0.8, False)), ('document', 'bdb8f9dbb0d94b3957272c53eeee9818'))
-paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270'))
+paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'init_loss_scaling', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(1.0, False)), ('document', '67e9bf14f345b38da169beb1ebb276eb'))
 paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32'))
-paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf'))
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39'))
-paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd'))
-paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7'))
-paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', '2348247f684bfd5bb9466470f35be064'))
-paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4'))
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '292ab72977afbe58e6a3bde175452680'))
+paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', '78f4949aedf317666a89ca74b3748ba8'))
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd796fc0c8d51503b556fcf6dc15c4f0c'))
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '736330e31a7a54abccc0c7fd9119d9ff'))
+paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '06ce55338dfe96311ad1078235ab3bf4'))
+paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)), ('document', 'eda17d0f1639bc6ca215cecf87f588a4'))
+paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ac4114d3df16264f1946deb3a8434a6f'))
 paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -448,154 +438,119 @@ paddle.fluid.transpiler.DistributeTranspilerConfig.__init__
 paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '13f01ff80e8dfbd3427d90cf49bc62eb'))
 paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'd6a1e527b53f5cc15594fee307dfc5cf'))
 paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', '6486b2595300fc3305b5a1f0ac363dce'))
-paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b'))
+paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', '921714c9bfb351b41403418265393203'))
 paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '5178bc1b4d302192597a5efbae13d902'))
 paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.SGDOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.MomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.FtrlOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.AdadeltaOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '648010d0ac1fa707dac0b89f74b0e35c'))
+paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '46234a5470590feb336346f70a3db715'))
 paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ModelAverage.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '5f14ea4adda2791e1c3b37ff327f6a83'))
+paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '18db9c70be9c4dd466f9844457b21bfe'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
+paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
-paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
-paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
-paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
-paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
-paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a'))
-paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec'))
-paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ea10f08af6d7aac3b7974aa976e4085f'))
-paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'cut_list', 'place_list', 'concurrency_list', 'queue_size', 'sync_steps', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(None, None, None, 30, 1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.PipelineOptimizer.create_vars (ArgSpec(args=['self', 'block', 'main_program'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.PipelineOptimizer.extract_section_ops (ArgSpec(args=['self', 'ops', 'cut_point_name'], varargs=None, keywords=None, defaults=None), ('document', '4a29be77da04b5c30dd7202f44c79b70'))
-paddle.fluid.optimizer.PipelineOptimizer.extract_section_opt_ops (ArgSpec(args=['self', 'ops', 'cut_point_name'], varargs=None, keywords=None, defaults=None), ('document', '99e0f641222c1ce4dd0d7194c3b2c653'))
-paddle.fluid.optimizer.PipelineOptimizer.find_input_output (ArgSpec(args=['self', 'ops', 'name', 'is_forward'], varargs=None, keywords=None, defaults=(True,)), ('document', '92d77fb262766b352746f09cca81db93'))
-paddle.fluid.optimizer.PipelineOptimizer.find_persistable_vars (ArgSpec(args=['self', 'ops', 'whole_parameters'], varargs=None, keywords=None, defaults=None), ('document', '877b7cc290f0647455e5e4409e825923'))
-paddle.fluid.optimizer.PipelineOptimizer.find_section_opt (ArgSpec(args=['self', 'ops', 'params'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.optimizer.PipelineOptimizer.split_program (ArgSpec(args=['self', 'main_program', 'cut_list'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
-paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None
-paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> bool
-paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]]
-paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]]
-paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core_avx.LoDTensor, lod: List[List[int]]) -> None
-paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
-paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core_avx.Tensor) -> List[int]
-paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core_avx.LoDTensorArray) -> None
-paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core_avx.LoDTensorArray, tensor: paddle.fluid.core_avx.LoDTensor) -> None
-paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core_avx.CPUPlace) -> None
-paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None
-paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None
+paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core.LoDTensor, arg0: List[List[int]]) -> None  2. __init__(self: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> bool
+paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor) -> List[List[int]]
+paddle.fluid.LoDTensor.set 1. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CPUPlace) -> None  2. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CPUPlace) -> None  3. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CPUPlace) -> None  4. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CPUPlace) -> None  5. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CPUPlace) -> None  6. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CPUPlace) -> None  7. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CPUPlace) -> None  8. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CPUPlace) -> None  9. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPlace) -> None  10. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPlace) -> None  11. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPlace) -> None  12. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPlace) -> None  13. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPlace) -> None  14. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPlace) -> None  15. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPlace) -> None  16. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPlace) -> None  17. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float32], arg1: paddle::platform::CUDAPinnedPlace) -> None  18. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int32], arg1: paddle::platform::CUDAPinnedPlace) -> None  19. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[float64], arg1: paddle::platform::CUDAPinnedPlace) -> None  20. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int64], arg1: paddle::platform::CUDAPinnedPlace) -> None  21. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[bool], arg1: paddle::platform::CUDAPinnedPlace) -> None  22. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint16], arg1: paddle::platform::CUDAPinnedPlace) -> None  23. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[uint8], arg1: paddle::platform::CUDAPinnedPlace) -> None  24. set(self: paddle.fluid.core.Tensor, arg0: numpy.ndarray[int8], arg1: paddle::platform::CUDAPinnedPlace) -> None
+paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core.LoDTensor, lod: List[List[int]]) -> None
+paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None
+paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core.Tensor) -> List[int]
+paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core.LoDTensorArray) -> None
+paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core.LoDTensorArray, tensor: paddle.fluid.core.LoDTensor) -> None
+paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> None
+paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
+paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'be47d7e07824b4281da77472846955ac'))
-paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc'))
-paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c312743c910dda1c3a9c6637ac30187f'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
+paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
+paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByValue.__init__ (ArgSpec(args=['self', 'min_value', 'max_value'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '49f5db5da13cfd8c069754dd11be3901'))
 paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'd33483b1781e47c4c5d5fefa7b7debcb'))
 paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'd8db46bf9a579bec476d09dea80eb23d'))
 paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '88da8fb6dbebaee2f7520188a09574f9'))
 paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a7500e39dd033f1e64f562e909333a8a'))
-paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42'))
-paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be'))
-paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4'))
+paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
-paddle.fluid.Scope Scope() -> paddle.fluid.core_avx._Scope
+paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
 paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9'))
 paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c'))
 paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 595454e90b9..c212d579921 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) # for mobile
+    add_subdirectory(lite)
+    return()
+endif()
 add_subdirectory(memory)
 add_subdirectory(platform)
 add_subdirectory(framework)
@@ -6,7 +10,8 @@ add_subdirectory(operators)
 add_subdirectory(string)
 add_subdirectory(recordio)
 add_subdirectory(pybind)
-
+add_subdirectory(train)
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
+
+add_subdirectory(lite)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 65367a21209..ce33a70c549 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -29,8 +29,7 @@ add_subdirectory(io)
 proto_library(framework_proto SRCS framework.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(async_executor_proto SRCS data_feed.proto)
-proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
-  data_feed_proto)
+proto_library(trainer_desc_proto SRCS trainer_desc.proto data_feed.proto)
 
 cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
@@ -125,7 +124,7 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co
 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type data_feed_proto)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -174,20 +173,20 @@ endif()
 
 cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector)
 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
+  cc_library(executor SRCS executor.cc multi_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
   data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
+  pull_dense_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer
   lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS}
   graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer)
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
+  cc_library(executor SRCS executor.cc multi_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
   data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc
-  pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
+  pull_dense_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto trainer_desc_proto glog
   lod_rank_table fs shell fleet_wrapper lodtensor_printer feed_fetch_method
   graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS} timer data_feed_proto)
@@ -202,10 +201,10 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         fast_threaded_ssa_graph_executor variable_helper)
 
 cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc
-           executor_thread_worker.cc multi_trainer.cc dist_multi_trainer.cc pipeline_trainer.cc
+           executor_thread_worker.cc multi_trainer.cc dist_multi_trainer.cc
            trainer_factory.cc trainer.cc device_worker.cc hogwild_worker.cc
-           downpour_worker.cc pull_dense_worker.cc section_worker.cc 
-           device_worker_factory.cc data_set.cc dataset_factory.cc
+           downpour_worker.cc pull_dense_worker.cc device_worker_factory.cc
+           data_set.cc dataset_factory.cc
            DEPS op_registry device_context scope framework_proto
            trainer_desc_proto glog lod_rank_table fleet_wrapper lodtensor_printer
            feed_fetch_method graph_to_program_pass data_feed_proto
@@ -226,8 +225,6 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
 cc_test(tuple_test SRCS tuple_test.cc )
 
-cc_test(inlined_vector_test SRCS inlined_vector_test.cc)
-
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc
index 7eb80a4617a..89153d82d07 100644
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -85,9 +85,8 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
   }
 
   DataFeedDesc data_feed_desc;
-  bool success = data_feed_desc.ParseFromString(data_feed_desc_str);
-  PADDLE_ENFORCE(success, "Fail to parse DataFeedDesc from string:\n%s",
-                 data_feed_desc_str.c_str());
+  google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
+                                                &data_feed_desc);
 
   actual_thread_num_ = thread_num;
   int file_cnt = filelist.size();
diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h
index 4f35da402f3..cc5b4e8c4b8 100644
--- a/paddle/fluid/framework/blocking_queue.h
+++ b/paddle/fluid/framework/blocking_queue.h
@@ -95,11 +95,6 @@ class BlockingQueue {
     return q_.size();
   }
 
-  void Clear() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    std::deque<T>().swap(q_);
-  }
-
  private:
   std::mutex mutex_;
   std::condition_variable cv_;
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index e89f3f1a4e0..02e467e853e 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed.h"
 #ifdef _LINUX
 #include <stdio_ext.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
 #endif
 #include <utility>
 #include "gflags/gflags.h"
@@ -90,13 +87,6 @@ void DataFeed::CheckStart() {
   PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
 }
 
-void DataFeed::AssignFeedVar(const Scope& scope) {
-  CheckInit();
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    feed_vec_[i] = scope.FindVar(use_slots_[i])->GetMutable<LoDTensor>();
-  }
-}
-
 template <typename T>
 void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
   PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
@@ -168,7 +158,6 @@ InMemoryDataFeed<T>::InMemoryDataFeed() {
   mutex_for_update_memory_data_ = nullptr;
   this->file_idx_ = nullptr;
   this->mutex_for_pick_file_ = nullptr;
-  fleet_send_sleep_seconds_ = 2;
 }
 
 template <typename T>
@@ -377,7 +366,7 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
   auto fleet_ptr = FleetWrapper::GetInstance();
   std::vector<std::vector<T*>> send_vec(trainer_num_);
   std::vector<int> send_index(trainer_num_);
-  uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_ + 1;
+  uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_;
   for (auto& vec : send_vec) {
     vec.reserve(reserve_len);
   }
@@ -388,33 +377,46 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
   auto interval = GetMemoryDataInterval();
   VLOG(3) << "global shuffle data from  [" << interval.first << ", "
           << interval.second << "), thread_id=" << thread_id_;
-
-  for (int64_t i = interval.first; i < interval.second;
-       i += fleet_send_batch_size_) {
-    for (int64_t j = 0; j < fleet_send_batch_size_ && i + j < interval.second;
-         ++j) {
-      int64_t random_num = fleet_ptr->LocalRandomEngine()();
-      int64_t node_id = random_num % trainer_num_;
-      send_vec[node_id].push_back(&((*memory_data_)[i + j]));
-    }
-    total_status.clear();
-    std::shuffle(send_index.begin(), send_index.end(),
-                 fleet_ptr->LocalRandomEngine());
-    for (int index = 0; index < send_index.size(); ++index) {
-      int j = send_index[index];
-      if (send_vec[j].size() == 0) {
-        continue;
+  for (int64_t i = interval.first; i < interval.second; ++i) {
+    // if get ins id, can also use hash
+    // std::string ins_id = memory_data_[i].ins_id;
+    int64_t random_num = rand_r(&rand_seed);
+    int64_t node_id = random_num % trainer_num_;
+    send_vec[node_id].push_back(&((*memory_data_)[i]));
+    if (i % fleet_send_batch_size_ == 0 && i != 0) {
+      // shuffle the sequence of sending to avoid network timeout error
+      std::random_shuffle(send_index.begin(), send_index.end());
+      for (int index = 0; index < send_index.size(); ++index) {
+        int j = send_index[index];
+        std::string send_str;
+        SerializeIns(send_vec[j], &send_str);
+        VLOG(3) << "send str_length=" << send_str.length()
+                << ", ins num=" << send_vec[j].size() << " to node_id=" << j
+                << ", thread_id=" << thread_id_;
+        auto ret = fleet_ptr->SendClientToClientMsg(0, j, send_str);
+        VLOG(3) << "end send, thread_id=" << thread_id_;
+        send_vec[j].clear();
+        total_status.push_back(std::move(ret));
       }
+    }
+  }
+  // shuffle the sequence of sending to avoid network timeout error
+  std::random_shuffle(send_index.begin(), send_index.end());
+  for (int index = 0; index < send_index.size(); ++index) {
+    int j = send_index[index];
+    if (send_vec[j].size() != 0) {
       std::string send_str;
       SerializeIns(send_vec[j], &send_str);
+      VLOG(3) << "send str_length=" << send_str.length() << " to node_id=" << j
+              << ", thread_id=" << thread_id_;
       auto ret = fleet_ptr->SendClientToClientMsg(0, j, send_str);
+      VLOG(3) << "end send, thread_id=" << thread_id_;
       total_status.push_back(std::move(ret));
-      send_vec[j].clear();
-    }
-    for (auto& t : total_status) {
-      t.wait();
     }
-    sleep(fleet_send_sleep_seconds_);
+    std::vector<T*>().swap(send_vec[j]);
+  }
+  for (auto& t : total_status) {
+    t.wait();
   }
   VLOG(3) << "GlobalShuffle() end, thread_id=" << thread_id_;
 #endif
@@ -434,24 +436,6 @@ std::pair<int64_t, int64_t> InMemoryDataFeed<T>::GetMemoryDataInterval() {
   return std::make_pair(start, end);
 }
 
-template <typename T>
-int64_t InMemoryDataFeed<T>::GetChannelDataSize() {
-  if (cur_channel_ == 0) {
-    return shuffled_ins_->Size();
-  } else {
-    return shuffled_ins_out_->Size();
-  }
-}
-
-template <typename T>
-void InMemoryDataFeed<T>::ReleaseChannelData() {
-  if (cur_channel_ == 0) {
-    shuffled_ins_->Clear();
-  } else {
-    shuffled_ins_out_->Clear();
-  }
-}
-
 // explicit instantiation
 template class InMemoryDataFeed<std::vector<MultiSlotType>>;
 
@@ -487,17 +471,17 @@ void MultiSlotDataFeed::Init(
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) > 0) {
-            total_dims_without_inductive_[i] *= slot.shape(j);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
           }
-          if (slot.shape(j) == -1) {
-            inductive_shape_index_[i] = j;
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
           }
         }
       }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
       }
       use_slots_shape_.push_back(local_shape);
     }
@@ -821,24 +805,22 @@ void MultiSlotInMemoryDataFeed::Init(
     all_slots_[i] = slot.name();
     all_slots_type_[i] = slot.type();
     use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
-    total_dims_without_inductive_[i] = 1;
-    inductive_shape_index_[i] = -1;
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
       std::vector<int> local_shape;
       if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) > 0) {
-            total_dims_without_inductive_[i] *= slot.shape(j);
+        for (size_t i = 0; i < slot.shape_size(); ++i) {
+          if (slot.shape(i) > 0) {
+            total_dims_without_inductive_[i] *= slot.shape(i);
           }
-          if (slot.shape(j) == -1) {
-            inductive_shape_index_[i] = j;
+          if (slot.shape(i) == -1) {
+            inductive_shape_index_[i] = i;
           }
         }
       }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
       }
       use_slots_shape_.push_back(local_shape);
     }
@@ -1019,205 +1001,5 @@ void MultiSlotInMemoryDataFeed::DeserializeIns(
   fleet_ptr->Deserialize(ins, str);
 }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-template <typename T>
-void PrivateInstantDataFeed<T>::PutToFeedVec() {
-  for (size_t i = 0; i < use_slots_.size(); ++i) {
-    const auto& type = ins_vec_[i].GetType();
-    const auto& offset = ins_vec_[i].GetOffset();
-    int total_instance = static_cast<int>(offset.back());
-
-    if (type[0] == 'f') {  // float
-      const auto& feasign = ins_vec_[i].GetFloatData();
-      float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
-          {total_instance, 1}, platform::CPUPlace());
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
-    } else if (type[0] == 'u') {  // uint64
-      // no uint64_t type in paddlepaddle
-      const auto& feasign = ins_vec_[i].GetUint64Data();
-      int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
-          {total_instance, 1}, platform::CPUPlace());
-      memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
-    }
-
-    LoD data_lod{offset};
-    feed_vec_[i]->set_lod(data_lod);
-    if (use_slots_is_dense_[i]) {
-      int64_t total_dims = 1;
-      for (const auto e : use_slots_shape_[i]) {
-        total_dims *= e;
-      }
-      PADDLE_ENFORCE(
-          total_dims == total_instance,
-          "The actual data size of slot[%s] doesn't match its declaration",
-          use_slots_[i].c_str());
-      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
-    }
-  }
-}
-
-template <typename T>
-int PrivateInstantDataFeed<T>::Next() {
-  if (ParseOneMiniBatch()) {
-    PutToFeedVec();
-    return ins_vec_[0].GetBatchSize();
-  }
-  Postprocess();
-
-  std::string filename;
-  if (!PickOneFile(&filename)) {
-    return -1;
-  }
-  if (!Preprocess(filename)) {
-    return -1;
-  }
-
-  PADDLE_ENFORCE(true == ParseOneMiniBatch(), "Fail to parse mini-batch data");
-  PutToFeedVec();
-  return ins_vec_[0].GetBatchSize();
-}
-
-template <typename T>
-void PrivateInstantDataFeed<T>::Init(const DataFeedDesc& data_feed_desc) {
-  finish_init_ = false;
-  finish_set_filelist_ = false;
-  finish_start_ = false;
-
-  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
-                 "Multi_slot_desc has not been set.");
-  paddle::framework::MultiSlotDesc multi_slot_desc =
-      data_feed_desc.multi_slot_desc();
-  SetBatchSize(data_feed_desc.batch_size());
-  size_t all_slot_num = multi_slot_desc.slots_size();
-  all_slots_.resize(all_slot_num);
-  all_slots_type_.resize(all_slot_num);
-  use_slots_index_.resize(all_slot_num);
-  multi_inductive_shape_index_.resize(all_slot_num);
-  use_slots_.clear();
-  use_slots_is_dense_.clear();
-  for (size_t i = 0; i < all_slot_num; ++i) {
-    const auto& slot = multi_slot_desc.slots(i);
-    all_slots_[i] = slot.name();
-    all_slots_type_[i] = slot.type();
-    use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
-    if (slot.is_used()) {
-      use_slots_.push_back(all_slots_[i]);
-      use_slots_is_dense_.push_back(slot.is_dense());
-      std::vector<int> local_shape;
-      if (slot.is_dense()) {
-        for (size_t j = 0; j < slot.shape_size(); ++j) {
-          if (slot.shape(j) == -1) {
-            multi_inductive_shape_index_[i].push_back(j);
-          }
-        }
-      }
-      for (size_t j = 0; j < slot.shape_size(); ++j) {
-        local_shape.push_back(slot.shape(j));
-      }
-      use_slots_shape_.push_back(local_shape);
-    }
-  }
-  feed_vec_.resize(use_slots_.size());
-  ins_vec_.resize(use_slots_.size());
-
-  finish_init_ = true;
-}
-
-template class PrivateInstantDataFeed<std::vector<MultiSlotType>>;
-
-bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) {
-  fd_ = open(filename.c_str(), O_RDONLY);
-  PADDLE_ENFORCE(fd_ != -1, "Fail to open file: %s", filename.c_str());
-
-  struct stat sb;
-  fstat(fd_, &sb);
-  end_ = static_cast<size_t>(sb.st_size);
-
-  buffer_ =
-      reinterpret_cast<char*>(mmap(NULL, end_, PROT_READ, MAP_PRIVATE, fd_, 0));
-  PADDLE_ENFORCE(buffer_ != MAP_FAILED, strerror(errno));
-
-  offset_ = 0;
-  return true;
-}
-
-bool MultiSlotFileInstantDataFeed::Postprocess() {
-  if (buffer_ != nullptr) {
-    munmap(buffer_, end_);
-    buffer_ = nullptr;
-  }
-  if (fd_ != -1) {
-    close(fd_);
-    fd_ = -1;
-    end_ = 0;
-    offset_ = 0;
-  }
-  return true;
-}
-
-bool MultiSlotFileInstantDataFeed::ParseOneMiniBatch() {
-  if (offset_ == end_) {
-    return false;
-  }
-
-  batch_size_ = 0;
-  while (batch_size_ < default_batch_size_ && offset_ < end_) {
-    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
-      int idx = use_slots_index_[i];
-      char type = all_slots_type_[i][0];
-
-      uint16_t num = *reinterpret_cast<uint16_t*>(buffer_ + offset_);
-      PADDLE_ENFORCE(
-          num,
-          "The number of ids can not be zero, you need padding "
-          "it in data generator; or if there is something wrong with "
-          "the data, please check if the data contains unresolvable "
-          "characters.");
-      offset_ += sizeof(uint16_t);
-
-      if (idx != -1) {
-        int inductive_size = multi_inductive_shape_index_[i].size();
-        if (UNLIKELY(batch_size_ == 0)) {
-          ins_vec_[idx].Init(all_slots_type_[i], default_batch_size_ * num);
-          ins_vec_[idx].InitOffset(default_batch_size_);
-          uint64_t* inductive_shape =
-              reinterpret_cast<uint64_t*>(buffer_ + offset_);
-          for (int inductive_id = 0; inductive_id < inductive_size;
-               ++inductive_id) {
-            use_slots_shape_[i][multi_inductive_shape_index_[i][inductive_id]] =
-                static_cast<int>(*(inductive_shape + inductive_id));
-          }
-        }
-        num -= inductive_size;
-        offset_ += sizeof(uint64_t) * inductive_size;
-
-        if (type == 'f') {
-          ins_vec_[idx].AppendValues(
-              reinterpret_cast<float*>(buffer_ + offset_), num);
-          offset_ += num * sizeof(float);
-        } else if (type == 'u') {
-          ins_vec_[idx].AppendValues(
-              reinterpret_cast<uint64_t*>(buffer_ + offset_), num);
-          offset_ += num * sizeof(uint64_t);
-        }
-      } else {
-        if (type == 'f') {
-          offset_ += num * sizeof(float);
-        } else if (type == 'u') {
-          offset_ += num * sizeof(uint64_t);
-        }
-      }
-    }
-    ++batch_size_;
-    // OPTIMIZE: It is better to insert check codes between instances for format
-    // checking
-  }
-
-  PADDLE_ENFORCE(batch_size_ == default_batch_size_ || offset_ == end_,
-                 "offset_ != end_");
-  return true;
-}
-#endif
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 7fea85601c4..c141059a6d6 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -59,7 +59,7 @@ class DataFeed {
     file_idx_ = nullptr;
   }
   virtual ~DataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc) = 0;
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
   virtual bool CheckFile(const char* filename) {
     PADDLE_THROW("This function(CheckFile) is not implemented.");
   }
@@ -84,9 +84,6 @@ class DataFeed {
   // This function is used for binding feed_vec memory
   virtual void AddFeedVar(Variable* var, const std::string& name);
 
-  // This function is used for binding feed_vec memory in a given scope
-  virtual void AssignFeedVar(const Scope& scope);
-
   // This function will do nothing at default
   virtual void SetMemoryData(void* memory_data) {}
   // This function will do nothing at default
@@ -118,9 +115,6 @@ class DataFeed {
   virtual void FillChannelToMemoryData() {}
   // This function will do nothing at default
   virtual void PutInsToChannel(const std::string& ins_str) {}
-  virtual int64_t GetChannelDataSize() { return 0; }
-  // This function will do nothing at default
-  virtual void ReleaseChannelData() {}
 
  protected:
   // The following three functions are used to check if it is executed in this
@@ -151,8 +145,6 @@ class DataFeed {
   std::vector<std::vector<int>> use_slots_shape_;
   std::vector<int> inductive_shape_index_;
   std::vector<int> total_dims_without_inductive_;
-  // For the inductive shape passed within data
-  std::vector<std::vector<int>> multi_inductive_shape_index_;
   std::vector<int>
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 
@@ -178,6 +170,7 @@ class PrivateQueueDataFeed : public DataFeed {
  public:
   PrivateQueueDataFeed() {}
   virtual ~PrivateQueueDataFeed() {}
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
   virtual bool Start();
   virtual int Next();
 
@@ -216,7 +209,7 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
  public:
   InMemoryDataFeed();
   virtual ~InMemoryDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc) = 0;
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
   virtual bool Start();
   virtual int Next();
   virtual void SetMemoryData(void* memory_data);
@@ -231,8 +224,6 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
   virtual void LoadIntoMemory();
   virtual void LocalShuffle();
   virtual void GlobalShuffle();
-  virtual int64_t GetChannelDataSize();
-  virtual void ReleaseChannelData();
 
  protected:
   virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
@@ -257,9 +248,6 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
   std::shared_ptr<paddle::framework::BlockingQueue<T>> shuffled_ins_;
   std::shared_ptr<paddle::framework::BlockingQueue<T>> shuffled_ins_out_;
   int64_t fleet_send_batch_size_;
-  // sleep after send is to slow down sending data, but it's trick,
-  // should be removed later.
-  int64_t fleet_send_sleep_seconds_;
 };
 
 // This class define the data type of instance(ins_vec) in MultiSlotDataFeed
@@ -267,25 +255,16 @@ class MultiSlotType {
  public:
   MultiSlotType() {}
   ~MultiSlotType() {}
-  void Init(const std::string& type, size_t reserved_size = 0) {
+  void Init(const std::string& type) {
     CheckType(type);
     if (type_[0] == 'f') {
       float_feasign_.clear();
-      if (reserved_size) {
-        float_feasign_.reserve(reserved_size);
-      }
     } else if (type_[0] == 'u') {
       uint64_feasign_.clear();
-      if (reserved_size) {
-        uint64_feasign_.reserve(reserved_size);
-      }
     }
     type_ = type;
   }
-  void InitOffset(size_t max_batch_size = 0) {
-    if (max_batch_size > 0) {
-      offset_.reserve(max_batch_size + 1);
-    }
+  void InitOffset() {
     offset_.resize(1);
     // LoDTensor' lod is counted from 0, the size of lod
     // is one size larger than the size of data.
@@ -301,16 +280,6 @@ class MultiSlotType {
     CheckUint64();
     uint64_feasign_.push_back(v);
   }
-  void CopyValues(const float* input, size_t size) {
-    CheckFloat();
-    float_feasign_.resize(size);
-    memcpy(float_feasign_.data(), input, size * sizeof(float));
-  }
-  void CopyValues(const uint64_t* input, size_t size) {
-    CheckUint64();
-    uint64_feasign_.resize(size);
-    memcpy(uint64_feasign_.data(), input, size * sizeof(uint64_t));
-  }
   void AddIns(const MultiSlotType& ins) {
     if (ins.GetType()[0] == 'f') {  // float
       CheckFloat();
@@ -324,22 +293,11 @@ class MultiSlotType {
       uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
     }
   }
-  void AppendValues(const uint64_t* input, size_t size) {
-    CheckUint64();
-    offset_.push_back(offset_.back() + size);
-    uint64_feasign_.insert(uint64_feasign_.end(), input, input + size);
-  }
-  void AppendValues(const float* input, size_t size) {
-    CheckFloat();
-    offset_.push_back(offset_.back() + size);
-    float_feasign_.insert(float_feasign_.end(), input, input + size);
-  }
   const std::vector<float>& GetFloatData() const { return float_feasign_; }
   std::vector<float>& MutableFloatData() { return float_feasign_; }
   const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
   std::vector<uint64_t>& MutableUint64Data() { return uint64_feasign_; }
   const std::string& GetType() const { return type_; }
-  size_t GetBatchSize() { return offset_.size() - 1; }
   std::string& MutableType() { return type_; }
 
   std::string DebugString() {
@@ -389,7 +347,7 @@ class MultiSlotDataFeed
  public:
   MultiSlotDataFeed() {}
   virtual ~MultiSlotDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc);
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
   virtual bool CheckFile(const char* filename);
   // virtual void ReadThread();
 
@@ -408,7 +366,7 @@ class MultiSlotInMemoryDataFeed
  public:
   MultiSlotInMemoryDataFeed() {}
   virtual ~MultiSlotInMemoryDataFeed() {}
-  virtual void Init(const DataFeedDesc& data_feed_desc);
+  virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
 
  protected:
   virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
@@ -423,54 +381,5 @@ class MultiSlotInMemoryDataFeed
                               const std::string& str);
 };
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-template <typename T>
-class PrivateInstantDataFeed : public DataFeed {
- public:
-  PrivateInstantDataFeed() {}
-  virtual ~PrivateInstantDataFeed() {}
-  void Init(const DataFeedDesc& data_feed_desc) override;
-  bool Start() override { return true; }
-  int Next() override;
-
- protected:
-  // The batched data buffer
-  std::vector<MultiSlotType> ins_vec_;
-
-  // This function is used to preprocess with a given filename, e.g. open it or
-  // mmap
-  virtual bool Preprocess(const std::string& filename) = 0;
-
-  // This function is used to postprocess system resource such as closing file
-  // NOTICE: Ensure that it is safe to call before Preprocess
-  virtual bool Postprocess() = 0;
-
-  // The reading and parsing method.
-  virtual bool ParseOneMiniBatch() = 0;
-
-  // This function is used to put ins_vec to feed_vec
-  virtual void PutToFeedVec();
-};
-
-class MultiSlotFileInstantDataFeed
-    : public PrivateInstantDataFeed<std::vector<MultiSlotType>> {
- public:
-  MultiSlotFileInstantDataFeed() {}
-  virtual ~MultiSlotFileInstantDataFeed() {}
-
- protected:
-  int fd_{-1};
-  char* buffer_{nullptr};
-  size_t end_{0};
-  size_t offset_{0};
-
-  bool Preprocess(const std::string& filename) override;
-
-  bool Postprocess() override;
-
-  bool ParseOneMiniBatch() override;
-};
-#endif
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index ec1acad99bc..201d6c0d0b9 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -64,8 +64,5 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
 
 REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
-#endif
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index bbcd34260e3..72c50518af0 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -13,13 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/data_layout_transform.h"
-#include <string>
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
 #endif
 
 namespace paddle {
@@ -147,6 +145,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
   memory::data_type in_type = ToMKLDNNDataType(in.type());
   PADDLE_ENFORCE(in_type != memory::data_type::data_undef,
                  "Input tensor type is not supported: %s", in.type());
+  memory::data_type out_type = in_type;
 
   auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
@@ -157,21 +156,14 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
 
   if (in_format != out_format) {
     void* in_data = GetDataFromTensor(in, in_type);
-    const std::string key = platform::ReorderMKLDNNHandler::GetHash(
-        in_tz, in_format, out_format, std::to_string(in_type));
+    auto out_data = out->mutable_data(expected_kernel_type.place_, in.type());
 
-    platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx,
-                                           cpu_engine, key);
+    auto in_memory =
+        memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data);
+    auto out_memory =
+        memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data);
 
-    auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data);
-    auto reorder_dst_memory_p =
-        handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_);
-    auto reorder_p =
-        handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-
-    std::vector<mkldnn::primitive> pipeline;
-    pipeline.push_back(*reorder_p);
-    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    platform::Reorder(in_memory, out_memory);
   } else {
     out->ShareDataWith(in);
   }
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 1b3edeed103..a3b7b1e454e 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -141,9 +141,6 @@ template <typename T>
 void DatasetImpl<T>::ReleaseMemory() {
   VLOG(3) << "DatasetImpl<T>::ReleaseMemory() begin";
   std::vector<T>().swap(memory_data_);
-  for (int i = 0; i < readers_.size(); ++i) {
-    readers_[i]->ReleaseChannelData();
-  }
   VLOG(3) << "DatasetImpl<T>::ReleaseMemory() end";
 }
 
@@ -181,10 +178,8 @@ void DatasetImpl<T>::GlobalShuffle() {
   if (readers_.size() == 0) {
     CreateReaders();
   }
-  auto fleet_ptr = FleetWrapper::GetInstance();
-  // local shuffle all data before global shuffle
-  std::shuffle(memory_data_.begin(), memory_data_.end(),
-               fleet_ptr->LocalRandomEngine());
+  // if it is not InMemory, memory_data_ is empty
+  std::random_shuffle(memory_data_.begin(), memory_data_.end());
   VLOG(3) << "start global shuffle threads";
   std::vector<std::thread> global_shuffle_threads;
   for (int i = 0; i < thread_num_; ++i) {
@@ -265,20 +260,6 @@ void DatasetImpl<T>::DestroyReaders() {
   }
 }
 
-template <typename T>
-int64_t DatasetImpl<T>::GetMemoryDataSize() {
-  return memory_data_.size();
-}
-
-template <typename T>
-int64_t DatasetImpl<T>::GetShuffleDataSize() {
-  int64_t sum = 0;
-  for (int i = 0; i < readers_.size(); ++i) {
-    sum += readers_[i]->GetChannelDataSize();
-  }
-  return sum;
-}
-
 template <typename T>
 int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
                                       const std::string& msg) {
@@ -286,7 +267,7 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
   VLOG(3) << "ReceiveFromClient msg_type=" << msg_type
           << ", client_id=" << client_id << ", msg length=" << msg.length();
   auto fleet_ptr = FleetWrapper::GetInstance();
-  int64_t index = fleet_ptr->LocalRandomEngine()() % thread_num_;
+  int64_t index = rand_r(&rand_seed) % thread_num_;
   VLOG(3) << "ramdom index=" << index;
   readers_[index]->PutInsToChannel(msg);
 #endif
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index ffbc7bfd95b..bbe0f937abf 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -85,10 +85,6 @@ class Dataset {
   virtual void CreateReaders() = 0;
   // destroy readers
   virtual void DestroyReaders() = 0;
-  // get memory data size
-  virtual int64_t GetMemoryDataSize() = 0;
-  // get shuffle data size
-  virtual int64_t GetShuffleDataSize() = 0;
 
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
@@ -131,8 +127,6 @@ class DatasetImpl : public Dataset {
   virtual void GlobalShuffle();
   virtual void CreateReaders();
   virtual void DestroyReaders();
-  virtual int64_t GetMemoryDataSize();
-  virtual int64_t GetShuffleDataSize();
 
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 4eba8177c56..615cfaa4f31 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -93,6 +93,6 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS
         fuse_elewise_add_act_pass multi_batch_merge_pass 
         fuse_relu_depthwise_conv_pass
         memory_optimize_pass lock_free_optimize_pass
-        alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
+        alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass
         fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass 
 	record_skip_memory_opt_vars_pass)
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 2f001e54d4f..c9f06c64e44 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -35,9 +35,16 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLCommunicator *ctxs)
-    : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+                                     const platform::NCCLContextMap *ctxs)
+    : OpHandleBase(node),
+      local_scopes_(local_scopes),
+      places_(places),
+      nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
+    }
+  }
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
@@ -64,9 +71,7 @@ void AllReduceOpHandle::RunAllReduceFuncs(
   if (FLAGS_sync_nccl_allreduce) {
     for (auto &p : places_) {
       int dev_id = boost::get<platform::CUDAPlace>(p).device;
-      auto *nccl_ctxs =
-          nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_);
-      auto &nccl_ctx = nccl_ctxs->at(dev_id);
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
       auto stream = nccl_ctx.stream();
       cudaError_t e_sync = cudaStreamSynchronize(stream);
       if (e_sync != 0) {
@@ -129,12 +134,21 @@ void AllReduceOpHandle::RunImpl() {
         numel = static_cast<size_t>(lod_tensor.numel());
       }
 
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
+
+      VLOG(10) << "before all reduce buffer:" << buffer << ", numel:" << numel
+               << ", dev_id:" << dev_id << ", dtype:" << dtype
+               << ", place:" << p;
+
       all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel,
-                      static_cast<ncclDataType_t>(dtype), ncclSum);
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), ncclSum,
+            comm, stream));
       });
     }
-    VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type());
     RunAllReduceFuncs(all_reduce_calls);
 #else
     PADDLE_THROW("Not compiled with CUDA");
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index f206f5fea5c..3effd0a8517 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -21,7 +21,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,15 +28,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-class AllReduceOpHandle : public NCCLOpHandleBase {
+class AllReduceOpHandle : public OpHandleBase {
  public:
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLCommunicator *ctxs);
+                    const platform::NCCLContextMap *ctxs);
 #else
-class AllReduceOpHandle : public OpHandleBase {
- public:
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
 #endif
@@ -49,17 +46,13 @@ class AllReduceOpHandle : public OpHandleBase {
 
  protected:
   void RunImpl() override;
-  std::vector<Scope *> local_scopes_;
 
-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  // NCCLOpHandleBase already have these attributes.
-  // Will polish it by class inheritance framework.
+  std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#endif
-
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   void RunAllReduceFuncs(
       const std::vector<std::function<void()>> &all_reduce_calls);
+  const platform::NCCLContextMap *nccl_ctxs_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index ce7849cb419..7f63c07b18f 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -51,39 +51,45 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
   VLOG(3) << "ProcessGraph";
   RpcCtxMap send_varname_to_ctx;
   RpcCtxMap recv_varname_to_ctx;
-  for (auto &node : graphs[0]->Nodes()) {
-    VLOG(3) << "node name " << node->Name();
-    if (node && node->IsOp()) {
-      if (node->Name() == "send") {
-        auto send_var_name = node->Op()->Input("X")[0];
-        auto send_varnames = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("send_varnames"));
-        auto epmap = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("epmap"));
-        auto height_section = boost::get<std::vector<int64_t>>(
-            node->Op()->GetNullableAttr("sections"));
-        auto trainer_id =
-            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-        send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-            send_var_name, send_varnames, epmap, height_section, trainer_id);
-        VLOG(3) << "find and init an send op: "
-                << send_varname_to_ctx[send_var_name];
-      } else if (node->Name() == "recv") {
-        auto recv_var_name = node->Op()->Output("Out")[0];
-        auto recv_varnames = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("recv_varnames"));
-        auto epmap = boost::get<std::vector<std::string>>(
-            node->Op()->GetNullableAttr("epmap"));
-        auto trainer_id =
-            boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
-        recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
-            recv_var_name, recv_varnames, epmap, {}, trainer_id);
-        VLOG(3) << "find and remove an recv op: "
-                << recv_varname_to_ctx[recv_var_name];
+  for (auto i = 0; i < graphs.size(); ++i) {
+    std::vector<ir::Node *> nodes_to_delete;
+    for (auto &node : graphs[i]->Nodes()) {
+      VLOG(3) << "node name " << node->Name();
+      if (node && node->IsOp()) {
+        if (node->Name() == "send") {
+          auto send_var_name = node->Op()->Input("X")[0];
+          auto send_varnames = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("send_varnames"));
+          auto epmap = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("epmap"));
+          auto height_section = boost::get<std::vector<int64_t>>(
+              node->Op()->GetNullableAttr("sections"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
+          send_varname_to_ctx[send_var_name] =
+              operators::distributed::RpcContext(send_var_name, send_varnames,
+                                                 epmap, height_section,
+                                                 trainer_id);
+          VLOG(3) << "find and init an send op: "
+                  << send_varname_to_ctx[send_var_name];
+        } else if (node->Name() == "recv") {
+          auto recv_var_name = node->Op()->Output("Out")[0];
+          auto recv_varnames = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("recv_varnames"));
+          auto epmap = boost::get<std::vector<std::string>>(
+              node->Op()->GetNullableAttr("epmap"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
+          recv_varname_to_ctx[recv_var_name] =
+              operators::distributed::RpcContext(recv_var_name, recv_varnames,
+                                                 epmap, {}, trainer_id);
+          nodes_to_delete.push_back(node);
+          VLOG(3) << "find and remove an recv op: "
+                  << recv_varname_to_ctx[recv_var_name];
+        }
       }
     }
   }
-
   // init communicator here
   if (send_varname_to_ctx.size() > 0) {
     VLOG(3) << "this is distribute mode, will use communicator";
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 3b57a099c8a..845fdf511e4 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include <memory>
-#include <unordered_set>
 #include <utility>
 #include "paddle/fluid/framework/details/reduce_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -27,8 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h"
 
-DECLARE_bool(use_mkldnn);
-
 namespace paddle {
 namespace framework {
 namespace details {
@@ -49,7 +46,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       : ir::PassBuilder(), strategy_(strategy) {
     // Add a graph viz pass to record a graph.
     if (!strategy_.debug_graphviz_path_.empty()) {
-      VLOG(1) << "Add graph_viz_pass";
       auto viz_pass = AppendPass("graph_viz_pass");
       const std::string graph_path = string::Sprintf(
           "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
@@ -57,27 +53,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     }
 
     // Note(zcd): record_skip_memory_opt_vars_pass should be the first pass.
-    VLOG(1) << "Add record_skip_memory_opt_vars_pass";
     AppendPass("record_skip_memory_opt_vars_pass");
 
-#ifdef PADDLE_WITH_MKLDNN
-    if (FLAGS_use_mkldnn) {
-      VLOG(1) << "Add mkldnn_placement_pass";
-      AppendPass("mkldnn_placement_pass");
-    } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
-      LOG(WARNING)
-          << "mkldnn_enabled_op_types specify the operator type list to "
-             "use MKLDNN acceleration. It is null in default, means "
-             "that all the operators supported by MKLDNN will be "
-             "accelerated. And it should not be set when "
-             "FLAGS_use_mkldnn=false.";
-    }
-#else
-    PADDLE_ENFORCE(!FLAGS_use_mkldnn,
-                   "Please compile with MKLDNN first to use MKLDNN");
-#endif
     if (strategy_.enable_sequential_execution_) {
-      VLOG(1) << "Add sequential_execution_pass";
+      VLOG(5) << "Add sequential_execution_pass";
       AppendPass("sequential_execution_pass");
     }
 
@@ -88,7 +67,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Add op fusion.
     if (strategy.fuse_relu_depthwise_conv_) {
-      VLOG(1) << "Add fuse_relu_depthwise_conv_pass";
+      VLOG(5) << "Add fuse_relu_depthwise_conv_pass";
       AppendPass("fuse_relu_depthwise_conv_pass");
     }
 
@@ -100,19 +79,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 
     // Add automatically inplace.
     if (strategy_.enable_inplace_) {
-      VLOG(1) << "Add inplace_pass";
+      VLOG(5) << "Add inplace_pass";
       AppendPass("inplace_pass");
     }
 
     if (strategy_.fuse_elewise_add_act_ops_) {
-      VLOG(1) << "Add fuse_elewise_add_act_pass";
+      VLOG(5) << "Add fuse_elewise_add_act_pass";
       AppendPass("fuse_elewise_add_act_pass");
     }
 
     // for single card training, fuse_all_reduce_ops is unnecessary.
     // alloc_continuous_space_for_grad_pass should be before of MultiDevPass.
     if (strategy_.fuse_all_reduce_ops_) {
-      VLOG(1) << "Add alloc_continuous_space_for_grad_pass";
+      VLOG(5) << "Add alloc_continuous_space_for_grad_pass";
       AppendPass("alloc_continuous_space_for_grad_pass");
     }
 
@@ -127,11 +106,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
         // NOTE: fuse_all_xx_ops will count the number of xx operator first,
         // if the number is zero, fuse_all_reduce_ops will do nothing.
         // Currently, only one type of optimization algorithm can be fused.
-        VLOG(1) << "Add fuse_adam_op_pass";
+        VLOG(5) << "Add fuse_adam_op_pass";
         AppendPass("fuse_adam_op_pass");
-        VLOG(1) << "Add fuse_sgd_op_pass";
+        VLOG(5) << "Add fuse_sgd_op_pass";
         AppendPass("fuse_sgd_op_pass");
-        VLOG(1) << "Add fuse_momentum_op_pass";
+        VLOG(5) << "Add fuse_momentum_op_pass";
         AppendPass("fuse_momentum_op_pass");
       }
     }
@@ -161,7 +140,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // A side-effect of that, memory optimize cannot forsee the fetched vars
     // , so fetchlist should be set persistable before call the Run interface.
     if (strategy_.memory_optimize_) {
-      VLOG(1) << "Add memory_optimize_pass";
+      VLOG(5) << "Add memory_optimize_pass";
       AppendPass("memory_optimize_pass");
     }
 
@@ -169,22 +148,26 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // all original and fused operators. But no operators can be enabled this
     // attr if putting it after MultiDevPass.
     if (strategy_.cache_runtime_context_) {
-      VLOG(1) << "Add runtime_context_cache_pass";
+      VLOG(5) << "Add runtime_context_cache_pass";
       AppendPass("runtime_context_cache_pass");
     }
 
+    if (strategy_.cache_expected_kernel_) {
+      VLOG(10) << "Add expected_kernel_cache_pass";
+      AppendPass("expected_kernel_cache_pass");
+    }
+
     AppendMultiDevPass(strategy_);
 
     if (strategy_.fuse_all_reduce_ops_) {
       // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator
       // first, if the number is zero, fuse_all_reduce_ops will do nothing.
-      VLOG(1) << "Add fuse_all_reduce_op_pass";
+      VLOG(5) << "Add fuse_all_reduce_op_pass";
       AppendPass("fuse_all_reduce_op_pass");
     }
 
     // Add a graph print pass to record a graph with device info.
     if (!strategy_.debug_graphviz_path_.empty()) {
-      VLOG(1) << "Add multi_devices_print_pass";
       auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
       const std::string graph_path =
           string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(),
@@ -200,22 +183,16 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (!strategy_.enable_parallel_graph_ &&
         (SeqOnlyAllReduceOps(strategy_) ||
          strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) {
-      VLOG(1) << "Add all_reduce_deps_pass";
+      VLOG(5) << "Add all_reduce_deps_pass";
       AppendPass("all_reduce_deps_pass");
     }
 
-    if (strategy_.enable_backward_optimizer_op_deps_) {
-      VLOG(1) << "Add backward_op_deps_pass";
-      AppendPass("backward_optimizer_op_deps_pass");
-    }
-
     if (strategy_.remove_unnecessary_lock_) {
-      VLOG(1) << "Add modify_op_lock_and_record_event_pass";
+      VLOG(5) << "Add modify_op_lock_and_record_event_pass";
       AppendPass("modify_op_lock_and_record_event_pass");
     }
 
     // Verify that the graph is correct for multi-device executor.
-    VLOG(1) << "Add multi_devices_check_pass";
     AppendPass("multi_devices_check_pass");
   }
 
@@ -224,19 +201,18 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     ir::Pass *multi_devices_pass = nullptr;
 
     if (strategy_.async_mode_) {
-      VLOG(1) << "Add async_multi_devices_pass";
       multi_devices_pass = AppendPass("async_multi_devices_pass").get();
     } else if (strategy_.is_distribution_) {
-      VLOG(1)
+      VLOG(5)
           << "Add dist_multi_devices_pass, multi device parameter server mode";
       multi_devices_pass = AppendPass("dist_multi_devices_pass").get();
     } else {
       if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
-        VLOG(1) << "Add all_reduce_mode_multi_devices_pass";
+        VLOG(5) << "Add all_reduce_mode_multi_devices_pass";
         multi_devices_pass =
             AppendPass("all_reduce_mode_multi_devices_pass").get();
       } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-        VLOG(1) << "Add reduce_mode_multi_devices_pass";
+        VLOG(5) << "Add reduce_mode_multi_devices_pass";
         multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get();
       } else {
         PADDLE_THROW("Unknown reduce strategy.");
@@ -273,7 +249,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                 const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
                                 const bool use_cuda,
-                                platform::NCCLCommunicator *nccl_ctxs) const {
+                                platform::NCCLContextMap *nccl_ctxs) const {
 #else
                                 const bool use_cuda) const {
 #endif
@@ -295,9 +271,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Set<size_t>(ir::kNRanks, new size_t(nranks));
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
+      pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" ||
                pass->Type() == "fuse_adam_op_pass" ||
@@ -311,12 +287,9 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                                     &local_scopes);
       if (pass->Type() == "fuse_all_reduce_op_pass") {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
+        platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
         pass->Erase(kNCCLCtxs);
-        pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
-        pass->Erase(kUseHierarchicalAllReduce);
-        pass->Set<bool>(kUseHierarchicalAllReduce,
-                        new bool(use_hierarchical_allreduce_));
+        pass->SetNotOwned<platform::NCCLContextMap>(kNCCLCtxs, nctx);
 #endif
       }
     } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") {
@@ -329,14 +302,6 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
     } else if (pass->Type() == "all_reduce_deps_pass") {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr;
-      pass->Erase(kNCCLCtxs);
-      pass->SetNotOwned<platform::NCCLCommunicator>(kNCCLCtxs, nctx);
-      pass->Erase(kUseHierarchicalAllReduce);
-      pass->Set<bool>(kUseHierarchicalAllReduce,
-                      new bool(use_hierarchical_allreduce_));
-#endif
       LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
                 << ", num_trainers:" << num_trainers_;
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
@@ -348,9 +313,6 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
     } else if (pass->Type() == "inplace_pass") {
       pass->Erase(ir::kUseCuda);
       pass->Set<bool>(ir::kUseCuda, new bool(use_cuda));
-    } else if (pass->Type() == "mkldnn_placement_pass") {
-      pass->Set("mkldnn_enabled_op_types",
-                new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
     }
     VLOG(3) << "Start Apply Pass " << pass->Type();
     graph = pass->Apply(graph);
@@ -377,7 +339,6 @@ USE_PASS(multi_devices_print_pass);
 USE_PASS(memory_optimize_pass);
 USE_PASS(sequential_execution_pass);
 USE_PASS(all_reduce_deps_pass);
-USE_PASS(backward_optimizer_op_deps_pass);
 USE_PASS(modify_op_lock_and_record_event_pass);
 USE_PASS(inplace_pass);
 USE_PASS(lock_free_optimize_pass);
@@ -388,7 +349,5 @@ USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_momentum_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
 USE_PASS(runtime_context_cache_pass);
+USE_PASS(expected_kernel_cache_pass);
 USE_PASS(record_skip_memory_opt_vars_pass);
-#ifdef PADDLE_WITH_MKLDNN
-USE_PASS(mkldnn_placement_pass);
-#endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 8eaace17bb1..b1601cfbcd5 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <string>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/pass_builder.h"
@@ -80,8 +79,6 @@ struct BuildStrategy {
 
   bool fuse_all_reduce_ops_{false};
 
-  bool enable_backward_optimizer_op_deps_{false};
-
   bool fuse_relu_depthwise_conv_{false};
 
   bool sync_batch_norm_{false};
@@ -111,18 +108,7 @@ struct BuildStrategy {
   bool remove_unnecessary_lock_{true};
 
   bool cache_runtime_context_{false};
-  std::unordered_set<std::string> mkldnn_enabled_op_types_;
-
-  size_t nccl_comm_num_{1};
-  // The picture is here:
-  // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
-  bool use_hierarchical_allreduce_{false};
-  // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu
-  // cards' number in most cases.
-  size_t hierarchical_allreduce_inter_nranks_{0};
-  // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to
-  // nodes number.
-  size_t hierarchical_allreduce_exter_nranks_{0};
+  bool cache_expected_kernel_{true};
 
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
@@ -149,7 +135,7 @@ struct BuildStrategy {
                    const size_t &nranks,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
                    const bool use_cuda,
-                   platform::NCCLCommunicator *nccl_ctxs) const;
+                   platform::NCCLContextMap *nccl_ctxs) const;
 #else
                    const bool use_cuda) const;
 #endif
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index f8723fe75f8..c8e27c7275f 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -20,7 +20,6 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/platform/profiler.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -66,7 +65,6 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
 
 void EagerDeletionOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
   Scope *exec_scope = nullptr;
   std::deque<std::shared_ptr<memory::Allocation>> garbages;
   for (auto &name : var_names_) {
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index b33162edd2b..c69f148297a 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -20,7 +20,6 @@
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -44,97 +43,35 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       bootstrap_ops_.emplace_back(op);
     }
   }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
+
   PrepareAtomicOpDeps();
 }
 
 FeedFetchList FastThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
-  VLOG(3) << "enter FastThreadedSSAGraphExecutor Run";
-  std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare"));
   std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
       op_deps = atomic_op_deps_.get();
   PrepareAtomicOpDeps();
-  size_t num_ops = op_deps->size();
 
   paddle::framework::FeedFetchList fetches;
   fetches.resize(fetch_tensors.size());
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
-  std::vector<OpHandleBase *> fetch_ops;
+  std::vector<FetchOpHandle *> fetch_ops;
   std::vector<OpHandleBase *> ready_fetch_ops;
-  exception_.Clear();
-
-  InsertFetchOps(fetch_tensors, &fetches, &fetched_vars, op_deps.get(),
-                 &fetch_ops, &ready_fetch_ops);
-  event.reset(nullptr);
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    RunTracedOps(traced_ops_);
-    RunTracedOps(fetch_ops);
-    if (exception_.IsCaught()) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    remaining_ = 0;
-    auto complete_q = std::make_shared<BlockingQueue<size_t>>();
-    for (auto op : bootstrap_ops_) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-    for (auto op : ready_fetch_ops) {
-      RunOpAsync(op_deps.get(), op, complete_q);
-    }
-
-    size_t num_complete = 0;
-    while (num_complete != op_deps->size()) {
-      size_t num_comp = complete_q->Pop();
-      if (num_comp == -1UL) {
-        int remaining = 0;
-        while (true) {
-          remaining = remaining_;
-          if (remaining == 0) {
-            break;
-          }
-          for (int i = 0; i < remaining; ++i) {
-            complete_q->Pop();
-          }
-        }
-        if (exception_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        }
-      }
-      num_complete += num_comp;
-    }
-  }
-  // Wait FetchOps.
-  ClearFetchOp(graph_, &fetch_ops);
-  return fetches;
-}
 
-void FastThreadedSSAGraphExecutor::InsertFetchOps(
-    const std::vector<std::string> &fetch_tensors, FeedFetchList *fetches,
-    std::unordered_map<std::string, std::vector<VarHandleBase *>> *fetched_vars,
-    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-    std::vector<OpHandleBase *> *fetch_ops,
-    std::vector<OpHandleBase *> *ready_fetch_ops) {
   for (auto &fetch_var_name : fetch_tensors) {
-    for (auto &var_map : graph_->Get<GraphVars>(kGraphVars)) {
+    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
-        (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin());
+        fetched_vars[fetch_var_name].push_back(*it->second.rbegin());
       }
     }
   }
 
   for (size_t i = 0; i < fetch_tensors.size(); ++i) {
-    auto &var_name = fetch_tensors.at(i);
-    auto fetched_var_it = fetched_vars->find(var_name);
-    PADDLE_ENFORCE(fetched_var_it != fetched_vars->end(),
+    auto &var_name = fetch_tensors[i];
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
                    "Cannot find fetched variable(%s).(Perhaps the main_program "
                    "is not set to ParallelExecutor)",
                    var_name);
@@ -143,8 +80,8 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
 
     ir::Node *fetch_node =
         graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation);
-    auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_);
-    fetch_ops->emplace_back(op);
+    auto *op = new FetchOpHandle(fetch_node, &fetches, i, &local_scopes_);
+    fetch_ops.emplace_back(op);
 
     for (auto &p : places_) {
       op->SetDeviceContext(p, fetch_ctxs_.Get(p));
@@ -157,22 +94,55 @@ void FastThreadedSSAGraphExecutor::InsertFetchOps(
     int dep = static_cast<int>(op->NotReadyInputSize());
     (*op_deps)[op] = dep;
     if (dep == 0) {
-      ready_fetch_ops->emplace_back(op);
+      ready_fetch_ops.emplace_back(op);
+    }
+  }
+
+  size_t num_complete = 0;
+  remaining_ = 0;
+  auto complete_q = std::make_shared<BlockingQueue<size_t>>();
+  for (auto op : bootstrap_ops_) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
+  for (auto op : ready_fetch_ops) {
+    RunOpAsync(op_deps.get(), op, complete_q);
+  }
+  while (num_complete != op_deps->size()) {
+    size_t num_comp = complete_q->Pop();
+    if (num_comp == -1UL) {
+      int remaining = 0;
+      while (true) {
+        remaining = remaining_;
+        if (remaining == 0) {
+          break;
+        }
+        for (int i = 0; i < remaining; ++i) {
+          complete_q->Pop();
+        }
+      }
+      if (exception_.IsCaught()) {
+        ClearFetchOp(graph_, &fetch_ops);
+        exception_.ReThrow();
+      }
     }
+    num_complete += num_comp;
   }
+  // Wait FetchOps.
+  ClearFetchOp(graph_, &fetch_ops);
+  return fetches;
 }
 
 bool FastThreadedSSAGraphExecutor::RunOp(
     OpHandleBase *op, const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
     size_t *complete) {
-  RunOpSync(op);
-  if (LIKELY(!exception_.IsCaught())) {
+  try {
     if (LIKELY(!strategy_.dry_run_)) {
-      RecordOps(op);
+      op->Run(strategy_.use_cuda_);
     }
     ++(*complete);
     return true;
-  } else {
+  } catch (...) {
+    exception_.Catch(std::current_exception());
     --remaining_;
     complete_q->Push(-1UL);
     return false;
@@ -224,7 +194,6 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     complete_q->Push(complete);
   });
 }
-
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
   atomic_op_deps_ = prepare_pool_.enqueue([&] {
     auto *op_deps = new std::unordered_map<OpHandleBase *, std::atomic<int>>;
@@ -237,44 +206,6 @@ void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
 }
 
 const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
-
-void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
-}
-
-void FastThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_.ReThrow();
-}
-
-void FastThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (exception_.IsCaught()) {
-      return;
-    }
-    RunOpSync(op);
-  }
-}
-
-void FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    }
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_cuda_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-  } catch (...) {
-    exception_.Catch(std::current_exception());
-  }
-}
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index d88e5bbaa97..234da5b9254 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -60,8 +60,6 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
   ::ThreadPool pool_;
   ::ThreadPool prepare_pool_;
 
-  std::vector<OpHandleBase *> traced_ops_;
-
   bool RunOp(OpHandleBase *op,
              const std::shared_ptr<BlockingQueue<size_t>> &complete_q,
              size_t *complete);
@@ -71,22 +69,6 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
                   const std::shared_ptr<BlockingQueue<size_t>> &complete_q);
 
   void PrepareAtomicOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline void RunOpSync(OpHandleBase *op);
-
-  void RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
-
-  void InsertFetchOps(
-      const std::vector<std::string> &fetch_tensors, FeedFetchList *fetches,
-      std::unordered_map<std::string, std::vector<VarHandleBase *>>
-          *fetched_vars,
-      std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
-      std::vector<OpHandleBase *> *fetch_ops,
-      std::vector<OpHandleBase *> *ready_fetch_ops);
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 4d96d820a1d..a57d670f118 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -44,10 +44,17 @@ typedef std::vector<std::vector<std::pair<std::string, const LoDTensor *>>>
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places, const size_t num_of_all_reduce,
-    const platform::NCCLCommunicator *ctxs)
-    : NCCLOpHandleBase(node, places, ctxs),
+    const platform::NCCLContextMap *ctxs)
+    : OpHandleBase(node),
       local_scopes_(local_scopes),
-      num_of_all_reduce_(num_of_all_reduce) {
+      places_(places),
+      num_of_all_reduce_(num_of_all_reduce),
+      nccl_ctxs_(ctxs) {
+  if (nccl_ctxs_) {
+    for (auto &p : places_) {
+      this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
+    }
+  }
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
 #else
@@ -160,14 +167,17 @@ void FusedAllReduceOpHandle::RunImpl() {
       auto &p = places_[i];
       void *buffer = const_cast<void *>(lod_tensor_data.at(i));
 
+      int dev_id = boost::get<platform::CUDAPlace>(p).device;
+      auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+      auto stream = nccl_ctx.stream();
+      auto comm = nccl_ctx.comm_;
       all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel,
-                      static_cast<ncclDataType_t>(nccl_dtype), ncclSum);
+        PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+            buffer, buffer, numel, static_cast<ncclDataType_t>(nccl_dtype),
+            ncclSum, comm, stream));
       });
     }
 
-    VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype);
-
     this->RunAndRecordEvent([&] {
       if (all_reduce_calls.size() == 1UL) {
         // Do not use NCCLGroup when manage NCCL by per thread per device
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index e0b9123c5b7..79772c61f8c 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -21,7 +21,6 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
@@ -29,15 +28,14 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+struct FusedAllReduceOpHandle : public OpHandleBase {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-struct FusedAllReduceOpHandle : public NCCLOpHandleBase {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
                          const std::vector<platform::Place> &places,
                          const size_t num_of_all_reduce,
-                         const platform::NCCLCommunicator *ctxs);
+                         const platform::NCCLContextMap *ctxs);
 #else
-struct FusedAllReduceOpHandle : public OpHandleBase {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
                          const std::vector<platform::Place> &places,
@@ -54,12 +52,11 @@ struct FusedAllReduceOpHandle : public OpHandleBase {
 
  private:
   std::vector<Scope *> local_scopes_;
-#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32))
-  // NCCLOpHandleBase already have these attributes.
-  // Will polish it by class inheritance framework.
   std::vector<platform::Place> places_;
-#endif
   size_t num_of_all_reduce_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  const platform::NCCLContextMap *nccl_ctxs_;
+#endif
 
   // Check the dtype of the input
   void GetDTypeAndNumel(
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index e97e5f439d1..6e6ef074db3 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -45,7 +45,6 @@ constexpr char kGraphVars[] = "vars";
 constexpr char kPlaces[] = "places";
 constexpr char kLocalScopes[] = "local_scopes";
 constexpr char kNCCLCtxs[] = "nccl_ctxs";
-constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce";
 
 // aux variables to represent dependency. Useful to resolve data hazard.
 typedef std::unordered_set<VarHandleBase *> GraphDepVars;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index b0e6a87bdde..69cd84ebf2d 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -20,7 +20,7 @@ namespace framework {
 namespace details {
 std::string OpHandleBase::DebugString() const {
   std::stringstream ss;
-  ss << Name() << "(";
+  ss << "(";
   for (auto *var : inputs_) {
     ss << var->DebugString() << ", ";
   }
@@ -187,11 +187,6 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
     std::function<void()> method = callback;
     for (auto &p : dev_ctxes_) {
       method = [method, p, this]() {
-        VLOG(10) << "cudadevicecontext:"
-                 << static_cast<platform::CUDADeviceContext *>(p.second)
-                 << ", dev_id:"
-                 << boost::get<platform::CUDAPlace>(p.first).device;
-
         static_cast<platform::CUDADeviceContext *>(p.second)->RecordEvent(
             events_.at(boost::get<platform::CUDAPlace>(p.first).device),
             method);
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 68be353e346..1bd27263f7d 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -95,7 +95,6 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
 
   auto seq_allreduce_pass =
       ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
-  seq_allreduce_pass->Set<bool>(kUseHierarchicalAllReduce, new bool(false));
   for (size_t i = 0; i < graphs_.size(); ++i) {
     graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release()));
   }
diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc
index a87b03451bb..3e082f247ad 100644
--- a/paddle/fluid/framework/details/rpc_op_handle.cc
+++ b/paddle/fluid/framework/details/rpc_op_handle.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/details/rpc_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -30,8 +29,6 @@ RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
       place_(place) {}
 
 void RPCOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
-
   for (auto *in : inputs_) {
     auto &p = static_cast<VarHandle *>(in)->place();
     if (ir::IsControlDepVar(*in->Node())) {
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 67b4fed0d30..6924549f36d 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+
 #include <string>
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace framework {
@@ -67,7 +67,6 @@ struct ScaleLossGradFunctor {
 };
 
 void ScaleLossGradOpHandle::RunImpl() {
-  platform::RecordEvent record_event(Name());
   // Doesn't wait any event
   std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
   auto &local_scope = *scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 5bbbf07e6d9..247d7847934 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -36,10 +36,26 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
 FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
   if (drop_scope_counter_ == 0) {
-    platform::RecordEvent e("InitLocalExeScopes");
-    PrepareLocalExeScopes();
-  }
+    // Create local scopes.
+    for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
+      auto &scope = *it;
+      Scope &local_scope = scope->NewScope();
+      *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+          &local_scope;
+
+      for (auto &info : var_infos_) {
+        if (scope->FindVar(info.name_) != nullptr) {
+          continue;
+        }
 
+        if (info.persistable_) {  // Persistable
+          InitializeVariable(scope->Var(info.name_), info.type_);
+        } else {
+          InitializeVariable(local_scope.Var(info.name_), info.type_);
+        }
+      }
+    }
+  }
   std::vector<framework::LoDTensor> fetch_data;
   std::exception_ptr eptr = nullptr;
   try {
@@ -48,7 +64,9 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
     eptr = std::current_exception();
   }
 
+  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
   ++drop_scope_counter_;
+
   if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
     DropLocalExeScopes();
   }
@@ -60,40 +78,16 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
 }
 
 void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
-  platform::RecordEvent drop_scope_event("DropLocalExeScopes");
   drop_scope_counter_ = 0;
   for (auto p : places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
 
   for (auto &scope : local_scopes_) {
-    auto *local_scope_var = scope->FindLocalVar(details::kLocalExecScopeName);
-    if (local_scope_var != nullptr) {
-      auto &local_scope = *local_scope_var->GetMutable<Scope *>();
-      scope->DeleteScope(local_scope);
-      scope->EraseVars({std::string(details::kLocalExecScopeName)});
-      VLOG(3) << "Drop local execution scope: " << local_scope;
-    }
-  }
-}
-
-void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() {
-  // Create local scopes.
-  for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) {
-    auto &scope = *it;
-    Scope &local_scope = scope->NewScope();
-    *scope->Var(kLocalExecScopeName)->GetMutable<Scope *>() = &local_scope;
-
-    for (auto &info : var_infos_) {
-      if (scope->FindVar(info.name_) != nullptr) {
-        continue;
-      }
-      if (info.persistable_) {  // Persistable
-        InitializeVariable(scope->Var(info.name_), info.type_);
-      } else {
-        InitializeVariable(local_scope.Var(info.name_), info.type_);
-      }
-    }
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    VLOG(3) << "Drop local execution scope: " << local_scope;
   }
 }
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index e0388be305f..030777cad89 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include <ThreadPool.h>
-#include <list>
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -52,8 +51,6 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
 
   bool NeedCreateLocalExeScope();
 
-  void PrepareLocalExeScopes();
-
  private:
   size_t drop_scope_counter_{0};
   ExecutionStrategy strategy_;
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index cc3493d849e..1bdd33fd535 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -30,7 +30,7 @@ namespace details {
 SparseAllReduceOpHandle::SparseAllReduceOpHandle(
     ir::Node *node, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    const platform::NCCLCommunicator *ctxs, bool is_encoded, int nranks)
+    const platform::NCCLContextMap *ctxs, bool is_encoded, int nranks)
     : AllReduceOpHandle(node, local_scopes, places, ctxs),
       is_encoded_(is_encoded),
       nranks_(nranks) {
@@ -102,8 +102,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
 
     int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false);
-    auto &nccl_ctx = nccl_ctxs->at(dev_id);
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
     auto stream = nccl_ctx.stream();
     auto comm = nccl_ctx.comm_;
 
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
index 9802f8dba7e..ed6be65a2c8 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -32,7 +32,7 @@ class SparseAllReduceOpHandle : public AllReduceOpHandle {
   SparseAllReduceOpHandle(ir::Node *node,
                           const std::vector<Scope *> &local_scopes,
                           const std::vector<platform::Place> &places,
-                          const platform::NCCLCommunicator *ctxs,
+                          const platform::NCCLContextMap *ctxs,
                           bool is_encoded = false, int nranks = -1);
   std::string Name() const override;
 
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 4f1e44ca26c..af2cbd5c876 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -19,13 +19,10 @@ namespace framework {
 namespace details {
 SSAGraphExecutor::~SSAGraphExecutor() {}
 
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops) {
   if (fetch_ops->empty()) return;
 
   for (auto& op : *fetch_ops) {
-    PADDLE_ENFORCE_NOT_NULL(
-        dynamic_cast<FetchOpHandle*>(op),
-        "The input ops of ClearFetchOp function should be FetchOpHandle.");
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 2454ec2b27d..860eaa25b58 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -38,7 +38,7 @@ class SSAGraphExecutor {
   virtual FeedFetchList Run(const std::vector<std::string>& fetch_tensors) = 0;
 };
 
-void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops);
+void ClearFetchOp(ir::Graph* graph, std::vector<FetchOpHandle*>* fetch_ops);
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index ac62f1dd833..67246a4dd44 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -53,84 +53,74 @@ inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl(
       new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
   std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
   CopyOpDeps();
-
   VLOG(10) << "ThreadedSSAGraphExecutor::Run";
   std::shared_ptr<BlockingQueue<VarHandleBase *>> ready_vars(
       new BlockingQueue<VarHandleBase *>);
   auto &pending_ops = op_deps->pending_ops_;
   auto &pending_vars = op_deps->pending_vars_;
   auto &ready_ops = op_deps->ready_ops_;
-  size_t num_ops = op_deps->num_ops_;
+
+  // For ops (e.g. nccl_all_reduce) that need to coordinate multiple
+  // streams from multiple GPUs, it's faster to buffer them and schedule
+  // together since we currently cannot overlap computation and memcpy streams.
+  // Should revisit it if overlapping is available.
+  std::unordered_set<OpHandleBase *> delayed_ops;
 
   // Step 2. Insert FetchOps
-  std::vector<OpHandleBase *> fetch_ops;
+  std::vector<FetchOpHandle *> fetch_ops;
   std::unordered_set<VarHandleBase *> fetch_dependencies;
   FeedFetchList fetch_data(fetch_tensors.size());
 
   InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops,
                  &pending_ops, &pending_vars, &fetch_data);
 
+  auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
+    for (auto *op : set) {
+      RunOp(ready_vars, op);
+    }
+    set.clear();
+  };
+  // Clean run context
+  run_op_futures_.clear();
   exception_holder_.Clear();
   event.reset(nullptr);
-
   // Step 3. Execution
-  if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) {
-    // If the num_threads is 1, we can record the order of operator's
-    // execution in the first iteration, and in subsequent iterations,
-    // run the recorded operators directly. This strategy could make the
-    // execution faster.
-    VLOG(3) << "Run the traced ops.";
-    RunTracedOps(traced_ops_);
-    RunTracedOps(fetch_ops);
-    if (exception_holder_.IsCaught()) {
-      ExecutionFinal(&fetch_ops);
-    }
-  } else {
-    traced_ops_.clear();
-    auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
-      for (auto *op : set) {
-        RunOp(ready_vars, op);
-      }
-      set.clear();
-    };
-    // Clean run context
-    run_op_futures_.clear();
-
-    while (!pending_vars.empty()) {
-      // 1. Run All Ready ops
-      // Keep loop until all vars are ready.
-      run_all_ops(ready_ops);
-
-      // 2. Find ready variable
-      bool timeout;
-      auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
-      if (timeout) {
+  while (!pending_vars.empty()) {
+    // 1. Run All Ready ops
+    // Keep loop until all vars are ready.
+    run_all_ops(ready_ops);
+
+    // 2. Find ready variable
+    bool timeout;
+    auto cur_ready_vars = ready_vars->PopAll(1, &timeout);
+    if (timeout) {
+      if (exception_holder_.IsCaught()) {
+        VLOG(3) << "caught exception " << exception_holder_.Type()
+                << ", rethrow it";
         for (auto &run_op_future : run_op_futures_) {
           run_op_future.wait();
         }
-        if (exception_holder_.IsCaught()) {
-          ExecutionFinal(&fetch_ops);
-        } else {
-          continue;
-        }
+        ClearFetchOp(graph_, &fetch_ops);
+        exception_holder_.ReThrow();
+      } else {
+        continue;
       }
+    }
 
-      // 3. Remove the dependency of ready_var.
-      // Find the ready_ops after the ready_var.
-      for (auto ready_var : cur_ready_vars) {
-        pending_vars.erase(ready_var);
-        for (auto *op : ready_var->PendingOps()) {
-          auto &deps = pending_ops[op];
-          --deps;
-          if (deps == 0) {
-            ready_ops.insert(op);
-          }
+    // 3. Remove the dependency of ready_var.
+    // Find the ready_ops after the ready_var.
+    for (auto ready_var : cur_ready_vars) {
+      pending_vars.erase(ready_var);
+      for (auto *op : ready_var->PendingOps()) {
+        auto &deps = pending_ops[op];
+        --deps;
+        if (deps == 0) {
+          ready_ops.insert(op);
         }
       }
     }
-    PADDLE_ENFORCE(ready_ops.empty());
   }
-
+  PADDLE_ENFORCE(ready_ops.empty());
   // Wait FetchOps.
   ClearFetchOp(graph_, &fetch_ops);
 
@@ -147,7 +137,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
 
 void ThreadedSSAGraphExecutor::InsertFetchOps(
     const std::vector<std::string> &fetch_tensors,
-    std::vector<OpHandleBase *> *fetch_ops,
+    std::vector<FetchOpHandle *> *fetch_ops,
     std::unordered_set<VarHandleBase *> *fetch_dependencies,
     std::unordered_set<OpHandleBase *> *ready_ops,
     std::unordered_map<OpHandleBase *, size_t> *pending_ops,
@@ -253,9 +243,6 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
       InsertPendingOp(&pending_ops, op);
     }
   }
-  op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
-
   for (auto ready_var : ready_vars) {
     pending_vars.erase(ready_var);
     for (auto *op : ready_var->PendingOps()) {
@@ -277,7 +264,6 @@ void ThreadedSSAGraphExecutor::CopyOpDeps() {
                                   op_deps_->pending_vars_.end());
     op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(),
                                op_deps_->ready_ops_.end());
-    op_deps->num_ops_ = op_deps_->num_ops_;
     return std::unique_ptr<OpDependentData>(op_deps);
   });
 }
@@ -286,59 +272,25 @@ void ThreadedSSAGraphExecutor::RunOp(
     const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
     details::OpHandleBase *op) {
   auto op_run = [ready_var_q, op, this] {
-    RunOpSync(op);
     try {
+      if (VLOG_IS_ON(10)) {
+        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      }
+      if (LIKELY(!strategy_.dry_run_)) {
+        op->Run(strategy_.use_cuda_);
+      }
+      VLOG(10) << op << " " << op->Name() << " Done ";
       ready_var_q->Extend(op->Outputs());
       VLOG(10) << op << " " << op->Name() << " Signal posted";
     } catch (...) {
       exception_holder_.Catch(std::current_exception());
     }
   };
-
   if (pool_) {
     run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
-
-  RecordOps(op);
-}
-
-void ThreadedSSAGraphExecutor::RunTracedOps(
-    const std::vector<OpHandleBase *> &traced_ops) {
-  for (auto &op : traced_ops) {
-    if (exception_holder_.IsCaught()) {
-      return;
-    }
-    RunOpSync(op);
-  }
-}
-
-void ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) {
-  try {
-    if (VLOG_IS_ON(10)) {
-      VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-    }
-    if (LIKELY(!strategy_.dry_run_)) {
-      op->Run(strategy_.use_cuda_);
-    }
-    VLOG(10) << op << " " << op->Name() << " Done ";
-  } catch (...) {
-    exception_holder_.Catch(std::current_exception());
-  }
-}
-
-void ThreadedSSAGraphExecutor::ExecutionFinal(
-    std::vector<OpHandleBase *> *fetch_ops) {
-  VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
-  exception_holder_.ReThrow();
-}
-
-void ThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
-  if (strategy_.num_threads_ == 1 && !dynamic_cast<FetchOpHandle *>(op)) {
-    traced_ops_.emplace_back(op);
-  }
 }
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 6c1fb1c6c0a..8c026057b48 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -44,7 +44,6 @@ struct OpDependentData {
   std::unordered_map<OpHandleBase *, size_t> pending_ops_;
   std::unordered_set<VarHandleBase *> pending_vars_;
   std::unordered_set<OpHandleBase *> ready_ops_;
-  size_t num_ops_{0};
 };
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
@@ -81,7 +80,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::list<std::future<void>> run_op_futures_;
   ::ThreadPool prepare_pool_;
   std::unique_ptr<::ThreadPool> pool_;
-  std::vector<OpHandleBase *> traced_ops_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
@@ -91,7 +89,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                         VarHandleBase *var) const;
 
   void InsertFetchOps(const std::vector<std::string> &fetch_tensors,
-                      std::vector<OpHandleBase *> *fetch_ops,
+                      std::vector<FetchOpHandle *> *fetch_ops,
                       std::unordered_set<VarHandleBase *> *fetch_dependencies,
                       std::unordered_set<OpHandleBase *> *ready_ops,
                       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
@@ -99,16 +97,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
                       FeedFetchList *fetch_data);
 
   void PrepareOpDeps();
-
   void CopyOpDeps();
-
-  inline void RecordOps(OpHandleBase *op);
-
-  inline void ExecutionFinal(std::vector<OpHandleBase *> *fetch_ops);
-
-  inline void RunOpSync(OpHandleBase *op);
-
-  void RunTracedOps(const std::vector<OpHandleBase *> &traced_ops);
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index be5f663e1c9..a7a8663ec3b 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <atomic>
 #include <fstream>
 #include <map>
 #include <memory>
@@ -36,17 +35,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/timer.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
 namespace paddle {
 namespace framework {
 
-#define SEC_LOG                                                              \
-  VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
-          << "]: "
-
 class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
@@ -57,7 +48,6 @@ class PullDenseWorker {
   void IncreaseThreadVersion(int thread_id, uint64_t table_id);
   void ResetThreadVersion(uint64_t table_id);
   void Wait(std::vector<::std::future<int32_t>>* status_vec);
-  void PullDense(bool force_update = false);
   static std::shared_ptr<PullDenseWorker> GetInstance() {
     if (NULL == s_instance_) {
       s_instance_.reset(new paddle::framework::PullDenseWorker());
@@ -102,7 +92,7 @@ class PullDenseWorker {
 // should incorporate different type of device
 class DeviceWorker {
  public:
-  DeviceWorker() { use_cvm_ = false; }
+  DeviceWorker() {}
   virtual ~DeviceWorker() {}
   virtual void Initialize(const TrainerDesc& desc) = 0;
   virtual void SetDeviceIndex(int tid) = 0;
@@ -124,7 +114,6 @@ class DeviceWorker {
   std::shared_ptr<DataFeed> device_reader_;
   int64_t batch_num_;
   FetchConfig fetch_config_;
-  bool use_cvm_;
 };
 
 class CPUWorkerBase : public DeviceWorker {
@@ -205,101 +194,5 @@ class DownpourWorker : public HogwildWorker {
   std::vector<::std::future<int32_t>> push_dense_status_;
 };
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
-
-class SyncFunctor {
- public:
-  SyncFunctor(int rank_id, int rank_num, int sync_steps);
-  virtual ~SyncFunctor() {}
-
-  void SetSyncParam(const std::vector<std::string>& sync_param) {
-    sync_param_ = &sync_param;
-  }
-  void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) {
-    nccl_ctx_map_ = nccl_ctx_map;
-  }
-
-  int operator()(Scope* scope);
-  static std::vector<Scope*> pipeline_scopes_;
-  static uint64_t sync_flag_;
-
- protected:
-  const int rank_id_;
-  const int rank_num_;
-  const std::vector<std::string>* sync_param_ = nullptr;
-  platform::NCCLContextMap* nccl_ctx_map_ = nullptr;
-
-  uint64_t sync_signal_;
-  const int sync_steps_;
-  int counter_;
-
-  void Synchronize();
-};
-
-class SectionWorker : public DeviceWorker {
- public:
-  SectionWorker() {}
-  ~SectionWorker() override {}
-
-  void Initialize(const TrainerDesc& desc) override;
-
-  void BindingDataFeedMemory() override {}
-  void CreateDeviceResource(const ProgramDesc& main_prog) override{};
-
-  void TrainFiles() override;
-  void TrainFilesWithProfiler() override;
-
-  void PrintFetchVars() override {}
-
-  const platform::Place& place() const { return place_; }
-
-  void SetSectionIndex(int section_id) { section_id_ = section_id; }
-  void SetDeviceIndex(int tid) override { pipeline_id_ = tid; }
-  void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
-  void SetVarNames(const std::vector<std::string>& in_var_names,
-                   const std::vector<std::string>& out_var_names) {
-    in_var_names_ = &in_var_names;
-    out_var_names_ = &out_var_names;
-  }
-  void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) {
-    in_scope_queue_ = in_scope_queue;
-    out_scope_queue_ = out_scope_queue;
-  }
-  void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; }
-  void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; }
-  void SetSectionNum(int section_num) { section_num_ = section_num; }
-  void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; }
-  void SetNextSectionPlace(const paddle::platform::Place& place) {
-    next_section_place_ = place;
-  }
-  SyncFunctor* sync_func_ = nullptr;
-  void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; }
-
-  static std::atomic<int> cpu_id_;
-
- protected:
-  void AutoSetCPUAffinity(bool reuse);
-  int section_id_;
-  int pipeline_id_;
-  int section_num_;
-  int pipeline_num_;
-  int thread_id_;
-
-  // This worker will consume scope from in_scope_queue_
-  // and produce scope to out_scope_queue_
-  ScopeQueue* in_scope_queue_ = nullptr;
-  ScopeQueue* out_scope_queue_ = nullptr;
-  const std::vector<std::string>* in_var_names_ = nullptr;
-  const std::vector<std::string>* out_var_names_ = nullptr;
-  std::mutex* worker_count_mutex_ = nullptr;
-  int* worker_count_ = nullptr;
-  paddle::platform::Place next_section_place_;
-
-  std::vector<std::unique_ptr<OperatorBase>> ops_;
-
-  platform::DeviceContext* dev_ctx_ = nullptr;
-};
-#endif
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index dc85941f57d..2a7b368145c 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -61,8 +61,5 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
 
 REGISTER_DEVICE_WORKER_CLASS(HogwildWorker);
 REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
-#endif
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 0b4e959f571..8e184e5d3cb 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -63,7 +63,6 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) {
 
   fleet_ptr_ = FleetWrapper::GetInstance();
   fetch_config_ = desc.fetch_config();
-  use_cvm_ = desc.use_cvm();
 }
 
 void DownpourWorker::CollectLabelInfo(size_t table_idx) {
@@ -140,25 +139,14 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
     LoD data_lod{tensor_lod};
     tensor_emb->set_lod(data_lod);
     for (int index = 0; index < len; ++index) {
-      if (use_cvm_) {
-        if (ids[index] == 0u) {
-          memcpy(ptr + table.emb_dim() * index, init_value.data(),
-                 sizeof(float) * table.emb_dim());
-          continue;
-        }
-        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(),
-               sizeof(float) * table.emb_dim());
-        fea_idx++;
-      } else {
-        if (ids[index] == 0u) {
-          memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
-                 sizeof(float) * table.emb_dim());
-          continue;
-        }
-        memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
+      if (ids[index] == 0u) {
+        memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
                sizeof(float) * table.emb_dim());
-        fea_idx++;
+        continue;
       }
+      memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2,
+             sizeof(float) * table.emb_dim());
+      fea_idx++;
     }
   }
 }
@@ -209,9 +197,9 @@ void DownpourWorker::TrainFilesWithProfiler() {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
       TableParameter table;
-      for (auto j : param_.sparse_table()) {
-        if (j.table_id() == tid) {
-          table = j;
+      for (auto i : param_.sparse_table()) {
+        if (i.table_id() == tid) {
+          table = i;
           break;
         }
       }
@@ -271,7 +259,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
         fleet_ptr_->PushSparseVarsWithLabelAsync(
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
-            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_);
+            &feature_grads_[tid], &push_sparse_status_);
         timeline.Pause();
         push_sparse_time += timeline.ElapsedSec();
         total_time += timeline.ElapsedSec();
@@ -379,9 +367,9 @@ void DownpourWorker::TrainFiles() {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
       TableParameter table;
-      for (auto j : param_.sparse_table()) {
-        if (j.table_id() == tid) {
-          table = j;
+      for (auto i : param_.sparse_table()) {
+        if (i.table_id() == tid) {
+          table = i;
           break;
         }
       }
@@ -423,7 +411,7 @@ void DownpourWorker::TrainFiles() {
         fleet_ptr_->PushSparseVarsWithLabelAsync(
             *thread_scope_, tid, features_[tid], feature_labels_[tid],
             sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(),
-            &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_);
+            &feature_grads_[tid], &push_sparse_status_);
       }
     }
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index e36871e8d82..239a3ce0a84 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -122,9 +122,8 @@ void Executor::RunFromDataset(const ProgramDesc& main_program, Scope* scope,
                               const std::string& trainer_desc_str) {
   VLOG(3) << "Start to RunFromDataset in executor";
   TrainerDesc trainer_desc;
-  bool success = trainer_desc.ParseFromString(trainer_desc_str);
-  PADDLE_ENFORCE(success, "Fail to parse TrainerDesc from string:\n%s",
-                 trainer_desc_str.c_str());
+  google::protobuf::TextFormat::ParseFromString(trainer_desc_str,
+                                                &trainer_desc);
   VLOG(3) << "Going to create trainer, trainer class is "
           << trainer_desc.class_name();
   std::shared_ptr<TrainerBase> trainer;
@@ -245,12 +244,6 @@ static bool has_fetch_operators(
   return fetch_count > 0;
 }
 
-std::unique_ptr<ExecutorPrepareContext> Executor::PrepareCtxCache(
-    const ProgramDesc& program, int block_id,
-    const std::vector<std::string>& skip_ref_cnt_vars, bool force_disable_gc) {
-  return Prepare(program, block_id, skip_ref_cnt_vars, force_disable_gc);
-}
-
 void Executor::Run(const ProgramDesc& program, Scope* scope,
                    std::map<std::string, const LoDTensor*>* feed_targets,
                    std::map<std::string, LoDTensor*>* fetch_targets,
@@ -335,7 +328,7 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
 #ifdef PADDLE_WITH_NGRAPH
-  if (FLAGS_use_ngraph && ctx->block_id_ == 0) {
+  if (FLAGS_use_ngraph) {
     paddle::operators::NgraphEngine::FuseNgraphOps(
         ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
   }
@@ -375,7 +368,6 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
                                   bool create_local_scope, bool create_vars,
                                   bool keep_kids) {
-  platform::RecordBlock b(kProgramId);
   PADDLE_ENFORCE_NOT_NULL(scope);
   Scope* local_scope = scope;
   if (create_vars) {
@@ -415,6 +407,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
 
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
+
     if (gc) {
       DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get());
     }
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index d0d12b30720..6eeeb1efc61 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -83,21 +83,6 @@ class Executor {
            const std::string& feed_holder_name = "feed",
            const std::string& fetch_holder_name = "fetch");
 
-  // This API is very slow.
-  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
-                          std::map<std::string, const LoDTensor*>* feed_targets,
-                          std::map<std::string, LoDTensor*>* fetch_targets,
-                          bool create_local_scope = true,
-                          bool create_vars = true,
-                          const std::string& feed_holder_name = "feed",
-                          const std::string& fetch_holder_name = "fetch");
-
-  std::unique_ptr<ExecutorPrepareContext> PrepareCtxCache(
-      const ProgramDesc& program, int block_id,
-      const std::vector<std::string>& skip_ref_cnt_vars =
-          std::vector<std::string>(),
-      bool force_disable_gc = false);
-
   static std::unique_ptr<ExecutorPrepareContext> Prepare(
       const ProgramDesc& program, int block_id,
       const std::vector<std::string>& skip_ref_cnt_vars =
@@ -116,6 +101,15 @@ class Executor {
                           bool create_local_scope = true,
                           bool create_vars = true, bool keep_kids = false);
 
+  // This API is very slow.
+  void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
+                          std::map<std::string, const LoDTensor*>* feed_targets,
+                          std::map<std::string, LoDTensor*>* fetch_targets,
+                          bool create_local_scope = true,
+                          bool create_vars = true,
+                          const std::string& feed_holder_name = "feed",
+                          const std::string& fetch_holder_name = "fetch");
+
   void EnableMKLDNN(const ProgramDesc& program);
 
   void RunFromDataset(const ProgramDesc& main_program, Scope* scope,
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index fd77cdeb7cb..394ff24c466 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -281,16 +281,9 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     const std::vector<std::string>& sparse_key_names,
     const std::vector<std::string>& sparse_grad_names, const int emb_dim,
     std::vector<std::vector<float>>* push_values,
-    std::vector<::std::future<int32_t>>* push_sparse_status,
-    const int batch_size, const bool use_cvm) {
+    std::vector<::std::future<int32_t>>* push_sparse_status) {
 #ifdef PADDLE_WITH_PSLIB
   int offset = 2;
-  int grad_dim = emb_dim;
-  if (use_cvm) {
-    offset = 0;
-    grad_dim = emb_dim - 2;
-  }
-  CHECK_GE(grad_dim, 0);
   uint64_t fea_idx = 0u;
   for (size_t i = 0; i < sparse_key_names.size(); ++i) {
     Variable* g_var = scope.FindVar(sparse_grad_names[i]);
@@ -314,13 +307,7 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     for (auto& t : *push_values) {
       t.resize(emb_dim + offset);
     }
-    if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) {
-      int dim = emb_dim + offset;
-      Eigen::Map<
-          Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
-          g_mat(g, g_tensor->numel() / dim, dim);
-      g_mat.rightCols(grad_dim) *= batch_size;
-    }
+
     for (auto id_idx = 0u; id_idx < len; ++id_idx) {
       if (ids[id_idx] == 0) {
         g += emb_dim;
@@ -328,15 +315,10 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
       }
       CHECK(fea_idx < (*push_values).size());
       CHECK(fea_idx < fea_labels.size());
-      if (use_cvm) {
-        memcpy((*push_values)[fea_idx].data() + offset, g,
-               sizeof(float) * emb_dim);
-      } else {
-        memcpy((*push_values)[fea_idx].data() + offset, g,
-               sizeof(float) * emb_dim);
-        (*push_values)[fea_idx][0] = 1.0f;
-        (*push_values)[fea_idx][1] = static_cast<float>(fea_labels[fea_idx]);
-      }
+      memcpy((*push_values)[fea_idx].data() + offset, g,
+             sizeof(float) * emb_dim);
+      (*push_values)[fea_idx][0] = 1.0f;
+      (*push_values)[fea_idx][1] = static_cast<float>(fea_labels[fea_idx]);
       g += emb_dim;
       fea_idx++;
     }
@@ -355,89 +337,6 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
 #endif
 }
 
-void FleetWrapper::LoadModel(const std::string& path, const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->load(path, std::to_string(mode));
-  ret.wait();
-  if (ret.get() != 0) {
-    LOG(ERROR) << "load model from path:" << path << " failed";
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::SaveModel(const std::string& path, const int mode) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode));
-  ret.wait();
-  int32_t feasign_cnt = ret.get();
-  if (feasign_cnt == -1) {
-    LOG(ERROR) << "save model failed";
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::SaveModel does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ShrinkSparseTable(int table_id) {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->shrink(table_id);
-  ret.wait();
-#else
-  VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
-                                    std::vector<std::string> var_list,
-                                    float decay) {
-#ifdef PADDLE_WITH_PSLIB
-  std::vector<paddle::ps::Region> regions;
-  for (std::string& name : var_list) {
-    if (name.find("batch_sum") != std::string::npos) {
-      Variable* var = scope->FindVar(name);
-      CHECK(var != nullptr) << "var[" << name << "] not found";
-      VLOG(3) << "prepare shrink dense batch_sum";
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      float* g = tensor->data<float>();
-      Eigen::Map<Eigen::MatrixXf> mat(g, 1, tensor->numel());
-      mat *= decay;
-      paddle::ps::Region reg(g, tensor->numel());
-      regions.emplace_back(std::move(reg));
-    } else {
-      Variable* var = scope->FindVar(name);
-      CHECK(var != nullptr) << "var[" << name << "] not found";
-      LoDTensor* tensor = var->GetMutable<LoDTensor>();
-      float* g = tensor->data<float>();
-      paddle::ps::Region reg(g, tensor->numel());
-      regions.emplace_back(std::move(reg));
-    }
-  }
-  auto push_status = pslib_ptr_->_worker_ptr->push_dense_param(
-      regions.data(), regions.size(), table_id);
-  push_status.wait();
-  auto status = push_status.get();
-  if (status != 0) {
-    LOG(FATAL) << "push shrink dense param failed, status[" << status << "]";
-    exit(-1);
-  }
-#else
-  VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib";
-#endif
-}
-
-void FleetWrapper::ClientFlush() {
-#ifdef PADDLE_WITH_PSLIB
-  auto ret = pslib_ptr_->_worker_ptr->flush();
-  ret.wait();
-#else
-  VLOG(0) << "FleetWrapper::ServerFlush does nothing when no pslib";
-#endif
-}
-
 int FleetWrapper::RegisterClientToClientMsgHandler(int msg_type,
                                                    MsgHandlerFunc handler) {
 #ifdef PADDLE_WITH_PSLIB
@@ -499,24 +398,6 @@ void FleetWrapper::Deserialize(std::vector<T>* t, const std::string& str) {
 #endif
 }
 
-std::default_random_engine& FleetWrapper::LocalRandomEngine() {
-  struct engine_wrapper_t {
-    std::default_random_engine engine;
-#ifdef PADDLE_WITH_PSLIB
-    engine_wrapper_t() {
-      struct timespec tp;
-      clock_gettime(CLOCK_REALTIME, &tp);
-      double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9;
-      static std::atomic<uint64_t> x(0);
-      std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)};
-      engine.seed(sseq);
-    }
-#endif
-  };
-  thread_local engine_wrapper_t r;
-  return r.engine;
-}
-
 template void FleetWrapper::Serialize<std::vector<MultiSlotType>>(
     const std::vector<std::vector<MultiSlotType>*>&, std::string*);
 template void FleetWrapper::Deserialize<std::vector<MultiSlotType>>(
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index b62270a488e..386e711ff71 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -55,7 +55,7 @@ namespace framework {
 class FleetWrapper {
  public:
   virtual ~FleetWrapper() {}
-  FleetWrapper() { scale_sparse_gradient_with_batch_size_ = true; }
+  FleetWrapper() {}
   // Pull sparse variables from server in Sync mode
   // Param<in>: scope, table_id, var_names, fea_keys
   // Param<out>: fea_values
@@ -99,8 +99,7 @@ class FleetWrapper {
       const std::vector<std::string>& sparse_key_names,
       const std::vector<std::string>& sparse_grad_names, const int emb_dim,
       std::vector<std::vector<float>>* push_values,
-      std::vector<::std::future<int32_t>>* push_sparse_status,
-      const int batch_size, const bool use_cvm);
+      std::vector<::std::future<int32_t>>* push_sparse_status);
 
   // Push sparse variables to server in Async mode
   // Param<In>: scope, table_id, fea_keys, sparse_grad_names
@@ -129,19 +128,6 @@ class FleetWrapper {
   // create client to client connection
   void CreateClient2ClientConnection();
 
-  // flush all push requests
-  void ClientFlush();
-  // mode = 0, load all feature
-  // mode = 1, laod delta feature, which means load diff
-  void LoadModel(const std::string& path, const int mode);
-  // mode = 0, save all feature
-  // mode = 1, save delta feature, which means save diff
-  void SaveModel(const std::string& path, const int mode);
-
-  void ShrinkSparseTable(int table_id);
-  void ShrinkDenseTable(int table_id, Scope* scope,
-                        std::vector<std::string> var_list, float decay);
-
   // register client to client communication
   typedef std::function<int32_t(int, int, const std::string&)> MsgHandlerFunc;
   int RegisterClientToClientMsgHandler(int msg_type, MsgHandlerFunc handler);
@@ -160,9 +146,6 @@ class FleetWrapper {
     return s_instance_;
   }
 
-  // this performs better than rand_r, especially large data
-  std::default_random_engine& LocalRandomEngine();
-
 #ifdef PADDLE_WITH_PSLIB
   static std::shared_ptr<paddle::distributed::PSlib> pslib_ptr_;
 #endif
@@ -175,7 +158,6 @@ class FleetWrapper {
 
  protected:
   static bool is_initialized_;
-  bool scale_sparse_gradient_with_batch_size_;
   DISABLE_COPY_AND_ASSIGN(FleetWrapper);
 };
 
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index efdabffb9b3..6c60a041a19 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
+// option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;
 
 // Any incompatible changes to ProgramDesc and its dependencies should
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index f02828ebaee..75c985d10f3 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -24,10 +24,9 @@ void HogwildWorker::Initialize(const TrainerDesc& desc) {
   fetch_config_ = desc.fetch_config();
   param_ = desc.hogwild_param();
   skip_ops_.resize(param_.skip_ops_size());
-  for (int i = 0; i < param_.skip_ops_size(); ++i) {
+  for (size_t i = 0; i < param_.skip_ops_size(); ++i) {
     skip_ops_[i] = param_.skip_ops(i);
   }
-  use_cvm_ = desc.use_cvm();
 }
 
 void HogwildWorker::CreateThreadOperators(const ProgramDesc& program) {
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 5228840c960..032fcbedf49 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -72,12 +72,12 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(expected_kernel_cache_pass base)
 pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)
 pass_library(shuffle_channel_detect_pass inference)
-pass_library(delete_quant_dequant_op_pass inference)
 
-if(ANAKIN_SUBGRAPH)
+if(ANAKIN_FOUND)
 pass_library(simplify_anakin_priorbox_detection_out_pass inference)
 endif()
 
@@ -86,23 +86,12 @@ if(WITH_MKLDNN)
     pass_library(depthwise_conv_mkldnn_pass base mkldnn)
     pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn)
     pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn)
-    pass_library(conv_brelu_mkldnn_fuse_pass inference mkldnn)
-    pass_library(conv_concat_relu_mkldnn_fuse_pass inference mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn)
-    pass_library(fc_mkldnn_pass inference mkldnn)
     pass_library(cpu_quantize_placement_pass base mkldnn)
     pass_library(cpu_quantize_pass inference mkldnn)
     pass_library(cpu_quantize_squash_pass inference mkldnn)
 endif()
 
-if(WITH_NGRAPH)
-    cc_library(ngraph_subgraph_pass SRCS ngraph_subgraph_pass.cc DEPS ngraph_bridge
-      analysis_helper subgraph_detector graph_pattern_detector pass fuse_pass_base ${op_library_DEPS})
-    set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
-    file(APPEND ${pass_file} "USE_PASS(ngraph_subgraph_pass);\n")
-    set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "")
-endif()
-
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
@@ -126,8 +115,6 @@ if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-    cc_test(test_conv_brelu_mkldnn_fuse_pass SRCS mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc DEPS conv_brelu_mkldnn_fuse_pass)
-    cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
     cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass)
     cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
diff --git a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
index 715ca97f371..12d5ad7ed8c 100644
--- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.cc
@@ -23,16 +23,15 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-DEFINE_double(fuse_parameter_memory_size, -1.0,  // MBytes
-              "fuse_parameter_memory_size is up limited memory size(MB)"
+DEFINE_uint64(fuse_parameter_memory_size, 0,  // 0 KB
+              "fuse_parameter_memory_size is up limited memory size "
               "of one group parameters' gradient which is the input "
               "of communication calling(e.g NCCLAllReduce). "
               "The default value is 0, it means that "
               "not set group according to memory_size.");
 DEFINE_int32(
-    fuse_parameter_groups_size, 1,
-    "fuse_parameter_groups_size is the up limited size of one group "
-    "parameters' gradient. "
+    fuse_parameter_groups_size, 3,
+    "fuse_parameter_groups_size is the size of one group parameters' gradient. "
     "The default value is a experimental result. If the "
     "fuse_parameter_groups_size is 1, it means that the groups size is "
     "the number of parameters' gradient. If the fuse_parameter_groups_size is "
@@ -42,9 +41,6 @@ DEFINE_int32(
 namespace paddle {
 namespace framework {
 namespace ir {
-// unit of the FLAGS_fuse_parameter_memory_size.
-static constexpr double kMB = 1048576.0;
-
 // SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit
 // test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size'
 // and 'FLAGS_fuse_parameter_groups_size' in unit test.
@@ -54,12 +50,15 @@ void SetFuseParameterGroupsSize(int group_size) {
 
 int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
 
-void SetFuseParameterMemorySize(double memory_size) {
+void SetFuseParameterMemorySize(uint64_t memory_size) {
   FLAGS_fuse_parameter_memory_size = memory_size;
 }
 
-double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; }
+uint64_t GetFuseParameterMemorySize() {
+  return FLAGS_fuse_parameter_memory_size;
+}
 
+static const char kUnKnow[] = "@UNKNOW@";
 static framework::proto::VarType::Type kDefaultDtype =
     framework::proto::VarType::Type::VarType_Type_BOOL;
 
@@ -84,7 +83,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     }
 
     if (params_grads.size() == 0) {
-      LOG(WARNING) << "Doesn't find gradients";
+      VLOG(10) << "Doesn't find gradients";
       return;
     }
 
@@ -170,6 +169,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
       details::GroupGradsAndParams *group_grads_params) const {
     SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
     SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
+    SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
   }
 
   void SetGroupAccordingToLayers(
@@ -181,7 +181,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     for (size_t i = 0; i < params_grads.size(); ++i) {
       auto pos = params_grads[i].first.find_first_of(".");
       if (pos == std::string::npos) {
-        layer_params[params_grads[i].first].emplace_back(i);
+        layer_params[std::string(kUnKnow)].emplace_back(i);
       } else {
         layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
       }
@@ -190,7 +190,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     group_grads_params->reserve(layer_params.size());
     for (size_t i = 0; i < params_grads.size(); ++i) {
       auto pos = params_grads[i].first.find_first_of(".");
-      std::string key = params_grads[i].first;
+      std::string key = kUnKnow;
       if (pos != std::string::npos) {
         key = params_grads[i].first.substr(0, pos);
       }
@@ -207,40 +207,21 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     }
 
     VLOG(10) << "SetGroupAccordingToLayers: ";
-    if (VLOG_IS_ON(10)) {
-      PrintGroupInfo(var_nodes, group_grads_params);
-    }
-  }
-
-  void PrintGroupInfo(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      details::GroupGradsAndParams *group_grads_params) const {
     for (size_t i = 0; i < group_grads_params->size(); ++i) {
       VLOG(10) << "group " << i;
       std::stringstream out;
-      size_t gps_size = 0;
-      for (auto &g_p : group_grads_params->at(i)) {
-        auto iter = var_nodes.find(g_p.second);
-        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
-        auto shape = iter->second->Var()->GetShape();
-        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
-        std::for_each(shape.begin(), shape.end(),
-                      [&size](const int64_t &n) { size *= n; });
-        gps_size += size;
-        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
       }
-      VLOG(10) << out.str()
-               << ", group size:" << group_grads_params->at(i).size()
-               << ", group memory size:" << static_cast<double>(gps_size) / kMB
-               << "(MB)";
+      VLOG(10) << out.str();
     }
   }
 
   void SetGroupAccordingToMemorySize(
       const std::unordered_map<std::string, ir::Node *> &var_nodes,
       details::GroupGradsAndParams *group_grads_params) const {
-    const double group_memory_size = GetFuseParameterMemorySize();
-    if (group_memory_size <= 0.0) {
+    const uint64_t group_memory_size = GetFuseParameterMemorySize();
+    if (group_memory_size == 0) {
       return;
     }
     details::GroupGradsAndParams local_group_grads_params;
@@ -267,14 +248,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
         group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
                          group_grads_params->at(j).end());
         ++j;
-        if (GetFuseParameterGroupsSize() > 1 &&
-            group_p_g.size() >
-                static_cast<size_t>(GetFuseParameterGroupsSize())) {
-          break;
-        }
-
-        if (static_cast<double>(local_group_memory_size) / kMB >=
-            group_memory_size) {
+        if (local_group_memory_size >= group_memory_size) {
           break;
         }
       }
@@ -283,10 +257,60 @@ class AllocContinuousSpaceForGradPass : public ir::Pass {
     std::swap(*group_grads_params, local_group_grads_params);
 
     VLOG(10) << string::Sprintf(
-        "SetGroupAccordingToMemorySize(memory_size: %f):", group_memory_size);
+        "SetGroupAccordingToMemorySize(memory_size: %d):", group_memory_size);
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &g_p : group_grads_params->at(i)) {
+        auto iter = var_nodes.find(g_p.second);
+        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
+        auto shape = iter->second->Var()->GetShape();
+        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
+        std::for_each(shape.begin(), shape.end(),
+                      [&size](const int64_t &n) { size *= n; });
+        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      }
+      VLOG(10) << out.str();
+    }
+  }
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      details::GroupGradsAndParams *group_grads_params) const {
+    if (GetFuseParameterGroupsSize() == 1) {
+      return;
+    }
+    const int group_size = GetFuseParameterGroupsSize() == -1
+                               ? static_cast<int>(group_grads_params->size())
+                               : GetFuseParameterGroupsSize();
+    PADDLE_ENFORCE_GT(group_size, 1);
+    size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
+    details::GroupGradsAndParams local_group_grads_params;
+    local_group_grads_params.reserve(groups);
+
+    size_t j = 0;
+    for (size_t i = 0; i < groups; ++i) {
+      local_group_grads_params.emplace_back();
+      auto &group_p_g = local_group_grads_params.back();
+      group_p_g.reserve(group_size);
+      while (j < group_grads_params->size()) {
+        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                         group_grads_params->at(j).end());
+        ++j;
+        if (j % group_size == 0) break;
+      }
+    }
+    std::swap(*group_grads_params, local_group_grads_params);
 
-    if (VLOG_IS_ON(10)) {
-      PrintGroupInfo(var_nodes, group_grads_params);
+    VLOG(10) << string::Sprintf("SetGroupAccordingToGroupSize(group_size: %d):",
+                                group_size);
+    for (size_t i = 0; i < group_grads_params->size(); ++i) {
+      VLOG(10) << "group " << i;
+      std::stringstream out;
+      for (auto &p_g : group_grads_params->at(i)) {
+        out << "(" << p_g.second << ", " << p_g.first << "), ";
+      }
+      VLOG(10) << out.str();
     }
   }
 
diff --git a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h
index 38dc4c99fc2..b20eda96f0f 100644
--- a/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h
+++ b/paddle/fluid/framework/ir/alloc_continuous_space_for_grad_pass.h
@@ -21,8 +21,8 @@ namespace ir {
 void SetFuseParameterGroupsSize(int group_size);
 int GetFuseParameterGroupsSize();
 
-void SetFuseParameterMemorySize(double memory_size);
-double GetFuseParameterMemorySize();
+void SetFuseParameterMemorySize(uint64_t memory_size);
+uint64_t GetFuseParameterMemorySize();
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index c4ffb2a9de4..5a82d7927f4 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -136,22 +136,22 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
 void PrepareParameters(Graph* graph, const Param& param) {
   // Check parameters
   PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 
   // Create new parameters.
-  scope.Var(param.LSTMWeight)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMBias)->GetMutable<LoDTensor>();
-  scope.Var(param.Hidden)->GetMutable<LoDTensor>();
-  scope.Var(param.Cell)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionedX)->GetMutable<LoDTensor>();
-  scope.Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMX)->GetMutable<LoDTensor>();
-  scope.Var(param.LSTMOUT)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMWeight)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMBias)->GetMutable<LoDTensor>();
+  scope->Var(param.Hidden)->GetMutable<LoDTensor>();
+  scope->Var(param.Cell)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionedX)->GetMutable<LoDTensor>();
+  scope->Var(param.AttentionFCOut)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
+  scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();
 
 #define GATE_W(name__)                                               \
-  auto* W_##name__##_w0 = scope.FindVar(#name__ ".w_0");             \
-  auto* W_##name__##_w1 = scope.FindVar(#name__ ".w_1");             \
-  auto* W_##name__##_b0 = scope.FindVar(#name__ ".b_0");             \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
   CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
   VLOG(4) << #name__ "_w0"                                           \
           << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
@@ -169,26 +169,26 @@ void PrepareParameters(Graph* graph, const Param& param) {
   GATE_W(c);
 #undef GATE_W
 
-  auto* attention_fc_w = scope.FindVar("attention_fc.w_0");
-  auto* attention_fc_b = scope.FindVar("attention_fc.b_0");
-  auto* attention_output_w = scope.FindVar("attention_output.w_0");
-  auto* attention_output_b = scope.FindVar("attention_output.b_0");
+  auto* attention_fc_w = scope->FindVar("attention_fc.w_0");
+  auto* attention_fc_b = scope->FindVar("attention_fc.b_0");
+  auto* attention_output_w = scope->FindVar("attention_output.w_0");
+  auto* attention_output_b = scope->FindVar("attention_output.b_0");
   CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w,
            attention_output_b);
 
-  auto* lstm_weight = scope.Var(param.LSTMWeight);
+  auto* lstm_weight = scope->Var(param.LSTMWeight);
   auto* lstm_weight_t = lstm_weight->GetMutable<LoDTensor>();
-  auto* lstm_bias = scope.Var(param.LSTMBias);
+  auto* lstm_bias = scope->Var(param.LSTMBias);
   auto* lstm_bias_t = lstm_bias->GetMutable<LoDTensor>();
 
   // reshape attention_bias
   auto* attention_bias_t =
-      scope.FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
+      scope->FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
   PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
   attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
 
   auto* attention_scalar_bias_t =
-      scope.FindVar(param.AttentionScalarBias)->GetMutable<LoDTensor>();
+      scope->FindVar(param.AttentionScalarBias)->GetMutable<LoDTensor>();
   attention_scalar_bias_t->Resize(
       make_ddim({1, attention_scalar_bias_t->dims()[0]}));
 
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 6462e7bf4c0..3a6bbe65b36 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -151,11 +151,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     op_desc.SetAttr("use_seq", true);
 
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 #define OP_SET_OUT(x)                            \
   const std::string x = patterns::UniqueKey(#x); \
   op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
+  scope->Var(x)->GetMutable<LoDTensor>()
     OP_SET_OUT(BatchedCell);
     OP_SET_OUT(BatchedHidden);
     OP_SET_OUT(ReorderedH0);
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 102fd388658..cd8030519cc 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -78,15 +77,9 @@ void FCFusePass::ApplyImpl(ir::Graph* graph) const {
       desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8"));
       desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale"));
       desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale"));
-      if (base_op_desc->HasAttr("out_scale"))
-        desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale"));
-      auto elementwise_desc = elementwise_add->Op();
-      if (elementwise_desc->HasAttr("out_scale"))
-        desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale"));
     }
 
     desc.SetType("fc");
-
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
     GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out});
 
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index 10cbe319ac8..5f660c6d366 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -69,15 +69,16 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
     auto* op = graph->CreateOpNode(&op_desc);
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+    PADDLE_ENFORCE(scope);
     if (with_fc_bias) {
       // Fusion GRU bias = fcbias + grubias
-      auto* fusion_bias_var = scope.Var(NEW_NAME(bias) + bias->Name());
+      auto* fusion_bias_var = scope->Var(NEW_NAME(bias) + bias->Name());
       auto* out_bias_tensor =
           fusion_bias_var->GetMutable<framework::LoDTensor>();
       PADDLE_ENFORCE(fusion_bias_var);
-      auto* gru_bias_var = scope.FindVar(bias->Name());
-      auto* fc_bias_var = scope.FindVar(fc_bias->Name());
+      auto* gru_bias_var = scope->FindVar(bias->Name());
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
       PADDLE_ENFORCE(gru_bias_var);
       PADDLE_ENFORCE(fc_bias_var);
       const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
@@ -93,7 +94,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 #undef GET_NODE
 
 #define NEW_IMTERMEDIATE_OUT(key) \
-  scope.Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
+  scope->Var(NEW_NAME(key))->GetMutable<framework::LoDTensor>()
     NEW_IMTERMEDIATE_OUT(ReorderedH0);
     NEW_IMTERMEDIATE_OUT(XX);
     NEW_IMTERMEDIATE_OUT(BatchedInput);
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 6858a98be39..babeba96149 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -100,11 +100,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
     op_desc.SetAttr("use_seq", true);
 
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
 #define OP_SET_OUT(x)                            \
   const std::string x = patterns::UniqueKey(#x); \
   op_desc.SetOutput(#x, {x});                    \
-  scope.Var(x)->GetMutable<LoDTensor>()
+  scope->Var(x)->GetMutable<LoDTensor>()
     OP_SET_OUT(BatchedCell);
     OP_SET_OUT(BatchedHidden);
     OP_SET_OUT(ReorderedH0);
diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
index 7f9eccf2fdd..bd496731683 100644
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -26,7 +26,7 @@ namespace framework {
 namespace ir {
 
 void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const {
-  std::unordered_set<std::string> act_types = {"relu", "scale", "tanh"};
+  std::unordered_set<std::string> act_types = {"relu", "scale"};
   graph = FuseActElewiseAdd(graph, act_types);
   graph = FuseElewiseAddAct(graph, act_types);
   // backward
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
index 5e2523607d6..5f3e2b4403b 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -26,8 +26,7 @@ void FusePassBase::Init(const std::string& repr, Graph* graph) const {
 
 Scope* FusePassBase::param_scope() const {
   PADDLE_ENFORCE(graph_->Has(kParamScopeAttr));
-  auto& scope = graph_->Get<framework::Scope>(kParamScopeAttr);
-  return &scope;
+  return graph_->Get<framework::Scope*>(kParamScopeAttr);
 }
 
 void FusePassBase::AddStatis(int count_of_fused) const {
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8ba0e8b80b1..5eba32c4f3a 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -134,7 +134,6 @@ void Graph::ResolveHazard(
         ir::Node *dep_var = CreateControlDepVar();
         write_op->inputs.push_back(dep_var);
         upstream_op->outputs.push_back(dep_var);
-        VLOG(10) << "add dep_var:" << dep_var->Name();
         dep_var->outputs.push_back(write_op);
         dep_var->inputs.push_back(upstream_op);
       }
@@ -158,7 +157,6 @@ void Graph::ResolveHazard(
         if (has_dep) continue;
 
         ir::Node *dep_var = CreateControlDepVar();
-        VLOG(10) << "add dep_var:" << dep_var->Name();
         read_op->outputs.push_back(dep_var);
         dep_var->inputs.push_back(read_op);
         write_op->inputs.push_back(dep_var);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 15b3429ef17..0dcf064902d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -14,10 +14,7 @@
 
 #include <algorithm>
 #include <array>
-#include <memory>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -788,33 +785,6 @@ PDNode *patterns::ConvReLU::operator()(
   return relu_out_var;
 }
 
-PDNode *patterns::ConvBReLU::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
-  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
-  auto *brelu_op = pattern->NewNode(brelu_repr())->assert_is_op("relu6");
-  // Create variables
-  // Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
-                           ->assert_is_op_input("relu6");
-  // output
-  auto *brelu_out_var = pattern->NewNode(brelu_out_repr())
-                            ->AsOutput()
-                            ->assert_is_op_output("relu6");
-
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  brelu_op->LinksFrom({conv_out_var}).LinksTo({brelu_out_var});
-  return brelu_out_var;
-}
-
 PDNode *patterns::SeqConvEltAddRelu::operator()(
     paddle::framework::ir::PDNode *seqconv_input) {
   // Create Operators
@@ -899,33 +869,6 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
   }
 }
 
-PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
-                                       bool with_bias) {
-  // Create shared nodes.
-  x->assert_is_op_input("fc", "Input");
-
-  auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc");
-  // Create variables
-  // Filter
-  auto *fc_weight_var = pattern->NewNode(weights_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("fc", "W");
-  // Bias
-  auto *fc_bias_var = pattern->NewNode(bias_repr())
-                          ->AsInput()
-                          ->assert_is_persistable_var()
-                          ->assert_is_op_input("fc", "Bias");
-  // Output
-  auto *fc_out_var = pattern->NewNode(output_repr())
-                         ->AsOutput()
-                         ->assert_is_op_output("fc", "Out")
-                         ->assert_is_only_output_of_op("fc");
-
-  fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var});
-  return fc_out_var;
-}
-
 PDNode *patterns::Embedding::operator()(PDNode *x) {
   x->assert_is_op_input("lookup_table", "Ids");
   auto *lookup_table_op =
@@ -1092,12 +1035,12 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
-// conv_type: conv2d, conv3d, conv2d_transpose
 PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input, std::string conv_type) {
+    paddle::framework::ir::PDNode *conv_input, bool is_conv3d) {
+  std::string type = is_conv3d ? "conv3d" : "conv2d";
   // Create Operators
-  conv_input->assert_is_op_input(conv_type, "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type);
+  conv_input->assert_is_op_input(type, "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type);
   auto *eltiwse_op =
       pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
   // Create variables
@@ -1105,11 +1048,11 @@ PDNode *patterns::ConvBias::operator()(
   auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
                               ->AsInput()
                               ->assert_is_persistable_var()
-                              ->assert_is_op_input(conv_type, "Filter");
+                              ->assert_is_op_input(type, "Filter");
   // intermediate variable, will be removed in the IR after fuse.
   auto *conv_out_var = pattern->NewNode(conv_out_repr())
                            ->AsIntermediate()
-                           ->assert_is_only_output_of_op(conv_type)
+                           ->assert_is_only_output_of_op(type)
                            ->assert_is_op_input("elementwise_add");
   // Bias stored in elementwise_add
   auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
@@ -1214,57 +1157,6 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {
   return out_var;
 }
 
-PDNode *patterns::Concat::operator()() {
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-
-  auto output_var = pattern->NewNode(concat_out_repr())
-                        ->AsOutput()
-                        ->assert_is_op_output("concat", "Out");
-
-  concat_op->LinksTo({output_var});
-  return output_var;
-}
-
-PDNode *patterns::ConcatReLU::operator()() {
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-  auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu");
-
-  auto concat_out =
-      pattern->NewNode(concat_out_repr())->assert_is_op_output("concat", "Out");
-
-  auto relu_out = pattern->NewNode(relu_out_repr())
-                      ->AsOutput()
-                      ->assert_is_op_output("relu", "Out");
-
-  concat_op->LinksTo({concat_out});
-  relu_op->LinksFrom({concat_out}).LinksTo({relu_out});
-
-  return relu_out;
-}
-
-PDNode *patterns::ConvConcatReLU::operator()() {
-  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
-  auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat");
-  auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu");
-
-  auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output("conv2d", "Output");
-
-  auto concat_out = pattern->NewNode(concat_out_repr())
-                        ->assert_is_op_output("concat", "Out")
-                        ->assert_is_op_input("relu", "X");
-
-  auto relu_out = pattern->NewNode(relu_out_repr())
-                      ->AsOutput()
-                      ->assert_is_op_output("relu", "Out");
-
-  conv_op->LinksTo({conv_out});
-  concat_op->LinksFrom({conv_out}).LinksTo({concat_out});
-  relu_op->LinksFrom({concat_out}).LinksTo({relu_out});
-
-  return relu_out;
-}
-
 std::unordered_set<std::string> conv_act_set({"identity", "relu"});
 
 PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
@@ -1749,16 +1641,13 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
                                               const std::string &op_type,
                                               const std::string &weight_name,
                                               int times,
-                                              const std::string &quant_type,
-                                              const std::string &dequant_type) {
-  int kNumFields = 5;
+                                              const std::string &quant_type) {
+  const int kNumFields = 5;
   const int kQuantizedWeightOffset = 0;
   const int kQuantizedOpOffset = 1;
   const int kQuantizedOpOutOffset = 2;
   const int kDequantOpOffset = 3;
   const int kDequantOpOutOffset = 4;
-  const int kDequantOpWeightScaleOffset = 5;
-
   // the quant op always be one.
   auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale"))
                                ->assert_is_op_input(quant_type, "InScale")
@@ -1766,19 +1655,11 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
   auto quant_op =
       pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type);
 
-  PDNode *quant_op_out_scale = nullptr;
-  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-    kNumFields += 1;
-    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
-                             ->assert_is_op_output(quant_type, "OutScale")
-                             ->assert_is_op_nth_input(dequant_type, "Scales", 1)
-                             ->AsIntermediate();
-  } else {
-    quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale"))
-                             ->assert_is_op_output(quant_type, "OutScale")
-                             ->assert_is_op_input(dequant_type, "Scale")
-                             ->AsIntermediate();
-  }
+  auto quant_op_out_scale =
+      pattern->NewNode(GetNodeName("quant_op_out_scale"))
+          ->assert_is_op_output(quant_type, "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
 
   auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out"))
                           ->assert_is_op_output(quant_type, "Out")
@@ -1799,25 +1680,16 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
     nodes.push_back(
         pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i))
             ->assert_is_op_output(op_type)
-            ->assert_is_op_input(dequant_type, "X")
+            ->assert_is_op_input("fake_dequantize_max_abs", "X")
             ->AsIntermediate());
 
     nodes.push_back(
         pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i))
-            ->assert_is_op(dequant_type));
-
+            ->assert_is_op("fake_dequantize_max_abs"));
     nodes.push_back(
         pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i))
-            ->assert_is_op_output(dequant_type, "Out")
+            ->assert_is_op_output("fake_dequantize_max_abs", "Out")
             ->AsOutput());
-
-    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-      nodes.push_back(pattern
-                          ->NewNode(GetNodeName("dequant_channel_scale") +
-                                    std::to_string(i))
-                          ->assert_is_op_nth_input(dequant_type, "Scales", 0)
-                          ->AsInput());
-    }
   }
 
   quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
@@ -1827,14 +1699,8 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
         {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
     nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
         {nodes[i * kNumFields + kQuantizedOpOffset]});
-    if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale,
-           nodes[i * kNumFields + kDequantOpWeightScaleOffset]});
-    } else {
-      nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
-          {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
-    }
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
     nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
         {nodes[i * kNumFields + kDequantOpOffset]});
   }
@@ -1871,41 +1737,6 @@ void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
   reshape2_out->LinksFrom({reshape2_op});
 }
 
-void patterns::DeleteQuantDequantOpPattern::operator()() {
-  auto any_op_out =
-      pattern->NewNode(any_op_out_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "X")
-          ->AsInput();
-
-  auto quant_dequant_op_inscale =
-      pattern->NewNode(quant_dequant_op_inscale_repr())
-          ->assert_is_op_input(
-              "fake_quantize_dequantize_moving_average_abs_max", "InScale")
-          ->AsInput();
-  auto quant_dequant_op =
-      pattern->NewNode(quant_dequant_op_repr())
-          ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max");
-
-  auto quant_dequant_out =
-      pattern->NewNode(quant_dequant_op_out_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "Out")
-          ->AsIntermediate();
-
-  auto quant_dequant_op_outscale =
-      pattern->NewNode(quant_dequant_op_outscale_repr())
-          ->assert_is_op_output(
-              "fake_quantize_dequantize_moving_average_abs_max", "OutScale")
-          ->AsOutput();
-  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
-
-  quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale});
-  quant_dequant_op_outscale->LinksFrom({quant_dequant_op});
-  quant_dequant_out->LinksFrom({quant_dequant_op});
-  any_op2->LinksFrom({quant_dequant_out});
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 1c53b910522..907371b56b0 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -449,27 +449,6 @@ struct ConvReLU : public PatternBase {
   PATTERN_DECL_NODE(relu_out);
 };
 
-// CONV with ReLU6
-// op: conv + relu6
-// named nodes:
-// conv_input, conv_weight,
-// conv_out, conv,
-// relu6_out, relu6
-struct ConvBReLU : public PatternBase {
-  ConvBReLU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_bounded_relu") {}
-
-  PDNode* operator()(PDNode* conv_input);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(brelu);
-  // declare variable node's name
-  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(brelu_out);
-};
-
 // SEQCONV with Elementwise_Add ReLU
 // op: seqconv + elementwise_add + relu
 // named nodes:
@@ -517,25 +496,6 @@ struct FC : public PatternBase {
   PATTERN_DECL_NODE(Out);
 };
 
-// MKL-DNN's FC with bias
-// op: fc
-// named node:
-// fc
-// w, bias, output
-struct FCMKLDNN : public PatternBase {
-  FCMKLDNN(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "fc_mkldnn") {}
-
-  PDNode* operator()(PDNode* x, bool with_bias);
-
-  // declare operator node's name
-  PATTERN_DECL_NODE(fc);
-  // declare variable node's name
-  PATTERN_DECL_NODE(weights);
-  PATTERN_DECL_NODE(bias);
-  PATTERN_DECL_NODE(output);
-};
-
 // Embedding
 struct Embedding : public PatternBase {
   Embedding(PDPattern* pattern, const std::string& name_scope)
@@ -669,7 +629,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
 struct ConvBias : public PatternBase {
   ConvBias(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d");
+  PDNode* operator()(PDNode* conv_input, bool is_conv3d = false);
   // declare operator node's name
   PATTERN_DECL_NODE(conv);
   PATTERN_DECL_NODE(eltwise);
@@ -747,52 +707,6 @@ struct ElementwiseAdd : public PatternBase {
   PATTERN_DECL_NODE(elementwise_add_out);
 };
 
-// Concat op
-// Forward pass for concat.
-// concat_out is a result of the operator.
-struct Concat : public PatternBase {
-  Concat(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "concat") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-};
-
-// Concat + ReLU
-// named nodes:
-// concat_op, concat_out, relu_op, relu_out
-struct ConcatReLU : public PatternBase {
-  ConcatReLU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "concat_relu") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-  PATTERN_DECL_NODE(relu_op);
-  PATTERN_DECL_NODE(relu_out);
-};
-
-// Conv + Concat + ReLU
-// named nodes:
-// conv_op, conv_out
-// concat_op, concat_out, relu_op, relu_out
-struct ConvConcatReLU : public PatternBase {
-  ConvConcatReLU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_concat_relu") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(conv_op);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(concat_op);
-  PATTERN_DECL_NODE(concat_out);
-  PATTERN_DECL_NODE(relu_op);
-  PATTERN_DECL_NODE(relu_out);
-};
-
 // Conv + ElementwiseAdd + an activation
 // This pattern can futher fuse the conv related ops after the conv+bn fusion.
 struct ConvElementwiseaddAct : public PatternBase {
@@ -967,8 +881,7 @@ struct QuantDequantOpFuse : public PatternBase {
 
   void operator()(PDNode* quant_op_input, const std::string& op_name,
                   const std::string& weight_name, int times,
-                  const std::string& quant_type,
-                  const std::string& dequant_type);
+                  const std::string& quant_type);
 
   std::string GetNodeName(const std::string& op_type) {
     return PDNodeName(name_scope_, repr_, id_, op_type);
@@ -994,20 +907,6 @@ struct ShuffleChannelPattern : public PatternBase {
   PATTERN_DECL_NODE(reshape2_out);
 };
 
-struct DeleteQuantDequantOpPattern : public PatternBase {
-  DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
-
-  void operator()();
-
-  PATTERN_DECL_NODE(any_op_out);
-  PATTERN_DECL_NODE(quant_dequant_op_inscale);
-  PATTERN_DECL_NODE(quant_dequant_op);
-  PATTERN_DECL_NODE(quant_dequant_op_outscale);
-  PATTERN_DECL_NODE(quant_dequant_op_out);
-  PATTERN_DECL_NODE(any_op2);
-};
-
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc
index f4df4cfeba6..3722aaab1fd 100644
--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_set>
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/lite/utils/string.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
@@ -84,7 +85,8 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const {
   auto marked_nodes = ConsumeMarkedNodes(graph);
   // Create nodes
   for (const Node* n : graph->Nodes()) {
-    std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")";
+    std::string node_id =
+        lite::string_format("%s(%d)", FormatName(n).c_str(), n->id());
     if (n->IsOp()) {
       decltype(op_attrs) attr =
           marked_nodes.count(n) ? marked_op_attrs : op_attrs;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 9c923480bac..d1718857a5d 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#ifndef PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
+#define PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
 
 #include <string>
 #include <vector>
@@ -125,3 +126,5 @@ class LockFreeOptimizePass : public Pass {
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
+
+#endif  // PADDLE_FLUID_FRAMEWORK_IR_LOCK_FREE_OPTIMIZE_PASS_H_
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
index f57e7bb2301..ed746ea988e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc
@@ -48,6 +48,8 @@ DEFINE_bool(
     "Such as scale, elementwise_add"
     "By default, it's turned off");
 
+DECLARE_string(memory_optimize_debug);
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -459,6 +461,13 @@ void InplacePass::ApplyImpl(ir::Graph *graph) const {
         continue;
       }
 
+      // Debug Interface. Which would be skipped by the pass.
+      if (out_arg == FLAGS_memory_optimize_debug) {
+        VLOG(4) << "Skiped var by force. FLAGS_memory_optimize_debug="
+                << out_node->Name();
+        continue;
+      }
+
       VLOG(4) << "Rename " << out_node->Name() << " with " << in_node->Name()
               << " in " << op_type;
       RenameInOut(op_node, in_node, out_node);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc
index af3fbb2808b..8d5271b5081 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc
@@ -31,6 +31,15 @@
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
+DEFINE_bool(enable_subgraph_optimize, false,
+            "SubGraph also reuse global graph variables, it will reduce the "
+            "memory occupation"
+            "but a higher risk of memory reuse error. default disabled.");
+DEFINE_string(memory_optimize_debug, "",
+              "debug the operator output variable when do the variable reuse."
+              "memory reuse pass."
+              "only for debug, default disabled.");
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -48,6 +57,15 @@ void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
     auto* op_desc = op->Op();
     // some op in graph has no op desc
     if (op_desc == nullptr) continue;
+    if (OpHasSubBlock(op_desc)) {
+      if (FLAGS_enable_subgraph_optimize) {
+        SubGraphOptimize(op_desc);
+      } else {
+        VLOG(3) << op->Name()
+                << " has subblock, but disable subgraph optimize. skipped.";
+        continue;
+      }
+    }
 
     for (auto& var : op->outputs) {
       if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) {
@@ -64,6 +82,13 @@ void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
                   << "replace it again. Skip this candidate.";
           cache = pool_.FindNextBestFitNode(var, cache);
         }
+        if (var->Name() == FLAGS_memory_optimize_debug) {
+          VLOG(3) << "start match var " << DebugString(var) << " of op "
+                  << op->Name();
+          VLOG(3) << pool_.ToString();
+          VLOG(3) << "matched in pool : "
+                  << ((cache == nullptr) ? "False" : "True");
+        }
 
         if (cache != nullptr) {
           int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
@@ -103,6 +128,81 @@ void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const {
   graph->ResolveHazard(var_nodes_);
 }
 
+void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
+  // conditional block, while op and their grad op
+  auto* sub_block_desc =
+      AttrReader(op_desc->GetAttrMap()).Get<BlockDesc*>("sub_block");
+
+  // create a mirror block to construct an IR Graph.
+  ProgramDesc prog;
+  auto* copy_block = prog.MutableBlock(0);
+  for (auto* op : sub_block_desc->AllOps()) {
+    auto* copy_op = copy_block->AppendOp();
+    copy_op->CopyFrom(*op);
+    copy_op->Flush();
+  }
+
+  for (auto* var : sub_block_desc->AllVars()) {
+    auto* copy_var = copy_block->Var(var->Name());
+    copy_var->SetDataType(var->GetDataType());
+    // only lod tensor can be reused. So ignore the multiple dims case.
+    copy_var->SetType(var->GetType());
+    copy_var->SetShape(var->GetShape());
+    copy_var->SetPersistable(var->Persistable());
+  }
+
+  ir::Graph sub_graph(prog);
+  std::unordered_set<ir::Node*> sub_graph_all_ops;
+  FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) {
+    // sub_graph_all_ops.emplace(var);
+    if (var->IsVar() && !var->IsCtrlVar()) {
+      sub_graph_all_ops.emplace(var);
+    }
+  });
+  int sub_reuse_id = 0;
+  // subgraph nodes is unordered, reuse need to follow the desc order.
+  // find the right op node through the descs
+  for (auto* sub_op_desc : sub_block_desc->AllOps()) {
+    ir::Node* sub_op = nullptr;
+    for (auto* node : sub_graph_all_ops) {
+      if (node->Op() == sub_op_desc) {
+        sub_op = node;
+        break;
+      }
+    }
+    PADDLE_ENFORCE(sub_op != nullptr);
+    for (auto* var : sub_op->outputs) {
+      if (NodeCanReused(var)) {
+        ir::Node* cache = pool_.FindBestFitNode(var);
+        if (cache != nullptr) {
+          if (var->Var()->GetDataType() != cache->Var()->GetDataType()) {
+            continue;
+          }
+          int node_idx_in_pool = pool_.GetNodeIndexInPool(cache);
+          VLOG(3) << string::Sprintf(
+              "!!! %s,  %s => %s, cache idx %d, pool size %d",
+              std::to_string(sub_reuse_id++), DebugString(var),
+              DebugString(cache), node_idx_in_pool,
+              static_cast<int>(pool_.size()));
+          // NOTE(dzh): subblock is not in IR graph. Modify the block_desc
+          // immediately to make the subblock variable reuse strategy take
+          // effect. Because it is a single op in graph. No need to
+          // update the ir nodes.
+          // FIXME(liuwei1031): Graph is not aware of the existence of
+          // BlockDescs and ProgramDescs.
+          // The operations related to BlockDesc or ProgramDesc should perform
+          // on Graph or Node directly!
+          sub_op_desc->Rename(var->Name(), cache->Name());
+          if (sub_op_desc->Block() != nullptr &&
+              sub_op_desc->Block()->HasVar(var->Name())) {
+            sub_op_desc->Block()->RemoveVar(var->Name());
+          }
+        }
+      }
+    }
+  }
+}
+
 void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const {
   // fill skip_set_
   PADDLE_ENFORCE(graph->Has(kMemOptSkipVars));
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc
index 040b769f89d..075a1955eb6 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc
@@ -140,9 +140,9 @@ class RecordSkipMemoryOptVarsPass : public ir::Pass {
         // fail since "states" and "ex_states" cannot be found in main block.
         // When memory optimization is enabled, "states", "ex_states" and their
         // gradient should be skipped.
-        auto ex_states =
+        auto& ex_states =
             boost::get<std::vector<std::string>>(op_desc->GetAttr("ex_states"));
-        auto states =
+        auto& states =
             boost::get<std::vector<std::string>>(op_desc->GetAttr("states"));
         if (op_type == "recurrent") {
           UpdateSkipVarSet(skip_vars, {ex_states, states});
@@ -154,7 +154,7 @@ class RecordSkipMemoryOptVarsPass : public ir::Pass {
           UpdateSkipVarSet(
               skip_vars,
               {ToGradVarName(op_desc->Input("parameters")),
-               ToGradVarName(op_desc->Input("inputs")), ex_states, states,
+               ToGradVarName(op_desc->Input("input")), ex_states, states,
                ToGradVarName(ex_states), ToGradVarName(states)});
         }
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index bbfc8c00558..8ef3993b065 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -45,14 +45,16 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
   auto* scope = param_scope();
   PADDLE_ENFORCE(scope);
 
+  std::string type = is_conv3d() ? "conv3d" : "conv2d";
+
   GraphPatternDetector gpd;
   auto* conv_input =
       gpd.mutable_pattern()
           ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
           ->AsInput()
-          ->assert_is_op_input(type(), "Input");
+          ->assert_is_op_input(type, "Input");
   patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_);
-  conv_bias_pattern(conv_input, type());
+  conv_bias_pattern(conv_input, is_conv3d());
   int found_conv_bias_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -73,7 +75,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
     if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(3) << "do not perform " + type() + "+bias fuse";
+      VLOG(3) << "do not perform conv+bias fuse";
       return;
     }
 
@@ -108,7 +110,7 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       desc.SetInput("Filter", std::vector<std::string>({conv_weight->Name()}));
       desc.SetInput("Bias", std::vector<std::string>({eltwise_bias->Name()}));
       desc.SetOutput("Output", std::vector<std::string>({eltwise_out->Name()}));
-      desc.SetType(type());
+      desc.SetType(type);
 
       for (auto& attr : conv->Op()->GetAttrMap()) {
         desc.SetAttr(attr.first, attr.second);
@@ -133,7 +135,5 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
-REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
index 833fbc748eb..84106d0655d 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h
@@ -26,7 +26,7 @@ namespace ir {
 class ConvBiasFusePass : public FusePassBase {
  public:
   virtual ~ConvBiasFusePass() {}
-  virtual std::string type() const { return "conv2d"; }
+  virtual bool is_conv3d() const { return false; }
 
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
@@ -35,14 +35,9 @@ class ConvBiasFusePass : public FusePassBase {
 /*
 * Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp.
 */
-class Conv2DTransposeBiasFusePass : public ConvBiasFusePass {
- public:
-  std::string type() const override { return "conv2d_transpose"; }
-};
-
 class Conv3DBiasFusePass : public ConvBiasFusePass {
  public:
-  std::string type() const override { return "conv3d"; }
+  bool is_conv3d() const override { return true; }
 };
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 427d7bc9aeb..ff7f9190fde 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -81,7 +81,8 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
                       const char* var_name) {
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
 }
 
 void MainTest(bool convWithExistingBias) {
@@ -96,7 +97,7 @@ void MainTest(bool convWithExistingBias) {
     InitTensorHolder(&scope, place, "conv_bias");
     InitTensorHolder(&scope, place, "eltwise_bias");
   }
-  graph->SetNotOwned(kParamScopeAttr, &scope);
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
 
   auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
 
@@ -140,12 +141,7 @@ TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
 
 TEST(ConvBiasFusePass, conv3d) {
   Conv3DBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv3d"));
-}
-
-TEST(ConvBiasFusePass, conv2d_transpose) {
-  Conv2DTransposeBiasFusePass pass;
-  ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
+  ASSERT_TRUE(pass.is_conv3d());
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index dd3ee50e040..dff98e523ac 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
-#include <limits>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
@@ -73,53 +72,6 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
 }
 
-void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                                     VarQuantScale* scales, bool are_unsigned,
-                                     std::string scale_attr_name) const {
-  auto inputs = op->inputs;
-  PADDLE_ENFORCE_GE(inputs.size(), 1);
-
-  // create a quantize op desc prototype
-  OpDesc q_desc;
-  q_desc.SetType("quantize");
-
-  std::vector<Node*> quantize_out_nodes(inputs.size());
-  std::vector<std::string> quantize_out_node_names(inputs.size());
-
-  double scale_min = std::numeric_limits<double>::max();
-  for (const auto& input : inputs) {
-    double scale = (*scales)[input->Name()].second.data<double>()[0];
-    if (scale < scale_min) scale_min = scale;
-  }
-  unsigned max = are_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_min * max;
-
-  for (size_t i = 0; i < inputs.size(); i++) {
-    // Create quantize output variable
-    VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
-    quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc);
-    quantize_out_node_names[i] = quantize_out_nodes[i]->Name();
-
-    q_desc.SetAttr("Scale", scale);
-    q_desc.SetInput("Input", std::vector<std::string>({inputs[i]->Name()}));
-    q_desc.SetOutput("Output",
-                     std::vector<std::string>({quantize_out_node_names[i]}));
-    q_desc.SetAttr("is_negative_input", !are_unsigned);
-    auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-
-    // link quantize op
-    UnlinkNodes(inputs[i], op);
-    IR_NODE_LINK_TO(inputs[i], quantize_op);
-    IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]);
-    IR_NODE_LINK_TO(quantize_out_nodes[i], op);
-  }
-
-  // update op's input
-  op->Op()->SetInput(input_name, quantize_out_node_names);
-
-  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
-}
-
 void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
                                        std::string output_name,
                                        double scale_to_one, bool is_unsigned,
@@ -264,48 +216,6 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
   PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
 }
 
-void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
-  GraphPatternDetector gpd;
-  auto pattern = gpd.mutable_pattern();
-  patterns::Concat concat_pattern{pattern, name_scope_};
-  concat_pattern();
-
-  int quantize_concat_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "Quantize concat op";
-    GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern);
-    auto* concat_op_desc = concat_op->Op();
-
-    // skip if should not be quantized
-    if (!concat_op_desc->HasAttr("use_quantizer") ||
-        !boost::get<bool>(concat_op_desc->GetAttr("use_quantizer")))
-      return;
-
-    GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
-
-    // get scales calculated after warmup, they scale variables to MAX=1.0
-    auto scales = Get<VarQuantScale>("quant_var_scales");
-
-    // if all inputs were unsigned, then the output was set to unsigned
-    // during the scale calculation step
-    bool are_all_inputs_unsigned = scales[concat_out->Name()].first;
-    QuantizeInputs(g, concat_op, "X", &scales, are_all_inputs_unsigned);
-
-    auto output_scale = scales[concat_out->Name()].second.data<double>()[0];
-
-    DequantizeOutput(g, concat_op, concat_out, "Out", output_scale,
-                     are_all_inputs_unsigned);
-
-    ++quantize_concat_count;
-  };
-
-  gpd(graph, handler);
-  AddStatis(quantize_concat_count);
-
-  PrettyLogDetail("---    quantized %d concat ops", quantize_concat_count);
-}
-
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
   PADDLE_ENFORCE(graph);
@@ -316,7 +226,6 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeConv(graph, false /* with_residual_data */);
   QuantizeConv(graph, true /* with_residual_data */);
   QuantizePool(graph);
-  QuantizeConcat(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index 61a28fd3131..a178c4dc363 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -48,17 +48,10 @@ class CPUQuantizePass : public FusePassBase {
 
   void QuantizePool(Graph* graph) const;
 
-  void QuantizeConcat(Graph* graph) const;
-
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_unsigned,
                      std::string scale_attr_name = "") const;
 
-  // quantize all inputs of given name with the same (minimum) scale
-  void QuantizeInputs(Graph* g, Node* op, std::string input_name,
-                      VarQuantScale* scales, bool are_unsigned,
-                      std::string scale_attr_name = "") const;
-
   void DequantizeOutput(Graph* g, Node* op, Node* output,
                         std::string output_name, double scale_to_one,
                         bool is_unsigned,
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 0a689441867..8716a412e4d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -60,14 +60,9 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
     if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
     op->SetOutput("Out", {outputs[0]});
-  } else if (type == "concat") {
-    op->SetInput("X", inputs);
-    op->SetOutput("Out", outputs);
-    op->SetAttr("use_quantizer", use_quantizer);
   }
 }
 
-namespace {
 static const std::initializer_list<std::string> variable_names{
     "a", "w1", "c",  "d", "w2", "e",  "f", "g",
     "h", "w3", "b1", "i", "j",  "w4", "b2"};
@@ -110,7 +105,8 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
                       const char* var_name) {
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
 }
 
 void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
@@ -136,7 +132,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
     (*scales)[v] = std::make_pair(false, std::move(tensor));
   }
 
-  graph->SetNotOwned(kParamScopeAttr, &scope);
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
 
   auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
   pass->Set("quant_var_scales", scales);
@@ -208,101 +204,6 @@ TEST(CpuQuantizePass, do_not_quantize) {
            1.0f);
 }
 
-}  // namespace
-
-namespace {
-static const std::initializer_list<std::string> variable_names_concat = {
-    "a1", "b1", "a2", "b2", "c", "d"};
-
-// a1->Pool1->b1
-// a2->Pool2->b2
-// (b1,b2)->Concat->c
-// c->Pool3->d
-ProgramDesc BuildProgramDescConcat() {
-  ProgramDesc prog;
-
-  SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false);
-  SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false);
-  SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true);
-  SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false);
-
-  return prog;
-}
-
-void MainTestConcat(const ProgramDesc& prog, int pool_count, int concat_count,
-                    int quant_count, int dequant_count, int added_nodes_count) {
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  // Init scope, as it is used in pass
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  exe.CreateVariables(prog, 0, true, &scope);
-
-  auto* scales = new VarQuantScale();
-
-  for (auto& v : variable_names_concat) {
-    InitTensorHolder(&scope, place, v.c_str());
-    LoDTensor tensor;
-    tensor.Resize({1});
-    auto* ptr = tensor.mutable_data<double>(place);
-    ptr[0] = 2.0;
-
-    (*scales)[v] = std::make_pair(false, std::move(tensor));
-  }
-
-  graph->SetNotOwned(kParamScopeAttr, &scope);
-
-  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
-  pass->Set("quant_var_scales", scales);
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph.reset(pass->Apply(graph.release()));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  int quantize_nodes_count = 0;
-  int dequantize_nodes_count = 0;
-  int concat_nodes_count = 0;
-  int pool2d_nodes_count = 0;
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp()) {
-      auto* op = node->Op();
-      if (op->Type() == "concat") {
-        concat_nodes_count++;
-      } else if (op->Type() == "pool2d") {
-        pool2d_nodes_count++;
-      } else if (op->Type() == "quantize") {
-        quantize_nodes_count++;
-      } else if (op->Type() == "dequantize") {
-        dequantize_nodes_count++;
-      }
-    }
-  }
-  EXPECT_EQ(concat_nodes_count, concat_count);
-  EXPECT_EQ(pool2d_nodes_count, pool_count);
-  EXPECT_EQ(quantize_nodes_count, quant_count);
-  EXPECT_EQ(dequantize_nodes_count, dequant_count);
-  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
-}
-
-TEST(CpuQuantizePass, concat) {
-  // a1->Pool1->b1
-  // a2->Pool2->b2
-  // (b1->QUANT1->IN1, b2->QUANT2->IN2)->Concat->c
-  // c->OUT1->DEQUANT1->Pool3->d
-  int pool_count = 3;
-  int concat_count = 1;
-  int quant_count = 2;
-  int dequant_count = 1;
-  int added_nodes_count = 6;
-  MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count,
-                 quant_count, dequant_count, added_nodes_count);
-}
-
-}  // namespace
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 2270e2b5cc5..debbbd6440b 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -14,7 +14,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h"
-#include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
@@ -82,10 +81,12 @@ void CPUQuantizeSquashPass::Squash(
       auto quant_out_var_name = quant_out->Name();
       auto next_op_inputs = next_op_desc->InputNames();
       for (const auto& name : next_op_inputs) {
-        auto input_names = next_op_desc->Input(name);
-        std::replace(input_names.begin(), input_names.end(), quant_out_var_name,
-                     dequant_in->Name());
-        next_op_desc->SetInput(name, input_names);
+        auto var_name = next_op_desc->Input(name)[0];
+        if (var_name.compare(quant_out_var_name) == 0) {
+          next_op_desc->SetInput(
+              name, std::vector<std::string>({dequant_in->Name()}));
+          break;
+        }
       }
 
       if (keep_dequant)
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 057a790ccb3..fda337066f4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -102,7 +102,8 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
                       const char* var_name) {
   auto x = scope->Var(var_name);
   auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
 }
 
 void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
@@ -118,7 +119,7 @@ void MainTest(const ProgramDesc& prog, int removed_nodes_num) {
     InitTensorHolder(&scope, place, v.c_str());
   }
 
-  graph->SetNotOwned(kParamScopeAttr, &scope);
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
 
   auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass");
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
index a2092a5059a..500419e4b78 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h"
-#include <memory>
 #include <string>
 #include <unordered_set>
 
@@ -25,9 +24,6 @@ void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies MKL-DNN placement strategy.";
   const auto& op_types_list =
       Get<std::unordered_set<std::string>>("mkldnn_enabled_op_types");
-  if (!graph->Has("use_mkldnn")) {
-    graph->Set<bool>("use_mkldnn", new bool(true));
-  }
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
index 4cdb6a7d308..096428e58ab 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
@@ -14,4 +14,3 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass)
-cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass)
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 1019c4f8427..314f8c0424d 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -22,7 +22,6 @@
 
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
-#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -36,20 +35,9 @@ namespace ir {
 class AllReduceDepsPass : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override {
-    std::vector<details::OpHandleBase*> all_reduce_op_handles =
+    std::vector<details::AllReduceOpHandle*> all_reduce_op_handles =
         GetSortedAllReduceOps(*graph);
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto use_hierarchical_allreduce =
-        Get<bool>(details::kUseHierarchicalAllReduce);
-    for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
-      auto op_handle =
-          dynamic_cast<details::NCCLOpHandleBase*>(all_reduce_op_handles[i]);
-      PADDLE_ENFORCE(op_handle, "op_handle must be NCCLOpHandleBase");
-      op_handle->SetRunEnv(i, use_hierarchical_allreduce);
-    }
-#endif
-
     for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) {
       auto* dep_var = new details::DummyVarHandle(graph->CreateControlDepVar());
       graph->Get<details::GraphDepVars>(details::kGraphDepVars)
@@ -63,12 +51,13 @@ class AllReduceDepsPass : public ir::Pass {
     }
   }
 
-  std::vector<details::OpHandleBase*> GetSortedAllReduceOps(
+  std::vector<details::AllReduceOpHandle*> GetSortedAllReduceOps(
       const ir::Graph& graph) const {
-    std::vector<details::OpHandleBase*> all_reduce_op_handles;
+    std::vector<details::AllReduceOpHandle*> all_reduce_op_handles;
     std::unordered_map<details::OpHandleBase*, size_t> pending_ops;
     std::unordered_set<details::OpHandleBase*> ready_ops;
     std::unordered_set<details::OpHandleBase*> next_ready_ops;
+
     auto op_handles = ir::FilterByNodeWrapper<details::OpHandleBase>(graph);
     size_t num_of_ops = op_handles.size();
     for (details::OpHandleBase* op : op_handles) {
@@ -106,16 +95,13 @@ class AllReduceDepsPass : public ir::Pass {
 
   void GetSortedAllReduceOps(
       const std::unordered_set<details::OpHandleBase*>& ready_ops,
-      std::vector<details::OpHandleBase*>* all_reduce_op_handles) const {
-    std::vector<details::OpHandleBase*> current_all_reduce_op_handles;
+      std::vector<details::AllReduceOpHandle*>* all_reduce_op_handles) const {
+    std::vector<details::AllReduceOpHandle*> current_all_reduce_op_handles;
     for (auto& op_handle : ready_ops) {
       auto all_reduce_op_handle =
           dynamic_cast<details::AllReduceOpHandle*>(op_handle);
-      auto fused_all_reduce_op_handle =
-          dynamic_cast<details::FusedAllReduceOpHandle*>(op_handle);
-
-      if (all_reduce_op_handle || fused_all_reduce_op_handle) {
-        current_all_reduce_op_handles.emplace_back(op_handle);
+      if (all_reduce_op_handle) {
+        current_all_reduce_op_handles.emplace_back(all_reduce_op_handle);
       }
     }
 
@@ -124,8 +110,8 @@ class AllReduceDepsPass : public ir::Pass {
     // Sort the current_all_reduce_op_handles according to the name of input.
     sort(current_all_reduce_op_handles.begin(),
          current_all_reduce_op_handles.end(),
-         [](const details::OpHandleBase* left,
-            const details::OpHandleBase* right) -> bool {
+         [](const details::AllReduceOpHandle* left,
+            const details::AllReduceOpHandle* right) -> bool {
            auto left_in_vars =
                details::DynamicCast<details::VarHandle>(left->Inputs());
            auto right_in_vars =
@@ -140,9 +126,9 @@ class AllReduceDepsPass : public ir::Pass {
                                   current_all_reduce_op_handles.end());
   }
 
-  void DebugString(
-      const ir::Graph& graph,
-      const std::vector<details::OpHandleBase*>& all_reduce_op_handles) const {
+  void DebugString(const ir::Graph& graph,
+                   const std::vector<details::AllReduceOpHandle*>&
+                       all_reduce_op_handles) const {
     // get vars order
     std::map<int, std::vector<std::string>> vars =
         GetSoredGradientsFromStaleProgram(graph);
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index abfaf1b8d20..a2b4c37ab4a 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -34,8 +34,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
     auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    auto *multi_nccl_ctxs =
-        &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
+    auto *nccl_ctxs = &Get<platform::NCCLContextMap>(details::kNCCLCtxs);
 #endif
 
     std::unordered_set<std::string> grads;
@@ -95,7 +94,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       }
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       InsertFusedAllReduce(places, local_scopes, group_size,
-                           group_all_reduce_ops, multi_nccl_ctxs, &result);
+                           group_all_reduce_ops, nccl_ctxs, &result);
 #else
       InsertFusedAllReduce(places, local_scopes, group_size,
                            group_all_reduce_ops, &result);
@@ -108,7 +107,7 @@ class FuseAllReduceOpPass : public ir::Pass {
                             const size_t num_of_all_reduce,
                             const std::vector<ir::Node *> &all_reduce_ops,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-                            const platform::NCCLCommunicator *multi_nccl_ctxs,
+                            const platform::NCCLContextMap *nccl_ctxs,
 #endif
                             ir::Graph *result) const {
     std::vector<details::VarHandleBase *> inputs;
@@ -136,7 +135,7 @@ class FuseAllReduceOpPass : public ir::Pass {
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
-                           local_scopes, multi_nccl_ctxs, result);
+                           local_scopes, nccl_ctxs, result);
 #else
     CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places,
                            local_scopes, result);
@@ -151,13 +150,13 @@ class FuseAllReduceOpPass : public ir::Pass {
       const std::vector<platform::Place> &places,
       const std::vector<Scope *> &local_scopes,
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-      const platform::NCCLCommunicator *multi_nccl_ctxs,
+      const platform::NCCLContextMap *nccl_ctxs,
 #endif
       ir::Graph *result) const {
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     auto *op_handle = new details::FusedAllReduceOpHandle(
         result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
-        local_scopes, places, num_of_all_reduce, multi_nccl_ctxs);
+        local_scopes, places, num_of_all_reduce, nccl_ctxs);
 #else
     auto *op_handle = new details::FusedAllReduceOpHandle(
         result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation),
@@ -173,7 +172,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     }
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    if (!multi_nccl_ctxs) {
+    if (!nccl_ctxs) {
       SetCommunicationContext(places, op_handle);
     }
 #else
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index d6d9c8bb891..a4cb0599ac4 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -157,11 +157,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
   local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
   strategy_ = Get<const details::BuildStrategy>(kStrategy);
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
-  nccl_ctxs_ = nullptr;
-  if (multi_nccl_ctxs_) {
-    nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx();
-  }
+  nccl_ctxs_ = &Get<platform::NCCLContextMap>(details::kNCCLCtxs);
 #endif
   PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
 }
@@ -464,20 +460,20 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::SparseAllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-              scopes, places, multi_nccl_ctxs_, is_encoded,
+              scopes, places, nccl_ctxs_, is_encoded,
               static_cast<int>(strategy_.trainers_endpoints_.size()) *
                   places_.size()));
     } else {
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::AllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-              scopes, places, multi_nccl_ctxs_));
+              scopes, places, nccl_ctxs_));
     }
 #elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(
         new details::AllReduceOpHandle(
             result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-            scopes, places, multi_nccl_ctxs_));
+            scopes, places, nccl_ctxs_));
 #else
     result->Get<GraphOps>(kGraphOps).emplace_back(
         new details::AllReduceOpHandle(
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index 9b36d231081..3434d45f142 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -96,8 +96,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
                          size_t device_id) const;
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
-  mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
+  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
 
   mutable std::string loss_var_name_;
@@ -131,7 +130,7 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
   bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override {
     if (node->Op()->Type() == "recv") {
       VLOG(1) << "set recv op do_not_run to true";
-      node->Op()->SetAttr("do_not_run", 1);
+      node->Op()->SetAttr("do_not_run", true);
       node->Op()->Flush();
     } else if (node->Name() == "lookup_table" || node->Name() == "nce" ||
                node->Name() == "hierarchical_sigmoid") {
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 62fba440ed4..017e3ef234c 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -25,20 +25,16 @@ namespace framework {
 namespace ir {
 
 void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
-                     const std::string& op_type, const std::string& quant_type,
-                     const std::string& dequant_type) {
+                     const std::string& op_type,
+                     const std::string& quant_type) {
   const std::string pattern_name = "quant_dequant_fuse";
-  int kNumFields = 5;
+  //  FusePassBase::Init(pattern_name, graph);
+  const int kNumFields = 5;
   const int kQuantizedWeightOffset = 0;
   const int kQuantizedOpOffset = 1;
   const int kQuantizedOpOutOffset = 2;
   const int kDequantOpOffset = 3;
   const int kDequantOpOutOffset = 4;
-  const int kDequantOpWeightScaleOffset = 5;
-
-  if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-    kNumFields += 1;
-  }
 
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
@@ -46,14 +42,22 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
                 ->assert_is_op_input(quant_type, "X")
                 ->AsInput();
 
-  std::string quantized_op_type = op_type;
+  std::string quantized_op_type = "";
   std::string weight_name = "";
-  if (op_type == "conv2d" || op_type == "depthwise_conv2d" ||
-      op_type == "conv2d_fusion") {
+  if (op_type == "conv2d") {
+    quantized_op_type = "conv2d";
+    weight_name = "Filter";
+  } else if (op_type == "depthwise_conv2d") {
+    quantized_op_type = "depthwise_conv2d";
+    weight_name = "Filter";
+  } else if (op_type == "conv2d_fusion") {
+    quantized_op_type = "conv2d_fusion";
     weight_name = "Filter";
   } else if (op_type == "mul") {
+    quantized_op_type = "mul";
     weight_name = "Y";
   } else if (op_type == "fc") {
+    quantized_op_type = "fc";
     weight_name = "W";
   } else {
     PADDLE_ENFORCE(
@@ -62,7 +66,7 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
   }
 
   patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name);
-  pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type);
+  pattern(x, quantized_op_type, weight_name, times, quant_type);
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -87,10 +91,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
           subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i))));
       nodes.push_back(
           subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i))));
-      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-        nodes.push_back(subgraph.at(
-            pattern.GetPDNode("dequant_channel_scale" + std::to_string(i))));
-      }
     }
 
     int bit_length = boost::get<int>(quant_op->Op()->GetAttr("bit_length"));
@@ -107,31 +107,10 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
     std::unordered_set<const Node*> delete_nodes;
 
     for (int i = 0; i < times; i++) {
-      std::vector<float> weight_scale;
-
-      // Get weight scale from dequant op.
-      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
-        auto scales_name =
-            nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales");
-        PADDLE_ENFORCE(scales_name.size() == 2);
-        const LoDTensor& channel_scale_tensor =
-            scope->FindVar(scales_name[0])->Get<LoDTensor>();
-        PADDLE_ENFORCE(
-            paddle::platform::is_cpu_place(channel_scale_tensor.place()));
-        const float* channel_scale_data = channel_scale_tensor.data<float>();
-        for (int i = 0; i < channel_scale_tensor.numel(); i++) {
-          weight_scale.push_back(channel_scale_data[i]);
-        }
-        delete_nodes.insert(
-            nodes[i * kNumFields + kDequantOpWeightScaleOffset]);
-      } else {
-        float max_range = boost::get<float>(
-            nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr(
-                "max_range"));
-        weight_scale.push_back((range * range) / max_range);
-      }
+      float max_range = boost::get<float>(
+          nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr("max_range"));
+      float weight_scale = (range * range) / max_range;
 
-      // create new op_desc
       auto base_op_desc =
           *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto();
       std::string new_input = input_node->Name();
@@ -162,7 +141,6 @@ void RunQuantDequant(ir::Graph* graph, Scope* scope, int times,
       IR_NODE_LINK_TO(input_node, new_op);
       IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op);
       IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]);
-
       delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]);
       delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]);
       delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]);
@@ -182,19 +160,16 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   const std::string pattern_name = "quant_dequant_fuse";
   FusePassBase::Init(pattern_name, graph);
 
-  std::unordered_set<std::string> dequant_types = {
-      "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"};
   std::unordered_set<std::string> quant_types = {
       "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
+
   std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
                                                         "depthwise_conv2d"};
   auto* scope = param_scope();
-  for (auto& dequant_type : dequant_types) {
-    for (auto& quant_type : quant_types) {
-      for (auto& op_type : quantized_op_types) {
-        for (int i = 6; i >= 1; i--) {
-          RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type);
-        }
+  for (auto& quant_type : quant_types) {
+    for (auto& op_type : quantized_op_types) {
+      for (int i = 6; i >= 1; i--) {
+        RunQuantDequant(graph, scope, i, op_type, quant_type);
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 556d28a42ae..3fd368741fb 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -43,11 +43,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {
     op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart"));
     op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride"));
     PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
-    auto& scope = graph->Get<Scope>(kParamScopeAttr);
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
     const std::string ColMat = patterns::UniqueKey("SeqConvColMat");
     op_desc.SetOutput("ColMat", {ColMat});
     op_desc.SetOutput("Out", {relu_out->Name()});
-    scope.Var(ColMat)->GetMutable<LoDTensor>();
+    scope->Var(ColMat)->GetMutable<LoDTensor>();
 
     auto* op = graph->CreateOpNode(&op_desc);
     IR_NODE_LINK_TO(input, op);
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 9883a194056..2b4683f9e77 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -53,8 +53,32 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
 }
 
 std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  os << "\tlod: " << t.lod() << "\n";
-  os << static_cast<Tensor>(t) << "\n";
+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor cpu_tensor;
+    cpu_tensor.set_lod(t.lod());
+    framework::TensorCopy(t, platform::CPUPlace(), &cpu_tensor);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << cpu_tensor;
+    return os;
+  }
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    if (t.type() == proto::VarType::FP32) {
+      os << t.data<float>()[i] << " ";
+    } else if (t.type() == proto::VarType::INT64) {
+      os << t.data<int64_t>()[i] << " ";
+    } else {
+      PADDLE_THROW("LoDTensor data type not in [float, int64_t]");
+    }
+  }
 
   return os;
 }
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index d1554113bc3..15928c18d38 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -28,14 +28,12 @@ namespace framework {
 
 TEST(LoD, PrintLoDTensor) {
   LoDTensor tensor1;
-  tensor1.Resize({2});
   tensor1.mutable_data<float>(platform::CPUPlace());
   tensor1.data<float>()[0] = 0.2;
   tensor1.data<float>()[1] = 0.5;
   LOG(INFO) << tensor1;
 
   LoDTensor tensor2;
-  tensor2.Resize({2});
   tensor2.mutable_data<int64_t>(platform::CPUPlace());
   tensor2.data<int64_t>()[0] = 1;
   tensor2.data<int64_t>()[1] = 2;
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 1ea93b7638a..9c955103ba7 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index dedaf243647..f06f67dcc3d 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -33,7 +33,7 @@ class OpDesc {
   OpDesc(const std::string &type, const VariableNameMap &inputs,
          const VariableNameMap &outputs, const AttributeMap &attrs);
 
-  OpDesc(const proto::OpDesc &desc, BlockDesc *block);
+  explicit OpDesc(const proto::OpDesc &desc, BlockDesc *block = nullptr);
 
   explicit OpDesc(BlockDesc *block) : block_(block) {}
 
@@ -42,6 +42,7 @@ class OpDesc {
   void CopyFrom(const OpDesc &op_desc);
 
   proto::OpDesc *Proto();
+  const proto::OpDesc &ReadonlyProto() const { return desc_; }
 
   std::string Type() const { return desc_.type(); }
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 8d4623468b9..dab35bae4d5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -39,6 +39,10 @@ DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
 namespace paddle {
 namespace framework {
 
+OpDuppy op_duppy;
+Scope scope_duppy;
+RuntimeContext runtime_context_duppy({}, {});
+
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
     std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
@@ -884,6 +888,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // result of HasAttr.
   if (!enable_cache_runtime_context && HasAttr(kEnableCacheRuntimeContext))
     enable_cache_runtime_context = true;
+  if (!enable_cache_expected_kernel && HasAttr(kEnableCacheExpectedKernel))
+    enable_cache_expected_kernel = true;
   if (!all_kernels_must_compute_runtime_shape &&
       HasAttr(kAllKernelsMustComputeRuntimeShape))
     all_kernels_must_compute_runtime_shape = true;
@@ -892,12 +898,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     RunImpl(scope, place, &ctx);
   } else {
     const Scope* cur_scope = &scope;
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-      std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
     }
     RunImpl(scope, place, runtime_ctx_.get());
   }
@@ -909,7 +912,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
+  if (!enable_cache_expected_kernel || !kernel_type_) {
     ChooseKernel(*runtime_ctx, scope, place);
   }
 
@@ -997,11 +1000,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  std::lock_guard<std::mutex> lock(cache_update_mutex_);
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-    kernel_type_.reset(new OpKernelType(expected_kernel_key));
-    kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
-  }
+  kernel_type_.reset(new OpKernelType(expected_kernel_key));
+  kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
 }
 
 void OperatorWithKernel::TransferInplaceVarsBack(
@@ -1027,6 +1027,7 @@ Scope* OperatorWithKernel::PrepareData(
     std::vector<std::string>* transfered_inplace_vars,
     RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
+  if (!need_prepare_data_) return new_scope;
 
   std::unordered_set<std::string> no_buffer_ins;
   if (info_) {
@@ -1119,6 +1120,10 @@ Scope* OperatorWithKernel::PrepareData(
       SetTensorToVariable(*var, out, trans_var);
     }
   }
+  // If new_scope = nullptr, it means that for each input of this Op, there is
+  // no TransformData. Thus, PrepareData could be skipped at the rest iterations
+  // of this Op's execution to save the elapsed time.
+  if (!new_scope) need_prepare_data_ = false;
 
   return new_scope;
 }
@@ -1142,7 +1147,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &(var->Get<SelectedRows>().value());
         }
         if (t != nullptr) {
-          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu) is not initialized",
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu)is not initialized",
                          input.first, i);
           proto::VarType::Type tmp = t->type();
           PADDLE_ENFORCE(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8e158e93063..8f301c6ebce 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <algorithm>
 #include <atomic>
 #include <memory>
-#include <mutex>  // NOLINT
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -71,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 /// this Op's execution to save the elapsed time.
 constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
 
+/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
+/// name scope and same place, since the expected kerenl of this Op does not
+/// change in the execution, it could be recorded only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -227,6 +232,18 @@ using OpKernelConfigsMap =
     std::unordered_map<OpKernelType, std::vector<KernelConfig>,
                        OpKernelType::Hash>;
 
+class OpDuppy : public OperatorBase {
+ public:
+  OpDuppy() : OperatorBase("duppy", {}, {}, {}) {}
+
+  void RunImpl(const Scope& scope,
+               const platform::Place& place) const override {}
+};
+
+extern OpDuppy op_duppy;
+extern Scope scope_duppy;
+extern RuntimeContext runtime_context_duppy;
+
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
@@ -239,6 +256,13 @@ class ExecutionContext {
         ctx_(ctx),
         kernel_configs_(configs) {}
 
+  explicit ExecutionContext(const platform::DeviceContext& device_context)
+      : op_(op_duppy),
+        scope_(scope_duppy),
+        device_context_(device_context),
+        ctx_(runtime_context_duppy),
+        kernel_configs_(nullptr) {}
+
   const OperatorBase& op() const { return op_; }
 
   const Scope& scope() const { return scope_; }
@@ -366,6 +390,9 @@ class ExecutionContext {
     auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
         allocation_ptr, deleter);
 
+    PADDLE_ENFORCE(
+        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+        "The AllocationPtr must be TemporaryAllocation.");
     PADDLE_ENFORCE_GE(allocation_ptr->size(),
                       framework::product(dim) * sizeof(T));
 
@@ -377,12 +404,12 @@ class ExecutionContext {
   }
 
   template <typename T>
-  T& GetKernelConfig(size_t idx) const {
+  T& GetKernelConfig(int idx) const {
     PADDLE_ENFORCE(
         kernel_configs_ && kernel_configs_->size() > static_cast<size_t>(idx),
-        "%s selected kernel doesn't have kernel config %lu <= %lu",
+        "%s selected kernel doesn't have kernel config %lu <= %d",
         op_.Type().c_str(), kernel_configs_->size(), idx);
-    return *boost::get<std::shared_ptr<T>>((*kernel_configs_)[idx]);
+    return *boost::get<std::shared_ptr<T>>(kernel_configs_->at(idx));
   }
 
  private:
@@ -499,9 +526,10 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
+  mutable bool need_prepare_data_ = true;
   mutable bool enable_cache_runtime_context = false;
+  mutable bool enable_cache_expected_kernel = false;
   mutable bool all_kernels_must_compute_runtime_shape = false;
-  mutable std::mutex cache_update_mutex_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index a350b8957d9..c520c222350 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -103,7 +103,7 @@ TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
     ++search_times_;
     return algo;
   }
-  TAlgorithm algo{};
+  TAlgorithm algo;
   int64_t min = static_cast<uint64_t>(INT_MAX);
   for (const auto& m : hash_) {
     if (m.first < min) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 8853ee3bd18..f400e8a5cc0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -94,113 +94,6 @@ class ParallelExecutorPrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
-    VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
-            << ", num_trainers:" << bst.num_trainers_
-            << ", trainer_id:" << bst.trainer_id_;
-
-    if (bst.use_hierarchical_allreduce_) {
-      VLOG(1) << ", use_hierarchical_allreduce:"
-              << bst.use_hierarchical_allreduce_ << ", inter_trainers_num:"
-              << bst.hierarchical_allreduce_inter_nranks_
-              << ", exter_trainers_num:"
-              << bst.hierarchical_allreduce_exter_nranks_;
-    }
-
-    std::vector<ncclUniqueId *> flat_nccl_ids;
-    if (nranks_ == 1) {
-      // FIXME(gongwb): need not to create ncclid when nranks==1
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      return;
-    }
-
-    if (bst.enable_parallel_graph_) {
-      VLOG(1) << "use only one ncclid in pg model";
-
-      ncclUniqueId *nccl_id = nullptr;
-
-      std::string var_name = platform::GetFlatNCCLVarName(0);
-      auto nccl_id_var = scope->FindVar(var_name);
-      if (nccl_id_var) {
-        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-      } else {
-        nccl_id = new ncclUniqueId();
-        PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id));
-      }
-
-      flat_nccl_ids.push_back(nccl_id);
-
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      VLOG(1) << "init bst nccl context complete!";
-      return;
-    }
-
-    // num_trainers ==1 && places > 1
-    if (bst.num_trainers_ == 1) {
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
-      return;
-    }
-
-    for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-      std::string var_name = platform::GetFlatNCCLVarName(i);
-      auto nccl_id_var = scope->FindVar(var_name);
-      PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-      auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-      flat_nccl_ids.push_back(nccl_id);
-    }
-
-    nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                             bst.trainer_id_);
-
-    if (bst.use_hierarchical_allreduce_) {
-      std::vector<ncclUniqueId *> inter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalInterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-        auto inter_nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        inter_nccl_ids.push_back(inter_nccl_id);
-      }
-
-      std::vector<ncclUniqueId *> exter_nccl_ids;
-      for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
-        std::string var_name = platform::GetHierarchicalExterNCCLVarName(i);
-        auto nccl_id_var = scope->FindVar(var_name);
-        PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name);
-        auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
-        exter_nccl_ids.push_back(nccl_id);
-      }
-
-      nccl_ctxs_->InitHierarchicalCtxs(
-          places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_,
-          bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_,
-          bst.hierarchical_allreduce_exter_nranks_);
-    }
-  }
-
-  void InitOrGetNCCLCommunicator(framework::Scope *scope,
-                                 const BuildStrategy &bst) {
-    const std::string var_name = "NCCLCommunicator";
-    auto var = scope->FindVar(var_name);
-    if (var != nullptr) {
-      PADDLE_ENFORCE(var->IsInitialized(),
-                     "if %s exists, it must be initialized", var_name);
-      VLOG(1) << "find " << var_name
-              << " in scope, so use it and does not recreate!";
-      nccl_ctxs_ = var->GetMutable<platform::NCCLCommunicator>();
-      return;
-    }
-
-    VLOG(1) << "not find " << var_name << " in scope, so recreate it!";
-    nccl_ctxs_ = scope->Var(var_name)->GetMutable<platform::NCCLCommunicator>();
-    InitNCCLCtxs(scope, bst);
-  }
-#endif
-
   BuildStrategy build_strategy_;
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
@@ -208,7 +101,7 @@ class ParallelExecutorPrivate {
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  platform::NCCLCommunicator *nccl_ctxs_{nullptr};
+  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
   bool own_local_scope_;
   bool use_cuda_;
@@ -314,23 +207,12 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   member_->use_all_reduce_ =
       build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
   member_->nranks_ = build_strategy.num_trainers_ * places.size();
-#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
-  if (member_->use_cuda_) {
-    PADDLE_ENFORCE(places.size() == 1, "Windows can support Single GPU only.");
-  }
-#endif
   if (!member_->use_all_reduce_) {
     PADDLE_ENFORCE(places.size() > 1,
                    "If you set build_strategy.reduce with 'Reduce',"
                    "the number of places must be greater than 1.");
   }
 
-  LOG(WARNING) << string::Sprintf(
-      "The number of %s, which is used in ParallelExecutor, is %lu. And "
-      "the Program will be copied %lu copies",
-      (member_->use_cuda_ ? "CUDAPlace" : "CPUPlace"), places.size(),
-      places.size());
-
   // Step 1. Bcast the bcast_vars to devs.
   // Create local scopes
   if (local_scopes.empty()) {
@@ -369,9 +251,27 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                "Execution which can get better performance,"
             << "you can force it off by env FLAGS_enable_parallel_graph=0";
 
-  if (member_->use_cuda_ && member_->nranks_ > 1) {
+  if (member_->use_cuda_) {
+// Bcast Parameters to all GPUs
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-    member_->InitOrGetNCCLCommunicator(scope, build_strategy);
+    ncclUniqueId *nccl_id = nullptr;
+    // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective
+    // distributed training
+    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
+    if (nccl_id_var != nullptr) {
+      nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+    }
+    if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) {
+      if (nccl_id == nullptr) {
+        local_nccl_id_.reset(new ncclUniqueId());
+        platform::dynload::ncclGetUniqueId(local_nccl_id_.get());
+        nccl_id = local_nccl_id_.get();
+      }
+    }
+
+    member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
+        member_->places_, nccl_id, build_strategy.num_trainers_,
+        build_strategy.trainer_id_));
 
     // Initialize device context's nccl comm, will be used by normal
     // Operators like sync_batch_norm, and collective ops.
@@ -380,16 +280,25 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     // NOTE: NCCL group-calls and non-group-calls can not use the same
     // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
     // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
+    std::unique_ptr<platform::NCCLContextMap> dev_nccl_ctxs;
+    if (nccl_id == nullptr) {
+      dev_nccl_ctxs.reset(new platform::NCCLContextMap(member_->places_));
+    }
     for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
           pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      if (nccl_id != nullptr) {
+        auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[dev_id]);
+        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      } else {
+        auto &nccl_ctx = dev_nccl_ctxs->at(member_->places_[dev_id]);
+        dev_ctx->set_nccl_comm(nccl_ctx.comm());
+      }
     }
+#else
+    PADDLE_THROW("Not compiled with CUDA");
 #endif
   }
   // broadcast parameters from the 0th device to others:
@@ -404,11 +313,10 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-  // Bcast Parameters to all GPUs
+
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
   }
-
   // Startup Program has been run. All local scopes has correct parameters.
 
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
@@ -419,18 +327,18 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     VLOG(3) << "use local async mode";
     graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
                                  {member_->local_scopes_[0]}, 1,
-                                 member_->use_cuda_, member_->nccl_ctxs_);
+                                 member_->use_cuda_, member_->nccl_ctxs_.get());
     for (size_t i = 1; i < member_->places_.size(); ++i) {
       graphs[i] =
           build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
                                {member_->local_scopes_[i]}, 1,
-                               member_->use_cuda_, member_->nccl_ctxs_);
+                               member_->use_cuda_, member_->nccl_ctxs_.get());
       async_graphs[i] = graphs[i];
     }
   } else {
     graph = build_strategy.Apply(graph, member_->places_, loss_var_name,
                                  member_->local_scopes_, member_->nranks_,
-                                 member_->use_cuda_, member_->nccl_ctxs_);
+                                 member_->use_cuda_, member_->nccl_ctxs_.get());
   }
 #else
   if (build_strategy.async_mode_) {
@@ -563,15 +471,16 @@ void ParallelExecutor::BCastParamsToDevices(
       PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                         "variables' buffer size to bcast NOT equal to places");
       {
-        auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
-          auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
           platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                        nccl_ctx.comm_, nccl_ctx.stream());
         }
-        nccl_ctxs->WaitAll();
+        member_->nccl_ctxs_->WaitAll();
       }
+#else
+      PADDLE_THROW("Not compiled with CUDA");
 #endif
     } else {
       platform::CPUPlace cpu;
@@ -603,7 +512,6 @@ void ParallelExecutor::BCastParamsToDevices(
 
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                            const std::string &fetched_var_name) {
-  VLOG(3) << "enter ParallelExecutor Run";
 #ifdef WITH_GPERFTOOLS
   if (gProfileStarted) {
     ProfilerFlush();
@@ -612,11 +520,8 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
 
   platform::RecordBlock b(0);
   if (member_->HasGarbageCollectors()) {
-    platform::RecordEvent event("PrepareGarbageCollectors");
     member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
   }
-
-  VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
@@ -641,21 +546,11 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
     const std::unordered_map<std::string, LoDTensor> &tensors) {
   for (auto pair : tensors) {
     auto lod_tensors = pair.second.SplitLoDTensor(member_->places_);
-    if (member_->places_.size() != lod_tensors.size()) {
-      bool is_cpu_place = platform::is_cpu_place(member_->places_.front());
-      auto error_info = string::Sprintf(
-          "The number(%d) of samples of "
-          "current batch is less than the count(%d) of "
-          "devices(%s), currently, it is not allowed. ",
-          lod_tensors.size(), member_->places_.size(),
-          (is_cpu_place ? "CPU" : "GPU"));
-      if (is_cpu_place) {
-        error_info +=
-            "You should set the environment variable CPU_NUM in the system "
-            "to determine the number of devices you need.";
-      }
-      PADDLE_THROW(error_info);
-    }
+    PADDLE_ENFORCE_EQ(
+        member_->places_.size(), lod_tensors.size(),
+        "The number of samples of current batch is less than the count of "
+        "devices, currently, it is not allowed. (%d vs %d)",
+        member_->places_.size(), lod_tensors.size());
     for (size_t j = 0; j < member_->places_.size(); ++j) {
       // TODO(panxy0718): Do I need to delete this var?
       auto t =
@@ -676,9 +571,7 @@ ParallelExecutor::~ParallelExecutor() {
 bool ParallelExecutor::EnableParallelGraphExecution(
     const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
     const BuildStrategy &build_strategy) const {
-  if (!FLAGS_enable_parallel_graph) {
-    return false;
-  }
+  if (!FLAGS_enable_parallel_graph) return false;
 
   bool enable_parallel_graph = true;
 
@@ -698,19 +591,11 @@ bool ParallelExecutor::EnableParallelGraphExecution(
     }
   }
 
-  if (!member_->use_all_reduce_ || !member_->use_cuda_) {
+  if (!member_->use_all_reduce_ || !member_->use_cuda_)
+
     if (build_strategy.enable_sequential_execution_ ||
-        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) {
+        exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
       enable_parallel_graph = false;
-    }
-  }
-
-#ifdef WIN32
-  VLOG(1) << "Windows has no support to parallel graph, enable_parallel_graph "
-             "would be forced to false.";
-  enable_parallel_graph = false;
-#endif
-
   return enable_parallel_graph;
 }
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 6943fe62b91..2de6b7f73d2 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -87,6 +87,10 @@ class ParallelExecutor {
 
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+  std::unique_ptr<ncclUniqueId> local_nccl_id_;
+#endif
 };
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 20d7f98e936..c48c7872ec2 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -83,34 +83,28 @@ void PullDenseWorker::Stop() {
   }
 }
 
-void PullDenseWorker::PullDense(bool force_update) {
-  pull_dense_status_.resize(0);
-  for (size_t i = 0;
-       i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) {
-    uint64_t tid = static_cast<uint64_t>(
-        dwp_param_.program_config(0).pull_dense_table_id(i));
-    if (force_update || CheckUpdateParam(tid)) {
-      fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
-                                     &pull_dense_status_);
-      ResetThreadVersion(tid);
-    }
-  }
-  if (pull_dense_status_.size() != 0) {
-    Wait(&pull_dense_status_);
-  }
-}
-
 int PullDenseWorker::Start() {
   running_ = true;
-  // before training, we can pull dense from pserver first.
-  PullDense(true);
   t_ = std::thread(&PullDenseWorker::Run, this);
   return 0;
 }
 
 void PullDenseWorker::Run() {
   while (running_) {
-    PullDense(false);
+    pull_dense_status_.resize(0);
+    for (size_t i = 0;
+         i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) {
+      uint64_t tid = static_cast<uint64_t>(
+          dwp_param_.program_config(0).pull_dense_table_id(i));
+      if (CheckUpdateParam(tid)) {
+        fleet_ptr_->PullDenseVarsAsync(
+            *root_scope_, tid, dense_value_names_[tid], &pull_dense_status_);
+        ResetThreadVersion(tid);
+      }
+    }
+    if (pull_dense_status_.size() != 0) {
+      Wait(&pull_dense_status_);
+    }
 #ifndef _WIN32
     usleep(sleep_time_ms_ * 1000);
 #endif
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 565b7d9d16c..ea7f8c496a9 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -35,6 +35,7 @@ size_t Tensor::memory_size() const {
 }
 
 void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
+                           memory::Allocator::Attr attr,
                            size_t requested_size) {
   type_ = type;
   PADDLE_ENFORCE_GE(numel(), 0,
@@ -49,17 +50,18 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type,
   /* some versions of boost::variant don't have operator!= */
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
-    holder_ = memory::AllocShared(place, size);
+    holder_ = memory::AllocShared(place, size, attr);
     offset_ = 0;
   }
   return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                  offset_);
 }
 
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
+                           size_t requested_size) {
   PADDLE_ENFORCE(this->holder_ != nullptr,
                  "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, type_, requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }
 
 Tensor& Tensor::ShareDataWith(const Tensor& src) {
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 1ab75e33257..f83a1aa49d5 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -80,6 +80,8 @@ class Tensor {
   template <typename T>
   const T* data() const;
 
+  const void* raw_data() const { return holder_->ptr(); }
+
   inline bool IsInitialized() const;
 
   /**
@@ -87,12 +89,17 @@ class Tensor {
    * @note    If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);
 
   void* mutable_data(platform::Place place, proto::VarType::Type type,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                      size_t requested_size = 0);
 
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                     size_t requested_size = 0);
 
   /**
    * @brief     Return a pointer to mutable memory block.
@@ -104,7 +111,9 @@ class Tensor {
    * @note      If not exist, then allocation.
    */
   template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);
 
   /*! Return the dimensions of the memory block. */
   const DDim& dims() const;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index a4b1457ad56..8dabecac8ab 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -49,17 +49,20 @@ inline T* Tensor::data() {
 
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               memory::Allocator::Attr attr,
                                size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }
 
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
+                               memory::Allocator::Attr attr,
+                               size_t requested_size) {
   static_assert(std::is_pod<T>::value, "T must be POD");
   return reinterpret_cast<T*>(
-      mutable_data(place, DataTypeTrait<T>::DataType, requested_size));
+      mutable_data(place, DataTypeTrait<T>::DataType, attr, requested_size));
 }
 
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 33ef3b91866..a7f09df4917 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -491,51 +491,5 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
   }
 }
 
-template <typename T>
-std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<T>();
-  auto element_num = tensor.numel();
-
-  os << "\tdata: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
-    }
-  }
-  os << "]";
-  return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const Tensor& t) {
-  os << "\tdim: " << t.dims() << "\n";
-  os << "\tlayout: " << DataLayoutToString(t.layout()) << "\n";
-
-  Tensor tensor;
-  tensor.Resize(t.dims());
-  if (platform::is_cpu_place(t.place())) {
-    tensor.ShareDataWith(t);
-  } else {
-    platform::CPUPlace place;
-    framework::TensorCopy(t, place, &tensor);
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& dev_ctx = *pool.Get(t.place());
-    dev_ctx.Wait();
-  }
-
-#define PrintTensorCallback(cpp_type, proto_type) \
-  do {                                            \
-    if (tensor.type() == proto_type) {            \
-      os << "\tdtype: " << proto_type << "\n";    \
-      print_tensor<cpp_type>(os, tensor);         \
-      return os;                                  \
-    }                                             \
-  } while (0)
-
-  _ForEachDataType_(PrintTensorCallback);
-  VLOG(1) << "PrintVar: unrecognized data type:" << t.type();
-  return os;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index e382f920399..1ffd357e62b 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -151,7 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
   memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
                src_ptr, size);
 }
-
-std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index b491725974c..b29736cfbbe 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -91,58 +91,5 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-class PipelineTrainer : public TrainerBase {
- public:
-  PipelineTrainer() {}
-  ~PipelineTrainer() override {}
-  void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override;
-  void InitTrainerEnv(const ProgramDesc& main_program,
-                      const platform::Place& place) override;
-  void InitOtherEnv(const ProgramDesc& main_program) override {}
-  void Run() override;
-  void Finalize() override;
-
- protected:
-  int section_num_;
-  int pipeline_num_;
-  int scope_queue_size_;
-  int sync_steps_;
-
-  SectionWorkerParameter pipeline_config_;
-
-  // The in/output var names for each section
-  std::vector<std::unique_ptr<std::vector<std::string>>> in_var_names_;
-  std::vector<std::unique_ptr<std::vector<std::string>>> out_var_names_;
-
-  // Counter for the running thread
-  std::vector<std::vector<int*>> worker_count_;
-  std::vector<std::vector<std::unique_ptr<std::mutex>>> worker_count_mutex_;
-
-  // worker: [section_id][pipeline_id][thread_id]
-  std::vector<std::vector<
-      std::vector<std::shared_ptr<paddle::framework::DeviceWorker>>>>
-      workers_;
-  std::vector<std::thread> section_threads_;
-
-  // We use scope to maintain context info, and scopes
-  // will be deliverd between different sections.
-  std::vector<std::vector<std::unique_ptr<ScopeQueue>>> scope_queues_;
-  std::vector<Scope*> pipeline_scopes_;
-
-  // The parameters that should be syncronized between different cards using
-  // nccl all-reduce
-  std::shared_ptr<std::vector<std::string>> param_need_sync_;
-  std::vector<std::unique_ptr<SyncFunctor>> sync_functors_;
-  std::shared_ptr<platform::NCCLContextMap> nccl_ctx_map_;
-
-  std::vector<std::shared_ptr<DataFeed>> readers_;
-
-  void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id,
-                           const ProgramDesc& main_program);
-  void CopyParameters(const Scope& root_scope, int pipeline_id);
-  void construct_sync_functor();
-};
-#endif
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 4910fb740c5..4fc05ccf5c9 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
 import "data_feed.proto";
-import "framework.proto";
 package paddle.framework;
 
 message TrainerDesc {
@@ -32,13 +30,11 @@ message TrainerDesc {
   repeated string filelist = 5;
   optional bool debug = 6 [ default = false ];
   optional FetchConfig fetch_config = 7;
-  optional bool use_cvm = 8 [ default = false ];
 
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
   optional DownpourWorkerParameter downpour_param = 103;
   optional PullDenseWorkerParameter pull_dense_param = 102;
-  optional SectionWorkerParameter section_param = 104;
   // datafeed desc
   optional DataFeedDesc data_desc = 201;
 }
@@ -54,30 +50,6 @@ message DownpourWorkerParameter {
   optional bool push_dense = 6 [ default = true ];
 }
 
-message SectionWorkerParameter {
-  repeated SectionConfig section_config = 1;
-  optional int32 queue_size = 2 [ default = 1 ];
-  optional int64 sync_steps = 3 [ default = 1 ];
-  optional int32 start_cpu_core_id = 4 [ default = 1 ];
-  repeated string param_need_sync = 5;
-}
-
-message SectionConfig {
-  enum Place {
-    CPUPlace = 0;
-    CUDAPlace = 1;
-    CUDAPinnedPlace = 2;
-  }
-
-  // FIXME: How to use proto::ProgramDesc
-  // required string program_desc_str = 1;
-  optional proto.ProgramDesc program_desc = 1;
-  optional Place place = 2;
-  optional int32 concurrency = 3 [ default = 1 ];
-  repeated string section_in_var_names = 4;
-  repeated string section_out_var_names = 5;
-}
-
 message FetchConfig {
   enum Method { PRINT = 0; }
   repeated string fetch_var_names = 1;
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index ce0eb5ec30c..6b4461c0c42 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -63,8 +63,5 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
 
 REGISTER_TRAINER_CLASS(MultiTrainer);
 REGISTER_TRAINER_CLASS(DistMultiTrainer);
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-REGISTER_TRAINER_CLASS(PipelineTrainer);
-#endif
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 7cc2b3b4225..a02b26e9c1f 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/var_type_traits.h"
-#include <unordered_map>
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
@@ -23,12 +22,12 @@
 #ifdef PADDLE_WITH_CUDA
 #ifndef _WIN32
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 #include <cudnn.h>
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
 #endif
+#include <unordered_map>
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 7147f06233c..fa77b96a7bd 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -36,7 +36,6 @@ namespace platform {
 #ifdef PADDLE_WITH_CUDA
 #ifndef _WIN32
 class Communicator;
-class NCCLCommunicator;
 #endif
 #endif
 }  // namespace platform
@@ -141,7 +140,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     std::map<size_t, Tensor>, operators::reader::LoDTensorBlockingQueueHolder,
 #ifdef PADDLE_WITH_CUDA
 #ifndef _WIN32
-    ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
+    ncclUniqueId, platform::Communicator,
 #endif
     operators::CudnnRNNCache,
 #endif
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 67dbfd740ed..a47275e1ca2 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -26,7 +26,6 @@
 #ifdef PADDLE_WITH_CUDA
 #ifndef _WIN32
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
-#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/cudnn_rnn_cache.h"
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 73c629fd227..e52a0283f72 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,10 +1,9 @@
-cc_library(imperative_flag SRCS flags.cc DEPS gflags) 
-
 if(WITH_PYTHON)
-cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag)
-cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler)
+cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
+cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
 cc_library(imperative_profiler SRCS profiler.cc)
 cc_library(nccl_context SRCS nccl_context.cc DEPS device_context)
+
 cc_test(nccl_context_test SRCS nccl_context_test.cc  DEPS nccl_context)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index fb22d334902..aa739a8972e 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/imperative/layer.h"
 
-#include <algorithm>
 #include <deque>
 #include <limits>
 #include <map>
@@ -28,32 +27,15 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/printf.h"
 
 namespace paddle {
 namespace imperative {
 
-void ThreadSafeNameSet::Insert(const std::string& name) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  set_.insert(name);
-}
-
-void ThreadSafeNameSet::Remove(const std::string& name) {
-  std::lock_guard<std::mutex> guard(mtx_);
-  auto iter = set_.find(name);
-  PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name);
-  set_.erase(iter);
-}
-
-std::vector<std::string> ThreadSafeNameSet::Names() const {
-  std::lock_guard<std::mutex> guard(mtx_);
-  return std::vector<std::string>(set_.begin(), set_.end());
-}
-
-ThreadSafeNameSet VarBase::name_set_;
+const char* PyLayer::kFwdInp = "X";
+const char* PyLayer::kFwdOut = "Out";
 
-std::vector<std::string> VarBase::AliveVarNames() { return name_set_.Names(); }
+std::map<int, py::object> py_funcs_;
 
 using framework::Variable;
 
@@ -99,106 +81,62 @@ class TensorAddToFunctor : public boost::static_visitor<> {
 
 }  // namespace detail
 
-void AddTo(std::shared_ptr<VarBase> src, std::shared_ptr<VarBase> dst,
-           platform::Place place, GradientRef* grad_ref) {
-  PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(),
-                 "gradient %s are not found in grad_ref", dst->Name());
-  if ((*grad_ref)[dst.get()].second) {
-    PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase");
-    dst->var_ = std::move(src->var_);
-    (*grad_ref)[dst.get()].second = false;
-    if (!dst->IsInitialize()) {
-      dst->SetInitialize(true);
-    }
-    return;
-  } else {
-    framework::Tensor* dst_tensor =
-        dst->var_->GetMutable<framework::LoDTensor>();
-    framework::Tensor* src_tensor =
-        src->var_->GetMutable<framework::LoDTensor>();
-
-    // FIXME(minqiyang): loss_grad op will pass a zero grad of label
-    // ugly fix for it
-    if (src_tensor->numel() == 0) {
-      return;
-    }
-
-    PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
-                   "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
-                   src_tensor->numel());
+void AddTo(Variable* src, Variable* dst, platform::Place place) {
+  framework::Tensor* dst_tensor = dst->GetMutable<framework::LoDTensor>();
+  framework::Tensor* src_tensor = src->GetMutable<framework::LoDTensor>();
 
-    detail::TensorAddToFunctor<float> func(
-        src_tensor->numel(), src_tensor->data<float>(),
-        dst_tensor->mutable_data<float>(place));
-    boost::apply_visitor(func, place);
+  // FIXME(minqiyang): loss_grad op will pass a zero grad of label
+  // ugly fix for it
+  if (src_tensor->numel() == 0) {
+    return;
   }
-}
 
-void ZeroGrads(const std::shared_ptr<imperative::VarBase> vb,
-               const platform::Place& place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-  auto grad_t = vb->var_->GetMutable<framework::LoDTensor>();
-  operators::math::set_constant(*dev_ctx, grad_t, 0.0);
-}
+  PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(),
+                 "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(),
+                 src_tensor->numel());
 
-void AddGradBySort(BackwardSumMap* bck_map,
-                   std::shared_ptr<imperative::VarBase> target,
-                   GradientRef* grad_ref) {
-  PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(),
-                 "Can't find %s in backward grad map", target->Name());
-  std::pair<platform::Place,
-            std::vector<std::pair<int, std::shared_ptr<imperative::VarBase>>>>&
-      current = bck_map->at(target.get());
-  std::sort(current.second.begin(), current.second.end(),
-            [](const std::pair<int, std::shared_ptr<imperative::VarBase>>& a,
-               const std::pair<int, std::shared_ptr<imperative::VarBase>>& b) {
-              return a.first > b.first;
-            });
-  for (auto& var_pair : current.second) {
-    VLOG(10) << "add origin_grad: " << target->Name();
-    VLOG(10) << "added grad: " << var_pair.second->Name()
-             << " trace id is: " << var_pair.first;
-    AddTo(var_pair.second, target, current.first, grad_ref);
-    var_pair.second.reset();
-  }
+  detail::TensorAddToFunctor<float> func(
+      src_tensor->numel(), src_tensor->data<float>(),
+      dst_tensor->mutable_data<float>(place));
+  boost::apply_visitor(func, place);
 }
 
 class Autograd {
  public:
   Autograd() {}
 
-  void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) {
+  void RunBackward(VarBase* var) {
     if (var->IsStopGradient()) {
       return;
     }
-    VLOG(2) << "start autograd";
-    BackwardSumMap bck_map;
+    VLOG(3) << "start autograd";
+
     std::deque<OpBase*> ready;
     ready.push_back(var->PreOp());
 
-    std::map<OpBase*, int> dep_counts =
-        ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref);
+    std::map<OpBase*, int> dep_counts = ComputeDepCounts(var->PreOp());
 
     while (!ready.empty()) {
       OpBase* ready_op = ready.front();
       ready.pop_front();
-      std::vector<VarBasePtrMap> grads_outputs =
-          ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy);
-
-      for (const auto& map : grads_outputs) {
-        for (auto it = map.rbegin(); it != map.rend(); ++it) {
-          const std::vector<std::shared_ptr<VarBase>>& grad_outs = it->second;
-          for (size_t i = 0; i < grad_outs.size(); ++i) {
-            if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue;
-            OpBase* pre_op = grad_outs[i]->PreOp();
-            if (!pre_op) continue;
-            dep_counts[pre_op] -= 1;
-            PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
-            bool pre_op_ready = dep_counts[pre_op] == 0;
-            if (pre_op_ready) {
-              ready.push_back(pre_op);
-            }
+      std::map<std::string, std::vector<VarBase*>> input_grads =
+          ready_op->ApplyGrad();
+
+      for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
+        const std::vector<VarBase*>& ingrads = it->second;
+        for (size_t i = 0; i < ingrads.size(); ++i) {
+          if (!ingrads[i]) continue;
+          if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
+            continue;
+          }
+          OpBase* pre_op = ready_op->pre_ops_[it->first][i];
+          if (!pre_op) continue;
+
+          dep_counts[pre_op] -= 1;
+          PADDLE_ENFORCE(dep_counts[pre_op] >= 0);
+          bool pre_op_ready = dep_counts[pre_op] == 0;
+          if (pre_op_ready) {
+            ready.push_back(pre_op);
           }
         }
       }
@@ -208,14 +146,7 @@ class Autograd {
   }
 
  private:
-  std::map<OpBase*, int> ComputeDepCounts(
-      OpBase* op, const detail::BackwardStrategy& bck_stratedy,
-      GradientRef* grad_ref) {
-    if (bck_stratedy.sorted_sum_gradient_) {
-      PADDLE_ENFORCE_NOT_NULL(grad_ref,
-                              "grad_ref should not be null when "
-                              "using sorted grad backward strategy");
-    }
+  std::map<OpBase*, int> ComputeDepCounts(OpBase* op) {
     std::map<OpBase*, int> ret;
 
     std::deque<OpBase*> queue;
@@ -225,21 +156,10 @@ class Autograd {
     while (!queue.empty()) {
       OpBase* candidate = queue.front();
       queue.pop_front();
-      for (const auto& map : candidate->grad_output_vars_) {
-        for (const auto& it : map) {
-          for (const auto& vb : it.second) {
-            if (bck_stratedy.sorted_sum_gradient_) {
-              ++(*grad_ref)[vb.get()].first;
-            }
-            // init the state of the grad_
-            (*grad_ref)[vb.get()].second = true;
-          }
-        }
-      }
       for (auto it : candidate->pre_ops_) {
         for (OpBase* pre_op : it.second) {
           if (!pre_op) continue;
-          VLOG(2) << "op dep " << candidate->Type() << " trace id "
+          VLOG(5) << "op dep " << candidate->Type() << " trace id "
                   << candidate->trace_id_ << " <---- " << it.first << " <---- "
                   << pre_op->Type() << " trace id " << pre_op->trace_id_;
           if (visited.find(pre_op) == visited.end()) {
@@ -252,8 +172,6 @@ class Autograd {
     }
     return ret;
   }
-
-  GradientRef grad_ref;
 };
 
 std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
@@ -269,14 +187,16 @@ std::unique_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
       new_var->var_->GetMutable<framework::LoDTensor>();
   tensor->set_lod(var_->Get<framework::LoDTensor>().lod());
 
-  const auto& src_tensor = var_->Get<framework::LoDTensor>();
-  framework::TensorCopy(src_tensor, dst_place, tensor);
   if (blocking) {
-    platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
-    auto src_place = src_tensor.place();
-    if (!(src_place == dst_place)) {
-      platform::DeviceContextPool::Instance().Get(src_place)->Wait();
-    }
+    platform::DeviceContext* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(dst_place);
+
+    framework::TensorCopySync(var_->Get<framework::LoDTensor>(), dst_place,
+                              tensor);
+
+    dev_ctx->Wait();
+  } else {
+    framework::TensorCopy(var_->Get<framework::LoDTensor>(), dst_place, tensor);
   }
 
   if (platform::is_gpu_place(dst_place)) {
@@ -293,94 +213,98 @@ framework::LoDTensor& VarBase::GradValue() {
   return *(grads_->var_->GetMutable<framework::LoDTensor>());
 }
 
-std::vector<VarBasePtrMap> OpBase::ApplyGrad(
-    BackwardSumMap* bck_map, GradientRef* grad_ref,
-    const detail::BackwardStrategy& bck_stratedy) {
-  PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation",
-                 Type());
+std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
+  PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
+                 "%s has no backward implementation", Type());
+
   VLOG(3) << "apply op grad: " << Type();
   std::vector<VarBasePtrMap> tmp_grad_outputs;
-  const size_t grad_op_count = grad_op_descs_.size();
+  if (backward_id_ > 0) {
+    VLOG(3) << "py_layer_grad";
+    tmp_grad_outputs.resize(1);
+    tmp_grad_outputs[0][framework::GradVarName(PyLayer::kFwdOut)] =
+        PyLayer::ApplyGrad(
+            backward_id_,
+            grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)]);
+  } else {
+    const size_t grad_op_count = grad_op_descs_.size();
+
+    tmp_grad_outputs.resize(grad_op_count);
+    for (size_t k = 0; k < grad_op_count; ++k) {
+      framework::OpDesc* grad_op_desc = grad_op_descs_[k];
+      auto& grad_output_variable_map = grad_output_vars_[k];
+
+      VLOG(3) << "apply grad op " << grad_op_desc->Type();
+
+      // Allocate tmp grad output variable
+      for (const auto& it : grad_output_variable_map) {
+        auto& outputs = tmp_grad_outputs[k][it.first];
+        outputs.reserve(it.second.size());
+        for (size_t i = 0; i < it.second.size(); ++i) {
+          VarBase* origin_grad_var_base = it.second[i];
+
+          // Allocate a new variable
+          VarBase* tmp_grad_var_base = new VarBase(
+              string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
+              origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
+              place_, true, false);
+          outputs.emplace_back(tmp_grad_var_base);
+        }
+      }
 
-  tmp_grad_outputs.resize(grad_op_count);
-  for (size_t k = 0; k < grad_op_count; ++k) {
-    framework::OpDesc* grad_op_desc = grad_op_descs_[k];
-    platform::RecordEvent record_event(grad_op_desc->Type());
-    auto& grad_output_variable_map = grad_output_vars_[k];
-    VLOG(3) << "apply grad op " << grad_op_desc->Type();
+      // No need to do compile time infer shape here.
+      // grad_op_desc_->InferShape(*block_);
+      // grad_op_desc->InferVarType(block_);
 
-    // Allocate tmp grad output variable
-    for (const auto& it : grad_output_variable_map) {
-      auto& outputs = tmp_grad_outputs[k][it.first];
-      outputs.reserve(it.second.size());
-      for (const std::shared_ptr<imperative::VarBase>& origin_grad_var_base :
-           it.second) {
-        // Allocate a new variable
-        std::shared_ptr<imperative::VarBase> tmp_grad_var_base(new VarBase(
-            string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
-            origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
-            place_, true, false));
-        outputs.emplace_back(std::move(tmp_grad_var_base));
+      std::unique_ptr<framework::OperatorBase> opbase =
+          framework::OpRegistry::CreateOp(*grad_op_desc);
+
+      auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      if (info.infer_var_type_) {
+        RuntimeInferVarTypeContext infer_var_type_ctx(
+            &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
+        info.infer_var_type_(&infer_var_type_ctx);
       }
-    }
 
-    // No need to do compile time infer shape here.
-    // grad_op_desc_->InferShape(*block_);
-    // grad_op_desc->InferVarType(block_);
+      framework::OperatorWithKernel* op_kernel =
+          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
+      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
-    std::unique_ptr<framework::OperatorBase> opbase =
-        framework::OpRegistry::CreateOp(*grad_op_desc);
+      // Run grad op
+      framework::VariableValueMap grad_invars_map;
+      framework::VariableValueMap grad_outvars_map;
 
-    auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
-    if (info.infer_var_type_) {
-      RuntimeInferVarTypeContext infer_var_type_ctx(
-          &grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs()));
-      info.infer_var_type_(&infer_var_type_ctx);
-    }
+      for (const auto& it : grad_input_vars_[k]) {
+        auto& grad_invars = grad_invars_map[it.first];
+        grad_invars.reserve(it.second.size());
+        for (const VarBase* grad_inp : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
+                                  grad_op_desc->Type(), grad_inp->Name());
 
-    framework::OperatorWithKernel* op_kernel =
-        dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
-    PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
-
-    // Run grad op
-    framework::VariableValueMap grad_invars_map;
-    framework::VariableValueMap grad_outvars_map;
-
-    for (const auto& it : grad_input_vars_[k]) {
-      auto& grad_invars = grad_invars_map[it.first];
-      grad_invars.reserve(it.second.size());
-      for (const std::shared_ptr<imperative::VarBase>& grad_inp : it.second) {
-        PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
-                                grad_op_desc->Type(), grad_inp->Name());
-        if (!grad_inp->IsInitialize()) {
-          grad_inp->InitBuffer();
-          ZeroGrads(grad_inp, place_);
+          grad_invars.emplace_back(grad_inp->var_);
         }
-        const std::shared_ptr<imperative::VarBase>& const_grad_inp = grad_inp;
-        grad_invars.emplace_back(const_grad_inp->var_.get());
       }
-    }
 
-    for (const auto& it : tmp_grad_outputs[k]) {
-      auto& grad_outvars = grad_outvars_map[it.first];
-      grad_outvars.reserve(it.second.size());
-      for (const std::shared_ptr<imperative::VarBase>& grad_out : it.second) {
-        PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
-                                grad_op_desc->Type(), grad_out->Name());
+      for (const auto& it : tmp_grad_outputs[k]) {
+        auto& grad_outvars = grad_outvars_map[it.first];
+        grad_outvars.reserve(it.second.size());
+        for (VarBase* grad_out : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
+                                  grad_op_desc->Type(), grad_out->Name());
 
-        grad_outvars.emplace_back(grad_out->var_.get());
+          grad_outvars.emplace_back(grad_out->var_);
+        }
       }
-    }
 
-    framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
-    framework::Scope scope;
-    PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
-    p.op.RuntimeInferShape(scope, place_, ctx);
-    p.func(
-        framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
+      framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
+      framework::Scope scope;
+      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
+      p.op.RuntimeInferShape(scope, place_, ctx);
+      p.func(
+          framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr));
+    }
   }
 
-  platform::RecordEvent record_event("merge_grads");
   // Add tmp grad outputs to original grad vars
   for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
     for (const auto& it : grad_output_vars_[k]) {
@@ -389,50 +313,18 @@ std::vector<VarBasePtrMap> OpBase::ApplyGrad(
       PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
       for (size_t i = 0; i < outputs.size(); ++i) {
-        // track outputs used by sum
-        if (bck_stratedy.sorted_sum_gradient_) {
-          if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) {
-            VLOG(10) << "add sub grad to " << origin_outputs[i]->Name();
-            bck_map->at(origin_outputs[i].get())
-                .second.emplace_back(
-                    std::pair<int, std::shared_ptr<imperative::VarBase>>(
-                        this->trace_id_, std::move(outputs[i])));
-          } else {
-            VLOG(10) << "insert new map for " << origin_outputs[i]->Name();
-            std::pair<platform::Place,
-                      std::vector<
-                          std::pair<int, std::shared_ptr<imperative::VarBase>>>>
-                tmp(place_,
-                    {std::make_pair(this->trace_id_, std::move(outputs[i]))});
-            bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp));
-          }
-
-          PADDLE_ENFORCE(
-              grad_ref->find(origin_outputs[i].get()) != grad_ref->end(),
-              "Can't find  %s in grad_reference count map",
-              origin_outputs[i]->Name());
-          PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1,
-                         "Backward error when calculate grad reference");
-          if (grad_ref->at(origin_outputs[i].get()).first > 1) {
-            VLOG(10) << "remove ref for " << origin_outputs[i]->Name();
-            grad_ref->at(origin_outputs[i].get()).first--;
-          } else {
-            VLOG(10) << "Add grad for: " << origin_outputs[i]->Name();
-            AddGradBySort(bck_map, origin_outputs[i], grad_ref);
-            grad_ref->at(origin_outputs[i].get()).first--;
-          }
-        } else {
-          VLOG(10) << "AddTo Called with orig_grad is: "
-                   << origin_outputs[i]->name_ << " Grad to be added is "
-                   << outputs[i]->name_;
-          AddTo(outputs[i], origin_outputs[i], place_, grad_ref);
-          outputs[i].reset();
-        }
+        framework::Variable* grad = outputs[i]->var_;
+        framework::Variable* orig_grad = origin_outputs[i]->var_;
+        VLOG(3) << "AddTo Called with orig_grad is: "
+                << origin_outputs[i]->name_ << " Grad to be added is "
+                << outputs[i]->name_;
+        AddTo(grad, orig_grad, place_);
+        delete grad;
       }
     }
   }
 
-  return grad_output_vars_;
+  return input_vars_;
 }
 
 void OpBase::InvokeBackwardHooks() {
@@ -444,25 +336,94 @@ void OpBase::InvokeBackwardHooks() {
   }
 }
 
-void OpBase::RegisterBackwardHooks(const py::object& callable) {
+void OpBase::RegisterBackwardHooks(const py::object& callable, bool front) {
   VLOG(3) << "Register backward hooks " << trace_id_;
 
   // TODO(minqiyang): check the callable format
-  backward_hooks_.push_back(callable);
+  if (front) {
+    backward_hooks_.insert(backward_hooks_.begin(), callable);
+  } else {
+    backward_hooks_.push_back(callable);
+  }
 }
 
-void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) {
+void VarBase::RunBackward() {
   if (!pre_op_) return;
-  platform::RecordEvent record_event("Imperative Backward");
+
   VLOG(3) << "start backward";
-  grads_->InitBuffer();
   auto grads_t = grads_->var_->GetMutable<framework::LoDTensor>();
   operators::math::set_constant(
       *(platform::DeviceContextPool::Instance().Get(
           var_->GetMutable<framework::LoDTensor>()->place())),
       grads_t, 1.0);
 
-  Autograd().RunBackward(this, bck_stratedy);
+  PADDLE_ENFORCE(
+      grads_ ==
+      pre_op_->output_vars_[pre_op_out_name_][pre_op_out_idx_]->grads_);
+  Autograd().RunBackward(this);
+}
+
+void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
+  py_funcs_[func_id] = py_func;
+}
+
+int PyLayer::NumFuncs() { return py_funcs_.size(); }
+
+std::vector<framework::Variable*> PyLayer::Apply(
+    int func_id, const std::vector<VarBase*>& inputs) {
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  return CallPythonFunc(py_funcs_[func_id], inputs);
+}
+
+std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs) {
+  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
+  auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
+
+  std::vector<VarBase*> outs;
+  outs.reserve(rets.size());
+  for (size_t i = 0U; i != rets.size(); ++i) {
+    outs.emplace_back(new VarBase(
+        string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
+                        i),
+        rets[i], nullptr, true));
+  }
+
+  return outs;
+}
+
+std::vector<framework::Variable*> PyLayer::CallPythonFunc(
+    const py::object& callable, const std::vector<VarBase*>& ins) {
+  py::gil_scoped_acquire guard;
+  py::tuple in_args(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
+    in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
+  }
+  VLOG(3) << "pyfunc in " << py::len(in_args);
+
+  // TODO(panyx0718): Who owns the returned LoDTensor.
+  auto ret = callable(in_args);
+  auto ret_tuple = py::cast<py::tuple>(ret);
+  size_t ret_num = py::len(ret_tuple);
+  std::vector<framework::Variable*> outs;
+  outs.reserve(ret_num);
+  VLOG(3) << "pyfunc out " << ret_num;
+  for (size_t i = 0; i < ret_num; ++i) {
+    try {
+      auto* py_out_tensor = py::cast<framework::LoDTensor*>(ret_tuple[i]);
+      PADDLE_ENFORCE_NOT_NULL(py_out_tensor,
+                              "Output tensor %d should not be nullptr", i);
+      auto* var = new framework::Variable();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      tensor->ShareDataWith(*py_out_tensor);
+      tensor->set_lod(py_out_tensor->lod());
+      outs.emplace_back(var);
+    } catch (py::cast_error&) {
+      PADDLE_THROW("The %d-th output must be LoDTensor", i);
+    }
+  }
+  return outs;
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 2fbedd82ea5..37488d381ef 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -14,20 +14,16 @@
 
 #pragma once
 
-#include <cstdint>
-#include <map>     // NOLINT
-#include <memory>  // NOLINT
-#include <mutex>   // NOLINT
-#include <set>
-#include <string>         // NOLINT
-#include <unordered_map>  // NOLINT
-#include <utility>
-#include <vector>  // NOLINT
-
 // clang-format off
 #include "paddle/fluid/framework/python_headers.h"
 // clang-format on
 
+#include <map>            // NOLINT
+#include <string>         // NOLINT
+#include <vector>         // NOLINT
+#include <memory>         // NOLINT
+#include <unordered_map>  // NOLINT
+
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
@@ -35,9 +31,8 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/imperative/backward_strategy.h"
+
 #include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/imperative/flags.h"
 
 namespace paddle {
 namespace imperative {
@@ -112,19 +107,6 @@ class PreparedOp {
 
 class OpBase;
 
-class ThreadSafeNameSet {
- public:
-  void Insert(const std::string& name);
-
-  void Remove(const std::string& name);
-
-  std::vector<std::string> Names() const;
-
- private:
-  std::multiset<std::string> set_;
-  mutable std::mutex mtx_;
-};
-
 /* The wrapper for Variable which holds a Variable and a VarBase of its
  * gradient. This object should be managed totally by Python intepreter.
  *
@@ -132,17 +114,13 @@ class ThreadSafeNameSet {
  */
 class VarBase {
  public:
-  static std::vector<std::string> AliveVarNames();
-
   // Internal interface, create VarBase from exist variable
-  VarBase(const std::string& name, std::unique_ptr<framework::Variable> var,
-          VarBase* grad, bool stop_gradient)
+  VarBase(const std::string& name, framework::Variable* var, VarBase* grad,
+          bool stop_gradient)
       : VarBase(name, var->Get<framework::LoDTensor>().type(),
                 var->Get<framework::LoDTensor>().dims(),
-                var->Get<framework::LoDTensor>().place(), nullptr, grad,
-                stop_gradient, false, true) {
-    var_ = std::move(var);
-  }
+                var->Get<framework::LoDTensor>().place(), var, grad,
+                stop_gradient, false) {}
 
   // Python interface
   VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
@@ -156,69 +134,52 @@ class VarBase {
           const framework::DDim& shape, const platform::Place& place,
           bool stop_gradient, bool persistable)
       : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient,
-                persistable, true) {}
-
-  // Grad used constructor
-  VarBase(const std::string& name, const framework::proto::VarType::Type dtype,
-          const std::vector<int64_t>& shape, const platform::Place& place,
-          bool stop_gradient, bool persistable, bool need_initialize)
-      : VarBase(name, dtype, framework::make_ddim(shape), place, nullptr,
-                nullptr, stop_gradient, persistable, need_initialize) {}
+                persistable) {}
 
  private:
   // TODO(minqiyang): need support SelectedRows
   VarBase(const std::string& name, framework::proto::VarType::Type dtype,
           const framework::DDim& shape, const platform::Place& place,
-          std::unique_ptr<framework::Variable> var, VarBase* grad,
-          bool stop_gradient, bool persistable, bool need_initialize)
+          framework::Variable* var, VarBase* grad, bool stop_gradient,
+          bool persistable)
       : name_(name),
         type_(framework::proto::VarType::LOD_TENSOR),
-        place_(place),
-        var_(std::move(var)),
+        var_(var),
         grads_(grad),
-        dtype_(dtype),
         stop_gradient_(stop_gradient),
         persistable_(persistable),
         pre_op_(nullptr),
         pre_op_out_name_(),
         pre_op_out_idx_(-1) {
     if (!var_) {
-      var_.reset(new framework::Variable());
+      var_ = new framework::Variable();
     }
-
     auto tensor = var_->GetMutable<framework::LoDTensor>();
     tensor->Resize(shape);
-    if (need_initialize) {
-      tensor->mutable_data(place, dtype);
-      is_initialized_ = true;
-      VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype
-              << " place: " << place;
-    } else {
-      is_initialized_ = false;
-      VLOG(8) << "not initialized varbase: " << name_;
-    }
-    VLOG(8) << "create varbase: " << name_ << " type: " << dtype
-            << " place: " << place << "Stop gradient: " << stop_gradient_;
-
-    if (IsDebugEnabled()) {
-      name_set_.Insert(name_);
-    }
+    tensor->mutable_data(place, dtype);
+    VLOG(10) << "create varbase: " << name_ << " type: " << dtype
+             << " place: " << place;
   }
 
  public:
   virtual ~VarBase() {
+    if (var_) {
+      delete var_;
+      var_ = nullptr;
+    }
+
+    if (grads_) {
+      delete grads_;
+      grads_ = nullptr;
+    }
+
     pre_op_ = nullptr;
     pre_op_out_idx_ = -1;
-    VLOG(8) << "destruct varbase: " << name_;
-    if (IsDebugEnabled()) {
-      name_set_.Remove(name_);
-    }
   }
 
   inline void SetName(const std::string& name) { name_ = name; }
   inline std::string Name() const { return name_; }
-  inline bool IsInitialize() const { return is_initialized_; }
-  inline void SetInitialize(bool inited) { is_initialized_ = inited; }
+
   inline std::vector<int64_t> Shape() const {
     if (var_->IsInitialized()) {
       return framework::vectorize(var_->Get<framework::LoDTensor>().dims());
@@ -236,7 +197,10 @@ class VarBase {
     auto tensor = var_->GetMutable<framework::LoDTensor>();
     tensor->mutable_data(tensor->place(), type);
   }
-  inline framework::proto::VarType::Type DataType() const { return dtype_; }
+  inline framework::proto::VarType::Type DataType() const {
+    auto tensor = var_->Get<framework::LoDTensor>();
+    return tensor.type();
+  }
 
   // tensor type. e.g.. LoDTensor
   inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
@@ -244,20 +208,16 @@ class VarBase {
 
   inline void SetStopGradient(bool stop_gradient) {
     stop_gradient_ = stop_gradient;
-    if (grads_) {
-      grads_->stop_gradient_ = stop_gradient;
-    }
   }
   inline bool IsStopGradient() const { return stop_gradient_; }
 
   inline void SetPersistable(bool persistable) { persistable_ = persistable; }
   inline bool IsPersistable() const { return persistable_; }
-  inline void SetPreOp(OpBase* op) { pre_op_ = op; }
-  inline platform::Place GetPlace() { return place_; }
+
   inline OpBase* PreOp() const { return pre_op_; }
   inline int PreOpOutIdx() const { return pre_op_out_idx_; }
 
-  void RunBackward(const detail::BackwardStrategy& bck_stratedy);
+  void RunBackward();
 
   inline void ResetPreOp(OpBase* op) {
     if (op == pre_op_) {
@@ -267,17 +227,6 @@ class VarBase {
     }
   }
 
-  void InitBuffer() {
-    if (!is_initialized_) {
-      var_->GetMutable<framework::LoDTensor>()->mutable_data(place_, dtype_);
-      is_initialized_ = true;
-      VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_
-              << " place: " << place_;
-    } else {
-      VLOG(8) << "var: " << name_ << " has already been initialized ";
-    }
-  }
-
   void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name,
                   int pre_op_out_idx, bool pre_op_stop_gradient) {
     pre_op_ = pre_op;
@@ -312,20 +261,16 @@ class VarBase {
   framework::proto::VarType::Type type_;
   platform::Place place_;
 
-  std::unique_ptr<framework::Variable> var_;
-  std::shared_ptr<VarBase> grads_;
+  framework::Variable* var_;
+  VarBase* grads_;
 
  private:
-  framework::proto::VarType::Type dtype_;
   bool stop_gradient_;
   bool persistable_;
-  bool is_initialized_;
+
   OpBase* pre_op_;
   std::string pre_op_out_name_;
   int pre_op_out_idx_;
-
-  // A private flag to check memory leak
-  static ThreadSafeNameSet name_set_;
 };
 
 /* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its
@@ -336,27 +281,28 @@ class PYBIND11_HIDDEN OpBase {
   OpBase(const std::string& type)
       : type_(type),
         trace_id_(-1),
+        forward_id_(-1),
+        backward_id_(-1),
         place_(platform::CPUPlace()),
         backward_hooks_() {}
 
   virtual ~OpBase() {
-    for (const auto& it : outputs_ref) {
-      auto vb = it.lock();
-      if (vb) {
-        VLOG(3) << "Op reset by" << vb->name_;
-        vb->ResetPreOp(this);
+    // TODO(minqiyang): remove op_desc from block_desc in tracer
+    //
+    // reset all output vars' pre op
+    for (auto iter : output_vars_) {
+      for (VarBase* var : iter.second) {
+        var->ResetPreOp(this);
       }
     }
-    // TODO(minqiyang): remove op_desc from block_desc in tracer
+
     // release resource
     for (framework::OpDesc* desc : grad_op_descs_) {
       delete desc;
     }
   }
 
-  std::vector<VarBasePtrMap> ApplyGrad(
-      BackwardSumMap* bck_map, GradientRef* grad_ref,
-      const detail::BackwardStrategy& bck_stratedy);
+  std::map<std::string, std::vector<VarBase*>> ApplyGrad();
 
   inline std::string Type() const { return type_; }
   inline std::string GradOpType(size_t index) const {
@@ -364,17 +310,16 @@ class PYBIND11_HIDDEN OpBase {
     return grad_op_descs_[index]->Type();
   }
 
-  void RegisterBackwardHooks(const py::object& callable);
+  void RegisterBackwardHooks(const py::object& callable, bool front = false);
 
   void InvokeBackwardHooks();
 
-  void TrackPreOp(
-      const std::string& inp_name,
-      const std::vector<std::shared_ptr<imperative::VarBase>>& inputs) {
+  void TrackPreOp(const std::string& inp_name,
+                  const std::vector<VarBase*>& inputs) {
     auto& pre_ops_list = pre_ops_[inp_name];
     pre_ops_list.reserve(inputs.size());
     auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
-    for (std::shared_ptr<imperative::VarBase> inp_var : inputs) {
+    for (VarBase* inp_var : inputs) {
       if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
         VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
                 << inp_name;
@@ -390,17 +335,24 @@ class PYBIND11_HIDDEN OpBase {
   }
 
   std::string type_;
+  // One of `trace_id_` or `forward_id_` is set, not both.
+  // For pure python PyLayer, use `forward_id_`, otherwise, use trace_id_.
   int trace_id_;
+  int forward_id_;
 
+  // When has backward, one of `grad_op_descs_` or `backward_id_` is set,
+  // not both.
   // Note: each fwd op corresponds to a vector of bwd ops.
   std::vector<framework::OpDesc*> grad_op_descs_;
+  int backward_id_;
 
   platform::Place place_;
 
+  VarBasePtrMap input_vars_;
+  VarBasePtrMap output_vars_;
   OpBasePtrMap pre_ops_;
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
-  VarBaseWeakPtrList outputs_ref;
   // Inputs to a vector of bwd ops.
   std::vector<VarBasePtrMap> grad_input_vars_;
   // Outputs to a vector of bwd ops.
@@ -415,13 +367,34 @@ class Layer {
  public:
   virtual ~Layer() {}
 
-  virtual std::vector<std::shared_ptr<VarBase>> Forward(
-      const std::vector<std::shared_ptr<VarBase>>& inputs) {
-    std::vector<std::shared_ptr<VarBase>> vars;
+  virtual std::vector<VarBase> Forward(const std::vector<VarBase>& inputs) {
+    std::vector<VarBase> vars;
     return vars;
   }
 };
 
+class PyLayer {
+ public:
+  virtual ~PyLayer() {}
+
+  static const char* kFwdInp;
+  static const char* kFwdOut;
+
+  static void RegisterFunc(int func_id, const py::object& py_func);
+
+  static int NumFuncs();
+
+  static std::vector<framework::Variable*> Apply(
+      int func_id, const std::vector<VarBase*>& inputs);
+
+  static std::vector<VarBase*> ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs);
+
+ private:
+  static std::vector<framework::Variable*> CallPythonFunc(
+      const py::object& callable, const std::vector<VarBase*>& ins);
+};
+
 // infer var type context for imperative mode
 class PYBIND11_HIDDEN RuntimeInferVarTypeContext
     : public framework::InferVarTypeContext {
@@ -438,7 +411,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
         var_set_() {
     input_names_.reserve(inputs_->size());
     for (auto& it : *inputs_) {
-      for (std::shared_ptr<imperative::VarBase> var : it.second) {
+      for (imperative::VarBase* var : it.second) {
         input_names_[it.first].emplace_back(var->Name());
         var_set_[var->Name()] = var;
       }
@@ -446,7 +419,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
 
     output_names_.reserve(outputs_->size());
     for (auto& it : *outputs_) {
-      for (std::shared_ptr<imperative::VarBase> var : it.second) {
+      for (imperative::VarBase* var : it.second) {
         output_names_[it.first].emplace_back(var->Name());
         var_set_[var->Name()] = var;
       }
@@ -542,8 +515,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
   const framework::AttributeMap* attrs_;
   std::unordered_map<std::string, std::vector<std::string>> input_names_;
   std::unordered_map<std::string, std::vector<std::string>> output_names_;
-  std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
-      var_set_;
+  std::unordered_map<std::string, imperative::VarBase*> var_set_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index d9630bd66d5..f96c83936df 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -93,7 +93,6 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
     send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0);
     break;
   }
-  close(sock);
 }
 
 void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 682bea7d09b..7c495ddd682 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -18,13 +18,11 @@
 #include <set>
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
 
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace imperative {
@@ -46,25 +44,25 @@ void CreateGradOp(const framework::OpDesc& op_desc,
   }
 }
 
-void CreateNoBuffuerGrad(std::shared_ptr<imperative::VarBase> var,
-                         platform::DeviceContext* dev_ctx) {
+void InitGrad(VarBase* var, platform::DeviceContext* dev_ctx) {
   PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base");
   PADDLE_ENFORCE_NOT_NULL(dev_ctx,
                           "Could not get valid device from forward op");
 
   if (var->grads_ == nullptr) {
     auto& var_t = var->var_->Get<framework::LoDTensor>();
-    var->grads_ = std::shared_ptr<imperative::VarBase>(
-        new VarBase(var->GradName(), framework::proto::VarType::FP32,
-                    framework::vectorize(var_t.dims()), dev_ctx->GetPlace(),
-                    var->IsStopGradient(), false, false));
+    var->grads_ = new VarBase(var->GradName(), framework::proto::VarType::FP32,
+                              framework::vectorize(var_t.dims()),
+                              dev_ctx->GetPlace(), true, false);
+    auto grad_t = var->grads_->var_->GetMutable<framework::LoDTensor>();
+    operators::math::set_constant(*dev_ctx, grad_t, 0.0);
   }
 }
 
 platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) {
   platform::Place result = place;
-  for (const auto& it : inputs) {
-    for (const std::shared_ptr<imperative::VarBase>& var : it.second) {
+  for (auto it : inputs) {
+    for (VarBase* var : it.second) {
       platform::Place tmp_place =
           var->var_->Get<framework::LoDTensor>().place();
       if (!platform::is_same_place(tmp_place, result)) {
@@ -98,7 +96,7 @@ framework::VariableNameMap CreateInputVarNameMap(
       auto var_vector = it->second;
       std::vector<std::string> args;
       args.reserve(var_vector.size());
-      for (std::shared_ptr<imperative::VarBase> var_base : var_vector) {
+      for (VarBase* var_base : var_vector) {
         args.emplace_back(var_base->Name());
       }
       result[in.name()] = args;
@@ -126,7 +124,7 @@ framework::VariableNameMap CreateOutputVarNameMap(
       auto var_vector = it->second;
       std::vector<std::string> args;
       args.reserve(var_vector.size());
-      for (const std::shared_ptr<imperative::VarBase>& var_base : var_vector) {
+      for (VarBase* var_base : var_vector) {
         args.emplace_back(var_base->Name());
       }
       result[out.name()] = args;
@@ -137,24 +135,25 @@ framework::VariableNameMap CreateOutputVarNameMap(
 
 Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
-void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                   VarBasePtrMap* outputs, framework::AttributeMap attrs_map,
-                   const platform::Place expected_place,
-                   const bool stop_gradient) {
-  platform::RecordEvent record_event(op->type_);
+std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
+                                    VarBasePtrMap* outputs,
+                                    framework::AttributeMap attrs_map,
+                                    const platform::Place expected_place,
+                                    const bool stop_gradient) {
   framework::VariableValueMap invars_map;
   framework::VariableValueMap outvars_map;
 
   // Construct input_vars_map and output_vars_map
-  std::map<std::string, std::shared_ptr<imperative::VarBase>> current_vars_map;
-  for (auto it : inputs) {
+  std::map<std::string, VarBase*> current_vars_map;
+  op->input_vars_ = inputs;
+  for (auto it : op->input_vars_) {
     auto& invars = invars_map[it.first];
     invars.reserve(it.second.size());
-    for (std::shared_ptr<imperative::VarBase> inp : it.second) {
+    for (VarBase* inp : it.second) {
       PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(),
                               inp->Name());
 
-      invars.emplace_back(inp->var_.get());
+      invars.emplace_back(inp->var_);
       if (!stop_gradient) {
         current_vars_map[inp->Name()] = inp;
       }
@@ -165,16 +164,14 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
     op->TrackPreOp(it.first, it.second);
   }
 
-  for (const auto& it : *outputs) {
+  op->output_vars_ = *outputs;
+  for (auto it : op->output_vars_) {
     auto& outvars = outvars_map[it.first];
-    const std::vector<std::shared_ptr<imperative::VarBase>>& outputs_tmp =
-        it.second;
-    outvars.reserve(outputs_tmp.size());
-    for (size_t i = 0U; i < outputs_tmp.size(); ++i) {
-      // Add weak_ptr to track outputs
-      op->outputs_ref.emplace_back(outputs_tmp[i]);
-      std::shared_ptr<imperative::VarBase> out = outputs_tmp[i];
-      outvars.emplace_back(out->var_.get());
+    const std::vector<VarBase*>& outputs = it.second;
+    outvars.reserve(outputs.size());
+    for (size_t i = 0U; i < outputs.size(); ++i) {
+      VarBase* out = outputs[i];
+      outvars.emplace_back(out->var_);
       out->TrackPreOp(op, it.first, i, stop_gradient);
       if (!stop_gradient) {
         current_vars_map[out->Name()] = out;
@@ -225,6 +222,8 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx,
                                   prepared_op.ctx, prepared_op.kernel_configs));
 
+  // construct backward op
+  std::set<std::string> vars_saved_for_backward;
   if (!stop_gradient) {
     VLOG(5) << "start construct backward op";
 
@@ -258,13 +257,13 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
             // Forward inputs or outputs.
             grad_in_vars.emplace_back(fwd_var_it->second);
           } else {
-            std::shared_ptr<imperative::VarBase> var =
-                current_vars_map[var_it->second];
-            CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
+            VarBase* var = current_vars_map[var_it->second];
+            InitGrad(var, prepared_op.GetDeviceContext());
             // Douts.
-            var->grads_->SetPreOp(var->PreOp());
             grad_in_vars.emplace_back(var->grads_);
           }
+
+          vars_saved_for_backward.insert(it.first);
         }
       }
 
@@ -276,17 +275,70 @@ void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                          "Could not found the grad op output var, should this "
                          "operator %s's stop gradient be True",
                          op->Type());
-
-          std::shared_ptr<imperative::VarBase> var =
-              current_vars_map[var_it->second];
-          CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext());
-          var->grads_->SetPreOp(var->PreOp());
+          VarBase* var = current_vars_map[var_it->second];
+          InitGrad(var, prepared_op.GetDeviceContext());
           grad_out_vars.push_back(var->grads_);
           VLOG(3) << "grads output var name: " << var->name_;
         }
       }
     }
   }
+
+  return vars_saved_for_backward;
 }
+
+std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
+                                      const std::vector<VarBase*>& inputs,
+                                      bool stop_gradient) {
+  VLOG(3) << "py_trace " << op->Type();
+
+  op->input_vars_[PyLayer::kFwdInp] = inputs;
+
+  std::vector<framework::Variable*> ret_vars =
+      PyLayer::Apply(op->forward_id_, inputs);
+
+  op->TrackPreOp(PyLayer::kFwdInp, inputs);
+
+  std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
+  outputs.reserve(ret_vars.size());
+  for (size_t i = 0U; i != ret_vars.size(); ++i) {
+    framework::Variable* v = ret_vars[i];
+    VarBase* out = new VarBase(string::Sprintf("%s_out_%d", op->Type(), i), v,
+                               nullptr, stop_gradient);
+    outputs.emplace_back(out);
+    out->TrackPreOp(op, PyLayer::kFwdOut, i, stop_gradient);
+  }
+
+  if (!stop_gradient) {
+    VLOG(5) << "start construct backward op";
+    op->grad_input_vars_.resize(1);
+    op->grad_output_vars_.resize(1);
+    auto& grad_input_vars =
+        op->grad_input_vars_[0][framework::GradVarName(PyLayer::kFwdInp)];
+    auto& grad_output_vars =
+        op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
+
+    for (VarBase* inp : inputs) {
+      grad_input_vars.push_back(inp);
+    }
+    for (VarBase* out : outputs) {
+      grad_input_vars.push_back(out);
+    }
+
+    // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
+    platform::CPUPlace place;
+    for (VarBase* out : outputs) {
+      InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
+      grad_input_vars.push_back(out->grads_);
+    }
+
+    for (VarBase* inp : inputs) {
+      InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
+      grad_output_vars.push_back(inp->grads_);
+    }
+  }
+  return outputs;
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 02d90227410..a87f3b8009d 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -36,6 +36,9 @@ void CreateGradOp(const framework::OpDesc& op_desc,
                   framework::OpDesc** grad_op_desc,
                   std::unordered_map<std::string, std::string>* grad_to_var);
 
+void InitVar(const VarBase* var, framework::Variable* grad_var,
+             platform::DeviceContext* dev_ctx);
+
 platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs);
 
 class Tracer {
@@ -44,11 +47,14 @@ class Tracer {
 
   virtual ~Tracer() {}
 
-  void Trace(OpBase* op, const VarBasePtrMap& inputs,
-             VarBasePtrMap* outputs,  // NOLINT
-             framework::AttributeMap attrs_map,
-             const platform::Place expected_place,
-             const bool stop_gradient = false);
+  std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
+                              VarBasePtrMap* outputs,  // NOLINT
+                              framework::AttributeMap attrs_map,
+                              const platform::Place expected_place,
+                              const bool stop_gradient = false);
+
+  std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
+                                bool stop_gradient = false);
 
  private:
   platform::Place GetPlace(const VarBasePtrMap& inputs);
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index fab8c2e6b91..c51ce931def 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -15,10 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <map>
-#include <memory>
 #include <string>
-#include <unordered_map>
-#include <utility>
 #include <vector>
 
 namespace paddle {
@@ -27,17 +24,9 @@ namespace imperative {
 class VarBase;
 class OpBase;
 
-typedef std::map<std::string, std::vector<std::shared_ptr<VarBase>>>
-    VarBasePtrMap;
-typedef std::vector<std::weak_ptr<VarBase>> VarBaseWeakPtrList;
+typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
 typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
-typedef std::unordered_map<
-    const VarBase*,
-    std::pair<platform::Place,
-              std::vector<std::pair<int, std::shared_ptr<VarBase>>>>>
-    BackwardSumMap;  // var_grad -> {place, {id -> var_grad@rename}}
-typedef std::unordered_map<const VarBase*, std::pair<int, bool>> GradientRef;
-// var_grad -> {ref_times, is_first_to_be_accumulate}
 
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 44eaf90371d..5e0be5d445e 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,7 +17,7 @@ if (TENSORRT_FOUND)
   add_subdirectory(tensorrt)
 endif()
 
-if (ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
   add_subdirectory(anakin)
 endif()
 
@@ -43,15 +43,11 @@ if(WITH_MKLDNN)
 endif()
 
 set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor)
-if (ANAKIN_FOUND)
-    set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc)
-endif()
 set(SHARED_INFERENCE_SRCS
     io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${mkldnn_quantizer_src}
-    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
-    ${ANAKIN_SHARED_INFERENCE_SRCS})
+    ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index 67194c9ff24..5d85525a652 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -5,19 +5,19 @@ detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc
 roi_align.cc shuffle_channel.cc helper.cc DEPS anakin_engine framework_proto
 scope op_registry gtest)
 
-cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
-cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
-cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
-cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
-cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
-cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op)
-cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter)
-cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
-cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)
-cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
-cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
-cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
-cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op)
-cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor)
-cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op)
+cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL)
+cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
+cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
+cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling SERIAL)
+cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split SERIAL)
+cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split SERIAL)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op SERIAL)
+cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL SERIAL)
+cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax SERIAL)
+cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op SERIAL)
+cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op SERIAL)
+cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op SERIAL)
+cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op SERIAL)
+cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op SERIAL)
+cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS  anakin_op_converter sum_op selected_rows_functor SERIAL)
+cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op SERIAL)
diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc
index 26f78efa61c..70e0adf5ead 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d.cc
@@ -70,8 +70,7 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
   if (enable_int8) {
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
+    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -92,8 +91,8 @@ void Conv2dOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(op_name,
+                                            {weight_scale / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto *weight1 = pblock_from_tensor<TargetT, PrecisionT>(
diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
index f2e6003aa68..a1568b8bdee 100644
--- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
+++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc
@@ -72,8 +72,7 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
   if (enable_int8) {
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
+    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -94,8 +93,8 @@ void Conv2dFusionOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(op_name,
+                                            {weight_scale / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace());
diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc
index d221f26e119..dd32baa0b90 100644
--- a/paddle/fluid/inference/anakin/convert/elementwise.cc
+++ b/paddle/fluid/inference/anakin/convert/elementwise.cc
@@ -60,7 +60,7 @@ void ElementwiseMulOpConverter<TargetT, PrecisionT>::operator()(
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
 
   this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name});
-  std::string elementwise_type = "Mul";
+  std::string elementwise_type = "Prod";
   this->engine_->template AddOpAttr<std::string>(op_name, "type",
                                                  elementwise_type);
   std::vector<float> coeff = {1.0, 1.0};
diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc
index b64d0b84fd4..0621e3377b3 100644
--- a/paddle/fluid/inference/anakin/convert/fc.cc
+++ b/paddle/fluid/inference/anakin/convert/fc.cc
@@ -76,8 +76,7 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
     ::anakin::saber::Shape anakin_shape(weight_shape);
     const float int8_range = 127.;
     float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
+    float weight_scale = boost::get<float>(op_desc.GetAttr("weight_scale"));
     PBlock<TargetT> *weight1 =
         new PBlock<TargetT>(anakin_shape, ::anakin::AK_INT8);
     this->engine_->RegistBlock(weight1);
@@ -96,8 +95,8 @@ void FcBaseOpConverter<TargetT, PrecisionT>::operator()(
     weight1->d_tensor().copy_from(weight1->h_tensor());
     this->engine_->AddOpAttr(op_name, "weight_1", *weight1);
     this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8);
-    this->engine_->Graph()->SetWeightsScale(
-        op_name, {weight_scale[0] / int8_range}, false);
+    this->engine_->Graph()->SetWeightsScale(op_name,
+                                            {weight_scale / int8_range}, false);
     this->engine_->AddTensorScale(input_name, in_scale / int8_range);
   } else {
     auto *weight1 = pblock_from_vector<TargetT, PrecisionT>(trans_weight_data,
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 1058e744bca..a6ae51bd4b1 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -153,12 +153,11 @@ template class AnakinOpConverter<::anakin::saber::NV,
                                  ::anakin::Precision::FP32>;
 template class AnakinOpConverter<::anakin::saber::NV,
                                  ::anakin::Precision::INT8>;
-#ifdef ANAKIN_X86_PLACE
+
 template class AnakinOpConverter<::anakin::saber::X86,
                                  ::anakin::Precision::FP32>;
 template class AnakinOpConverter<::anakin::saber::X86,
                                  ::anakin::Precision::INT8>;
-#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
@@ -204,16 +203,16 @@ template class AnakinOpConverter<::anakin::saber::X86,
       CPU, ::anakin::saber::X86, precision_type__,                       \
       ::anakin::Precision::precision_type__)
 
-#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
+#ifdef PADDLE_WITH_CUDA
 #define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
   REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
   REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \
   REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32);  \
   REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
-#elif defined(PADDLE_WITH_CUDA)
-#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)       \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
-  REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
+#else
+#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__)      \
+  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \
+  REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8)
 #endif
 
 #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__)   \
@@ -222,16 +221,12 @@ template class AnakinOpConverter<::anakin::saber::X86,
       __attribute__((unused)) =                                                \
           Touch_anakin_##op_type__##_##place_type__##_##precision_type__();
 
-#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE)
-#define USE_ANAKIN_CONVERTER(op_type__)            \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
-#define USE_INT8_ANAKIN_CONVERTER(op_type__)       \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \
-  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
-#elif defined(PADDLE_WITH_CUDA)
 #define USE_ANAKIN_CONVERTER(op_type__) \
   USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32)
 #define USE_INT8_ANAKIN_CONVERTER(op_type__) \
   USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8)
-#endif
+
+#define USE_CPU_ANAKIN_CONVERTER(op_type__) \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32)
+#define USE_CPU_INT8_ANAKIN_CONVERTER(op_type__) \
+  USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8)
diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
index 5ac8b45882f..4f898252d27 100644
--- a/paddle/fluid/inference/anakin/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc
@@ -77,6 +77,32 @@ TEST(swish_op, gpu) {
 }
 #endif
 
+/*
+TEST(sigm_op, cpu) {
+  platform::CPUPlace cpu_place;
+  platform::CPUDeviceContext ctx(cpu_place);
+  test_activation_op<::anakin::saber::X86>("sigmoid", ctx, false);
+}
+
+TEST(tanh_op, cpu) {
+  platform::CPUPlace cpu_place;
+  platform::CPUDeviceContext ctx(cpu_place);
+  test_activation_op<::anakin::saber::X86>("tanh", ctx, false);
+}
+
+TEST(relu6_op, cpu) {
+  platform::CPUPlace cpu_place;
+  platform::CPUDeviceContext ctx(cpu_place);
+  test_activation_op<::anakin::saber::X86>("relu6", ctx, false);
+}
+
+TEST(swish_op, cpu) {
+  platform::CPUPlace cpu_place;
+  platform::CPUDeviceContext ctx(cpu_place);
+  test_activation_op<::anakin::saber::X86>("swish", ctx, false);
+}
+*/
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
@@ -86,7 +112,13 @@ USE_OP(tanh);
 USE_OP(relu6);
 USE_OP(swish);
 
+USE_CPU_ANAKIN_CONVERTER(sigmoid);
+USE_CPU_ANAKIN_CONVERTER(tanh);
+USE_CPU_ANAKIN_CONVERTER(relu6);
+USE_CPU_ANAKIN_CONVERTER(swish);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(relu6);
 USE_ANAKIN_CONVERTER(swish);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
index 008537dc8a5..f6399387aa2 100644
--- a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc
@@ -57,16 +57,19 @@ TEST(affine_channel_op, gpu) {
   test_affine_channel_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(affine_channel_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_affine_channel_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(affine_channel);
+USE_CPU_ANAKIN_CONVERTER(affine_channel);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(affine_channel);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
index edba90235fa..c008ef1bd5e 100644
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -73,15 +73,19 @@ TEST(batch_norm_op, gpu) {
   test_batchnorm_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(batch_norm_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_batchnorm_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 USE_OP(batch_norm);
+USE_CPU_ANAKIN_CONVERTER(batch_norm);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(batch_norm);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
index 6870260c865..42dfbeb5cdc 100644
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -53,15 +53,19 @@ TEST(concat_op, gpu) {
   test_concat_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(concat_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_concat_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 USE_OP(concat);
+USE_CPU_ANAKIN_CONVERTER(concat);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(concat);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
index 723a348b12e..e95e11c4f96 100644
--- a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc
@@ -60,16 +60,20 @@ TEST(conv2d_op, gpu) {
   test_conv2d_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(conv2d_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_conv2d_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(conv2d);
+USE_CPU_ANAKIN_CONVERTER(conv2d);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(conv2d);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
index 83792676a00..ae27e27ded5 100644
--- a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc
@@ -54,16 +54,19 @@ TEST(dropout_op, gpu) {
   test_dropout_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(dropout_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_dropout_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(dropout);
+USE_CPU_ANAKIN_CONVERTER(dropout);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(dropout);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
index ee128c1ec9a..bff75294908 100644
--- a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc
@@ -59,23 +59,29 @@ TEST(elementwise_op, native_mul_gpu) {
   test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(elementwise_op, native_add_cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false);
 }
+
 TEST(elementwise_op, native_mul_cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(elementwise_add);
 USE_OP(elementwise_mul);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(elementwise_add);
 USE_ANAKIN_CONVERTER(elementwise_mul);
+#endif
+
+USE_CPU_ANAKIN_CONVERTER(elementwise_add);
+USE_CPU_ANAKIN_CONVERTER(elementwise_mul);
diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
index 3e68d8fed6a..a24c809c022 100644
--- a/paddle/fluid/inference/anakin/convert/test_fc_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc
@@ -49,16 +49,19 @@ TEST(mul_op, gpu) {
   test_mul_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(mul_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_mul_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(mul);
+USE_CPU_ANAKIN_CONVERTER(fc);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(fc);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
index 5e4cfdabfd7..5765f5ebd1f 100644
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -48,17 +48,20 @@ TEST(flatten_op, gpu) {
   test_flatten_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(flatten_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_flatten_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(reshape);
 USE_OP_ITSELF(flatten);
+USE_CPU_ANAKIN_CONVERTER(flatten);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(flatten);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
index 9b23b5b93df..90503b1fbba 100644
--- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc
@@ -87,7 +87,7 @@ TEST(Pool2dOpConverter, avg_ceil_test) {
   test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg");
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(Pool2dOpConverter, normal_cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
@@ -110,10 +110,14 @@ TEST(Pool2dOpConverter, avg_ceil_test_cpu) {
   platform::CPUDeviceContext ctx(cpu_place);
   test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg");
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(pool2d);
+USE_CPU_ANAKIN_CONVERTER(pool2d);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(pool2d);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
index eb6429f3383..3f224796519 100644
--- a/paddle/fluid/inference/anakin/convert/test_relu_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc
@@ -66,5 +66,10 @@ TEST(leaky_relu_op, gpu) {
 
 USE_OP(relu);
 USE_OP(leaky_relu);
+USE_CPU_ANAKIN_CONVERTER(relu);
+USE_CPU_ANAKIN_CONVERTER(leaky_relu);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(leaky_relu);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
index b1be42e542c..e102bd3ac3e 100644
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -81,7 +81,7 @@ TEST(reshape2_op, gpu) {
   test_reshape2_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(reshape1_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
@@ -93,10 +93,14 @@ TEST(reshape2_op, cpu) {
   platform::CPUDeviceContext ctx(cpu_place);
   test_reshape2_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(reshape);
+USE_CPU_ANAKIN_CONVERTER(reshape);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(reshape);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
index 1a324739d98..de0b18fdbfd 100644
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -48,16 +48,20 @@ TEST(softmax_op, gpu) {
   test_softmax_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(relu_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_softmax_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(softmax);
+USE_CPU_ANAKIN_CONVERTER(softmax);
+
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(softmax);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc
index f9ef54fdcac..9a42ffd853b 100644
--- a/paddle/fluid/inference/anakin/convert/test_split_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc
@@ -92,7 +92,7 @@ TEST(split_op, test_different_shape_axis3_batch1) {
   platform::CUDADeviceContext ctx(gpu_place);
   AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1});
 }
-#ifdef ANAKIN_X86_PLACE
+
 TEST(split_op, test_different_shape_axis1_batch1_cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
@@ -110,10 +110,13 @@ TEST(split_op, test_different_shape_axis3_batch1_cpu) {
   platform::CPUDeviceContext ctx(cpu_place);
   AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2});
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(split);
+USE_CPU_ANAKIN_CONVERTER(split);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(split);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
index 9d26430ea68..65f67ebd129 100644
--- a/paddle/fluid/inference/anakin/convert/test_sum_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc
@@ -49,16 +49,19 @@ TEST(sum_op, gpu) {
   test_sum_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(sum_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
   test_sum_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(sum);
+USE_CPU_ANAKIN_CONVERTER(sum);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(sum);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
index 466e2f1a49f..51b69dfbb08 100644
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -79,7 +79,7 @@ TEST(transpose2_op, gpu) {
   test_transpose2_op<::anakin::saber::NV>(ctx, true);
 }
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 TEST(transpose1_op, cpu) {
   platform::CPUPlace cpu_place;
   platform::CPUDeviceContext ctx(cpu_place);
@@ -91,10 +91,13 @@ TEST(transpose2_op, cpu) {
   platform::CPUDeviceContext ctx(cpu_place);
   test_transpose2_op<::anakin::saber::X86>(ctx, false);
 }
-#endif
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
 
 USE_OP(transpose);
+USE_CPU_ANAKIN_CONVERTER(transpose);
+#ifdef PADDLE_WITH_CUDA
 USE_ANAKIN_CONVERTER(transpose);
+#endif
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index 92441f2560f..2f8f953892c 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 using anakin::Precision;
+using anakin::saber::X86;
 
 namespace paddle {
 namespace inference {
@@ -214,14 +215,13 @@ class AnakinConvertValidation {
 
 template class AnakinConvertValidation<::anakin::saber::NV,
                                        ::anakin::Precision::FP32>;
-template class AnakinConvertValidation<::anakin::saber::NV,
-                                       ::anakin::Precision::INT8>;
-#ifdef ANAKIN_X86_PLACE
 template class AnakinConvertValidation<::anakin::saber::X86,
                                        ::anakin::Precision::FP32>;
+
+template class AnakinConvertValidation<::anakin::saber::NV,
+                                       ::anakin::Precision::INT8>;
 template class AnakinConvertValidation<::anakin::saber::X86,
                                        ::anakin::Precision::INT8>;
-#endif
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index 13f16c4c898..529a859458a 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -32,25 +32,18 @@ namespace paddle {
 namespace inference {
 namespace anakin {
 
-template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-extern std::once_flag
-    AnakinEngine<TargetT, PrecisionType, RunType>::init_anakin_;
-
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
 AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
     bool need_summary, int device, int max_batch_size,
     std::map<std::string, std::vector<int>> max_input_shape,
     std::vector<std::string> program_inputs, bool auto_config_layout)
-    : device_(device),
-      max_batch_size_(max_batch_size),
-      max_input_shape_(max_input_shape),
-      program_inputs_(program_inputs),
-      auto_config_layout_(auto_config_layout) {
-  ::anakin::TargetWrapper<TargetT>::set_device(device_);
-  std::call_once(init_anakin_,
-                 [this]() { ::anakin::Env<TargetT>::env_init(); });
-  graph_.reset(new AnakinGraphT<TargetT, PrecisionType>());
-  net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary));
+    : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
+      net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
+  device_ = device;
+  max_batch_size_ = max_batch_size;
+  max_input_shape_ = max_input_shape;
+  program_inputs_ = program_inputs;
+  auto_config_layout_ = auto_config_layout;
 }
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
@@ -109,7 +102,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::BindInput(
       anakin_input = net_->get_in(input.first);
     }
     anakin_input->reshape(fluid_input_shape);
-    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), device_,
+    ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
                                                        fluid_input_shape);
     anakin_input->copy_from(tmp_anakin_tensor);
   }
@@ -193,14 +186,14 @@ template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>;
 template class AnakinEngineManager<::anakin::saber::NV,
                                    ::anakin::Precision::INT8>;
 #endif
-#ifdef ANAKIN_X86_PLACE
+
 template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
 template class AnakinEngineManager<::anakin::saber::X86,
                                    ::anakin::Precision::FP32>;
 template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>;
 template class AnakinEngineManager<::anakin::saber::X86,
                                    ::anakin::Precision::INT8>;
-#endif
+
 // template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>;
 }  // namespace anakin
 }  // namespace inference
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
index e62bb82fd12..fb40f56511b 100644
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -24,9 +24,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#ifdef EXIT  // NOLINT
-#undef EXIT  // NOLINT
-#endif       // NOLINT
+
 #include "framework/core/net/net.h"
 #include "framework/core/types.h"
 #include "framework/graph/graph.h"
@@ -116,13 +114,12 @@ class AnakinEngine {
 
  private:
   bool initialized_{false};
-  int device_;
   int max_batch_size_;
   std::map<std::string, std::vector<int>> max_input_shape_;
-  std::vector<std::string> program_inputs_;
+  int device_;
   std::unique_ptr<GraphT> graph_;
   std::unique_ptr<NetT> net_;
-  static std::once_flag init_anakin_;
+  std::vector<std::string> program_inputs_;
   std::unordered_map<std::string, float> tensor_scales_;
   // Always be false in gpu mode but true in most cpu cases.
   bool auto_config_layout_;
diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc
index 3c8a33ec60f..422f415a5db 100644
--- a/paddle/fluid/inference/anakin/test_anakin_engine.cc
+++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 using anakin::AK_FLOAT;
 using anakin::Precision;
 using anakin::saber::NV;
+using anakin::saber::X86;
 using anakin::saber::Shape;
 using anakin::PBlock;
 using anakin::PTuple;
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index d79fb529092..7a795bda820 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -23,46 +23,18 @@ cc_library(analysis SRCS
 
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 
-function(inference_analysis_test_build TARGET)
-  if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs SRCS EXTRA_DEPS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
-             SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
-  endif()
-endfunction()
-
-function(inference_analysis_test_run TARGET)
-  if(WITH_TESTING)
-     set(options "")
-     set(oneValueArgs "")
-     set(multiValueArgs COMMAND ARGS)
-     cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${analysis_test_COMMAND}
-             ARGS ${analysis_test_ARGS})
-  endif()
-endfunction()
-
 function(inference_analysis_test TARGET)
   if(WITH_TESTING)
      set(options "")
      set(oneValueArgs "")
      set(multiValueArgs SRCS ARGS EXTRA_DEPS)
      cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-     inference_base_test_build(${TARGET}
+     inference_base_test(${TARGET}
              SRCS ${analysis_test_SRCS}
-             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS})
-     inference_base_test_run(${TARGET}
-	     COMMAND ${TARGET}
-             ARGS ${analysis_test_ARGS})
+             DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}
+             ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS})
   endif()
 endfunction(inference_analysis_test)
 
-inference_analysis_test(test_analyzer
-	SRCS analyzer_tester.cc
-	EXTRA_DEPS reset_tensor_array paddle_inference_api
-	ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR})
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc
+    EXTRA_DEPS reset_tensor_array paddle_inference_api)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 7bcd1f01bfe..66e8d8b5287 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -63,16 +63,6 @@ struct Argument {
   using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
 
   bool Has(const std::string& key) const { return valid_fields_.count(key); }
-  void PartiallyRelease() {
-    if (Has("model_program_path")) {
-      if (Has("model_from_memory") && model_from_memory()) {
-        model_program_path().clear();
-        model_program_path().shrink_to_fit();
-        model_params_path().clear();
-        model_params_path().shrink_to_fit();
-      }
-    }
-  }
 
 #define DECL_ARGUMENT_FIELD(field__, Field, type__)          \
  public:                                                     \
@@ -174,7 +164,6 @@ struct Argument {
                       AnalysisConfig::Precision);
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
-  DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
 
   DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
                       anakin_max_shape_t);
diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h
index 4693729cb43..62cce64223d 100644
--- a/paddle/fluid/inference/analysis/dot.h
+++ b/paddle/fluid/inference/analysis/dot.h
@@ -19,11 +19,14 @@
  */
 #pragma once
 
-#include <glog/logging.h>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
+// #include "paddle/fluid/lite/utils/logging.h"
+// #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#include <glog/logging.h>  // NOLINT
+// #endif
 
 namespace paddle {
 namespace inference {
@@ -55,9 +58,11 @@ class Dot {
     std::vector<Attr> attrs;
 
     Node(const std::string& name, const std::vector<Attr>& attrs)
-        : name(name),
-          attrs(attrs),
-          id_("node_" + std::to_string(dot_node_counter++)) {}
+        : name(name), attrs(attrs) {
+      std::stringstream ss;
+      ss << "node_" << dot_node_counter++;
+      id_ = ss.str();
+    }
 
     std::string id() const { return id_; }
 
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index 008608c14c7..4f5c50d0d6b 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -63,18 +63,6 @@ void SetAttr<std::vector<std::string>>(framework::proto::OpDesc *op,
   }
 }
 
-template <>
-void SetAttr<std::vector<int>>(framework::proto::OpDesc *op,
-                               const std::string &name,
-                               const std::vector<int> &data) {
-  auto *attr = op->add_attrs();
-  attr->set_name(name);
-  attr->set_type(paddle::framework::proto::AttrType::INTS);
-  for (const auto i : data) {
-    attr->add_ints(i);
-  }
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index f290e6fce49..4714c30507c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -38,9 +38,9 @@ IRPassManager::IRPassManager(Argument *argument) {
   ARGUMENT_CHECK_FIELD(argument, main_program);
   graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   if (argument->Has("scope")) {
-    auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
-    graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
+    graph_->Set(framework::ir::kParamScopeAttr,
+                new framework::Scope *(
+                    const_cast<framework::Scope *>(&argument->scope())));
   }
 
   ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
@@ -87,10 +87,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool enable_int8 = argument->tensorrt_precision_mode() ==
                          AnalysisConfig::Precision::kInt8;
 
-      pass->Set("predictor_id", new int(argument->predictor_id()));
-      bool use_calib_mode = argument->tensorrt_use_calib_mode();
       pass->Set("enable_int8", new bool(enable_int8));
-      pass->Set("use_calib_mode", new bool(use_calib_mode));
 
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
@@ -113,10 +110,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("engine_opt_info", new std::map<std::string, std::string>(
                                        argument->engine_opt_info()));
     }
-    if (pass_name == "ngraph_subgraph_pass") {
-      pass->Set("program",
-                new framework::ProgramDesc *(&argument->main_program()));
-    }
+
     if (pass_name == "anakin_subgraph_pass") {
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index ddadbc6df4a..05a3d7ddfdb 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -15,7 +15,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "")
 endif()
 
-if (ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
   cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller)
 
   set(analysis_deps ${analysis_deps}
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index a6c6f33cf77..9586ce3e6b0 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -226,6 +226,7 @@ void AnakinSubgraphPass::CreateAnakinEngine(
   auto max_batch_size = Get<int>("max_batch_size");
   auto max_input_shape =
       Get<std::map<std::string, std::vector<int>>>("max_input_shape");
+  bool auto_config_layout = Get<bool>("auto_config_layout");
   if (use_gpu) {
 #ifdef PADDLE_WITH_CUDA
     inference::Singleton<
@@ -234,14 +235,11 @@ void AnakinSubgraphPass::CreateAnakinEngine(
                 max_input_shape, program_inputs, false, engine_key);
 #endif
   } else {
-#ifdef ANAKIN_X86_PLACE
-    bool auto_config_layout = Get<bool>("auto_config_layout");
     inference::Singleton<
         anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global()
         .Create(true, Get<int>("gpu_device_id"), max_batch_size,
                 max_input_shape, program_inputs, auto_config_layout,
                 engine_key);
-#endif
   }
 
   auto *scope = param_scope();
@@ -260,7 +258,6 @@ void AnakinSubgraphPass::CreateAnakinEngine(
             param_set, output_mapping, anakin_engine);
 #endif
   } else {
-#ifdef ANAKIN_X86_PLACE
     auto *anakin_engine =
         inference::Singleton<inference::anakin::AnakinEngineManager<
             ::anakin::saber::X86, PrecisionT>>::Global()
@@ -271,7 +268,6 @@ void AnakinSubgraphPass::CreateAnakinEngine(
             &block_desc_temp, scope,
             std::vector<std::string>(input_names.begin(), input_names.end()),
             param_set, output_mapping, anakin_engine);
-#endif
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
index 670335827b4..76b1671601e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc
@@ -420,7 +420,7 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
     // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
     // as deleted. 3. Replace the deleted node with the new Block Node.
     framework::OpDesc empty_desc;
-    empty_desc.SetType(name_);
+    empty_desc.SetType("anakin_engine");
     auto *block_node = graph_->CreateOpNode(&empty_desc);
     Agent(block_node).set_subgraph({});
     auto io = ExtractInputAndOutputOfSubGraph(subgraph);
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
index 26201541f67..5d11c217b69 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <string>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -75,11 +74,10 @@ class SubGraphFuser {
   using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller;
 
   SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller,
-                int min_subgraph_size, std::string name = "anakin_engine")
+                int min_subgraph_size)
       : graph_(graph),
         node_inside_subgraph_teller_(teller),
-        min_subgraph_size_{min_subgraph_size},
-        name_{name} {}
+        min_subgraph_size_{min_subgraph_size} {}
 
   // The main method which run all the logic.
   void operator()();
@@ -92,7 +90,6 @@ class SubGraphFuser {
   Graph *graph_;
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
   int min_subgraph_size_;
-  const std::string name_;
 };
 
 struct NodeWrapper {
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index e16cce54c24..8f7c6ac7553 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -61,7 +61,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool trt_and_not_int8) {
+    bool is_trt) {
   //// In the normal case, the paddle-trt exists bug when runing the googlenet.
   // When there are more than two convolutions of 1 * 1 with the same input, the
   // paddle-tensorrt will do the merging optimization, which fuse those conv
@@ -121,7 +121,7 @@ void RenameAndGetOutputs(
     for (auto out_var : correspond_node->outputs) {
       var2id[out_var->Name()] = out_var->id();
     }
-    if (op_desc.Type() == "conv2d" && trt_and_not_int8) {
+    if (op_desc.Type() == "conv2d" && is_trt) {
       auto input_var_name = op_desc.Input("Input").front();
       auto filter_var_name = op_desc.Input("Filter").front();
       auto out_var_name = op_desc.Output("Output").front();
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 444e1984cf8..bb445027821 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -43,7 +43,7 @@ void RenameAndGetOutputs(
     std::set<std::string> *output_names,
     std::unordered_map<std::string, std::string> *output_name_map,
     const std::unordered_map<std::string, framework::ir::Node *> &graph_var_map,
-    bool trt_and_not_int8 = false);
+    bool is_trt = true);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 37c3fc79554..67650a352d8 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -149,8 +149,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
       graph_var_map[node->Name()] = node;
     }
   }
-  auto enable_int8 = Get<bool>("enable_int8");
-  auto use_calib_mode = Get<bool>("use_calib_mode");
   auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
@@ -167,7 +165,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // it is either an OP's input or an OP's output.
   RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
                       &output_names_with_id, &output_names, &output_name_map,
-                      graph_var_map, !enable_int8);
+                      graph_var_map);
 
   // When tensorrt engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -198,27 +196,22 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
   SetAttr(op_desc->Proto(), "parameters", params);
 
+  auto enable_int8 = Get<bool>("enable_int8");
   auto use_static_engine = Get<bool>("use_static_engine");
-  // TODO(NHZlX)
-  // There are models with the same structure but the different parameters,
-  // when runing in the 'use_serialize' mode, there is a bug.
   auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
                                       std::to_string(0));
-  auto predictor_id = Get<int>("predictor_id");
 
   // Get "" when there is no cached calibration table data.
   bool load_from_memory = Get<bool>("model_from_memory");
   std::string calibration_data = "";
-  if (enable_int8 && use_calib_mode) {
+  if (enable_int8) {
     calibration_data = GetTrtCalibTableData(
         Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
   }
   SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
-  SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  SetAttr(op_desc->Proto(), "predictor_id", predictor_id);
   std::string trt_engine_serialized_data = "";
   SetAttr(op_desc->Proto(), "engine_serialized_data",
           trt_engine_serialized_data);
@@ -229,8 +222,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   }
   // When in int8 mode and calibration_mode, the program just produce the
   // calibration table data.
-  bool calibration_mode =
-      (enable_int8 && calibration_data.size() == 0 && use_calib_mode);
+  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
   if (calibration_mode) {
     // calibraion mode means generate int8 calibration table data process.
     return;
@@ -238,20 +230,15 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   std::copy(params.begin(), params.end(),
             std::back_inserter(*repetitive_params));
-
-  tensorrt::TensorRTEngine *trt_engine =
-      inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
-                  enable_int8, calibrator.get(), Get<int>("gpu_device_id"));
-
   bool need_serialize = (use_static_engine && !load_from_memory);
+
   if (need_serialize) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
         Get<std::string>("model_opt_cache_dir"), engine_key);
     // we can load the engine info serialized before from the disk.
     if (!trt_engine_serialized_data.empty()) {
-      trt_engine->Deserialize(trt_engine_serialized_data);
+      SetAttr(op_desc->Proto(), "engine_serialized_data",
+              trt_engine_serialized_data);
       LOG(INFO) << "Load TRT Optimized Info from "
                 << GetTrtEngineSerializedPath(
                        Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -264,7 +251,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // 2. already load serialized trt engine info.
   LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                "kernel etc). This process may cost a lot of time.";
-
+  std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+      new tensorrt::TensorRTEngine(
+          Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
+          calibrator.get(), Get<int>("gpu_device_id")));
   auto *scope = param_scope();
   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
   std::unordered_set<std::string> param_set(params.begin(), params.end());
@@ -272,18 +262,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
       .ConvertBlockToTRTEngine(
           &block_desc_temp, *scope,
           std::vector<std::string>(input_names.begin(), input_names.end()),
-          param_set, output_mapping, trt_engine);
+          param_set, output_mapping, trt_engine.get());
+  nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+  trt_engine_serialized_data =
+      std::string((const char *)serialized_engine_data->data(),
+                  serialized_engine_data->size());
 
   if (need_serialize) {
-    nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-    trt_engine_serialized_data =
-        std::string((const char *)serialized_engine_data->data(),
-                    serialized_engine_data->size());
     SaveTrtEngineSerializedDataToFile(
         GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
                                    engine_key),
         trt_engine_serialized_data);
   }
+  SetAttr(op_desc->Proto(), "engine_serialized_data",
+          trt_engine_serialized_data);
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index a8d0c69a54a..8af0a1aceac 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -5,7 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p
 cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
 cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass)
 
-cc_library(analysis_passes SRCS passes.cc DEPS
+cc_library(analysis_passes SRCS paddle_use_passes.cc DEPS
   ir_graph_build_pass
   ir_analysis_pass
   ir_params_sync_among_devices_pass
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 970ecdbbeb0..6ae7381f41d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
 #include <memory>
 #include <string>
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -57,9 +57,8 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
 
   auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   argument->SetMainGraph(graph.release());
-  auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
-  argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
+  argument->main_graph().Set(framework::ir::kParamScopeAttr,
+                             new framework::Scope *(argument->scope_ptr()));
 }
 
 std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index fedee3ff95f..1f27e80cf49 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -69,7 +69,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       // Copy the parameter data to a tmp tensor.
       TensorCopySync(*t, cpu_place, &temp_tensor);
       // Reallocation the space on GPU
-      t->clear();
+      t->mutable_data<float>(place);
 
       // Copy parameter data to newly allocated GPU space.
       TensorCopySync(temp_tensor, place, t);
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 1f4077eec8f..c27fe89e374 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -15,11 +15,12 @@
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include <algorithm>
 #include <fstream>
-#include <functional>
 #include <limits>
 #include <map>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -39,14 +40,6 @@ using framework::ir::Node;
 using framework::ir::TopologyVarientSort;
 using space_table_t = MemoryOptimizePass::space_table_t;
 
-typedef struct {
-  std::string name;
-  size_t size;
-  int cluster;
-  std::pair<int, int> lifetime;
-  std::unordered_set<std::string> adj;
-} MemNode;
-
 // Collect the lifecycles of the tensors.
 // Traverse the graph in topological order.
 // The traversal order also affect the lifecycles, so different sort_kind is
@@ -105,89 +98,6 @@ int DataTypeToSpace(framework::proto::VarType_Type type) {
   }
 }
 
-void MemoryOptimizePass::CollectVarMemorySize(
-    space_table_t* space_table) const {
-  const int fake_batch_size = 1;
-  // Collect tensors from graph.
-  for (auto* node : graph_->Nodes()) {
-    if (node->IsVar() &&
-        node->Var()->GetType() ==
-            framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
-      // Parameters will not be reused.
-      if (node->Var()->Persistable()) continue;
-      auto shape = node->Var()->GetShape();
-      for (auto& v : shape) {
-        if (v < 0) v = fake_batch_size;
-      }
-
-      int size = std::accumulate(shape.begin(), shape.end(), 1,
-                                 std::multiplies<int>());
-      (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
-    }
-  }
-}
-
-void MakeSimpleReusePlan(
-    const std::unordered_map<std::string, std::pair<int, int>>& lifecycles,
-    const std::unordered_map<std::string, size_t>& space_table,
-    std::unordered_map<std::string, std::string>* node2cluster,
-    std::unordered_map<std::string, int>* cluster_size) {
-  std::vector<MemNode> mem_nodes;
-  for (auto& data : lifecycles) {
-    MemNode temp_node;
-    temp_node.name = data.first;
-    PADDLE_ENFORCE(
-        space_table.count(data.first),
-        "%s variable should be in the spacetable during memory optimize",
-        data.first);
-    temp_node.size = space_table.at(data.first);
-    temp_node.cluster = -1;
-    temp_node.lifetime = data.second;
-    mem_nodes.push_back(temp_node);
-  }
-  auto overlap = [](std::pair<int, int> a, std::pair<int, int> b) -> bool {
-    return b.second >= a.first && a.second >= b.first;
-  };
-  // If the lifetime of two nodes is overwritten, we set them as adjacent nodes.
-  for (size_t i = 0; i < mem_nodes.size(); i++) {
-    for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-      if (overlap(mem_nodes[i].lifetime, mem_nodes[j].lifetime)) {
-        mem_nodes[i].adj.insert(mem_nodes[j].name);
-        mem_nodes[j].adj.insert(mem_nodes[i].name);
-      }
-    }
-  }
-
-  // Sort the nodes according to the node memory size.
-  auto sort_func = [](MemNode a, MemNode b) { return a.size > b.size; };
-  std::sort(mem_nodes.begin(), mem_nodes.end(), sort_func);
-
-  // Generating Memory Reuse Strategy Based on Greedy Way
-  for (size_t i = 0; i < mem_nodes.size(); i++) {
-    if (mem_nodes[i].cluster >= 0) continue;
-    int cluster_index = cluster_size->size();
-    mem_nodes[i].cluster = cluster_index;
-    (*cluster_size)[mem_nodes[i].name] = mem_nodes[i].size;
-    (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name;
-    std::unordered_set<std::string> cluster_adj = mem_nodes[i].adj;
-    for (size_t j = i + 1; j < mem_nodes.size(); j++) {
-      if (mem_nodes[j].cluster < 0 &&
-          (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) {
-        (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name;
-        mem_nodes[j].cluster = cluster_index;
-        for (auto& n : mem_nodes[j].adj) {
-          cluster_adj.insert(n);
-        }
-      }
-    }
-  }
-  for (auto& cluster : *cluster_size) {
-    LOG(INFO) << "Cluster name : " << cluster.first
-              << "  size: " << cluster.second;
-  }
-}
-
 // Collect the memory size of the tensors.
 void MemoryOptimizePass::CollectVarMemorySize(
     const std::unordered_map<std::string, size_t>& batch_var_ave_dim,
@@ -469,17 +379,6 @@ void UpdateOpDescsByReuse(
         }
       }
 
-      // modify the graph
-      for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
-        std::string input_node_name = input_node->Name();
-        if (reuse_table.count(input_node_name) &&
-            reuse_table.at(input_node_name) != input_node_name) {
-          auto name = reuse_table.at(input_node_name);
-          input_node->RenameVar(name);
-        }
-      }
-
       for (auto argument : node->Op()->Outputs()) {
         for (const auto& x : argument.second) {
           auto name = x;
@@ -491,17 +390,6 @@ void UpdateOpDescsByReuse(
         }
       }
 
-      // modify the graph
-      for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
-        std::string out_node_name = out_node->Name();
-        if (reuse_table.count(out_node_name) &&
-            reuse_table.at(out_node_name) != out_node_name) {
-          auto name = reuse_table.at(out_node_name);
-          out_node->RenameVar(name);
-        }
-      }
-
       // Update arguments.
       for (auto& arg : in_args) {
         node->Op()->SetInput(arg.first, arg.second);
@@ -703,24 +591,12 @@ void MemoryOptimizePass::RunImpl(Argument* argument) {
   VLOG(3) << "Load memory cache from " << path;
   std::vector<std::map<std::string, std::vector<int>>> batches;
 
-  if (!(argument->static_memory_optim() && inference::IsFileExists(path))) {
-    string::PrettyLogInfo("--- Performing dynamic memory optimize");
-    // batches = FakeBatchVarShapes(argument->main_program());
-    int sort_kind = 0;
-    std::unordered_map<std::string, lifecycle_t> lifecycles;
-    space_table_t space_table;
-    std::unordered_map<std::string, std::string> node2cluster;
-    std::unordered_map<std::string, int> cluster_size;
-
-    CollectLifeCycle(&lifecycles, sort_kind);
-    CollectVarMemorySize(&space_table);
-    MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size);
-    UpdateOpDescsByReuse(graph_, node2cluster, sort_kind);
-    return;
-
-  } else {
+  if (argument->static_memory_optim() && inference::IsFileExists(path)) {
     string::PrettyLogInfo("--- Performing static memory optimize");
     batches = DeseralizeBatchVarShapes(path);
+  } else {
+    string::PrettyLogInfo("--- Performing dynamic memory optimize");
+    batches = FakeBatchVarShapes(argument->main_program());
   }
   auto var_batch_ave_size = GetBatchAverageSize(batches);
 
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
index 5a907303b4d..8bcc7c1ddd7 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h
@@ -74,8 +74,6 @@ class MemoryOptimizePass : public AnalysisPass {
       std::unordered_map<std::string, lifecycle_t> *lifecycles,
       int sort_kind) const;
 
-  void CollectVarMemorySize(space_table_t *space_table) const;
-
   void CollectVarMemorySize(
       const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
       std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/paddle_use_passes.cc
similarity index 100%
rename from paddle/fluid/inference/analysis/passes/passes.cc
rename to paddle/fluid/inference/analysis/passes/paddle_use_passes.cc
index a55904ed536..76043a53b75 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/paddle_use_passes.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/passes/passes.h"
 #include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
+#include "paddle/fluid/inference/analysis/passes/passes.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 1921e419383..8b0b76e6539 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -27,28 +27,20 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
 endif()
 
-if (ANAKIN_SUBGRAPH)
+if (ANAKIN_FOUND)
     set(inference_deps ${inference_deps} anakin_op_converter anakin_engine)
 endif()
 
-if(WITH_NGRAPH)
-    set(inference_deps ${inference_deps} ngraph)
-endif()
-
 add_subdirectory(details)
 
 if(WITH_MKLDNN)
-  set(mkldnn_quantizer_src mkldnn_quantizer.cc)
-  set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
-  cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
+	set(mkldnn_quantizer_src mkldnn_quantizer.cc)
+	set(mkldnn_quantizer_cfg mkldnn_quantizer_config)
+	cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder)
 endif()
 
 cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder)
-if(WITH_NGRAPH)
-  cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc DEPS ngraph)
-else(WITH_NGRAPH)
-  cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
-endif(WITH_NGRAPH)
+cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor
   reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
 cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
@@ -56,7 +48,9 @@ cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
            paddle_pass_builder zero_copy_tensor
            reset_tensor_array)
 
-cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
+cc_test(test_paddle_inference_api
+        SRCS api_tester.cc
+        DEPS paddle_inference_api)
 
 if(WITH_TESTING)
   inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
@@ -67,21 +61,13 @@ endif()
 cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
         ARGS --dirname=${WORD2VEC_MODEL_DIR})
 
-if(ANAKIN_FOUND)
-  if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86)
-    message(STATUS "Compile with anakin mlu place.")
-    add_definitions(-DANAKIN_MLU_PLACE)
-  elseif(ANAKIN_X86)
-    message(STATUS "Compile with anakin x86 place.")
-    add_definitions(-DANAKIN_X86_PLACE)
-  endif()
-  cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
-  target_link_libraries(inference_anakin_api anakin anakin_saber_common)
-  cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
-  target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common)
-  function(anakin_target target_name)
-    target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
-  endfunction()
-  anakin_target(inference_anakin_api)
-  anakin_target(inference_anakin_api_shared)
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
+    # compile the libinference_anakin_api.a and anakin.so.
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml zero_copy_tensor_dummy device_context)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber zero_copy_tensor_dummy device_context)
+    function(anakin_target target_name)
+      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
+    endfunction()
+    anakin_target(inference_anakin_api)
+    anakin_target(inference_anakin_api_shared)
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 890c90697bc..8b940b67e3f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -21,7 +21,6 @@
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
-extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -87,12 +86,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   // Model related.
   CP_MEMBER(model_dir_);
+  CP_MEMBER(prog_file_);
+  CP_MEMBER(params_file_);
   CP_MEMBER(model_from_memory_);  // the memory model reuses prog_file_ and
                                   // params_file_ fields.
-
-  prog_file_ = std::move(other.prog_file_);
-  params_file_ = std::move(other.params_file_);
-
   // Gpu related.
   CP_MEMBER(use_gpu_);
   CP_MEMBER(device_id_);
@@ -108,9 +105,6 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
   CP_MEMBER(trt_use_static_engine_);
-  CP_MEMBER(trt_use_calib_mode_);
-  // NGRAPH related.
-  CP_MEMBER(use_ngraph_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -174,26 +168,16 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
   Update();
 }
 
-void AnalysisConfig::EnableNgraph() {
-#ifdef PADDLE_WITH_NGRAPH
-  pass_builder()->EnableNgraph();
-  use_ngraph_ = true;
-#else
-  LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH";
-  use_ngraph_ = false;
-#endif
-}
-
-MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
+std::shared_ptr<MkldnnQuantizerConfig> AnalysisConfig::mkldnn_quantizer_config()
+    const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
                           "MkldnnQuantizer was not enabled yet.");
-  return mkldnn_quantizer_config_.get();
+  return mkldnn_quantizer_config_;
 }
 
 void AnalysisConfig::EnableTensorRtEngine(
     int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode, bool use_static,
-    bool use_calib_mode) {
+    AnalysisConfig::Precision precision_mode, bool use_static) {
 #ifdef PADDLE_WITH_CUDA
   if (!use_gpu()) {
     LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -206,7 +190,6 @@ void AnalysisConfig::EnableTensorRtEngine(
   tensorrt_min_subgraph_size_ = min_subgraph_size;
   tensorrt_precision_mode_ = precision_mode;
   trt_use_static_engine_ = use_static;
-  trt_use_calib_mode_ = use_calib_mode;
 
   Update();
 #else
@@ -245,24 +228,14 @@ void AnalysisConfig::Update() {
   }
 
   if (use_tensorrt_) {
-    pass_builder()->ClearPasses();
-    for (const auto &pass : kTRTSubgraphPasses) {
-      pass_builder()->AppendPass(pass);
-    }
-  }
-
-  if (use_ngraph_) {
-    if (!enable_ir_optim_) {
-      LOG(ERROR)
-          << "EnableNgraph() only works when IR optimization is enabled.";
+    const auto &passes = pass_builder_->AllPasses();
+    if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
+        std::end(passes)) {
+      // Append after the Affine_channel_conv_fuse pass.
+      pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
-#ifdef PADDLE_WITH_NGRAPH
-    pass_builder()->EnableNgraph();
-    use_ngraph_ = true;
-#else
-    LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH";
-    use_ngraph_ = false;
-#endif
+    pass_builder()->DeletePass("runtime_context_cache_pass");
+    pass_builder()->DeletePass("expected_kernel_cache_pass");
   }
 
   if (use_mkldnn_) {
@@ -339,8 +312,6 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << static_memory_optim_;
   ss << static_memory_optim_force_update_;
 
-  ss << use_ngraph_;
-
   ss << use_mkldnn_;
   for (auto &item : mkldnn_enabled_op_types_) ss << item;
   ss << ";";
@@ -371,7 +342,6 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_used, gpu_available;
-  platform::SetDeviceId(device_id_);
   platform::GpuMemoryUsage(&gpu_used, &gpu_available);
   double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
   float fraction_of_gpu_memory =
@@ -442,12 +412,4 @@ void AnalysisConfig::EnableAnakinEngine(
   anakin_auto_config_layout_ = auto_config_layout;
   Update();
 }
-
-void AnalysisConfig::PartiallyRelease() {
-  prog_file_.clear();
-  prog_file_.shrink_to_fit();
-  params_file_.clear();
-  params_file_.shrink_to_fit();
-}
-
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5d9d5a3178a..e57d3a80456 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -202,7 +202,6 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   timer.tic();
   // set feed variable
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
     return false;
@@ -230,15 +229,8 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // Here is a bugfix, collect all the container variables, and reset then to a
   // bool; the next time, the operator will call MutableData and construct a new
   // container again, so that the container will be empty for each batch.
-  if (sub_scope_) {
-    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
-  }
+  tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
   tensor_array_batch_cleaner_.ResetNoTensorVars();
-
-  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
-  // conflict when integrating it into deployment service.
-  paddle::platform::SetNumThreads(1);
-
   return true;
 }
 
@@ -393,7 +385,6 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
     argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
-    argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
   }
 
   if (config_.anakin_engine_enabled()) {
@@ -444,10 +435,6 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
       new framework::ProgramDesc(argument_.ir_analyzed_program()));
-  // The config and argument take a lot of storage,
-  // when the predictor settings are complete, we release these stores.
-  argument_.PartiallyRelease();
-  config_.PartiallyRelease();
   LOG(INFO) << "== optimize end ==";
 }
 
@@ -455,8 +442,6 @@ template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
   VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
   if (config.use_gpu()) {
     // 1. GPU memory
     PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
@@ -486,8 +471,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   }
 
   std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
-  // Each config can only be used for one predictor.
-  config.SetInValid();
   auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());
 
   if (!predictor_p->Init(nullptr)) {
@@ -599,11 +582,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
-
-  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
-  // conflict when integrating it into deployment service.
-  paddle::platform::SetNumThreads(1);
-
   return true;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 44b1b8071de..6bc892638c2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -260,7 +260,7 @@ class MkldnnQuantizerTest : public testing::Test {
     predictor.reset(new AnalysisPredictor(config));
     auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
 
-    auto qconfig = new MkldnnQuantizerConfig();
+    auto qconfig = std::make_shared<MkldnnQuantizerConfig>();
 
     mkldnn_quantizer.reset(
         new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig));
@@ -384,7 +384,7 @@ TEST_F(MkldnnQuantizerTest, histogram_empty) {
   // zero tensor
   framework::LoDTensor var_tensor;
   var_tensor.Resize({0});
-  var_tensor.mutable_data<double>(platform::CPUPlace());
+  ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
 
   ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 63d23321ab4..7dca6310fc3 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,14 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/inference/api/api_anakin_engine.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#include <mkl_service.h>
+#include <omp.h>
 #include <map>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/inference/api/api_anakin_engine.h"
-#include "paddle/fluid/inference/api/paddle_api.h"
-
 #include "framework/core/net/net.h"
 #include "framework/operators/ops.h"
 #include "saber/funcs/timer.h"
@@ -27,346 +33,209 @@
 namespace paddle {
 
 using paddle::contrib::AnakinConfig;
-template <typename T, Precision P, OpRunType R>
-extern std::mutex PaddleInferenceAnakinPredictor<T, P, R>::mutex_;
-template <typename T, Precision P, OpRunType R>
-extern std::once_flag PaddleInferenceAnakinPredictor<T, P, R>::init_anakin_;
 
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitEnv() {
-  anakin::TargetWrapper<T>::set_device(this->config_.device_id);
-  std::call_once(this->init_anakin_, [this]() {
-    anakin::Env<T>::env_init(this->config_.max_stream);
-  });
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitNet() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  this->executor_p_ = new anakin::Net<T, P, R>(*this->graph_p_, true);
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
+    const contrib::AnakinConfig &config) {
+  CHECK(Init(config));
 }
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::SetContext() {
-  this->ctx_p_ = std::make_shared<anakin::Context<T>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
+template <>
+PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
+    const contrib::AnakinConfig &config) {
+  omp_set_dynamic(0);
+  omp_set_num_threads(1);
+  mkl_set_num_threads(1);
+  CHECK(Init(config));
 }
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitGraph() {
-  this->graph_p_ =
-      std::make_shared<anakin::graph::Graph<T, anakin::Precision::FP32>>();
-  if (!(this->graph_p_->load(this->config_.model_file))) {
-    LOG(FATAL) << "fail to load graph from " << this->config_.model_file;
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Init(
+    const contrib::AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    VLOG(3) << "fail to load graph from " << config.model_file;
+    return false;
   }
-  auto inputs = this->graph_p_->get_ins();
+  auto inputs = graph_.get_ins();
   for (auto &input_str : inputs) {
-    if (this->config_.init_inputs_shape.find(input_str) ==
-        this->config_.init_inputs_shape.end()) {
-      LOG(FATAL) << input_str << " is not implemented.";
-    }
-    std::vector<int> shape =
-        this->config_.init_inputs_shape.find(input_str)->second;
-    this->graph_p_->Reshape(input_str, shape);
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+    max_batch_size_ = config.max_batch_size;
   }
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::OptimizeGraph() {
-  if (!this->graph_p_->Optimize()) {
-    LOG(FATAL) << "Graph optimization error.";
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
   }
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::InitPredictor() {
-  this->InitEnv();
-  this->SetContext();
-  this->InitGraph();
-  this->OptimizeGraph();
-  this->InitNet();
-}
-template <typename T, Precision P, OpRunType R>
-void PaddleInferenceAnakinPredictor<T, P, R>::Predict() {
-  anakin::TargetWrapper<T>::device_sync();
-  this->executor_p_->prediction();
-  anakin::TargetWrapper<T>::device_sync();
-}
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, int batch_size) {
-  if (this->config_.re_allocable) {
-    return this->RunImpl(inputs, output_data);
-  } else {
-    // Run inputs data that exceeds batch size in batches.
-    // 1. Reassign the batch size.
-    if (batch_size == -1) {
-      if (!inputs[0].lod.empty()) {
-        batch_size = inputs[0].lod[0].size() - 1;
-      } else {
-        batch_size = inputs[0].shape[0];
-      }
-    }
-    // 2. If the data don't need to be batched, run it directly.
-    if (batch_size <= this->config_.init_batch_size) {
-      return this->RunImpl(inputs, output_data);
-    }
-    // 3. Check the batch size and define temporary variables.
-    std::vector<PaddleTensor> cur_inputs;
-    std::vector<PaddleTensor> outputs_master;
-    std::vector<std::vector<paddle::PaddleTensor>> outputs_vec;
-    for (const auto &input : inputs) {
-      if (!input.lod.empty()) {
-        if (input.lod.size() != 1) {
-          return false;
-        }
-        if (input.lod[0].size() - 1 != batch_size) {
-          return false;
-        }
-      } else {
-        LOG(INFO) << "Non-lod mode to be implemented.";
-        return false;
-      }
-      PaddleTensor tensor;
-      tensor.name = input.name;
-      tensor.dtype = PaddleDType::FLOAT32;
-      cur_inputs.push_back(tensor);
-    }
-    for (auto output : *output_data) {
-      PaddleTensor tensor;
-      tensor.name = output.name;
-      outputs_master.push_back(tensor);
-    }
-    // 4. Batch execution.
-    for (size_t start_batch = 0; start_batch < batch_size;) {
-      auto end_batch = start_batch + this->config_.init_batch_size;
-      if (end_batch > batch_size) {
-        end_batch = batch_size;
-      }
-      auto cur_outputs = outputs_master;
-      for (size_t i = 0; i < inputs.size(); i++) {
-        auto start = inputs[i].lod[0][start_batch];
-        auto end = inputs[i].lod[0][end_batch];
-        std::vector<size_t> offsets;
-        for (size_t j = start_batch; j <= end_batch; j++) {
-          offsets.push_back(inputs[i].lod[0][j] -
-                            inputs[i].lod[0][start_batch]);
-        }
-        auto mem_start = static_cast<float *>(inputs[i].data.data()) + start;
-        cur_inputs[i].data =
-            PaddleBuf(mem_start, (end - start) * sizeof(float));
-        cur_inputs[i].lod = std::vector<std::vector<size_t>>({offsets});
-        cur_inputs[i].shape =
-            std::vector<int>({static_cast<int>(end - start), 1, 1, 1});
-      }
-      if (!this->RunImpl(cur_inputs, &cur_outputs)) {
-        return false;
-      }
-      outputs_vec.push_back(cur_outputs);
-      start_batch = end_batch;
-    }
-    // 5. Copy the results to contiguous memory.
-    // Assume that each batch has the same final outputs size.
-    auto count = [](const std::vector<int> &v) {
-      int cnt = 1;
-      for_each(v.begin(), v.end(), [&cnt](int n) { cnt *= n; });
-      return cnt;
-    };
-    for (size_t i = 0; i < output_data->size(); i++) {
-      std::vector<int> shape = outputs_vec[i][0].shape;
-      shape[0] = batch_size;
-      int total_cnt = count(shape);
-      (*output_data)[i].shape = shape;
-      (*output_data)[i].data.Resize(total_cnt * sizeof(float));
-      float *addr = static_cast<float *>((*output_data)[i].data.data());
-      for (const auto &single_out : outputs_vec) {
-        int cnt = count(single_out[i].shape);
-        memcpy(addr, single_out[i].data.data(), cnt * sizeof(float));
-        addr += cnt;
-      }
-    }
+  // construct executer
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                  anakin::Precision::FP32>(graph_, true);
   }
   return true;
 }
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::RunImpl(
+
+template <typename Target>
+bool PaddleInferenceAnakinPredictor<Target>::Run(
     const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data) {
+    std::vector<PaddleTensor> *output_data, int batch_size) {
   for (const auto &input : inputs) {
     if (input.dtype != PaddleDType::FLOAT32) {
-      LOG(FATAL) << "Only support float type inputs. " << input.name
-                 << "'s type is not float";
+      VLOG(3) << "Only support float type inputs. " << input.name
+              << "'s type is not float";
+      return false;
     }
-    auto d_tensor_p = this->executor_p_->get_in(input.name);
-    auto net_shape = d_tensor_p->shape();
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->shape();
     if (net_shape.size() != input.shape.size()) {
-      LOG(FATAL) << " input  " << input.name
-                 << "'s shape size should be equal to that of net";
+      VLOG(3) << " input  " << input.name
+              << "'s shape size should be equal to that of net";
+      return false;
     }
     int sum = 1;
     for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
     if (sum > net_shape.count()) {
-      if (this->config_.re_allocable) {
-        this->graph_p_->Reshape(input.name, input.shape);
-        delete this->executor_p_;
-        this->InitNet();
-        d_tensor_p = this->executor_p_->get_in(input.name);
-      } else {
-        LOG(FATAL)
-            << "Run failed because Anakin was expected not to reallocate "
-               "memory.";
-      }
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, anakin::saber::AK_FLOAT,
+                                    anakin::Precision::FP32>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
     }
-    std::vector<int> tmp_shape;
+
+    anakin::saber::Shape tmp_shape;
     for (auto s : input.shape) {
       tmp_shape.push_back(s);
     }
-    auto *data = static_cast<float *>(input.data.data());
-    anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
-        h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
-                 tmp_shape);
-    d_tensor_p->reshape(tmp_shape);
+    d_tensor_in_p->reshape(tmp_shape);
 
     if (input.lod.size() > 0) {
       if (input.lod.size() > 1) {
-        LOG(FATAL) << " input lod first dim should <=1, but you set "
-                   << input.lod.size();
+        VLOG(3) << " input lod first dim should <=1, but you set "
+                << input.lod.size();
+        return false;
       }
-      std::vector<int> lod(input.lod[0].begin(), input.lod[0].end());
-      std::vector<std::vector<int>> offset({lod});
-      d_tensor_p->set_seq_offset(offset);
-      VLOG(3) << "offset.size(): " << offset[0].size();
-      for (int i = 0; i < offset[0].size(); i++) {
-        VLOG(3) << offset[0][i];
+      std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
+      d_tensor_in_p->set_seq_offset(offset);
+      VLOG(3) << "offset.size(): " << offset.size();
+      for (int i = 0; i < offset.size(); i++) {
+        VLOG(3) << offset[i];
+      }
+    }
+
+    float *d_data_p = d_tensor_in_p->mutable_data();
+
+#ifdef PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      if (cudaMemcpy(d_data_p, static_cast<float *>(input.data.data()),
+                     d_tensor_in_p->valid_size() * sizeof(float),
+                     cudaMemcpyHostToDevice) != 0) {
+        VLOG(3) << "copy data from CPU to GPU error";
+        return false;
       }
     }
-    d_tensor_p->copy_from(h_tensor);
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(d_data_p, static_cast<float *>(input.data.data()),
+             d_tensor_in_p->valid_size() * sizeof(float));
+    }
   }
-  this->Predict();
+#ifdef PADDLE_WITH_CUDA
+  cudaDeviceSynchronize();
+  executor_p_->prediction();
+  cudaDeviceSynchronize();
+#endif
+
   if (output_data->empty()) {
-    LOG(FATAL) << "At least one output should be set with tensors' names.";
+    VLOG(3) << "At least one output should be set with tensors' names.";
+    return false;
   }
   for (auto &output : *output_data) {
-    auto *d_tensor_p = this->executor_p_->get_out(output.name);
-    output.shape = d_tensor_p->valid_shape();
-    if (output.data.length() < d_tensor_p->valid_size() * sizeof(float)) {
-      output.data.Resize(d_tensor_p->valid_size() * sizeof(float));
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
+
+#if PADDLE_WITH_CUDA
+    if (std::is_same<anakin::NV, Target>::value) {
+      // Copy data from GPU -> CPU
+      if (cudaMemcpy(output.data.data(), tensor->mutable_data(),
+                     tensor->valid_size() * sizeof(float),
+                     cudaMemcpyDeviceToHost) != 0) {
+        VLOG(3) << "copy data from GPU to CPU error";
+        return false;
+      }
+    }
+#endif
+    if (std::is_same<anakin::X86, Target>::value) {
+      memcpy(output.data.data(), tensor->mutable_data(),
+             tensor->valid_size() * sizeof(float));
     }
-    auto *data = static_cast<float *>(output.data.data());
-    anakin::saber::Tensor<typename anakin::DefaultHostType<T>::Host_type>
-        h_tensor(data, typename anakin::DefaultHostType<T>::Host_type(), 0,
-                 d_tensor_p->valid_shape());
-    h_tensor.copy_from(*d_tensor_p);
   }
   return true;
 }
-template <typename T, Precision P, OpRunType R>
-bool PaddleInferenceAnakinPredictor<T, P, R>::ResetConfig(
-    const AnakinConfig &config) {
-  this->config_ = config;
-  return true;
-}
-template <typename T, Precision P, OpRunType R>
-anakin::Net<T, P, R> &PaddleInferenceAnakinPredictor<T, P, R>::ResetExecuter(
-    std::shared_ptr<anakin::graph::Graph<T, P>> graph_p) {
-  this->graph_p_ = graph_p;
-  this->ctx_p_ = std::make_shared<anakin::Context<T>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
-  this->InitNet();
-  return *this->executor_p_;
+
+template <typename Target>
+anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor<Target>::get_executer() {
+  return *executor_p_;
 }
+
 // the cloned new Predictor of anakin share the same net weights from original
 // Predictor
-template <typename T, Precision P, OpRunType R>
+template <typename Target>
 std::unique_ptr<PaddlePredictor>
-PaddleInferenceAnakinPredictor<T, P, R>::Clone() {
+PaddleInferenceAnakinPredictor<Target>::Clone() {
   VLOG(3) << "Anakin Predictor::clone";
   std::unique_ptr<PaddlePredictor> cls(
-      new PaddleInferenceAnakinPredictor<T, P, R>());
+      new PaddleInferenceAnakinPredictor<Target>());
   // construct executer from other graph
   auto anakin_predictor_p =
-      dynamic_cast<PaddleInferenceAnakinPredictor<T, P, R> *>(cls.get());
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target> *>(cls.get());
   if (!anakin_predictor_p) {
-    LOG(FATAL) << "fail to call Init";
+    VLOG(3) << "fail to call Init";
+    return nullptr;
   }
-  anakin_predictor_p->ResetConfig(this->config_);
-  anakin_predictor_p->ResetExecuter(this->graph_p_);
-  return cls;
-}
+  anakin_predictor_p->get_executer().init(graph_);
 
-#ifdef ANAKIN_MLU_PLACE
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::SetContext() {
-  this->ctx_p_ = std::make_shared<anakin::Context<anakin::MLU>>(
-      this->config_.device_id, this->config_.data_stream_id,
-      this->config_.compute_stream_id);
-  this->ctx_p_->set_model_parallel(this->config_.model_parallel);
-  this->ctx_p_->set_fusion(this->config_.op_fuse);
+  return std::move(cls);
 }
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::OptimizeGraph() {
-  if (!this->graph_p_->fusion_optimize(this->config_.op_fuse)) {
-    LOG(FATAL) << "Graph optimization error.";
-  }
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::InitNet() {
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  this->executor_p_ = new anakin::Net<anakin::MLU, P, R>();
-  this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true);
-}
-template <Precision P, OpRunType R>
-void PaddleInferenceAnakinMLUPredictor<P, R>::Predict() {
-  anakin::TargetWrapper<anakin::MLU>::device_sync();
-  this->executor_p_->fusion_prediction();
-  anakin::TargetWrapper<anakin::MLU>::device_sync();
-}
-#endif
 
 #ifdef PADDLE_WITH_CUDA
-template class PaddleInferenceAnakinPredictor<
-    anakin::NV, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
-#endif
-#ifdef ANAKIN_X86_PLACE
-template class PaddleInferenceAnakinPredictor<
-    anakin::X86, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>;
-#endif
-#ifdef ANAKIN_MLU_PLACE
-template class PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
-                                                 ::anakin::OpRunType::SYNC>;
+template class PaddleInferenceAnakinPredictor<anakin::NV>;
 #endif
+template class PaddleInferenceAnakinPredictor<anakin::X86>;
 
 // A factory to help create difference predictor.
 template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
     const contrib::AnakinConfig &config) {
-#ifdef PADDLE_WITH_CUDA
+  VLOG(3) << "Anakin Predictor create.";
   if (config.target_type == contrib::AnakinConfig::NVGPU) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinPredictor<anakin::NV, anakin::Precision::FP32,
-                                           ::anakin::OpRunType::ASYNC>(config));
-  }
-#endif
-#ifdef ANAKIN_X86_PLACE
-  if (config.target_type == contrib::AnakinConfig::X86) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinPredictor<anakin::X86, anakin::Precision::FP32,
-                                           ::anakin::OpRunType::ASYNC>(config));
-  }
+#ifdef PADDLE_WITH_CUDA
+    VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::NV>(config));
+    return x;
+#else
+    LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
+    return nullptr;
 #endif
-#ifdef ANAKIN_MLU_PLACE
-  if (config.target_type == contrib::AnakinConfig::MLU) {
-    return std::unique_ptr<PaddlePredictor>(
-        new PaddleInferenceAnakinMLUPredictor<anakin::Precision::FP32,
-                                              ::anakin::OpRunType::SYNC>(
-            config));
+  } else if (config.target_type == contrib::AnakinConfig::X86) {
+    VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::X86>(config));
+    return x;
+  } else {
+    VLOG(3) << "Anakin Predictor create on unknown platform.";
+    return nullptr;
   }
-#endif
-  LOG(FATAL) << "Anakin Predictor create on unknown platform.";
-  return nullptr;
 }
-template <typename T, Precision P, OpRunType R>
-void DisplayOpTimer(anakin::Net<T, P, R> *net_executor, int epoch) {
+
 #ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+template <typename Target>
+using executor_t =
+    anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>;
+
+template <typename Target>
+void DisplayOpTimer(executor_t<Target> *net_executor, int epoch) {
   std::vector<float> op_time = net_executor->get_op_time();
   auto exec_funcs = net_executor->get_exec_funcs();
   auto op_param = net_executor->get_op_param();
@@ -386,13 +255,16 @@ void DisplayOpTimer(anakin::Net<T, P, R> *net_executor, int epoch) {
   for (auto it = op_map.begin(); it != op_map.end(); ++it) {
     LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
   }
-#endif
 }
-template <typename T, Precision P, OpRunType R>
-PaddleInferenceAnakinPredictor<T, P, R>::~PaddleInferenceAnakinPredictor() {
-  DisplayOpTimer<T, P, R>(this->executor_p_, this->config_.init_batch_size);
-  delete this->executor_p_;
-  this->executor_p_ = nullptr;
+#endif
+
+template <typename Target>
+PaddleInferenceAnakinPredictor<Target>::~PaddleInferenceAnakinPredictor() {
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+  DisplayOpTimer<Target>(executor_p_, max_batch_size_);
+#endif
+  delete executor_p_;
+  executor_p_ = nullptr;
 }
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index 0f0d7febe2e..35994668dfb 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
-
 #include "framework/core/net/net.h"
 #include "framework/graph/graph.h"
 #include "paddle/fluid/inference/api/paddle_anakin_config.h"
@@ -31,18 +30,13 @@ limitations under the License. */
 namespace paddle {
 
 using contrib::AnakinConfig;
-using anakin::Precision;
-using anakin::OpRunType;
 
-template <typename T, Precision P, OpRunType R>
+template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
-  PaddleInferenceAnakinPredictor() = default;
+  PaddleInferenceAnakinPredictor() {}
 
-  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config)
-      : config_(config) {
-    this->InitPredictor();
-  }
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
 
   // NOTE Unlike the native engine, the buffers of anakin engine's output_data
   // should be allocated first.
@@ -51,45 +45,21 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
            int batch_size = -1) override;
 
   std::unique_ptr<PaddlePredictor> Clone() override;
-  virtual bool ResetConfig(const AnakinConfig& config);
-  virtual anakin::Net<T, P, R>& ResetExecuter(
-      std::shared_ptr<anakin::graph::Graph<T, P>> graph_p);
-  void InitPredictor();
 
-  ~PaddleInferenceAnakinPredictor() override;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
 
- protected:
-  void InitEnv();
-  void InitGraph();
-  virtual void OptimizeGraph();
-  virtual void InitNet();
-  virtual void SetContext();
-  virtual void Predict();
-  static std::mutex mutex_;
-  AnakinConfig config_;
-  std::shared_ptr<anakin::Context<T>> ctx_p_;
-  std::shared_ptr<anakin::graph::Graph<T, P>> graph_p_;
-  anakin::Net<T, P, R>* executor_p_{nullptr};
+  ~PaddleInferenceAnakinPredictor() override;
 
  private:
-  bool RunImpl(const std::vector<PaddleTensor>& inputs,
-               std::vector<PaddleTensor>* output_data);
-  static std::once_flag init_anakin_;
-};
+  bool Init(const AnakinConfig& config);
 
-#ifdef ANAKIN_MLU_PLACE
-template <Precision P, OpRunType R>
-class PaddleInferenceAnakinMLUPredictor final
-    : public PaddleInferenceAnakinPredictor<anakin::MLU, P, R> {
- public:
-  explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) {
-    this->ResetConfig(config);
-    this->InitPredictor();
-  }
-  void SetContext() override;
-  void OptimizeGraph() override;
-  void InitNet() override;
-  void Predict() override;
+  anakin::graph::Graph<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      graph_;
+  anakin::Net<Target, anakin::saber::AK_FLOAT, anakin::Precision::FP32>*
+      executor_p_{nullptr};
+  AnakinConfig config_;
+  int max_batch_size_{0};
 };
-#endif
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 8c4ce84fa61..19ef402d6fd 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -53,8 +53,10 @@ if (WIN32)
     safe_set_static_flag()
     add_definitions(-DSTATIC_LIB)
   endif()
+  set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()
 message("flags" ${CMAKE_CXX_FLAGS})
 
@@ -123,8 +125,11 @@ if (NOT WIN32)
 else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
-      glog gflags_static protobuf snappy zlibstatic xxhash snappystream ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+      ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
+      ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash
+      snappystream ${EXTERNAL_LIB})
+  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+  set(DEPS ${DEPS} libcmt ${os_dependency_modules})
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index e5820c3637b..ab7f5533748 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -21,7 +21,6 @@
 #endif
 #include <algorithm>
 #include <chrono>  // NOLINT
-#include <functional>
 #include <iterator>
 #include <numeric>
 #include <sstream>
@@ -64,12 +63,9 @@ static int GetUniqueId() {
 }
 
 static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces, bool ignore_null = true) {
+                  std::vector<std::string> *pieces) {
   pieces->clear();
   if (str.empty()) {
-    if (!ignore_null) {
-      pieces->push_back(str);
-    }
     return;
   }
   size_t pos = 0;
@@ -83,63 +79,26 @@ static void split(const std::string &str, char sep,
     pieces->push_back(str.substr(pos));
   }
 }
-
-template <typename T>
-static T convert(const std::string &item,
-                 std::function<T(const std::string &item)> func) {
-  T res;
-  try {
-    res = func(item);
-  } catch (std::invalid_argument &e) {
-    std::string message =
-        "invalid_argument exception when try to convert : " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  } catch (std::out_of_range &e) {
-    std::string message =
-        "out_of_range exception when try to convert : " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  } catch (...) {
-    std::string message = "unexpected exception when try to convert " + item;
-    LOG(ERROR) << message;
-    PADDLE_THROW(message);
-  }
-  return res;
-}
-
 static void split_to_float(const std::string &str, char sep,
                            std::vector<float> *fs) {
   std::vector<std::string> pieces;
   split(str, sep, &pieces);
   std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs),
-                 [](const std::string &v) {
-                   return convert<float>(v, [](const std::string &item) {
-                     return std::stof(item);
-                   });
-                 });
+                 [](const std::string &v) { return std::stof(v); });
 }
 static void split_to_int64(const std::string &str, char sep,
                            std::vector<int64_t> *is) {
   std::vector<std::string> pieces;
   split(str, sep, &pieces);
   std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
-                 [](const std::string &v) {
-                   return convert<int64_t>(v, [](const std::string &item) {
-                     return std::stoll(item);
-                   });
-                 });
+                 [](const std::string &v) { return std::stoi(v); });
 }
 static void split_to_int(const std::string &str, char sep,
                          std::vector<int> *is) {
   std::vector<std::string> pieces;
   split(str, sep, &pieces);
   std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is),
-                 [](const std::string &v) {
-                   return convert<int>(v, [](const std::string &item) {
-                     return std::stoi(item);
-                   });
-                 });
+                 [](const std::string &v) { return std::stoi(v); });
 }
 template <typename T>
 std::string to_string(const std::vector<T> &vec) {
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 9d560ddd2e0..de75e884f53 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -50,48 +50,40 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
 
       auto glambda = [&](const VariableNameMap& connections, bool is_output) {
         for (auto const& conn : connections) {
-          for (const auto& var_name : conn.second) {
-            // skip if scale already computed
-            if (scales_.find(var_name) != scales_.end()) return;
-
-            auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
-            LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
-
-            // force unsigned type if already know it
-            bool is_unsigned = false;
-            if (is_output && op->Type() == "conv2d") {
-              // output of conv2d with relu must be unsigned
-              is_unsigned = (op->HasAttr("fuse_relu") &&
-                             boost::get<bool>(op->GetAttr("fuse_relu"))) ||
-                            (op->HasAttr("fuse_brelu") &&
-                             boost::get<bool>(op->GetAttr("fuse_brelu")));
-            } else if (is_output && op->Type() == "relu") {
-              is_unsigned = true;
-            } else if (is_output &&
-                       (op->Type() == "pool2d" || op->Type() == "transpose2" ||
-                        op->Type() == "reshape2" || op->Type() == "concat")) {
-              // output of ops with unsigned input must be unsigned
-              is_unsigned = true;
-              for (auto input_var_name : op->Input("X")) {
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
-                               "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
-                is_unsigned = is_unsigned && scales_[input_var_name].first;
-              }
+          if (conn.second.size() == 0) continue;
+          auto& var_name = conn.second[0];
+
+          // skip if scale already computed
+          if (scales_.find(var_name) != scales_.end()) return;
+
+          auto* var = predictor_.sub_scope_->FindVar(var_name);
+          PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
+          PADDLE_ENFORCE(var->IsType<LoDTensor>(),
+                         "Only support lod tensor now.");
+          LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
+
+          // force unsigned type if already know it
+          bool is_unsigned = false;
+          if (is_output && op->Type() == "conv2d") {
+            // output of conv2d with relu must be unsigned
+            is_unsigned = op->HasAttr("fuse_relu") &&
+                          boost::get<bool>(op->GetAttr("fuse_relu"));
+          } else if (is_output && op->Type() == "pool2d") {
+            // output of pool2d with unsigned input must be unsigned
+            auto input_var_name = op->Input("X")[0];
+            if (scales_.find(input_var_name) != scales_.end()) {
+              is_unsigned = scales_[input_var_name].first;
             }
-
-            CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
-                                 is_unsigned);
           }
+
+          CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
+                               is_unsigned);
         }
       };
 
-      // handle inputs first to let is_unsigned be inferred for the outputs
-      glambda(connections_in, false /* is_output */);
+      // handle outputs first so unsigned outputs could be inferred
       glambda(connections_out, true /* is_output */);
+      glambda(connections_in, false /* is_output */);
     }
   }
 
@@ -361,9 +353,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   arg.SetMainProgramNotOwned(predictor_.inference_program_.get());
   auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
   arg.SetMainGraph(graph.release());
-  auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
-  arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
+  arg.main_graph().Set(framework::ir::kParamScopeAttr,
+                       new framework::Scope*(arg.scope_ptr()));
 
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
index aea4a0ac93d..f4b0df5d742 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
@@ -45,8 +45,9 @@ using VarQuantScale =
 
 class AnalysisPredictor::MkldnnQuantizer {
  public:
-  explicit MkldnnQuantizer(AnalysisPredictor& predictor,  // NOLINT
-                           const MkldnnQuantizerConfig* qconfig)
+  explicit MkldnnQuantizer(
+      AnalysisPredictor& predictor,  // NOLINT
+      const std::shared_ptr<MkldnnQuantizerConfig>& qconfig)
       : predictor_(predictor), qconfig_(qconfig) {}
 
   // Execute full quantization procedure.
@@ -94,7 +95,7 @@ class AnalysisPredictor::MkldnnQuantizer {
 
  private:
   AnalysisPredictor& predictor_;
-  const MkldnnQuantizerConfig* qconfig_;
+  const std::shared_ptr<MkldnnQuantizerConfig> qconfig_;
 
   // A map: variable name -> scale
   VarQuantScale scales_;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index a7cb785fe95..f9ff542d86d 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -22,13 +22,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH;
   rules_["conv2d"]["Bias"] = ScaleAlgo::NONE;  // do not compute scale
   rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL;
-  rules_["conv2d"]["Output"] = ScaleAlgo::KL;
+  rules_["conv2d"]["Output"] = ScaleAlgo::KL;  // do not compute scale
 
   rules_["pool2d"]["X"] = ScaleAlgo::KL;
-  rules_["pool2d"]["Out"] = ScaleAlgo::KL;
-
-  rules_["concat"]["X"] = ScaleAlgo::KL;
-  rules_["concat"]["Out"] = ScaleAlgo::KL;
+  rules_["pool2d"]["Out"] = ScaleAlgo::KL;  // do not compute scale
 }
 
 ScaleAlgo MkldnnQuantizerConfig::scale_algo(
diff --git a/paddle/fluid/inference/api/paddle_anakin_config.h b/paddle/fluid/inference/api/paddle_anakin_config.h
index 7c0e2f06ff4..0e91c2624be 100644
--- a/paddle/fluid/inference/api/paddle_anakin_config.h
+++ b/paddle/fluid/inference/api/paddle_anakin_config.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 #pragma once
 
 #include <cassert>
-#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -25,22 +24,11 @@ namespace paddle {
 namespace contrib {
 // Configurations for Anakin engine.
 struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86, MLU };
-  int device_id{0};
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
   std::string model_file;
-  std::map<std::string, std::vector<int>> init_inputs_shape;
-  int init_batch_size{-1};
-  bool re_allocable{true};
-  int max_stream{4};
-  int data_stream_id{0};
-  int compute_stream_id{0};
+  int max_batch_size{-1};
   TargetType target_type;
-#ifdef ANAKIN_MLU_PLACE
-  int model_parallel{8};
-  int data_parallel{1};
-  bool op_fuse{false};
-  bool sparse{false};
-#endif
 };
 
 }  // namespace contrib
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e3682d27054..ebe289322bd 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -142,8 +142,7 @@ struct AnalysisConfig {
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
                             int max_batch_size = 1, int min_subgraph_size = 3,
                             Precision precision = Precision::kFloat32,
-                            bool use_static = false,
-                            bool use_calib_mode = false);
+                            bool use_static = false);
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -169,13 +168,6 @@ struct AnalysisConfig {
    */
   void SwitchIrDebug(int x = true);
 
-  /** Turn on NGRAPH.
-   */
-  void EnableNgraph();
-  /** A boolean state telling whether to use the NGRAPH.
-   */
-  bool ngraph_enabled() const { return use_ngraph_; }
-
   /** Turn on MKLDNN.
    */
   void EnableMKLDNN();
@@ -210,7 +202,7 @@ struct AnalysisConfig {
   */
   bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; }
 
-  MkldnnQuantizerConfig* mkldnn_quantizer_config() const;
+  std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config() const;
 
   /** Specify the memory buffer of program and parameter
    * @param prog_buffer the memory buffer of program.
@@ -232,8 +224,6 @@ struct AnalysisConfig {
                          bool force_update_static_cache = false);
   /** Tell whether the memory optimization is activated. */
   bool enable_memory_optim() const;
-  void SetInValid() const { is_valid_ = false; }
-  bool is_valid() const { return is_valid_; }
 
   friend class ::paddle::AnalysisPredictor;
 
@@ -241,7 +231,6 @@ struct AnalysisConfig {
    * Get a pass builder for customize the passes in IR analysis phase.
    */
   PassStrategy* pass_builder() const;
-  void PartiallyRelease();
 
  protected:
   // Update the config.
@@ -252,8 +241,8 @@ struct AnalysisConfig {
  protected:
   // Model pathes.
   std::string model_dir_;
-  mutable std::string prog_file_;
-  mutable std::string params_file_;
+  std::string prog_file_;
+  std::string params_file_;
 
   // GPU related.
   bool use_gpu_{false};
@@ -277,14 +266,12 @@ struct AnalysisConfig {
   int tensorrt_min_subgraph_size_{3};
   Precision tensorrt_precision_mode_;
   bool trt_use_static_engine_;
-  bool trt_use_calib_mode_;
 
   // memory reuse related.
   bool enable_memory_optim_{false};
   bool static_memory_optim_{false};
   bool static_memory_optim_force_update_{false};
 
-  bool use_ngraph_{false};
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
@@ -315,11 +302,6 @@ struct AnalysisConfig {
 
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
-  // If the config is already used on a predictor, it becomes invalid.
-  mutable bool is_valid_{true};
-  // Any config can only be used with one predictor.
-  // Variables held by config can take up a lot of memory in some cases.
-  // So we release the memory when the predictor is set up.
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 2906a4926f7..1785bd520a1 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -28,6 +28,6 @@ limitations under the License. */
 
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
-#if (defined WITH_ANAKIN) || (defined PADDLE_WITH_ANAKIN)
+#ifdef WITH_ANAKIN
 #include "paddle_anakin_config.h"  // NOLINT
 #endif
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index bc2c0914728..2a7bd55a76e 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -70,24 +70,6 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
-const std::vector<std::string> kTRTSubgraphPasses({
-  "infer_clean_graph_pass",                        //
-      "conv_affine_channel_fuse_pass",             //
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_quant_dequant_op_pass",              //
-      // "fc_fuse_pass",                                 //
-      "tensorrt_subgraph_pass",  //
-      "conv_bn_fuse_pass",       //
-#if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
-                           // guaranteed at least v7
-      "conv_elementwise_add_act_fuse_pass",   //
-      "conv_elementwise_add2_act_fuse_pass",  //
-      "conv_elementwise_add_fuse_pass",       //
-#endif                                        //
-      "transpose_flatten_concat_fuse_pass",
-});
-
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
     "infer_clean_graph_pass",                       //
@@ -109,7 +91,6 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_affine_channel_fuse_pass",             //
         "conv_eltwiseadd_affine_channel_fuse_pass",  //
         "conv_bn_fuse_pass",                         //
-        "conv_eltwiseadd_bn_fuse_pass",              //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
         "conv_elementwise_add_act_fuse_pass",   //
@@ -117,8 +98,9 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_fuse_pass",       //
 #endif                                          //
         "transpose_flatten_concat_fuse_pass",
-        // following pass should be located in the last, since it will
+        // following two passes should be located in the last, since they will
         // work on all fused ops.
+        "expected_kernel_cache_pass",  //
         "runtime_context_cache_pass"
   });
 
@@ -133,10 +115,6 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
   LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
-void GpuPassStrategy::EnableNgraph() {
-  LOG(ERROR) << "GPU not support Ngraph yet";
-}
-
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -156,8 +134,9 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   "conv_bn_fuse_pass",             //
                   "conv_eltwiseadd_bn_fuse_pass",  //
                   "is_test_pass",                  //
-                  // following pass should be located in the last, since
-                  // it will work on all fused ops.
+                  // following two passes should be located in the last, since
+                  // they will work on all fused ops.
+                  "expected_kernel_cache_pass",  //
                   "runtime_context_cache_pass"});
 
   use_gpu_ = false;
@@ -169,20 +148,14 @@ void CpuPassStrategy::EnableMKLDNN() {
   if (!use_mkldnn_) {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
-    for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",    //
-             "conv_bn_fuse_pass",             // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
-             "conv_bias_mkldnn_fuse_pass",    //
-             "conv_transpose_bias_mkldnn_fuse_pass",
-             "conv3d_bias_mkldnn_fuse_pass",  //
-             "conv_elementwise_add_mkldnn_fuse_pass",
-             "conv_concat_relu_mkldnn_fuse_pass",
-             "conv_relu_mkldnn_fuse_pass",   //
-             "conv_brelu_mkldnn_fuse_pass",  //
-             // Disabled due to topology-dependent speed-up
-             // "fc_mkldnn_pass"
-         })) {
+    for (auto &pass : std::vector<std::string>(
+             {"depthwise_conv_mkldnn_pass",    //
+              "conv_bn_fuse_pass",             // Execute BN passes again to
+              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+              "conv_bias_mkldnn_fuse_pass",    //
+              "conv3d_bias_mkldnn_fuse_pass",  //
+              "conv_elementwise_add_mkldnn_fuse_pass",
+              "conv_relu_mkldnn_fuse_pass"})) {
       passes_.push_back(pass);
     }
   }
@@ -203,14 +176,4 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 #endif
 }
 
-void CpuPassStrategy::EnableNgraph() {
-#ifdef PADDLE_WITH_NGRAPH
-  if (!use_ngraph_) {
-    passes_.insert(passes_.begin(), "ngraph_subgraph_pass");
-  }
-  use_ngraph_ = true;
-#else
-  use_ngraph_ = false;
-#endif
-}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index 4236399aa1a..057e7dc65d5 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -90,10 +90,6 @@ class PassStrategy : public PaddlePassBuilder {
    */
   virtual void EnableMKLDNN() {}
 
-  /** Enable NGRAPH optimization
-   */
-  virtual void EnableNgraph() {}
-
   /** Enable MKLDNN quantize optimization
    */
   virtual void EnableMkldnnQuantizer() {}
@@ -103,7 +99,6 @@ class PassStrategy : public PaddlePassBuilder {
   virtual ~PassStrategy() = default;
 
  protected:
-  bool use_ngraph_{false};
   bool use_gpu_{false};
   bool use_mkldnn_{false};
 };
@@ -117,19 +112,16 @@ class CpuPassStrategy : public PassStrategy {
   explicit CpuPassStrategy(const CpuPassStrategy &other)
       : PassStrategy(other.AllPasses()) {
     use_gpu_ = other.use_gpu_;
-    use_ngraph_ = other.use_ngraph_;
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
   }
 
   virtual ~CpuPassStrategy() = default;
 
-  void EnableNgraph() override;
   void EnableMKLDNN() override;
   void EnableMkldnnQuantizer() override;
 
  protected:
-  bool use_ngraph_{false};
   bool use_mkldnn_quantizer_{false};
 };
 
@@ -144,14 +136,12 @@ class GpuPassStrategy : public PassStrategy {
     use_gpu_ = true;
   }
 
-  void EnableNgraph() override;
   void EnableMKLDNN() override;
   void EnableMkldnnQuantizer() override;
 
   virtual ~GpuPassStrategy() = default;
 };
 
-extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 854007ce801..840abd26a75 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -8,37 +8,35 @@ nv_library(tensorrt_converter
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter)
 
-# TODO(xingzhaolong): fix the the following ci ut error.
-
-#nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
-#nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op)
-#nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op)
-#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
-#nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op)
-#nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin)
-#nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#             elementwise_add_op elementwise_mul_op)
-#nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op)
-#nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op)
-#nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op)
-#nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op)
-#nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op)
-#nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#             split_op concat_op)
-#nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
-#        prelu_op)
-#nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
-#        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
+nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
+nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op SERIAL)
+nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
+nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL)
+nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin SERIAL)
+nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             elementwise_add_op elementwise_mul_op SERIAL)
+nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op SERIAL)
+nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op SERIAL)
+nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op SERIAL)
+nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op SERIAL)
+nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op SERIAL)
+nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+             split_op concat_op SERIAL)
+nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin
+        prelu_op SERIAL)
+nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
+        DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op SERIAL)
diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
index 5c2454fa9a3..0b756534ec6 100644
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -43,13 +43,12 @@ class ActivationOpConverter : public OpConverter {
         engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
         op_pair->second);
     auto output_name = op_desc.Output("Out")[0];
-
-    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index d9488684644..d017bac66dd 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,12 +116,18 @@ class BatchNormOpConverter : public OpConverter {
                              scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
+    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
     engine_->weight_map[op_desc.Input("Bias").front()] =
         std::move(combile_bias_tensor);
     engine_->weight_map[op_desc.Input("Scale").front()] =
         std::move(combile_scale_tensor);
 
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index ec771850edf..525ba9dc341 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -42,7 +42,13 @@ class ConcatOpConverter : public OpConverter {
     axis = axis - 1;  // Remove batch dim
     layer->setAxis(axis);
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode);
+    layer->setName(("concat (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 73bfa800f09..39a99a21ea7 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -32,31 +32,25 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   PADDLE_ENFORCE(engine != nullptr);
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
+
+  // Declare weights
   auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
   PADDLE_ENFORCE_NOT_NULL(Y_v);
   auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-  float* weight_data = nullptr;
-  bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-
-  if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-    float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-    auto weight_scale =
-        boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-    weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t,
-                                           true, weight_scale);
-    engine->SetTensorDynamicRange(X, in_scale);
-#endif
-  } else {
-    weight_data =
-        engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false);
-  }
 
-  PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL);
-  const int n_output = Y_t->dims()[0];
-  const int n_input = Y_t->dims()[1];
-  const int filter_h = Y_t->dims()[2];
-  const int filter_w = Y_t->dims()[3];
+  platform::CPUPlace cpu_place;
+  std::unique_ptr<framework::LoDTensor> weight_tensor(
+      new framework::LoDTensor());
+  weight_tensor->Resize(Y_t->dims());
+  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+
+  auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
+
+  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
+  const int n_output = weight_tensor->dims()[0];
+  const int n_input = weight_tensor->dims()[1];
+  const int filter_h = weight_tensor->dims()[2];
+  const int filter_w = weight_tensor->dims()[3];
   const int groups = boost::get<int>(op_desc.GetAttr("groups"));
   const std::vector<int> dilations =
       boost::get<std::vector<int>>(op_desc.GetAttr("dilations"));
@@ -72,7 +66,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                 static_cast<void*>(weight_data),
-                                static_cast<size_t>(Y_t->numel())};
+                                static_cast<size_t>(weight_tensor->numel())};
 
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
   auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
@@ -86,16 +80,11 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   auto output_name = op_desc.Output("Output").front();
   layer->setName((name + " (Output: " + output_name + ")").c_str());
+  engine->weight_map[op_desc.Input("Filter").front()] =
+      std::move(weight_tensor);
   layer->getOutput(0)->setName(output_name.c_str());
   engine->SetITensor(output_name, layer->getOutput(0));
 
-#if IS_TRT_VERSION_GE(5000)
-  if (enable_int8) {
-    float output_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-    engine->SetTensorDynamicRange(layer->getOutput(0), output_scale);
-  }
-#endif
-
   if (test_mode) {
     engine->DeclareOutput(output_name);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
index 71177e5e66d..ddbc724e3b2 100644
--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -55,8 +55,11 @@ class DropoutOpConverter : public OpConverter {
     engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
         std::move(weight_tensor);
     auto output_name = op_desc.Output("Out")[0];
-
-    RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
+    layer->setName(("dropout (Output: " + output_name + ")").c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index a888b0803df..0c5a1a6ef16 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -55,13 +55,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    float* weight_data = nullptr;
-    weight_data =
-        engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
 
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
     auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
-    std::vector<int> dims_y = framework::vectorize2int(Y_t->dims());
+    std::vector<int> dims_y = framework::vectorize2int(weight_tensor->dims());
     if (static_cast<int>(dims_y.size()) == dims_x.nbDims + 1) {
       if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
     }
@@ -88,9 +92,9 @@ class ElementwiseWeightOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
     }
 
-    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
-                                         static_cast<void*>(weight_data),
-                                         static_cast<size_t>(Y_t->numel())};
+    TensorRTEngine::Weight shift_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
     TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
@@ -108,13 +112,14 @@ class ElementwiseWeightOpConverter : public OpConverter {
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
-                             test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    layer->setName(
+        ("elementwise_" + op_type_ + "(Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
     }
   }
 
@@ -133,7 +138,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     // Here the two nullptr looks strange, that's because the
     // framework::OpDesc's constructor is strange.
     framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ILayer* layer = nullptr;
 
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
@@ -149,11 +153,13 @@ class ElementwiseTensorOpConverter : public OpConverter {
     if (CheckDims(dims_x, dims_y)) {
       // The two input tensor should have the same dims
       VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-      nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER(
+      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
           engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
           *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
 
-      layer = elet_layer;
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
     } else {
       VLOG(3) << "Convert a fluid elementwise op to TensorRT "
                  "ElementWisePluginLayer";
@@ -162,18 +168,17 @@ class ElementwiseTensorOpConverter : public OpConverter {
           new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
       plugin->AddInput(X);
       plugin->AddInput(Y);
-      nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin(
+      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
           const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2,
           reinterpret_cast<plugin::PluginTensorRT*>(plugin));
 
-      layer = plugin_layer;
+      layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
     }
-    RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
     }
   }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index fb7b89b189a..42dcd68e40e 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -53,47 +53,33 @@ class FcOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
-    framework::OpDesc op_desc(op, nullptr);
 
-    auto input_names = op_desc.InputNames();
-    bool with_bias = input_names.size() >= 3;
-    std::string w_name = "Y";
-    std::string i_name = "X";
-    if (with_bias) {
-      w_name = "W";
-      i_name = "Input";
-    }
+    framework::OpDesc op_desc(op, nullptr);
+    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
+    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
 
     // Declare inputs
-    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
+    auto* X = engine_->GetITensor(op_desc.Input("X").front());
 
     // Declare weights
-    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
+    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(Y_v);
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     // This may trigger a GPU->CPU copy, because TRT's weight can only be
     // assigned from CPU memory, that can't be avoided.
-    float* weight_data = nullptr;
-    bool enable_int8 = boost::get<bool>(op_desc.HasAttr("enable_int8"));
-    if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-      float in_scale = boost::get<float>(op_desc.GetAttr("input_scale"));
-      auto weight_scale =
-          boost::get<std::vector<float>>(op_desc.GetAttr("weight_scale"));
-      weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(),
-                                              Y_t, true, weight_scale);
-      engine_->SetTensorDynamicRange(X, in_scale);
-#endif
-    } else {
-      weight_data =
-          engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false);
-    }
+    platform::CPUPlace cpu_place;
+    framework::LoDTensor weight_tensor;
+    weight_tensor.Resize(Y_t->dims());
+    TensorCopySync((*Y_t), cpu_place, &weight_tensor);
 
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL);  // a matrix
-    size_t n_output = Y_t->dims()[1];
+    auto* weight_data = weight_tensor.mutable_data<float>(platform::CPUPlace());
+
+    PADDLE_ENFORCE_EQ(weight_tensor.dims().size(), 2UL);  // a matrix
+    size_t n_output = weight_tensor.dims()[1];
 
     std::unique_ptr<framework::Tensor> tmp(new framework::LoDTensor());
-    tmp->Resize(Y_t->dims());
+    tmp->Resize(weight_tensor.dims());
 
     memcpy(tmp->mutable_data<float>(platform::CPUPlace()), weight_data,
            Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
@@ -114,32 +100,19 @@ class FcOpConverter : public OpConverter {
     // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
     // handle `mul`, leave `add` as another layer.
     // DEBUG
-    float* bias_data = nullptr;
-    int bias_num = 0;
-    if (with_bias) {
-      auto* b_v = scope.FindVar(op_desc.Input("Bias").front());
-      auto* b_t = b_v->GetMutable<framework::LoDTensor>();
-      bias_data =
-          engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false);
-      bias_num = b_t->numel();
-    }
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(bias_data),
-                                static_cast<size_t>(bias_num)};
+    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
                                        *const_cast<nvinfer1::ITensor*>(X),
                                        n_output, tmp_weight.get(), bias.get());
 
-    engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp);
     auto output_name = op_desc.Output("Out").front();
-
-    RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode);
-    if (enable_int8) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    layer->setName(("fc (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index 7753fda06cf..3f6ed04c46d 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -76,9 +76,15 @@ class LeakyReluOpConverter : public OpConverter {
                    engine_->weight_map.end());
     engine_->weight_map[alpha_name] = std::move(alpha_tensor);
 
+    std::string layer_name = "leaky_relu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
-                             test_mode);
+    output_layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, output_layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    output_layer->setName((layer_name + ")").c_str());
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f89b0d7efe2..55515569ead 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -170,24 +170,8 @@ class OpConverter {
       engine->DeclareOutput(output);
     }
     engine->FreezeNetwork();
-    engine->ClearWeights();
   }
 
-  void RreplenishLayerAndOutput(
-      nvinfer1::ILayer* layer, const std::string& layer_type,
-      const std::vector<std::string>& output_tensor_names,
-      bool test_mode = false) {
-    size_t num_out = output_tensor_names.size();
-    for (size_t i = 0; i < num_out; i++) {
-      layer->getOutput(i)->setName(output_tensor_names[i].c_str());
-      engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
-      if (test_mode) {
-        engine_->DeclareOutput(output_tensor_names[i]);
-      }
-    }
-    layer->setName(
-        (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
-  }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
   virtual ~OpConverter() {}
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index bcd2166728b..4afcb0aecec 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -51,7 +51,13 @@ class PadOpConverter : public OpConverter {
 
     PADDLE_ENFORCE(layer != nullptr);
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer->setName(("scale (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 1752c52c3f5..1d0d83d1f36 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -148,13 +148,11 @@ class Pool2dOpConverter : public OpConverter {
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
-
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 01bcd03e522..2ae804106e5 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -58,8 +58,15 @@ class PReluOpConverter : public OpConverter {
     engine_->weight_map[op_desc.Input("Alpha")[0]] =
         std::move(alpha_tensor_temp);
 
+    std::string layer_name = "prelu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode);
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index b0ae1694127..80bfb2d190a 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -34,13 +34,9 @@ class SoftMaxOpConverter : public OpConverter {
                                        *const_cast<nvinfer1::ITensor*>(input1));
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode);
-
-    if (op_desc.HasAttr("out_scale")) {
-#if IS_TRT_VERSION_GE(5000)
-      float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
-      engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
-#endif
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
     }
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 388d83d8345..2571abbf698 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -40,7 +40,8 @@ namespace tensorrt {
  * Get a random float value between [low, high]
  */
 float random(float low, float high) {
-  static std::mt19937 mt(100);
+  static std::random_device rd;
+  static std::mt19937 mt(rd());
   std::uniform_real_distribution<double> dist(low, high);
   return dist(mt);
 }
@@ -158,7 +159,7 @@ class TRTConvertValidation {
     PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
     platform::CUDADeviceContext ctx(place_);
     op_->Run(scope_, place_);
-    cudaStreamSynchronize(stream_);
+
     std::vector<std::string> input_output_names;
 
     // Note: we need filter the parameter
@@ -193,7 +194,6 @@ class TRTConvertValidation {
 
     // Execute TRT.
     engine_->Execute(batch_size, &buffers, stream_);
-    cudaStreamSynchronize(stream_);
 
     ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
     int index = 0;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index c5ac6f38410..fddf5f11c28 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -53,40 +53,10 @@ void TensorRTEngine::FreezeNetwork() {
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
   if (enable_int8_) {
     infer_builder_->setInt8Mode(true);
-    if (calibrator_) {
-      infer_builder_->setInt8Calibrator(calibrator_);
-    } else {
-      infer_builder_->setInt8Calibrator(nullptr);
-
-#if IS_TRT_VERSION_GE(5000)
-      infer_builder_->setStrictTypeConstraints(true);
-      for (auto &quant_range : quant_dynamic_range_) {
-        auto tensor = quant_range.first;
-        float range = quant_range.second;
-        tensor->setDynamicRange(-range, range);
-      }
-
-      std::unordered_set<nvinfer1::ITensor *> all_t;
-      for (int i = 0; i < infer_network_->getNbLayers(); i++) {
-        auto layer = infer_network_->getLayer(i);
-        for (int j = 0; j < layer->getNbOutputs(); j++) {
-          all_t.insert(layer->getOutput(j));
-        }
-      }
-      for (int i = 0; i < infer_network_->getNbInputs(); i++) {
-        all_t.insert(infer_network_->getInput(i));
-      }
-
-      for (auto &t : all_t) {
-        if (!quant_dynamic_range_.count(t)) {
-          LOG(WARNING)
-              << "We are in trt int8 mode(not calibration), scale not setted"
-              << " for tensor " << t->getName()
-              << ", this might be ok when trt does not need this range";
-        }
-      }
-#endif
-    }
+    PADDLE_ENFORCE(
+        calibrator_ != nullptr,
+        "The precision mode is 'INT8', the calibrator should not be nullptr");
+    infer_builder_->setInt8Calibrator(calibrator_);
   }
 
   infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
@@ -163,47 +133,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
   runtime_batch_ = batch_size;
 }
 
-float *TensorRTEngine::GetWeightCPUData(const std::string &name,
-                                        framework::Tensor *weight_tensor,
-                                        bool enable_int8,
-                                        const std::vector<float> &scale) {
-  auto w_dims = weight_tensor->dims();
-  platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE(!weight_map.count(name),
-                 "During TRT Op converter: We set weight %s with the same name "
-                 "twice into the weight_map",
-                 name);
-  weight_map[name].reset(new framework::Tensor());
-  weight_map[name]->Resize(weight_tensor->dims());
-  TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get());
-  float *weight_data = weight_map[name]->mutable_data<float>(cpu_place);
-
-  if (enable_int8) {
-    // when the op is fc, scale's size should be 1
-    // when the op is conv, the scale's size should be w_dims[0]
-    bool valid_scale_size =
-        (scale.size() == 1 || scale.size() == static_cast<size_t>(w_dims[0]));
-    PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size");
-    for (int i = 0; i < weight_tensor->numel(); i++) {
-      bool is_valid_int8 =
-          ((weight_data[i] >= -128) && (weight_data[i] <= 127));
-      PADDLE_ENFORCE(is_valid_int8,
-                     "We are in anakin subgraph int8 mode, the weight of conv "
-                     "should be in range [-128, 127]");
-      if (scale.size() == 1) {
-        weight_data[i] *= (scale[0] / 127);
-      } else {
-        PADDLE_ENFORCE(w_dims.size() == 4,
-                       "TRT int8 quant : We only use the channel quant for "
-                       "conv op, so the weight dims should be 4.");
-        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-        weight_data[i] *= (scale[i / inner_size] / 127);
-      }
-    }
-  }
-  return weight_data;
-}
-
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
 
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 80af463d274..657dfd9355f 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -18,10 +18,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
@@ -133,13 +131,6 @@ class TensorRTEngine {
   int GetDeviceId() { return device_id_; }
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int num_inputs, plugin::PluginTensorRT*);
-  void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
-    quant_dynamic_range_[tensor] = range;
-  }
-
-  float* GetWeightCPUData(const std::string& name,
-                          framework::Tensor* weight_tensor, bool enable_int8,
-                          const std::vector<float>& scale = {});
 
   // A pointer to CPU memory is needed of the TRT weight.
   // Before TRT runs, fluid loads weight into GPU storage.
@@ -149,12 +140,6 @@ class TensorRTEngine {
   std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
       weight_map;
 
-  void ClearWeights() {
-    for (auto& weight_pair : weight_map) {
-      weight_pair.second.reset(nullptr);
-    }
-  }
-
  private:
   // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
   // ensure that the thread is associated with the correct device by calling
@@ -199,13 +184,8 @@ class TensorRTEngine {
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
   infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
-  std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
 };  // class TensorRTEngine
 
-#define IS_TRT_VERSION_GE(version)                       \
-  ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
-    NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version)
-
 // Add an layer__ into engine__ with args ARGS.
 // For example:
 //
@@ -219,39 +199,6 @@ class TensorRTEngine {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
-class TRTEngineManager {
- public:
-  bool Empty() const { return engines_.size() == 0; }
-  bool Has(const std::string& name) const {
-    if (engines_.count(name) == 0) return false;
-    return engines_.at(name).get() != nullptr;
-  }
-
-  TensorRTEngine* Get(const std::string& name) const {
-    return engines_.at(name).get();
-  }
-
-  TensorRTEngine* Create(std::string name, int max_batch, int max_workspace,
-                         bool enable_int8 = false,
-                         TRTInt8Calibrator* calibrator = nullptr,
-                         int device_id = 0,
-                         nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8,
-                                 calibrator, device_id, logger);
-    engines_[name].reset(p);
-    return p;
-  }
-
-  void DeleteAll() {
-    for (auto& item : engines_) {
-      item.second.reset(nullptr);
-    }
-  }
-
- private:
-  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
-};
-
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 170ca40d659..118019ef5a8 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include <unordered_set>
 
 namespace paddle {
 namespace inference {
@@ -31,8 +32,8 @@ struct SimpleOpTypeSetTeller : public Teller {
   std::unordered_set<std::string> teller_set{
       {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
        "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
-       "elementwise_add", "elementwise_mul", "dropout", "prelu",
-       "conv2d_transpose", "leaky_relu", "fc"}};
+       "elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
+       "conv2d_transpose", "leaky_relu"}};
 };
 
 bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 3363d77af84..99da067d8ff 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <memory>
 #include <string>
-#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 243f5cef008..c0854d4d0a7 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -4,15 +4,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)
 endif()
 
-function(download_data install_dir data_file)
+function(download_model install_dir model_name)
     if (NOT EXISTS ${install_dir})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
-    endif()
-endfunction()
-
-function(download_int8_data install_dir data_file)
-    if (NOT EXISTS ${install_dir})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
     endif()
 endfunction()
 
@@ -29,32 +23,22 @@ function(inference_analysis_api_test target install_dir filename)
         ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
 endfunction()
 
-function(inference_analysis_api_int8_test_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark)
-endfunction()
-
-function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir data_path)
-	inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
+function(inference_analysis_api_int8_test target model_dir data_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
         ARGS --infer_model=${model_dir}/model
-             --infer_data=${data_path}
+             --infer_data=${data_dir}/data.bin
              --warmup_batch_size=100
              --batch_size=50
              --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=2)
 endfunction()
 
-function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename)
-	inference_analysis_test_build(${TARGET_NAME} SRCS ${filename}
-        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS})
-endfunction()
-
-function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary model_dir disable_fc)
-    inference_analysis_test_run(${TARGET_NAME}
-	COMMAND ${test_binary}
-        ARGS --infer_model=${model_dir}/model
-             --disable_mkldnn_fc=${disable_fc}) 
+function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
+    download_model(${install_dir} ${model_name})
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model)
 endfunction()
 
 function(inference_analysis_api_test_with_refer_result target install_dir filename)
@@ -68,12 +52,12 @@ if(NOT APPLE AND WITH_MKLML)
     # RNN1
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
     download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL)
     
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
-    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
+    inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc SERIAL)
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -93,17 +77,17 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
 download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
-#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator)
+#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
 download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
+        ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL)
 
-#save model 
-inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc)
+# save model
+inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc SERIAL)
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
@@ -118,7 +102,7 @@ inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_te
 # MM DNN
 set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn")
 download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc)
+inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc SERIAL)
 
 # Pyramid DNN
 set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
@@ -141,123 +125,63 @@ download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
-       --paddle_num_threads=${CPU_NUM_THREADS_ON_CI})
+       --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
     inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
 endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
     inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
 endif()
-inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
-
-### Image classification tests with fake data
-set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
-set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")
-
-# build test binary to be used in subsequent tests
-inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLASS_TEST_APP_SRC})
+inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc SERIAL)
 
 # googlenet
-set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet")
-download_data(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP}
-	${GOOGLENET_MODEL_DIR} false)
+inference_analysis_api_test_with_fake_data(test_analyzer_googlenet
+  "${INFERENCE_DEMO_INSTALL_DIR}/googlenet" analyzer_resnet50_tester.cc "googlenet.tar.gz" SERIAL)
 
 # resnet50
-set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
-download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
-	${RESNET50_MODEL_DIR} true)
+inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
+  "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL)
 
 # mobilenet with depthwise_conv op
-set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
-download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
-inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
-	${MOBILENET_MODEL_DIR} false)
+inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv
+  "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL)
 
-### INT8 tests
+# int8 image classification tests
 if(WITH_MKLDNN)
-
   set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
+  if (NOT EXISTS ${INT8_DATA_DIR})
+    inference_download_and_uncompress(${INT8_DATA_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
+  endif()
 
-  ### Image classification tests
-  set(IMAGENET_DATA_PATH "${INT8_DATA_DIR}/data.bin")
-  set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification")
-  set(INT8_IMG_CLASS_TEST_APP_SRC "analyzer_int8_image_classification_tester.cc")
-
-  # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "imagenet_val_100_tail.tar.gz")
-
-  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC})
-
-  # resnet50 int8
+  #resnet50 int8
   set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-  download_int8_data(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  # mobilenetv1 int8
-  set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  download_int8_data(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # mobilenetv2 int8
-  set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-  download_int8_data(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # resnet101 int8
-  set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-  download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # vgg16 int8
-  set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-  download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
- 
-  # vgg19 int8
-  set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-  download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  # googlenet int8
-  set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH})
-
-  ### Object detection models
-  set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_data.bin")
-  set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection")
-  set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc")
-
-  # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_100_head.tar.gz")
-
-  # build test binary to be used in subsequent tests
-  inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC})
-
-  # mobilenet-ssd int8
-  set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
-  download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
-  inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
-
+  if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
+    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "${INFERENCE_URL}/int8" "resnet50_int8_model.tar.gz" )
+  endif()
+  inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
+
+  #mobilenet int8
+  set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
+  if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
+    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "${INFERENCE_URL}/int8" "mobilenetv1_int8_model.tar.gz" )
+  endif()
+  inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 endif()
 
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
 download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
-inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
+inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc SERIAL)
 
 # anakin
-if (ANAKIN_FOUND AND WITH_MKL) # only needed in CI
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     # anakin rnn1
     set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
     set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
@@ -266,14 +190,14 @@ if (ANAKIN_FOUND AND WITH_MKL) # only needed in CI
     cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc
             ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin
                  --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
-            DEPS inference_anakin_api_shared)
+            DEPS inference_anakin_api_shared SERIAL)
     # anakin mobilenet
     if(WITH_GPU)
         set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
         inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
         cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc
                 ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
-                DEPS inference_anakin_api_shared dynload_cuda)
+                DEPS inference_anakin_api_shared dynload_cuda SERIAL)
     endif()
 endif()
 
@@ -282,13 +206,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
     if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
         inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz")
     endif()
-    inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models)
-    inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models)
-    inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc
+    inference_analysis_test(test_trt_models SRCS trt_models_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models)
+            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models SERIAL)
 endif()
diff --git a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
index 48689486af4..cf97f064bed 100644
--- a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
@@ -27,8 +27,8 @@ contrib::AnakinConfig GetConfig() {
   // using AnakinConfig::X86 if you need to use cpu to do inference
   config.target_type = contrib::AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
-  config.device_id = 0;
-  config.init_batch_size = 1;
+  config.device = 0;
+  config.max_batch_size = 1;
   return config;
 }
 
diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
index db01cfebcb2..da42688f29f 100644
--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -100,8 +100,8 @@ contrib::AnakinConfig GetConfig() {
   // using AnakinConfig::X86 if you need to use cpu to do inference
   config.target_type = contrib::AnakinConfig::X86;
   config.model_file = FLAGS_model;
-  config.device_id = 0;
-  config.init_batch_size = 1000;  // the max number of token
+  config.device = 0;
+  config.max_batch_size = 1000;  // the max number of token
   return config;
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index 45256234b83..9b2e74ec16e 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -146,17 +146,12 @@ bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs) {
 
 void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); }
 
-void profile(bool use_mkldnn = false, bool use_ngraph = false) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig config;
   SetConfig(&config);
 
   if (use_mkldnn) {
     config.EnableMKLDNN();
-    config.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  if (use_ngraph) {
-    config.EnableNgraph();
   }
 
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -168,11 +163,7 @@ void profile(bool use_mkldnn = false, bool use_ngraph = false) {
 
 TEST(Analyzer_bert, profile) { profile(); }
 #ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_bert, profile_mkldnn) { profile(true, false); }
-#endif
-
-#ifdef PADDLE_WITH_NGRAPH
-TEST(Analyzer_bert, profile_ngraph) { profile(false, true); }
+TEST(Analyzer_bert, profile_mkldnn) { profile(true); }
 #endif
 
 // Check the fuse status
@@ -187,16 +178,11 @@ TEST(Analyzer_bert, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-void compare(bool use_mkldnn = false, bool use_ngraph = false) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
-  }
-
-  if (use_ngraph) {
-    cfg.EnableNgraph();
   }
 
   std::vector<std::vector<PaddleTensor>> inputs;
@@ -207,15 +193,7 @@ void compare(bool use_mkldnn = false, bool use_ngraph = false) {
 
 TEST(Analyzer_bert, compare) { compare(); }
 #ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_bert, compare_mkldnn) {
-  compare(true, false /* use_mkldnn, no use_ngraph */);
-}
-#endif
-
-#ifdef PADDLE_WITH_NGRAPH
-TEST(Analyzer_bert, compare_ngraph) {
-  compare(false, true /* no use_mkldnn, use_ngraph */);
-}
+TEST(Analyzer_bert, compare_mkldnn) { compare(true /* use_mkldnn */); }
 #endif
 
 // Compare Deterministic result
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 83bf99ec8aa..a3eac7b200c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -200,9 +200,8 @@ void profile(bool use_mkldnn = false) {
     cfg.EnableMKLDNN();
     // Enable all the mkldnn supported ops except conv3d in dam
     std::unordered_set<std::string> op_list = {"softmax", "elementwise_add",
-                                               "relu", "fc"};
+                                               "relu"};
     cfg.SetMKLDNNOp(op_list);
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -252,7 +251,6 @@ void compare(bool use_mkldnn = false) {
     std::unordered_set<std::string> op_list = {"softmax", "elementwise_add",
                                                "relu"};
     cfg.SetMKLDNNOp(op_list);
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -323,6 +321,7 @@ TEST(Analyzer_dam, compare_determine) {
   CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                        input_slots_all);
 }
+
 // Save optim model
 TEST(Analyzer_dam, save_optim_model) {
   AnalysisConfig cfg;
diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
index 17c670a68cc..2eb347a44b3 100644
--- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc
@@ -100,7 +100,6 @@ void profile(bool use_mkldnn = false) {
 
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -147,7 +146,6 @@ void compare(bool use_mkldnn = false) {
 
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 11a49ed2914..cc31ab9588d 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -177,15 +177,11 @@ TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
 
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   std::vector<std::string> outputs_name;
   outputs_name.emplace_back("cos_sim_2.tmp_0");
   CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
                              input_slots_all, outputs_name);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 620a1d1f7a3..54fd3a4a4ca 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -293,15 +293,11 @@ TEST(Analyzer_rnn1, compare_zero_copy) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
 
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   std::vector<std::string> outputs_name;
   outputs_name.emplace_back("final_output.tmp_1");
   CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
                              input_slots_all, outputs_name);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
index 977b2ec885d..578b420ea92 100644
--- a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc
@@ -34,22 +34,14 @@ TEST(Analyzer, save_model) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   cfg.SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param");
-  //  ensure the path being unique
-  std::string optimModelPath = FLAGS_infer_model + "/only_for_save_model_test";
+  std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
   mkdir(optimModelPath.c_str(), 0777);
   SaveOptimModel(&cfg, optimModelPath);
 
-  // Each config can only be applied to one predictor.
-  AnalysisConfig cfg2;
-  SetConfig(&cfg2);
-  cfg2.pass_builder()->ClearPasses();
-  cfg2.SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  int origin_num_ops = GetNumOps(cfg2);
-
-  AnalysisConfig cfg3;
-  SetConfig(&cfg3);
-  cfg3.SetModel(optimModelPath + "/model", optimModelPath + "/params");
-  int fused_num_ops = GetNumOps(cfg3);
+  cfg.pass_builder()->ClearPasses();
+  int origin_num_ops = GetNumOps(cfg);
+  cfg.SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  int fused_num_ops = GetNumOps(cfg);
   CHECK_LE(fused_num_ops, origin_num_ops);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index e6f2bfad68c..3cebf8e9698 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -149,7 +149,6 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   }
   if (use_mkldnn) {
     cfg->EnableMKLDNN();
-    cfg->pass_builder()->AppendPass("fc_mkldnn_pass");
   }
   // Enable seqpool_concat_fuse_pass, disabled by default since it takes much
   // time
@@ -215,15 +214,11 @@ TEST(Analyzer_seq_pool1, compare_zero_copy) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
 
-  AnalysisConfig cfg1;
-  SetConfig(&cfg1);
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   std::vector<std::string> outputs_name;
   outputs_name.emplace_back(out_var_name);
   CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
-                             reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
                              input_slots_all, outputs_name);
 }
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 78e500b2ed5..54492dbc238 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -36,8 +36,6 @@ struct DataReader {
     tensor.lod.front().push_back(data.size());
 
     tensor.data.Resize(data.size() * sizeof(int64_t));
-    CHECK(tensor.data.data() != nullptr);
-    CHECK(data.data() != nullptr);
     memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
     tensor.shape.push_back(data.size());
     tensor.shape.push_back(1);
@@ -89,12 +87,7 @@ TEST(Analyzer_Text_Classification, profile) {
       CHECK_EQ(output.lod.size(), 0UL);
       LOG(INFO) << "output.dtype: " << output.dtype;
       std::stringstream ss;
-      int num_data = 1;
-      for (auto i : output.shape) {
-        num_data *= i;
-      }
-
-      for (int i = 0; i < num_data; i++) {
+      for (int i = 0; i < 5; i++) {
         ss << static_cast<float *>(output.data.data())[i] << " ";
       }
       LOG(INFO) << "output.data summary: " << ss.str();
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index f2195966add..a23297f29cf 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -189,7 +189,6 @@ void profile(bool use_mkldnn = false) {
   std::vector<std::vector<PaddleTensor>> outputs;
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -220,7 +219,6 @@ void compare(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 5f65229ecd5..fb47048cd0c 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -85,7 +85,6 @@ void profile(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
   // cfg.pass_builder()->TurnOnDebug();
   std::vector<std::vector<PaddleTensor>> outputs;
@@ -133,7 +132,6 @@ void compare(bool use_mkldnn = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index de938669c0b..b952b62f13e 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -78,8 +78,6 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
      << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n";
   os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled()
      << "\n";
-  os << GenSpaces(num_spaces) << "use_ngraph: " << config.ngraph_enabled()
-     << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index eb786196a88..a50810948ff 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -148,7 +148,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
       case PaddleDType::INT64: {
         int64_t *pdata = static_cast<int64_t *>(out.data.data());
         int64_t *pdata_ref = ref_out.data<int64_t>(&place, &ref_size);
-        EXPECT_EQ(size, static_cast<size_t>(ref_size));
+        EXPECT_EQ(size, ref_size);
         for (size_t j = 0; j < size; ++j) {
           EXPECT_EQ(pdata_ref[j], pdata[j]);
         }
@@ -320,8 +320,7 @@ void PredictionRun(PaddlePredictor *predictor,
                    const std::vector<std::vector<PaddleTensor>> &inputs,
                    std::vector<std::vector<PaddleTensor>> *outputs,
                    int num_threads, int tid,
-                   const VarType::Type data_type = VarType::FP32,
-                   float *sample_latency = nullptr) {
+                   const VarType::Type data_type = VarType::FP32) {
   int num_times = FLAGS_repeat;
   int iterations = inputs.size();  // process the whole dataset ...
   if (FLAGS_iterations > 0 &&
@@ -361,10 +360,6 @@ void PredictionRun(PaddlePredictor *predictor,
   auto batch_latency = elapsed_time / (iterations * num_times);
   PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency,
             iterations, data_type);
-
-  if (sample_latency != nullptr)
-    *sample_latency = batch_latency / FLAGS_batch_size;
-
   if (FLAGS_record_benchmark) {
     Benchmark benchmark;
     benchmark.SetName(FLAGS_model_name);
@@ -378,14 +373,12 @@ void TestOneThreadPrediction(
     const PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true,
-    const VarType::Type data_type = VarType::FP32,
-    float *sample_latency = nullptr) {
+    const VarType::Type data_type = VarType::FP32) {
   auto predictor = CreateTestPredictor(config, use_analysis);
   if (FLAGS_warmup) {
     PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type);
   }
-  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type,
-                sample_latency);
+  PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type);
 }
 
 void TestMultiThreadPrediction(
@@ -437,31 +430,6 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) {
-  LOG(INFO) << "--- Accuracy summary --- ";
-  LOG(INFO) << "Accepted top1 accuracy drop threshold: "
-            << FLAGS_quantized_accuracy
-            << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)";
-  LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc1_fp32;
-  LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc1_int8;
-}
-
-void SummarizePerformance(float sample_latency_fp32,
-                          float sample_latency_int8) {
-  // sample latency in ms
-  auto throughput_fp32 = 1000.0 / sample_latency_fp32;
-  auto throughput_int8 = 1000.0 / sample_latency_int8;
-  LOG(INFO) << "--- Performance summary --- ";
-  LOG(INFO) << "FP32: avg fps: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << throughput_fp32
-            << ", avg latency: " << sample_latency_fp32 << " ms";
-  LOG(INFO) << "INT8: avg fps: " << std::fixed << std::setw(6)
-            << std::setprecision(4) << throughput_int8
-            << ", avg latency: " << sample_latency_int8 << " ms";
-}
-
 void CompareTopAccuracy(
     const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
     const std::vector<std::vector<PaddleTensor>> &output_slots_ref) {
@@ -491,10 +459,12 @@ void CompareTopAccuracy(
   float avg_acc1_quant = total_accs1_quant / output_slots_quant.size();
   float avg_acc1_ref = total_accs1_ref / output_slots_ref.size();
 
-  SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant);
-  CHECK_GT(avg_acc1_ref, 0.0);
-  CHECK_GT(avg_acc1_quant, 0.0);
-  CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy);
+  LOG(INFO) << "Avg top1 INT8 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_quant;
+  LOG(INFO) << "Avg top1 FP32 accuracy: " << std::fixed << std::setw(6)
+            << std::setprecision(4) << avg_acc1_ref;
+  LOG(INFO) << "Accepted accuracy drop threshold: " << FLAGS_quantized_accuracy;
+  CHECK_LE(std::abs(avg_acc1_quant - avg_acc1_ref), FLAGS_quantized_accuracy);
 }
 
 void CompareDeterministic(
@@ -540,19 +510,16 @@ void CompareQuantizedAndAnalysis(
   auto *cfg = reinterpret_cast<const PaddlePredictor::Config *>(config);
   PrintConfig(cfg, true);
   std::vector<std::vector<PaddleTensor>> analysis_outputs;
-  float sample_latency_fp32{-1};
-  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32,
-                          &sample_latency_fp32);
+  TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32);
 
   LOG(INFO) << "--- INT8 prediction start ---";
   auto *qcfg = reinterpret_cast<const PaddlePredictor::Config *>(qconfig);
   PrintConfig(qcfg, true);
   std::vector<std::vector<PaddleTensor>> quantized_outputs;
-  float sample_latency_int8{-1};
-  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, VarType::INT8,
-                          &sample_latency_int8);
+  TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true,
+                          VarType::INT8);
 
-  SummarizePerformance(sample_latency_fp32, sample_latency_int8);
+  LOG(INFO) << "--- comparing outputs --- ";
   CompareTopAccuracy(quantized_outputs, analysis_outputs);
 }
 
@@ -567,7 +534,7 @@ void CompareNativeAndAnalysis(
 }
 
 void CompareAnalysisAndZeroCopy(
-    PaddlePredictor::Config *config, PaddlePredictor::Config *config1,
+    PaddlePredictor::Config *config,
     const std::vector<std::vector<PaddleTensor>> &inputs,
     const std::vector<std::string> &outputs_name) {
   int batch_size = FLAGS_batch_size;
@@ -577,8 +544,8 @@ void CompareAnalysisAndZeroCopy(
   predictor->Run(inputs[0], &analysis_outputs, batch_size);
   // analysis + zero_copy
   std::vector<ZeroCopyTensor> zerocopy_outputs;
-  reinterpret_cast<AnalysisConfig *>(config1)->SwitchUseFeedFetchOps(false);
-  predictor = CreateTestPredictor(config1, true);
+  reinterpret_cast<AnalysisConfig *>(config)->SwitchUseFeedFetchOps(false);
+  predictor = CreateTestPredictor(config, true);
   ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]);
   predictor->ZeroCopyRun();
   for (size_t i = 0; i < outputs_name.size(); i++) {
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 444bab1b33d..c93c9ef2f23 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -48,35 +48,13 @@ if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32)
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 
-function (inference_base_test_build TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS})
-endfunction()
-
-function (inference_base_test_run TARGET)
+function (inference_base_test TARGET)
    set(options "")
    set(oneValueArgs "")
-   set(multiValueArgs COMMAND ARGS)
+   set(multiValueArgs SRCS ARGS DEPS)
    cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    if(WITH_GPU)
        set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
    endif()
-   cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} ${base_test_ARGS})
+   cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS})
 endfunction()
-
-function (inference_base_test TARGET)
-   set(options "")
-   set(oneValueArgs "")
-   set(multiValueArgs SRCS ARGS DEPS)
-   cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-   inference_base_test_build(${TARGET}
-	   SRCS ${base_test_SRCS}
-	   DEPS ${base_test_DEPS})
-   inference_base_test_run(${TARGET}
-	   COMMAND ${TARGET}
-	   ARGS ${base_test_ARGS})
-endfunction()
-
diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
index 269cc95b658..99b541a683a 100644
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -2,6 +2,8 @@ if (NOT WITH_LITE)
     return()
 endif()
 
+include(lite)
+
 message(WARNING "Lite enabled!")
 message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
@@ -10,6 +12,9 @@ message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
 message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
 
 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")
+
+set(LITE_ON_MOBILE ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK})
+
 set(LITE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 
 function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
@@ -21,8 +26,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
             ${EXTERNAL_PROJECT_NAME}
             ${EXTERNAL_PROJECT_LOG_ARGS}
             PREFIX                ${INSTALL_DIR}
-            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
             DOWNLOAD_DIR          ${INSTALL_DIR}
             DOWNLOAD_NO_PROGRESS  1
             CONFIGURE_COMMAND     ""
@@ -35,7 +39,7 @@ endfunction()
 function (lite_deps TARGET)
   set(options "")
   set(oneValueArgs "")
-  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS ARGS)
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS ARGS)
   cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
   set(deps ${lite_deps_DEPS})
@@ -76,14 +80,29 @@ function (lite_deps TARGET)
     endforeach(var)
   endif()
 
-  set(${TARGET} ${deps} PARENT_SCOPE)
+  if (LITE_WITH_OPENCL)
+    foreach(var ${lite_deps_CL_DEPS})
+      set(deps ${deps} ${var})
+    endforeach(var)
+  endif()
 
+  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 
+
+# A fake target to include all the libraries and tests the lite module depends.
+add_custom_target(lite_compile_deps COMMAND echo 1)
+
 # Add names for lite libraries for latter compile. We use this name list to avoid compiling
 # the whole fluid project to accelerate the compile speed.
 set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt")
 file(WRITE ${offline_lib_registry_file} "") # clean
+
+set(__lite_cc_files "" ; "")
+set(__lite_cc_files "${CMAKE_BINARY_DIR}/lite_cc_files.txt")
+file(WRITE ${__lite_cc_files} "") # clean
+
+
 # cc_library with branch support.
 # The branches:
 #  X86_DEPS: works only when LITE_WITH_X86 is ON.
@@ -92,11 +111,12 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  PROFILE_DEPS:  LITE_WITH_PROFILE
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#  EXCLUDE_COMPILE_DEPS: TARGET will not be included in lite_compile_deps if this is not None
 function(lite_cc_library TARGET)
-    set(options "")
+    set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS
-      HVY_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS
+      HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -104,13 +124,31 @@ function(lite_cc_library TARGET)
             DEPS ${args_DEPS}
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
+            CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             )
 
-    cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    if (args_SHARED OR ARGS_shared)
+        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS} SHARED)
+    elseif (args_MODULE OR ARGS_module)
+        add_library(${TARGET} MODULE ${args_SRCS})
+        add_dependencies(${TARGET} ${deps} ${args_DEPS})
+    else()
+        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    endif()
+    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+
+    foreach(cc_file ${args_SRCS})
+        file(APPEND ${__lite_cc_files} "${cc_file}\n")
+    endforeach()
+
+    # collect targets need to compile for lite
+    if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
+        add_dependencies(lite_compile_deps ${TARGET})
+    endif()
 
     # register a library name.
     file(APPEND ${offline_lib_registry_file} "${TARGET}\n")
@@ -119,8 +157,8 @@ endfunction()
 function(lite_cc_binary TARGET)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS ARGS)
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS PROFILE_DEPS
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
     set(deps "")
@@ -128,12 +166,18 @@ function(lite_cc_binary TARGET)
             DEPS ${args_DEPS}
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
+            CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
+    # collect targets need to compile for lite
+    if (NOT args_EXCLUDE_COMPILE_DEPS)
+        add_dependencies(lite_compile_deps ${TARGET})
+    endif()
 endfunction()
 
 # Add a unit-test name to file for latter offline manual test.
@@ -143,8 +187,8 @@ file(WRITE ${offline_test_registry_file} "") # clean
 function(lite_cc_test TARGET)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS
-      LIGHT_DEPS HVY_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS PROFILE_DEPS
+      LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
       ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -153,13 +197,20 @@ function(lite_cc_test TARGET)
             DEPS ${args_DEPS}
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
+            CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
             )
     _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
+    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
     file(APPEND ${offline_test_registry_file} "${TARGET}\n")
+
+    # collect targets need to compile for lite
+    if (NOT args_EXCLUDE_COMPILE_DEPS)
+        add_dependencies(lite_compile_deps ${TARGET})
+    endif()
 endfunction()
 
 add_subdirectory(operators)
@@ -169,7 +220,109 @@ add_subdirectory(x86)
 add_subdirectory(arm)
 add_subdirectory(host)
 add_subdirectory(cuda)
+add_subdirectory(opencl)
 add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
 add_subdirectory(gen_code)
+add_subdirectory(tools)
+
+if (WITH_TESTING)
+    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
+    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+	lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
+    endif()
+    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
+	    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
+    endif()
+endif()
+
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
+    # for publish
+    set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}")
+    if (LITE_WITH_OPENCL) 
+        set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.opencl")
+    endif(LITE_WITH_OPENCL)
+    message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
+
+    # The final target for publish lite lib
+    add_custom_target(publish_inference_lite)
+    
+    # add cxx lib
+    add_custom_target(publish_inference_cxx_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/model_optimize_tool" "${INFER_LITE_PUBLISH_ROOT}/bin"
+            COMMAND cp "${CMAKE_BINARY_DIR}/paddle/fluid/lite/gen_code/paddle_code_generator" "${INFER_LITE_PUBLISH_ROOT}/bin"
+            )
+    add_dependencies(publish_inference_cxx_lib model_optimize_tool)
+    add_dependencies(publish_inference_cxx_lib paddle_code_generator)
+    add_dependencies(publish_inference_cxx_lib bundle_full_api)
+    add_dependencies(publish_inference_cxx_lib bundle_light_api)
+    add_dependencies(publish_inference_lite publish_inference_cxx_lib)
+
+    if (LITE_WITH_JAVA)
+    # add java lib
+        add_custom_target(publish_inference_java_lib ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/java/so"
+            COMMAND cp "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/android/jni/native/libpaddle_lite_jni.so" "${INFER_LITE_PUBLISH_ROOT}/java/so"
+            COMMAND cp -r "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/api/android/jni/src" "${INFER_LITE_PUBLISH_ROOT}/java"
+        )
+        add_dependencies(publish_inference_java_lib paddle_lite_jni)
+        add_dependencies(publish_inference_lite publish_inference_java_lib)
+    endif()
+    
+    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+            ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
+        # copy
+        add_custom_target(publish_inference_android_cxx_demos ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/glog" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/gflags" "${INFER_LITE_PUBLISH_ROOT}/third_party"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/Makefile.def" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND cp -r "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/mobile_full" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
+            COMMAND cp -r "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+        )
+        add_dependencies(publish_inference_android_cxx_demos glog gflags)
+        add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
+    
+        if (LITE_WITH_JAVA)
+            # copy java mobile_light demo/lib
+            add_custom_target(publish_inference_android_java_demo ${TARGET}
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java"
+                    COMMAND cp -r "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/java/android" "${INFER_LITE_PUBLISH_ROOT}/demo/java"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/demo/java/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/java"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/libs"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm7"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm8"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/arm64-v8a"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/armeabi-v7a"
+                    COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/java/android/PaddlePredictor/app/src/main/jniLibs/x86"
+            )
+            add_dependencies(publish_inference_java_lib publish_inference_android_java_demo)
+        endif()
+    endif()
+    
+    if (LITE_WITH_OPENCL) 
+        add_custom_target(publish_inference_opencl ${TARGET}
+            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/opencl"
+            COMMAND cp -r "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl/cl_kernel" "${INFER_LITE_PUBLISH_ROOT}/opencl"
+        )
+        add_dependencies(publish_inference_cxx_lib publish_inference_opencl)
+    endif()
+endif()
diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt
index 78f85a8caeb..4d507bc0784 100644
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -1,54 +1,184 @@
-set(cxx_api_lite_deps scope_lite optimizer_lite target_wrapper_host model_parser_lite)
+set(cxx_api_lite_deps
+  scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite)
 if(LITE_WITH_CUDA)
     set(cxx_api_lite_deps ${cxx_api_lite_deps} kernels_cuda)
-    cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
+    lite_cc_library(cxx_api_lite_cuda SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} target_wrapper_cuda)
     nv_test(test_cxx_api_lite_cuda SRCS cxx_api_test.cc DEPS cxx_api_lite_cuda)
 endif()
 
-cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} program_lite)
+lite_cc_library(place_lite SRCS paddle_place.cc DEPS glog)
+
+lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
+  DEPS scope_lite optimizer_lite target_wrapper_host model_parser_lite program_lite
+       ${ops_lite} ${host_kernels}
+  CUDA_DEPS kernels_cuda
+  X86_DEPS ${x86_kernels})
 
 set(light_api_deps
-    scope_lite target_wrapper_host model_parser_lite)
+    scope_lite target_wrapper_host model_parser_lite program_lite)
 
 if(LITE_WITH_CUDA)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
 
-cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
-
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 
+lite_cc_library(cxx_api_lite
+                SRCS cxx_api.cc
+                DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite
+                X86_DEPS ${x86_kernels} operator
+                ARM_DEPS ${arm_kernels}
+                CL_DEPS ${opencl_kenrels})
+
+lite_cc_library(light_api_lite SRCS light_api.cc
+        DEPS scope_lite target_wrapper_host model_parser_lite
+            ${light_api_deps} ${ops_lite} ${host_kernels} program_lite
+        CUDA_DEPS target_wrapper_cuda
+        X86_DEPS ${x86_kernels} operator
+        ARM_DEPS ${arm_kernels}
+        CL_DEPS ${opencl_kenrels})
+
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
         "A path setting inference demo download directories.")
 
-if((NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) AND WITH_TESTING)
+if(WITH_TESTING)
     lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes
-       ${ops_lite} ${host_kernels} ${x86_kernels}
+       DEPS cxx_api_lite mir_passes lite_api_test_helper
+       ${ops_lite} ${host_kernels} 
+       X86_DEPS ${x86_kernels}
+       ARM_DEPS ${arm_kernels}
+       EXCLUDE_COMPILE_DEPS "ON"
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
             --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-
-    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
     add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+    if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+        lite_cc_test(test_googlenet_lite SRCS test_googlenet_lite.cc
+           DEPS cxx_api_lite mir_passes lite_api_test_helper
+           ${ops_lite} ${host_kernels} ${x86_kernels}
+           ARGS --model_dir=${LITE_MODEL_DIR}/googlenet)
+        add_dependencies(test_googlenet_lite extern_lite_download_GoogleNet_inference_tar_gz)
+        lite_cc_test(test_mobilenetv1_lite_x86 SRCS test_mobilenetv1_lite_x86.cc
+           DEPS cxx_api_lite mir_passes lite_api_test_helper
+           ${ops_lite} ${host_kernels} ${x86_kernels}
+           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1)
+        add_dependencies(test_mobilenetv1_lite_x86 extern_lite_download_mobilenet_v1_tar_gz)
+        lite_cc_test(test_mobilenetv2_lite_x86 SRCS test_mobilenetv2_lite_x86.cc
+           DEPS cxx_api_lite mir_passes lite_api_test_helper
+           ${ops_lite} ${host_kernels} ${x86_kernels}
+           ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2_relu)
+        add_dependencies(test_mobilenetv2_lite_x86 extern_lite_download_mobilenet_v2_relu_tar_gz)
+        lite_cc_test(test_inceptionv4_lite_x86 SRCS test_inceptionv4_lite_x86.cc
+           DEPS cxx_api_lite mir_passes lite_api_test_helper
+           ${ops_lite} ${host_kernels} ${x86_kernels}
+           ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4_simple)
+        add_dependencies(test_inceptionv4_lite_x86 extern_lite_download_inception_v4_simple_tar_gz)
+        lite_cc_test(test_step_rnn_lite_x86 SRCS test_step_rnn_lite_x86.cc
+           DEPS cxx_api_lite mir_passes lite_api_test_helper
+           ${ops_lite} ${host_kernels} ${x86_kernels}
+           ARGS --model_dir=${LITE_MODEL_DIR}/test_onestep_no_switch/inference_models/0)
+    endif()
 endif()
 
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
+    set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
+
+    lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+            --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
+    add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz)
+
+    lite_cc_test(test_mobilenetv2_lite SRCS mobilenetv2_test.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+            --model_dir=${LITE_MODEL_DIR}/mobilenet_v2 SERIAL)
+    add_dependencies(test_mobilenetv2_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
+
+    lite_cc_test(test_resnet50_lite SRCS resnet50_test.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+            --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
+    add_dependencies(test_resnet50_lite extern_lite_download_resnet50_tar_gz)
+
+    lite_cc_test(test_inceptionv4_lite SRCS inceptionv4_test.cc
+       DEPS ${lite_model_test_DEPS}
+       CL_DEPS ${opencl_kernels}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+            --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
+    add_dependencies(test_inceptionv4_lite extern_lite_download_inception_v4_simple_tar_gz)
 endif()
 
-# if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-#     lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api_lite ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-# endif()
+# These tests needs CLI arguments, and is not supported in ARM CI.
+# TODO(Superjomn) support latter.
+lite_cc_test(test_light_api_lite SRCS light_api_test.cc
+    DEPS light_api_lite program_lite mir_passes
+    CL_DEPS ${opencl_kernels}
+    ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+
+lite_cc_test(test_apis_lite SRCS apis_test.cc
+  DEPS cxx_api_lite light_api_lite ${ops_lite}
+  CL_DEPS ${opencl_kernels}
+  X86_DEPS ${x86_kernels} operator
+  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
+       --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+
+lite_cc_library(paddle_api_lite SRCS paddle_api.cc DEPS op_params_lite)
+
+#-----------------------------------------------------------------------------------------------------
+# The final inference library for both CxxConfig and MobileConfig.
+lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api_lite paddle_api_lite light_api_lite
+  ${ops_lite}
+  ARM_DEPS ${arm_kernels}
+  CL_DEPS ${opencl_kernels})
+# The final inference library for just MobileConfig.
+lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api_lite paddle_api_lite mir_passes)
+
+bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
+bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
+#-----------------------------------------------------------------------------------------------------
+
+
+lite_cc_test(test_paddle_api_lite SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
+  ${ops_lite}
+  ARM_DEPS ${arm_kernels}
+  CL_DEPS ${opencl_kernels}
+  X86_DEPS ${x86_kernels}
+  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
+
+lite_cc_test(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light
+  ${ops_lite}
+  ARM_DEPS ${arm_kernels}
+  CL_DEPS ${opencl_kernels}
+  X86_DEPS ${x86_kernels})
+
+if (WITH_TESTING)
+    add_dependencies(test_paddle_api_lite extern_lite_download_lite_naive_model_tar_gz)
+endif()
+
+if (LITE_WITH_JAVA AND LITE_WITH_ARM)
+    add_subdirectory(android)
+endif()
 
+#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
+    #X86_DEPS operator
+    #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes
+    #ARM_DEPS ${arm_kernels})
 
-lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
+lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin_int8.cc
     DEPS
     cxx_api_lite
     model_parser_lite
     target_wrapper_host
     mir_passes
     ${ops_lite} ${host_kernels}
-    ARM_DEPS ${arm_kernels})
+    ARM_DEPS ${arm_kernels}
+    CL_DEPS ${opencl_kernels})
+lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc
+    DEPS paddle_api_full
+    CL_DEPS ${opencl_kernels})
diff --git a/paddle/fluid/lite/api/android/.gitignore b/paddle/fluid/lite/api/android/.gitignore
new file mode 100644
index 00000000000..a1d6334395d
--- /dev/null
+++ b/paddle/fluid/lite/api/android/.gitignore
@@ -0,0 +1,2 @@
+/bin/
+.classpath
diff --git a/paddle/fluid/lite/api/android/CMakeLists.txt b/paddle/fluid/lite/api/android/CMakeLists.txt
new file mode 100644
index 00000000000..7f31f7e9479
--- /dev/null
+++ b/paddle/fluid/lite/api/android/CMakeLists.txt
@@ -0,0 +1,5 @@
+if ((NOT LITE_WITH_JAVA) OR (NOT LITE_WITH_ARM))
+  return()
+endif()
+
+add_subdirectory(jni)
diff --git a/paddle/fluid/lite/api/android/jni/.gitignore b/paddle/fluid/lite/api/android/jni/.gitignore
new file mode 100644
index 00000000000..1299d2738c0
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/.gitignore
@@ -0,0 +1,3 @@
+/PaddleListTest.class
+/PaddleLite.class
+/bin/
diff --git a/paddle/fluid/lite/api/android/jni/CMakeLists.txt b/paddle/fluid/lite/api/android/jni/CMakeLists.txt
new file mode 100644
index 00000000000..3d5497d51ef
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/CMakeLists.txt
@@ -0,0 +1,52 @@
+if ((NOT LITE_WITH_ARM) OR (NOT LITE_WITH_JAVA))
+  return()
+endif()
+
+include(UseJava)
+find_package(Java REQUIRED)
+
+# We are only interested in finding jni.h: we do not care about extended JVM
+# functionality or the AWT library.
+set(JAVA_AWT_LIBRARY NotNeeded)
+set(JAVA_JVM_LIBRARY NotNeeded)
+set(JAVA_INCLUDE_PATH2 NotNeeded)
+set(JAVA_AWT_INCLUDE_PATH NotNeeded)
+find_package(JNI REQUIRED)
+
+# Generate PaddlePredictor.jar
+include_directories(${JNI_INCLUDE_DIRS})
+add_jar(PaddlePredictor
+    src/com/baidu/paddle/lite/ConfigBase.java
+    src/com/baidu/paddle/lite/CxxConfig.java
+    src/com/baidu/paddle/lite/MobileConfig.java
+    src/com/baidu/paddle/lite/PaddleLiteInitializer.java
+    src/com/baidu/paddle/lite/PaddlePredictor.java
+    src/com/baidu/paddle/lite/Place.java
+    src/com/baidu/paddle/lite/PrecisionType.java
+    src/com/baidu/paddle/lite/Tensor.java)
+get_target_property(_jarFile PaddlePredictor JAR_FILE)
+get_target_property(_classDir PaddlePredictor CLASSDIR)
+set(_stubDir "${CMAKE_CURRENT_BINARY_DIR}")
+
+# Generate native headers
+add_custom_target(
+    paddle_lite_jni_header ALL
+    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
+        -classpath ${_classDir}
+        -o "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h"
+        -jni
+        com.baidu.paddle.lite.PaddlePredictor
+    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
+        -classpath ${_classDir}
+        -o "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/android/jni/native/tensor_jni.h"
+        -jni
+        com.baidu.paddle.lite.Tensor
+    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
+        -classpath ${_classDir}
+        -o "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h"
+        -jni
+        com.baidu.paddle.lite.PaddleLiteInitializer
+    DEPENDS PaddlePredictor
+)
+
+add_subdirectory(native)
diff --git a/paddle/fluid/lite/api/android/jni/native/CMakeLists.txt b/paddle/fluid/lite/api/android/jni/native/CMakeLists.txt
new file mode 100644
index 00000000000..eeeef1bd193
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Generate paddle_lite_jni.so
+include_directories(${JNI_INCLUDE_DIRS} ${_classDir} ${_stubDir})
+lite_cc_library(paddle_lite_jni MODULE 
+    SRCS paddle_init_jni.cc paddle_lite_jni.cc tensor_jni.cc
+    DEPS light_api_lite cxx_api_lite
+       paddle_api_full paddle_api_lite paddle_api_light
+    ARM_DEPS ${arm_kernels})
+if (APPLE)
+    # MacOS only accepts JNI lib ends with .jnilib or .dylib
+    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".jnilib")
+elseif (WIN32)
+    # Windows only accepts JNI lib ends with .dll
+    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".dll")
+endif (APPLE)
+# Unlike static library, module library has to link target to be able to work
+# as a single .so lib.
+target_link_libraries(paddle_lite_jni light_api_lite cxx_api_lite
+    paddle_api_full paddle_api_lite paddle_api_light ${arm_kernels})
diff --git a/paddle/fluid/lite/api/android/jni/native/convert_util_jni.h b/paddle/fluid/lite/api/android/jni/native/convert_util_jni.h
new file mode 100644
index 00000000000..9b284ccf2b2
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/convert_util_jni.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <jni.h>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/paddle_place.h"
+
+#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_CONVERT_UTIL_JNI_H_
+#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_CONVERT_UTIL_JNI_H_
+
+namespace paddle {
+namespace lite_api {
+
+inline std::string jstring_to_cpp_string(JNIEnv *env, jstring jstr) {
+  // In java, a unicode char will be encoded using 2 bytes (utf16).
+  // so jstring will contain characters utf16. std::string in c++ is
+  // essentially a string of bytes, not characters, so if we want to
+  // pass jstring from JNI to c++, we have convert utf16 to bytes.
+  if (!jstr) {
+    return "";
+  }
+  const jclass stringClass = env->GetObjectClass(jstr);
+  const jmethodID getBytes =
+      env->GetMethodID(stringClass, "getBytes", "(Ljava/lang/String;)[B");
+  const jbyteArray stringJbytes = (jbyteArray)env->CallObjectMethod(
+      jstr, getBytes, env->NewStringUTF("UTF-8"));
+
+  size_t length = (size_t)env->GetArrayLength(stringJbytes);
+  jbyte *pBytes = env->GetByteArrayElements(stringJbytes, NULL);
+
+  std::string ret = std::string(reinterpret_cast<char *>(pBytes), length);
+  env->ReleaseByteArrayElements(stringJbytes, pBytes, JNI_ABORT);
+
+  env->DeleteLocalRef(stringJbytes);
+  env->DeleteLocalRef(stringClass);
+  return ret;
+}
+
+inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env, const float *buf,
+                                            int64_t len) {
+  jfloatArray result = env->NewFloatArray(len);
+  env->SetFloatArrayRegion(result, 0, len, buf);
+  return result;
+}
+
+inline jintArray cpp_array_to_jintarray(JNIEnv *env, const int *buf,
+                                        int64_t len) {
+  jintArray result = env->NewIntArray(len);
+  env->SetIntArrayRegion(result, 0, len, buf);
+  return result;
+}
+
+inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env, const int8_t *buf,
+                                          int64_t len) {
+  jbyteArray result = env->NewByteArray(len);
+  env->SetByteArrayRegion(result, 0, len, buf);
+  return result;
+}
+
+inline jlongArray int64_vector_to_jlongarray(JNIEnv *env,
+                                             const std::vector<int64_t> &vec) {
+  jlongArray result = env->NewLongArray(vec.size());
+  jlong *buf = new jlong[vec.size()];
+  for (size_t i = 0; i < vec.size(); ++i) {
+    buf[i] = (jlong)vec[i];
+  }
+  env->SetLongArrayRegion(result, 0, vec.size(), buf);
+  delete[] buf;
+  return result;
+}
+
+inline std::vector<int64_t> jlongarray_to_int64_vector(JNIEnv *env,
+                                                       jlongArray dims) {
+  int dim_size = env->GetArrayLength(dims);
+  jlong *dim_nums = env->GetLongArrayElements(dims, nullptr);
+  std::vector<int64_t> dim_vec(dim_nums, dim_nums + dim_size);
+  env->ReleaseLongArrayElements(dims, dim_nums, 0);
+  return dim_vec;
+}
+
+/**
+ * Converts Java com.baidu.paddle.lite.Place to c++ paddle::lite_api::Place.
+ */
+inline Place jplace_to_cpp_place(JNIEnv *env, jobject java_place) {
+  jclass place_jclazz = env->GetObjectClass(java_place);
+
+  jmethodID target_method =
+      env->GetMethodID(place_jclazz, "getTargetInt", "()I");
+  jmethodID precision_method =
+      env->GetMethodID(place_jclazz, "getPrecisionInt", "()I");
+  jmethodID data_layout_method =
+      env->GetMethodID(place_jclazz, "getDataLayoutInt", "()I");
+  jmethodID device_method = env->GetMethodID(place_jclazz, "getDevice", "()I");
+
+  int target = env->CallIntMethod(java_place, target_method);
+  int precision = env->CallIntMethod(java_place, precision_method);
+  int data_layout = env->CallIntMethod(java_place, data_layout_method);
+  int device = env->CallIntMethod(java_place, device_method);
+
+  return Place(static_cast<paddle::lite_api::TargetType>(target),
+               static_cast<paddle::lite_api::PrecisionType>(precision),
+               static_cast<paddle::lite_api::DataLayoutType>(data_layout),
+               device);
+}
+
+inline CxxConfig jcxxconfig_to_cpp_cxxconfig(JNIEnv *env, jobject jcxxconfig) {
+  jclass cxxconfig_jclazz = env->GetObjectClass(jcxxconfig);
+
+  jmethodID model_dir_method =
+      env->GetMethodID(cxxconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
+  jmethodID preferred_place_method = env->GetMethodID(
+      cxxconfig_jclazz, "getPreferredPlace", "()Lcom/baidu/paddle/lite/Place;");
+  jmethodID valid_places_method = env->GetMethodID(
+      cxxconfig_jclazz, "getValidPlaces", "()[Lcom/baidu/paddle/lite/Place;");
+
+  CxxConfig config;
+
+  jstring java_model_dir =
+      (jstring)env->CallObjectMethod(jcxxconfig, model_dir_method);
+  if (java_model_dir != nullptr) {
+    std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir);
+    config.set_model_dir(cpp_model_dir);
+  }
+
+  jobject java_preferred_place =
+      env->CallObjectMethod(jcxxconfig, preferred_place_method);
+  if (java_preferred_place != nullptr) {
+    Place cpp_preferred_place = jplace_to_cpp_place(env, java_preferred_place);
+    config.set_preferred_place(cpp_preferred_place);
+  }
+
+  jobject object_valid_places =
+      env->CallObjectMethod(jcxxconfig, valid_places_method);
+  jobjectArray *java_valid_places =
+      reinterpret_cast<jobjectArray *>(&object_valid_places);
+  if (java_valid_places != nullptr) {
+    int valid_place_count = env->GetArrayLength(*java_valid_places);
+    std::vector<Place> cpp_valid_places;
+    for (int i = 0; i < valid_place_count; ++i) {
+      jobject jplace = env->GetObjectArrayElement(*java_valid_places, i);
+      cpp_valid_places.push_back(jplace_to_cpp_place(env, jplace));
+    }
+    config.set_valid_places(cpp_valid_places);
+  }
+
+  return config;
+}
+
+inline MobileConfig jmobileconfig_to_cpp_mobileconfig(JNIEnv *env,
+                                                      jobject jmobileconfig) {
+  jclass mobileconfig_jclazz = env->GetObjectClass(jmobileconfig);
+
+  jmethodID model_dir_method = env->GetMethodID(
+      mobileconfig_jclazz, "getModelDir", "()Ljava/lang/String;");
+  MobileConfig config;
+
+  jstring java_model_dir =
+      (jstring)env->CallObjectMethod(jmobileconfig, model_dir_method);
+  if (java_model_dir != nullptr) {
+    std::string cpp_model_dir = jstring_to_cpp_string(env, java_model_dir);
+    config.set_model_dir(cpp_model_dir);
+  }
+  return config;
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#endif  //  PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_CONVERT_UTIL_JNI_H_
diff --git a/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.cc b/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.cc
new file mode 100644
index 00000000000..33c43ff3269
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h"
+
+#include <memory>
+
+#include "paddle/fluid/lite/api/paddle_lite_factory_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include "paddle/fluid/lite/kernels/arm/dropout_compute.h"
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include "paddle/fluid/lite/kernels/arm/fc_compute.h"
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include "paddle/fluid/lite/kernels/arm/scale_compute.h"
+#include "paddle/fluid/lite/kernels/arm/softmax_compute.h"
+#include "paddle/fluid/lite/kernels/arm/split_compute.h"
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARM_KERNEL_POINTER(kernel_class_name__)                    \
+  std::unique_ptr<paddle::lite::kernels::arm::kernel_class_name__> \
+      p##kernel_class_name__(                                      \
+          new paddle::lite::kernels::arm::kernel_class_name__);
+
+namespace paddle {
+namespace lite_api {
+
+/**
+ * Not sure why, we have to initial a pointer first for kernels.
+ * Otherwise it throws null pointer error when do KernelRegistor.
+ */
+static void use_arm_kernels() {
+  ARM_KERNEL_POINTER(BatchNormCompute);
+  ARM_KERNEL_POINTER(CalibComputeFp32ToInt8);
+  ARM_KERNEL_POINTER(CalibComputeInt8ToFp32);
+  ARM_KERNEL_POINTER(ConvCompute);
+  ARM_KERNEL_POINTER(ConcatCompute);
+  ARM_KERNEL_POINTER(ElementwiseAddCompute);
+  ARM_KERNEL_POINTER(DropoutCompute);
+  ARM_KERNEL_POINTER(FcCompute);
+  ARM_KERNEL_POINTER(MulCompute);
+  ARM_KERNEL_POINTER(PoolCompute);
+  ARM_KERNEL_POINTER(ReluCompute);
+  ARM_KERNEL_POINTER(ScaleCompute);
+  ARM_KERNEL_POINTER(SoftmaxCompute);
+  ARM_KERNEL_POINTER(SplitCompute);
+  ARM_KERNEL_POINTER(TransposeCompute);
+  ARM_KERNEL_POINTER(Transpose2Compute);
+}
+
+JNIEXPORT void JNICALL
+Java_com_baidu_paddle_lite_PaddleLiteInitializer_initNative(JNIEnv *env,
+                                                            jclass jclazz) {
+  use_arm_kernels();
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h b/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h
new file mode 100644
index 00000000000..4917cd576ce
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/paddle_init_jni.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class com_baidu_paddle_lite_PaddleLiteInitializer */
+
+#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_INIT_JNI_H_
+#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_INIT_JNI_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace paddle {
+namespace lite_api {
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddleLiteInitializer
+ * Method:    initNative
+ * Signature: ()V
+ */
+JNIEXPORT void JNICALL
+Java_com_baidu_paddle_lite_PaddleLiteInitializer_initNative(JNIEnv *, jclass);
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_INIT_JNI_H_
diff --git a/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.cc b/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.cc
new file mode 100644
index 00000000000..b3a322c1cfd
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.cc
@@ -0,0 +1,158 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/lite/api/android/jni/native/convert_util_jni.h"
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/api/paddle_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace paddle {
+namespace lite_api {
+
+inline static std::shared_ptr<PaddlePredictor> *getPaddlePredictorPointer(
+    JNIEnv *env, jobject jpaddle_predictor) {
+  jclass jclazz = env->GetObjectClass(jpaddle_predictor);
+  jfieldID jfield = env->GetFieldID(jclazz, "cppPaddlePredictorPointer", "J");
+  jlong java_pointer = env->GetLongField(jpaddle_predictor, jfield);
+  std::shared_ptr<PaddlePredictor> *ptr =
+      reinterpret_cast<std::shared_ptr<PaddlePredictor> *>(java_pointer);
+  return ptr;
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_PaddlePredictor_run(
+    JNIEnv *env, jobject jpaddle_predictor) {
+  std::shared_ptr<PaddlePredictor> *predictor =
+      getPaddlePredictorPointer(env, jpaddle_predictor);
+  if (predictor == nullptr || (*predictor == nullptr)) {
+    return JNI_FALSE;
+  }
+  (*predictor)->Run();
+  return JNI_TRUE;
+}
+
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(
+    JNIEnv *env, jobject jpaddle_predictor, jstring model_dir) {
+  std::shared_ptr<PaddlePredictor> *predictor =
+      getPaddlePredictorPointer(env, jpaddle_predictor);
+  if (predictor == nullptr || (*predictor == nullptr)) {
+    return JNI_FALSE;
+  }
+  (*predictor)->SaveOptimizedModel(jstring_to_cpp_string(env, model_dir));
+  return JNI_TRUE;
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer(
+    JNIEnv *env, jobject jpaddle_predictor, jint offset) {
+  std::shared_ptr<PaddlePredictor> *predictor =
+      getPaddlePredictorPointer(env, jpaddle_predictor);
+  if (predictor == nullptr || (*predictor == nullptr)) {
+    return 0;
+  }
+  std::unique_ptr<Tensor> tensor =
+      (*predictor)->GetInput(static_cast<int>(offset));
+  std::unique_ptr<Tensor> *cpp_tensor_pointer =
+      new std::unique_ptr<Tensor>(std::move(tensor));
+  return reinterpret_cast<jlong>(cpp_tensor_pointer);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer(
+    JNIEnv *env, jobject jpaddle_predictor, jint offset) {
+  std::shared_ptr<PaddlePredictor> *predictor =
+      getPaddlePredictorPointer(env, jpaddle_predictor);
+  if (predictor == nullptr || (*predictor == nullptr)) {
+    return 0;
+  }
+  std::unique_ptr<const Tensor> tensor =
+      (*predictor)->GetOutput(static_cast<int>(offset));
+  std::unique_ptr<const Tensor> *cpp_tensor_pointer =
+      new std::unique_ptr<const Tensor>(std::move(tensor));
+  return reinterpret_cast<jlong>(cpp_tensor_pointer);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName(
+    JNIEnv *env, jobject jpaddle_predictor, jstring name) {
+  std::string cpp_name = jstring_to_cpp_string(env, name);
+  std::shared_ptr<PaddlePredictor> *predictor =
+      getPaddlePredictorPointer(env, jpaddle_predictor);
+  if (predictor == nullptr || (*predictor == nullptr)) {
+    return 0;
+  }
+  std::unique_ptr<const Tensor> tensor = (*predictor)->GetTensor(cpp_name);
+  std::unique_ptr<const Tensor> *cpp_tensor_pointer =
+      new std::unique_ptr<const Tensor>(std::move(tensor));
+  return reinterpret_cast<jlong>(cpp_tensor_pointer);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
+paddle_lite_CxxConfig_2(JNIEnv *env, jobject jpaddle_predictor,
+                        jobject jcxxconfig) {
+  CxxConfig config = jcxxconfig_to_cpp_cxxconfig(env, jcxxconfig);
+  std::shared_ptr<PaddlePredictor> predictor =
+      paddle::lite_api::CreatePaddlePredictor(config);
+  if (predictor == nullptr) {
+    return 0;
+  }
+  std::shared_ptr<PaddlePredictor> *predictor_pointer =
+      new std::shared_ptr<PaddlePredictor>(predictor);
+  return reinterpret_cast<jlong>(predictor_pointer);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
+paddle_lite_MobileConfig_2(JNIEnv *env, jobject jpaddle_predictor,
+                           jobject jmobileconfig) {
+  MobileConfig config = jmobileconfig_to_cpp_mobileconfig(env, jmobileconfig);
+  std::shared_ptr<PaddlePredictor> predictor =
+      paddle::lite_api::CreatePaddlePredictor(config);
+  if (predictor == nullptr) {
+    return 0;
+  }
+  std::shared_ptr<PaddlePredictor> *predictor_pointer =
+      new std::shared_ptr<PaddlePredictor>(predictor);
+  return reinterpret_cast<jlong>(predictor_pointer);
+}
+
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor(
+    JNIEnv *env, jobject jpaddle_predictor, jlong java_pointer) {
+  if (java_pointer == 0) {
+    return JNI_FALSE;
+  }
+  std::shared_ptr<PaddlePredictor> *ptr =
+      reinterpret_cast<std::shared_ptr<PaddlePredictor> *>(java_pointer);
+  ptr->reset();
+  delete ptr;
+  return JNI_TRUE;
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h b/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h
new file mode 100644
index 00000000000..0e95043aae4
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/paddle_lite_jni.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class com_baidu_paddle_lite_PaddlePredictor */
+
+#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_LITE_JNI_H_
+#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_LITE_JNI_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace paddle {
+namespace lite_api {
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    run
+ * Signature: ()Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_run(JNIEnv *, jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    saveOptimizedModel
+ * Signature: (Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(JNIEnv *, jobject,
+                                                              jstring);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    getInputCppTensorPointer
+ * Signature: (I)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getInputCppTensorPointer(JNIEnv *,
+                                                                    jobject,
+                                                                    jint);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    getOutputCppTensorPointer
+ * Signature: (I)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getOutputCppTensorPointer(JNIEnv *,
+                                                                     jobject,
+                                                                     jint);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    getCppTensorPointerByName
+ * Signature: (Ljava/lang/String;)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getCppTensorPointerByName(JNIEnv *,
+                                                                     jobject,
+                                                                     jstring);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    newCppPaddlePredictor
+ * Signature: (Lcom/baidu/paddle/lite/CxxConfig;)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
+paddle_lite_CxxConfig_2(JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    newCppPaddlePredictor
+ * Signature: (Lcom/baidu/paddle/lite/MobileConfig;)J
+ */
+JNIEXPORT jlong JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_newCppPaddlePredictor__Lcom_baidu_\
+paddle_lite_MobileConfig_2(JNIEnv *, jobject, jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    deleteCppPaddlePredictor
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_deleteCppPaddlePredictor(JNIEnv *,
+                                                                    jobject,
+                                                                    jlong);
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_PADDLE_LITE_JNI_H_
diff --git a/paddle/fluid/lite/api/android/jni/native/tensor_jni.cc b/paddle/fluid/lite/api/android/jni/native/tensor_jni.cc
new file mode 100644
index 00000000000..4a1085735aa
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/tensor_jni.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/api/android/jni/native/tensor_jni.h"
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/lite/api/android/jni/native/convert_util_jni.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace paddle {
+namespace lite_api {
+
+inline static int64_t product(const std::vector<int64_t> &vec) {
+  if (vec.empty()) {
+    return 0;
+  }
+  int64_t result = 1;
+  for (int64_t d : vec) {
+    result *= d;
+  }
+  return result;
+}
+
+inline static bool is_const_tensor(JNIEnv *env, jobject jtensor) {
+  jclass jclazz = env->GetObjectClass(jtensor);
+  jfieldID jfield = env->GetFieldID(jclazz, "readOnly", "Z");
+  jboolean read_only = env->GetBooleanField(jtensor, jfield);
+  return static_cast<bool>(read_only);
+}
+
+inline static std::unique_ptr<Tensor> *get_writable_tensor_pointer(
+    JNIEnv *env, jobject jtensor) {
+  jclass jclazz = env->GetObjectClass(jtensor);
+  jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J");
+  jlong java_pointer = env->GetLongField(jtensor, jfield);
+  std::unique_ptr<Tensor> *ptr =
+      reinterpret_cast<std::unique_ptr<Tensor> *>(java_pointer);
+  return ptr;
+}
+
+inline static std::unique_ptr<const Tensor> *get_read_only_tensor_pointer(
+    JNIEnv *env, jobject jtensor) {
+  jclass jclazz = env->GetObjectClass(jtensor);
+  jfieldID jfield = env->GetFieldID(jclazz, "cppTensorPointer", "J");
+  jlong java_pointer = env->GetLongField(jtensor, jfield);
+  std::unique_ptr<const Tensor> *ptr =
+      reinterpret_cast<std::unique_ptr<const Tensor> *>(java_pointer);
+  return ptr;
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_resize(
+    JNIEnv *env, jobject jtensor, jlongArray dims) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  std::vector<int64_t> shape = jlongarray_to_int64_vector(env, dims);
+  (*tensor)->Resize(shape);
+  return JNI_TRUE;
+}
+
+JNIEXPORT jlongArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    std::vector<int64_t> shape = (*tensor)->shape();
+    return int64_vector_to_jlongarray(env, shape);
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    std::vector<int64_t> shape = (*tensor)->shape();
+    return int64_vector_to_jlongarray(env, shape);
+  }
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_setData___3F(
+    JNIEnv *env, jobject jtensor, jfloatArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+
+  float *input = (*tensor)->mutable_data<float>();
+  env->GetFloatArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_setData___3B(
+    JNIEnv *env, jobject jtensor, jbyteArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+
+  int8_t *input = (*tensor)->mutable_data<int8_t>();
+  env->GetByteArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
+
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jfloatarray(env, (*tensor)->data<float>(),
+                                    product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jfloatarray(env, (*tensor)->data<float>(),
+                                    product((*tensor)->shape()));
+  }
+}
+
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jbytearray(env, (*tensor)->data<int8_t>(),
+                                   product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jbytearray(env, (*tensor)->data<int8_t>(),
+                                   product((*tensor)->shape()));
+  }
+}
+
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
+    JNIEnv *env, jobject jtensor, jlong java_pointer) {
+  if (java_pointer == 0) {
+    return JNI_FALSE;
+  }
+  std::unique_ptr<Tensor> *ptr =
+      reinterpret_cast<std::unique_ptr<Tensor> *>(java_pointer);
+  ptr->reset();
+  delete ptr;
+  return JNI_TRUE;
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/fluid/lite/api/android/jni/native/tensor_jni.h b/paddle/fluid/lite/api/android/jni/native/tensor_jni.h
new file mode 100644
index 00000000000..80bf71f1175
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/native/tensor_jni.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class com_baidu_paddle_lite_Tensor */
+
+#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+namespace paddle {
+namespace lite_api {
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    resize
+ * Signature: ([J)Z
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_resize(JNIEnv *,
+                                                                    jobject,
+                                                                    jlongArray);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    shape
+ * Signature: ()[J
+ */
+JNIEXPORT jlongArray JNICALL Java_com_baidu_paddle_lite_Tensor_shape(JNIEnv *,
+                                                                     jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    setData
+ * Signature: ([F)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_Tensor_setData___3F(JNIEnv *, jobject, jfloatArray);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    setData
+ * Signature: ([B)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_Tensor_setData___3B(JNIEnv *, jobject, jbyteArray);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getFloatData
+ * Signature: ()[F
+ */
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getByteData
+ * Signature: ()[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
+
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    deleteCppTensor
+ * Signature: (J)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong);
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
new file mode 100644
index 00000000000..870ec275e82
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
@@ -0,0 +1,2 @@
+/PaddleLite.class
+/PaddleLiteTest.class
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
new file mode 100644
index 00000000000..51115b30167
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/ConfigBase.java
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+/**
+ * Base class for all configurations.
+ */
+public class ConfigBase {
+
+    protected String modelDir;
+
+    public String getModelDir() {
+        return modelDir;
+    }
+
+    public void setModelDir(String modelDir) {
+        this.modelDir = modelDir;
+    }
+
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
new file mode 100644
index 00000000000..1498a3a492f
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/CxxConfig.java
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+/**
+ * CxxConfig is the configuration for the Full feature predictor.
+ */
+public class CxxConfig extends ConfigBase {
+
+    Place preferredPlace;
+    Place[] validPlaces;
+
+    public Place getPreferredPlace() {
+        return preferredPlace;
+    }
+
+    public void setPreferredPlace(Place preferredPlace) {
+        this.preferredPlace = preferredPlace;
+    }
+
+    public Place[] getValidPlaces() {
+        return validPlaces;
+    }
+
+    public void setValidPlaces(Place[] validPlaces) {
+        this.validPlaces = validPlaces;
+    }
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
new file mode 100644
index 00000000000..d1aabff0bef
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/MobileConfig.java
@@ -0,0 +1,22 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+/**
+ * MobileConfig is the config for the light weight predictor, it will skip
+ * IR optimization or other unnecessary stages.
+ */ 
+public class MobileConfig extends ConfigBase {
+
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
new file mode 100644
index 00000000000..7a23891b7d6
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddleLiteInitializer.java
@@ -0,0 +1,26 @@
+package com.baidu.paddle.lite;
+
+/**
+ * Initializer for PaddleLite.
+ */
+public class PaddleLiteInitializer {
+
+    /** name of C++ JNI lib */
+    public final static String JNI_LIB_NAME = "paddle_lite_jni";
+
+    /** 
+     * load the C++ JNI lib
+     * @return true if initialize successfully.
+     */
+    public static boolean init() {
+        System.loadLibrary(JNI_LIB_NAME);
+        initNative();
+        return true;
+    }
+    
+    private static native void initNative();
+    
+    static {
+        init();
+    }
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
new file mode 100644
index 00000000000..38549d6daf1
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
@@ -0,0 +1,94 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+/** Java Native Interface (JNI) class for Paddle Lite APIs */
+public class PaddlePredictor {
+
+    /** 
+     * Java doesn't have pointer. To maintain the life cycle of under going
+     * C++ PaddlePredictor object, we use a long value to maintain it.
+     */
+    private long cppPaddlePredictorPointer;
+
+    private PaddlePredictor(ConfigBase config) {
+        init(config);
+    }
+
+    public static PaddlePredictor createPaddlePredictor(ConfigBase config) {
+        PaddlePredictor predictor = new PaddlePredictor(config);
+        return predictor.cppPaddlePredictorPointer == 0L ? null : predictor;
+    }
+
+    public Tensor getInput(int offset) {
+        long cppTensorPointer = getInputCppTensorPointer(offset);
+        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ false, this);
+    }
+
+    public Tensor getOutput(int offset) {
+        long cppTensorPointer = getOutputCppTensorPointer(offset);
+        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this);
+    }
+
+    public Tensor getTensor(String name) {
+        long cppTensorPointer = getCppTensorPointerByName(name);
+        return cppTensorPointer == 0 ? null : new Tensor(cppTensorPointer, /* readOnly = */ true, this); 
+    }
+
+    public native boolean run();
+
+    public native boolean saveOptimizedModel(String modelDir);
+
+    @Override
+    protected void finalize() throws Throwable {
+        clear();
+        super.finalize();
+    }
+    
+    protected boolean init(ConfigBase config) {
+        if (config instanceof CxxConfig) {
+            cppPaddlePredictorPointer = newCppPaddlePredictor((CxxConfig)config);
+        } else if (config instanceof MobileConfig) {
+            cppPaddlePredictorPointer = newCppPaddlePredictor((MobileConfig)config);
+        } else {
+            throw new IllegalArgumentException("Not supported PaddleLite Config type");
+        }
+        return cppPaddlePredictorPointer != 0L;
+    }
+
+    protected boolean clear() {
+        boolean result = false;
+        if (cppPaddlePredictorPointer != 0L) {
+            result = deleteCppPaddlePredictor(cppPaddlePredictorPointer);
+            cppPaddlePredictorPointer = 0L;
+        }
+        return result;
+    }
+
+    private native long getInputCppTensorPointer(int offset);
+
+    private native long getOutputCppTensorPointer(int offset);
+    
+    private native long getCppTensorPointerByName(String name);
+
+    private native long newCppPaddlePredictor(CxxConfig config);
+
+    private native long newCppPaddlePredictor(MobileConfig config);
+
+    private native boolean deleteCppPaddlePredictor(long nativePointer);
+
+    static {
+        PaddleLiteInitializer.init();
+    }
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
new file mode 100644
index 00000000000..e08b69da066
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
@@ -0,0 +1,119 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+/**
+ * Place specifies the execution context of a Kernel or input/output for a
+ * kernel. It is used to make the analysis of the MIR more clear and accurate.
+ */
+public class Place {
+    public enum TargetType {
+        UNKNOWN(0), HOST(1), X86(2), CUDA(3), ARM(4), OPEN_CL(5), ANY(6);
+
+        public final int value;
+
+        private TargetType(int value) {
+            this.value = value;
+        }
+    }
+
+    public enum DataLayoutType {
+        UNKNOWN(0), NCHW(1), ANY(2);
+
+        public final int value;
+
+        private DataLayoutType(int value) {
+            this.value = value;
+        }
+    }
+
+    private TargetType target;
+    private PrecisionType precision;
+    private DataLayoutType layout;
+    private int device;
+
+    public Place() {
+        target = TargetType.UNKNOWN;
+        precision = PrecisionType.UNKNOWN;
+        layout = DataLayoutType.UNKNOWN;
+        device = 0;
+    }
+
+    public Place(TargetType target) {
+        this(target, PrecisionType.FLOAT);
+    }
+
+    public Place(TargetType target, PrecisionType precision) {
+        this(target, precision, DataLayoutType.NCHW);
+    }
+
+    public Place(TargetType target, PrecisionType precision, DataLayoutType layout) {
+        this(target, precision, layout, 0);
+    }
+
+    public Place(TargetType target, PrecisionType precision, DataLayoutType layout, int device) {
+        this.target = target;
+        this.precision = precision;
+        this.layout = layout;
+        this.device = device;
+    }
+
+    public boolean isValid() {
+        return target != TargetType.UNKNOWN && precision != PrecisionType.UNKNOWN && layout != DataLayoutType.UNKNOWN;
+    }
+
+    public TargetType getTarget() {
+        return target;
+    }
+
+    public void setTarget(TargetType target) {
+        this.target = target;
+    }
+
+    public PrecisionType getPrecision() {
+        return precision;
+    }
+
+    public void setPrecision(PrecisionType precision) {
+        this.precision = precision;
+    }
+
+    public DataLayoutType getLayout() {
+        return layout;
+    }
+
+    public void setLayout(DataLayoutType layout) {
+        this.layout = layout;
+    }
+
+    public int getDevice() {
+        return device;
+    }
+
+    public void setDevice(int device) {
+        this.device = device;
+    }
+
+    public int getTargetInt() {
+        return target.value;
+    }
+
+    public int getPrecisionInt() {
+        return precision.value;
+    }
+
+    public int getDataLayoutInt() {
+        return layout.value;
+    }
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PrecisionType.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PrecisionType.java
new file mode 100644
index 00000000000..40312bda74b
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PrecisionType.java
@@ -0,0 +1,11 @@
+package com.baidu.paddle.lite;
+
+public enum PrecisionType {
+    UNKNOWN(0), FLOAT(1), INT8(2), INT32(3), ANY(4);
+
+    public final int value;
+
+    private PrecisionType(int value) {
+        this.value = value;
+    }
+}
diff --git a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
new file mode 100644
index 00000000000..e3828af9b72
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
@@ -0,0 +1,57 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+public class Tensor {
+
+    private long cppTensorPointer;
+    private boolean readOnly;
+    private PaddlePredictor predictor;
+
+    /**
+     * Accessed by package only to prevent public users to create it wrongly. A
+     * Tensor can be created by {@link com.baidu.paddle.lite.PaddlePredictor} only
+     */
+    protected Tensor(long cppTensorPointer, boolean readOnly, PaddlePredictor predictor) {
+        this.cppTensorPointer = cppTensorPointer;
+        this.readOnly = readOnly;
+        this.predictor = predictor;
+    }
+
+    protected void finalize() throws Throwable {
+        if (cppTensorPointer != 0L) {
+            deleteCppTensor(cppTensorPointer);
+            cppTensorPointer = 0L;
+        }
+        super.finalize();
+    }
+
+    public boolean isReadOnly() {
+        return readOnly;
+    }
+
+    public native boolean resize(long[] dims);
+
+    public native long[] shape();
+
+    public native boolean setData(float[] buf);
+
+    public native boolean setData(byte[] buf);
+
+    public native float[] getFloatData();
+
+    public native byte[] getByteData();
+
+    private native boolean deleteCppTensor(long native_pointer);
+}
\ No newline at end of file
diff --git a/paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java b/paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
new file mode 100644
index 00000000000..a1f0778d33a
--- /dev/null
+++ b/paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
@@ -0,0 +1,48 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+package com.baidu.paddle.lite;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.Assert.assertEquals;
+
+class PaddlePredictorTest {
+
+    @Test
+    public void run_defaultModel() {
+        MobileConfig config = new MobileConfig();
+        config.setModelDir("");
+        PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(config);
+
+        float[] inputBuffer = new float[10000];
+        for (int i = 0; i < 10000; ++i) {
+            inputBuffer[i] = i;
+        }
+        long[] dims = { 100, 100 };
+
+        Tensor input = predictor.getInput(0);
+        input.resize(dims);
+        input.setData(inputBuffer);
+
+        predictor.run();
+        
+        Tensor output = predictor.getOutput(0);
+        float[] outputBuffer = output.getFloatData();
+
+        assertEquals(outputBuffer.length, 50000);
+        assertEquals(outputBuffer[0], 50.2132f, 1e-3f);
+        assertEquals(outputBuffer[1], -28.8729f, 1e-3f);
+    }
+
+}
diff --git a/paddle/fluid/lite/api/apis_test.cc b/paddle/fluid/lite/api/apis_test.cc
new file mode 100644
index 00000000000..b3694be9053
--- /dev/null
+++ b/paddle/fluid/lite/api/apis_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * We test multiple apis here.
+ */
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+
+namespace paddle {
+namespace lite {
+
+void SetConstInput(lite::Tensor* x) {
+  x->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = x->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+}
+
+bool CompareTensors(const std::string& name, const Predictor& cxx_api,
+                    const LightPredictor& light_api) {
+  const auto* a = cxx_api.GetTensor(name);
+  const auto* b = light_api.GetTensor(name);
+  return TensorCompareWith(*a, *b);
+}
+
+TEST(CXXApi_LightApi, optim_model) {
+  lite::Predictor cxx_api;
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kX86), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
+  });
+  // On ARM devices, the preferred X86 target not works, but it can still
+  // select ARM kernels.
+  cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                valid_places);
+  cxx_api.SaveModel(FLAGS_optimized_model);
+}
+
+TEST(CXXApi_LightApi, save_and_load_model) {
+  lite::Predictor cxx_api;
+  lite::LightPredictor light_api(FLAGS_optimized_model);
+
+  // CXXAPi
+  {
+    std::vector<Place> valid_places({
+        Place{TARGET(kHost), PRECISION(kFloat)},
+        Place{TARGET(kX86), PRECISION(kFloat)},
+        Place{TARGET(kARM), PRECISION(kFloat)},  // Both works on X86 and ARM
+    });
+    // On ARM devices, the preferred X86 target not works, but it can still
+    // select ARM kernels.
+    cxx_api.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places);
+
+    auto* x = cxx_api.GetInput(0);
+    SetConstInput(x);
+
+    cxx_api.Run();
+
+    LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
+    cxx_api.SaveModel(FLAGS_optimized_model);
+  }
+
+  // LightApi
+  {
+    auto* x = light_api.GetInput(0);
+    SetConstInput(x);
+
+    light_api.Run();
+  }
+
+  const auto* cxx_out = cxx_api.GetOutput(0);
+  const auto* light_out = light_api.GetOutput(0);
+  ASSERT_TRUE(TensorCompareWith(*cxx_out, *light_out));
+
+  std::vector<std::string> tensors_with_order({
+      "a", "fc_0.w_0", "scale_0.tmp_0",
+  });
+
+  for (const auto& tensor_name : tensors_with_order) {
+    ASSERT_TRUE(CompareTensors(tensor_name, cxx_api, light_api));
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api.cc b/paddle/fluid/lite/api/cxx_api.cc
index 1ea8be2c0b5..afb25271a1e 100644
--- a/paddle/fluid/lite/api/cxx_api.cc
+++ b/paddle/fluid/lite/api/cxx_api.cc
@@ -17,17 +17,76 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#include "paddle/fluid/platform/port.h"
-#endif
+#include "paddle/fluid/lite/utils/io.h"
 
 namespace paddle {
 namespace lite {
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void ExecutorLite::SaveModel(const std::string &dir) {
-  MkDirRecursively(dir.c_str());
+void Predictor::SaveModel(const std::string &dir) {
+  MkDirRecur(dir);
   program_->PersistModel(dir, program_desc_);
+  LOG(INFO) << "Save model to " << dir;
+}
+
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) const {
+  auto *_fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void Predictor::Build(const std::string &model_path, const Place &prefer_place,
+                      const std::vector<Place> &valid_places,
+                      const std::vector<std::string> &passes) {
+  LoadModel(model_path, scope_.get(), &program_desc_);
+  Build(program_desc_, prefer_place, valid_places, passes);
+}
+
+const framework::proto::ProgramDesc &Predictor::program_desc() const {
+  return program_desc_;
+}
+
+const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
+
+void Predictor::Build(const framework::proto::ProgramDesc &desc,
+                      const Place &prefer_place,
+                      const std::vector<Place> &valid_places,
+                      const std::vector<std::string> &passes) {
+  program_desc_ = desc;
+  Program program(desc, scope_, valid_places);
+
+  optimizer_.KernelPickPreferPlace(prefer_place);
+  core::KernelPickFactor factor;
+  factor.ConsiderTarget();
+  factor.ConsiderPrecision();
+  optimizer_.Run(std::move(program), valid_places, factor, passes);
+  program_ = optimizer_.GenRuntimeProgram();
+}
+
+const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
+  auto *var = program_->exec_scope()->FindVar(name);
+  return &var->Get<lite::Tensor>();
+}
+
+#ifdef LITE_WITH_X86
+void Predictor::FeedVars(const std::vector<framework::Tensor> &tensors) {
+  auto var = scope_->FindVar("feed");
+  auto &feed_list = *(var->GetMutable<std::vector<lite::Tensor>>());
+  feed_list.resize(tensors.size());
+
+  for (size_t i = 0; i < tensors.size(); ++i)
+    feed_list[i].ShareDataWith(tensors[i]);
 }
 #endif
 
diff --git a/paddle/fluid/lite/api/cxx_api.h b/paddle/fluid/lite/api/cxx_api.h
index 13679413958..5a3b6976ebe 100644
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -17,73 +17,65 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/api/paddle_api.h"
 #include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/optimizer.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"
 #include "paddle/fluid/lite/model_parser/model_parser.h"
 
+#ifdef LITE_WITH_X86
+#include "paddle/fluid/framework/program_desc.h"
+#endif
+
 namespace paddle {
 namespace lite {
 
-struct Config {};
-
-class ExecutorLite {
+/*
+ * Predictor for inference, input a model, it will optimize and execute it.
+ */
+class Predictor {
  public:
-  ExecutorLite() { scope_ = std::make_shared<Scope>(); }
-  explicit ExecutorLite(const std::shared_ptr<lite::Scope>& root_scope) {
-    scope_ = root_scope;
-  }
+  // Create an empty predictor.
+  Predictor() { scope_ = std::make_shared<Scope>(); }
+  // Create a predictor with the weight variable scope set.
+  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
+      : scope_(root_scope) {}
 
+  // Build from a model, with places set for hardware config.
   void Build(const std::string& model_path, const Place& prefer_place,
-             const std::vector<Place>& valid_places) {
-    LoadModel(model_path, scope_.get(), &program_desc_);
-    Build(program_desc_, prefer_place, valid_places);
-  }
+             const std::vector<Place>& valid_places,
+             const std::vector<std::string>& passes = {});
 
   void Build(const framework::proto::ProgramDesc& desc,
-             const Place& prefer_place,
-             const std::vector<Place>& valid_places) {
-    program_desc_ = desc;
-    Program program(desc, scope_, valid_places);
-
-    optimizer_.KernelPickPreferPlace(prefer_place);
-    core::KernelPickFactor factor;
-    factor.ConsiderTarget();
-    optimizer_.Run(std::move(program), valid_places, factor);
-    program_ = optimizer_.GenRuntimeProgram();
-  }
+             const Place& prefer_place, const std::vector<Place>& valid_places,
+             const std::vector<std::string>& passes = {});
 
-// This method is disabled in mobile, or unnecessary dependencies required.
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  void SaveModel(const std::string& dir);
-#endif
+  // Run the predictor for a single batch of data.
+  void Run() { program_->Run(); }
 
-  // Get offset-th col of feed.
-  lite::Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  lite::Tensor* GetInput(size_t offset);
 
-  const lite::Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
+  // Get offset-th col of fetch results.
+  const lite::Tensor* GetOutput(size_t offset) const;
 
-  void Run() { program_->Run(); }
+  const framework::proto::ProgramDesc& program_desc() const;
+  const lite::Tensor* GetTensor(const std::string& name) const;
+  const RuntimeProgram& runtime_program() const;
+
+  // This method is disabled in mobile, for unnecessary dependencies required.
+  void SaveModel(const std::string& dir);
 
-  const framework::proto::ProgramDesc& program_desc() const {
-    return program_desc_;
+#ifdef LITE_WITH_X86
+  void Run(const std::vector<framework::Tensor>& tensors) {
+    FeedVars(tensors);
+    program_->Run();
   }
 
+  void FeedVars(const std::vector<framework::Tensor>& tensors);
+#endif
+
  private:
   Optimizer optimizer_;
   framework::proto::ProgramDesc program_desc_;
@@ -91,6 +83,7 @@ class ExecutorLite {
   std::unique_ptr<RuntimeProgram> program_;
 };
 
+#ifdef LITE_WITH_X86
 /*
  * An executor for training.
  *
@@ -114,21 +107,31 @@ class CXXTrainer {
       : scope_(root_scope),
         preferred_place_(preferred_place),
         valid_places_(valid_places),
-        main_program_executor_(ExecutorLite(scope_)) {}
+        main_program_executor_(Predictor(scope_)) {}
 
   // Build the RuntimeProgram cache for the main program. The cache will run
   // multiple times for the epoches.
   // NOTE Just support to execute the 0-th block currently.
-  ExecutorLite& BuildMainProgramExecutor(
-      const framework::proto::ProgramDesc& desc, int block_id = 0) {
+  Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
+                                      int block_id = 0) {
     main_program_executor_.Build(desc, preferred_place_, valid_places_);
     return main_program_executor_;
   }
 
+#ifdef LITE_WITH_X86
+  Predictor& BuildMainProgramExecutor(framework::ProgramDesc& desc) {  // NOLINT
+    return BuildMainProgramExecutor(*desc.Proto());
+  }
+
+  void RunStartupProgram(framework::ProgramDesc& desc) {  // NOLINT
+    RunStartupProgram(*desc.Proto());
+  }
+#endif
+
   // Run the startup program. It just executes once, no cache needed.
   void RunStartupProgram(const framework::proto::ProgramDesc& desc,
                          int block_id = 0) {
-    ExecutorLite exe(scope_);
+    Predictor exe(scope_);
     exe.Build(desc, preferred_place_, valid_places_);
     exe.Run();
   }
@@ -140,8 +143,9 @@ class CXXTrainer {
   std::vector<Place> valid_places_;
 
   // The training program.
-  ExecutorLite main_program_executor_;
+  Predictor main_program_executor_;
 };
+#endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc
index f53f6105d1b..dfd3e8ab832 100644
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -13,32 +13,48 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/api/cxx_api.h"
-
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#include "paddle/fluid/lite/core/mir/passes.h"
-#endif
-
+#include <chrono>  // NOLINT
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 
-void Run(const char* model_dir) {
-  lite::ExecutorLite predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); }
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
 
-  predictor.Build(model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+void Run(const char* model_dir, int repeat) {
+#ifdef LITE_WITH_ARM
+  DeviceInfo::Init();
+#endif
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kInt8)},
+  });
+
+  predictor.Build(model_dir, Place{TARGET(kARM), PRECISION(kInt8)},
                   valid_places);
 
   auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
   auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
   }
 
-  predictor.Run();
+  auto time1 = time();
+  for (int i = 0; i < repeat; i++) predictor.Run();
+  auto time2 = time();
+  std::cout << " predict cost: " << time_diff(time1, time2) / repeat << "ms"
+            << std::endl;
 
   auto* out = predictor.GetOutput(0);
   LOG(INFO) << out << " memory size " << out->data_size();
@@ -52,8 +68,8 @@ void Run(const char* model_dir) {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 2) << "usage: ./cmd <model_dir>";
-  paddle::lite::Run(argv[1]);
+  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]));
 
   return 0;
 }
@@ -65,14 +81,40 @@ USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
 
+USE_LITE_OP(conv2d);
+USE_LITE_OP(batch_norm);
+USE_LITE_OP(relu);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(pool2d);
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(softmax);
+USE_LITE_OP(fake_quantize_moving_average_abs_max);
+USE_LITE_OP(fake_dequantize_max_abs);
+
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+USE_LITE_OP(calib);
 
 #ifdef LITE_WITH_ARM
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+
 // USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
 // USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
 #endif  // LITE_WITH_ARM
diff --git a/paddle/fluid/lite/api/cxx_api_bin_int8.cc b/paddle/fluid/lite/api/cxx_api_bin_int8.cc
new file mode 100644
index 00000000000..4e30cb88b10
--- /dev/null
+++ b/paddle/fluid/lite/api/cxx_api_bin_int8.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include <chrono>  // NOLINT
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+using Time = decltype(std::chrono::high_resolution_clock::now());
+Time time() { return std::chrono::high_resolution_clock::now(); }
+double time_diff(Time t1, Time t2) {
+  typedef std::chrono::microseconds ms;
+  auto diff = t2 - t1;
+  ms counter = std::chrono::duration_cast<ms>(diff);
+  return counter.count() / 1000.0;
+}
+
+void Run(const char* model_dir, int repeat) {
+#ifdef LITE_WITH_ARM
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, 1);
+#endif
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kInt8)},
+  });
+
+  predictor.Build(model_dir, Place{TARGET(kARM), PRECISION(kInt8)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < 10; i++) predictor.Run();
+  auto time1 = time();
+  for (int i = 0; i < repeat; i++) predictor.Run();
+  auto time2 = time();
+  std::cout << " predict cost: " << time_diff(time1, time2) / repeat << "ms"
+            << std::endl;
+
+  auto* out = predictor.GetOutput(0);
+  LOG(INFO) << out << " memory size " << out->data_size();
+  LOG(INFO) << "dims " << out->dims();
+  LOG(INFO) << "out data size: " << out->data_size();
+  /*
+  float sum = 0.;
+  for (int i = 0; i < out->data_size(); i++) {
+     LOG(INFO) << "out " << out->data<float>()[i];
+     sum += out->data<float>()[i];
+  }
+  LOG(INFO) << sum;
+  */
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+
+  return 0;
+}
diff --git a/paddle/fluid/lite/api/cxx_api_impl.cc b/paddle/fluid/lite/api/cxx_api_impl.cc
new file mode 100644
index 00000000000..83a9f99965f
--- /dev/null
+++ b/paddle/fluid/lite/api/cxx_api_impl.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_api.h"
+
+namespace paddle {
+namespace lite {
+
+class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
+ public:
+  CxxPaddleApiImpl();
+
+  /// Create a new predictor from a config.
+  void Init(const lite_api::CxxConfig &config);
+
+  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
+
+  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
+
+  void Run() override;
+
+  std::unique_ptr<const lite_api::Tensor> GetTensor(
+      const std::string &name) const override;
+
+  void SaveOptimizedModel(const std::string &model_dir) override;
+
+ private:
+  Predictor raw_predictor_;
+};
+
+CxxPaddleApiImpl::CxxPaddleApiImpl() {}
+
+void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
+  auto places = config.valid_places();
+  places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
+  raw_predictor_.Build(config.model_dir(), config.preferred_place(), places);
+}
+
+std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
+  auto *x = raw_predictor_.GetInput(i);
+  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
+}
+
+std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
+    int i) const {
+  const auto *x = raw_predictor_.GetOutput(i);
+  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
+}
+
+void CxxPaddleApiImpl::Run() { raw_predictor_.Run(); }
+
+std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetTensor(
+    const std::string &name) const {
+  auto *x = raw_predictor_.GetTensor(name);
+  return std::unique_ptr<const lite_api::Tensor>(new lite_api::Tensor(x));
+}
+
+void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir) {
+  raw_predictor_.SaveModel(model_dir);
+}
+
+}  // namespace lite
+
+namespace lite_api {
+
+template <>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const CxxConfig &config) {
+  auto x = std::make_shared<lite::CxxPaddleApiImpl>();
+  x->Init(config);
+  return x;
+}
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc
index 430bd9b58f8..3d1ad5e8511 100644
--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -16,12 +16,13 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
-DEFINE_string(model_dir, "", "");
-DEFINE_string(optimized_model, "", "");
-
 // For training.
 DEFINE_string(startup_program_path, "", "");
 DEFINE_string(main_program_path, "", "");
@@ -29,48 +30,19 @@ DEFINE_string(main_program_path, "", "");
 namespace paddle {
 namespace lite {
 
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, test) {
-  lite::ExecutorLite predictor;
-#ifndef LITE_WITH_CUDA
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kX86), PRECISION(kFloat)}});
-#else
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  });
-#endif
-
-  predictor.Build(FLAGS_model_dir,
-                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
-                  valid_places);
-
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < 100 * 100; i++) {
-    data[i] = i;
-  }
-
-  // LOG(INFO) << "input " << *input_tensor;
-
-  predictor.Run();
-
-  auto* out = predictor.GetOutput(0);
+  const lite::Tensor* out = RunHvyModel();
   LOG(INFO) << out << " memory size " << out->data_size();
-  LOG(INFO) << "out " << out->data<float>()[0];
-  LOG(INFO) << "out " << out->data<float>()[1];
+  for (int i = 0; i < 10; i++) {
+    LOG(INFO) << "out " << out->data<float>()[i];
+  }
   LOG(INFO) << "dims " << out->dims();
   // LOG(INFO) << "out " << *out;
 }
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi, save_model) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
   predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
@@ -79,9 +51,7 @@ TEST(CXXApi, save_model) {
   LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
   predictor.SaveModel(FLAGS_optimized_model);
 }
-#endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 /*TEST(CXXTrainer, train) {
   Place prefer_place({TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)});
   std::vector<Place> valid_places({prefer_place});
@@ -115,46 +85,18 @@ TEST(CXXApi, save_model) {
 }*/
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 
-}  // namespace lite
-}  // namespace paddle
+#ifdef LITE_WITH_ARM
+TEST(CXXApi, save_model) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
 
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(relu);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-USE_LITE_OP(elementwise_add)
-USE_LITE_OP(elementwise_sub)
-USE_LITE_OP(square)
-USE_LITE_OP(softmax)
-USE_LITE_OP(dropout)
-USE_LITE_OP(concat)
-USE_LITE_OP(conv2d)
-USE_LITE_OP(depthwise_conv2d)
-USE_LITE_OP(pool2d)
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
+  LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
+  predictor.SaveModel(FLAGS_optimized_model);
+}
 #endif
 
-#ifdef LITE_WITH_CUDA
-USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
-USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
-#endif
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/inceptionv4_test.cc b/paddle/fluid/lite/api/inceptionv4_test.cc
new file mode 100644
index 00000000000..45fb79851f8
--- /dev/null
+++ b/paddle/fluid/lite/api/inceptionv4_test.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+#ifdef LITE_WITH_ARM
+TEST(InceptionV4, test) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  // std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
+  //                            0.00070094, 0.00080584, 0.00044525, 0.00074907,
+  //                            0.00059774, 0.00063654});
+  //
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.0011684548,  0.0010390386,  0.0011301535,  0.0010133048,
+       0.0010259597,  0.0010982729,  0.00093195855, 0.0009141837,
+       0.00096620916, 0.00089982944, 0.0010064574,  0.0010474789,
+       0.0009782845,  0.0009230255,  0.0010548076,  0.0010974824,
+       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+#endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/light_api.cc b/paddle/fluid/lite/api/light_api.cc
index 9d3da3a5919..6a7e20a053c 100644
--- a/paddle/fluid/lite/api/light_api.cc
+++ b/paddle/fluid/lite/api/light_api.cc
@@ -13,3 +13,67 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/api/light_api.h"
+
+namespace paddle {
+namespace lite {
+
+void LightPredictor::Build(const std::string& model_dir) {
+  framework::proto::ProgramDesc desc;
+  LoadModel(model_dir, scope_.get(), &desc);
+  BuildRuntimeProgram(desc);
+}
+
+Tensor* LightPredictor::GetInput(size_t offset) {
+  auto* _feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const Tensor* LightPredictor::GetOutput(size_t offset) {
+  auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void LightPredictor::BuildRuntimeProgram(
+    const framework::proto::ProgramDesc& prog) {
+  std::vector<Instruction> insts;
+  // 1. Create op first
+  Program program(prog, scope_, {});
+
+  // 2. Create Instructs
+
+  // Create the kernels of the target places, and filter out the specific
+  // kernel with the target alias.
+  for (auto& op : program.ops()) {
+    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    std::string op_type, alias;
+    Place place;
+    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+    auto kernels = op->CreateKernels({place});
+    // filter out a kernel
+    auto it = std::find_if(
+        kernels.begin(), kernels.end(),
+        [&](std::unique_ptr<KernelBase>& it) { return it->alias() == alias; });
+    CHECK(it != kernels.end());
+    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    insts.emplace_back(op, std::move(*it));
+  }
+  program_.reset(new RuntimeProgram(std::move(insts)));
+  CHECK(program.exec_scope());
+  program_->set_exec_scope(program.exec_scope());
+}
+
+LightPredictor::LightPredictor(const std::string& model_dir) {
+  scope_ = std::make_shared<Scope>();
+  Build(model_dir);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h
index a43755c8738..bf1d7e95a3d 100644
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -22,6 +22,7 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/context.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"
@@ -31,66 +32,30 @@
 namespace paddle {
 namespace lite {
 
+/*
+ * The light weight predictor, mainly for mobile. It loads an optimized model,
+ * and will not depend on the MIR or perform latter optimization.
+ */
 class LightPredictor {
  public:
-  LightPredictor() { scope_ = std::make_shared<Scope>(); }
-
-  void Build(const std::string& model_dir) {
-    framework::proto::ProgramDesc desc;
-    LoadModel(model_dir, scope_.get(), &desc);
-    BuildRuntimeProgram(desc);
-  }
+  explicit LightPredictor(const std::string& model_dir);
 
   void Run() { program_->Run(); }
 
-  // Get offset-th col of feed.
-  Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  Tensor* GetInput(size_t offset);
 
-  const Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
+  // Get offset-th col of fetch outputs.
+  const Tensor* GetOutput(size_t offset);
+
+  const lite::Tensor* GetTensor(const std::string& name) const {
+    auto* var = program_->exec_scope()->FindVar(name);
+    return &var->Get<lite::Tensor>();
   }
 
  private:
-  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
-    std::vector<Instruction> insts;
-    // 1. Create op first
-    Program program(prog, scope_, {});
-
-    // 2. Create Instructs
-
-    // Create the kernels of the target places, and filter out the specific
-    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-      std::string op_type, alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      auto kernels = op->CreateKernels({place});
-      // filter out a kernel
-      auto it = std::find_if(kernels.begin(), kernels.end(),
-                             [&](std::unique_ptr<KernelBase>& it) {
-                               return it->alias() == alias;
-                             });
-      CHECK(it != kernels.end());
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-      insts.emplace_back(op, std::move(*it));
-    }
-    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
-  }
+  void Build(const std::string& model_dir);
+  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog);
 
  private:
   std::shared_ptr<Scope> scope_;
diff --git a/paddle/fluid/lite/api/light_api_impl.cc b/paddle/fluid/lite/api/light_api_impl.cc
new file mode 100644
index 00000000000..3f69f355736
--- /dev/null
+++ b/paddle/fluid/lite/api/light_api_impl.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/api/paddle_api.h"
+
+namespace paddle {
+namespace lite_api {
+
+class LightPredictorImpl : public PaddlePredictor {
+ public:
+  LightPredictorImpl() = default;
+
+  std::unique_ptr<Tensor> GetInput(int i) override;
+
+  std::unique_ptr<const Tensor> GetOutput(int i) const override;
+
+  void Run() override;
+
+  std::unique_ptr<const Tensor> GetTensor(
+      const std::string& name) const override;
+
+  void Init(const MobileConfig& config);
+
+ private:
+  std::unique_ptr<lite::LightPredictor> raw_predictor_;
+};
+
+void LightPredictorImpl::Init(const MobileConfig& config) {
+  raw_predictor_.reset(new lite::LightPredictor(config.model_dir()));
+}
+
+std::unique_ptr<Tensor> LightPredictorImpl::GetInput(int i) {
+  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetInput(i)));
+}
+
+std::unique_ptr<const Tensor> LightPredictorImpl::GetOutput(int i) const {
+  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetOutput(i)));
+}
+
+void LightPredictorImpl::Run() { raw_predictor_->Run(); }
+
+std::unique_ptr<const Tensor> LightPredictorImpl::GetTensor(
+    const std::string& name) const {
+  return std::unique_ptr<const Tensor>(
+      new Tensor(raw_predictor_->GetTensor(name)));
+}
+
+template <>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
+    const MobileConfig& config) {
+  auto x = std::make_shared<LightPredictorImpl>();
+  x->Init(config);
+  return x;
+}
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/light_api_test.cc b/paddle/fluid/lite/api/light_api_test.cc
index b1e6741e09e..ca34175c6e3 100644
--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -15,6 +15,9 @@
 #include "paddle/fluid/lite/api/light_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 
 DEFINE_string(optimized_model, "", "");
 
@@ -22,8 +25,10 @@ namespace paddle {
 namespace lite {
 
 TEST(LightAPI, load) {
-  LightPredictor predictor;
-  predictor.Build(FLAGS_optimized_model);
+  if (FLAGS_optimized_model.empty()) {
+    FLAGS_optimized_model = "lite_naive_model";
+  }
+  LightPredictor predictor(FLAGS_optimized_model);
 
   auto* input_tensor = predictor.GetInput(0);
   input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));
@@ -33,29 +38,14 @@ TEST(LightAPI, load) {
   }
 
   predictor.Run();
+
+  const auto* output = predictor.GetOutput(0);
+  const float* raw_output = output->data<float>();
+
+  for (int i = 0; i < 10; i++) {
+    LOG(INFO) << "out " << raw_output[i];
+  }
 }
 
 }  // namespace lite
 }  // namespace paddle
-
-USE_LITE_OP(mul);
-USE_LITE_OP(fc);
-USE_LITE_OP(scale);
-USE_LITE_OP(feed);
-USE_LITE_OP(fetch);
-USE_LITE_OP(io_copy);
-
-USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
-
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
-#endif
diff --git a/paddle/fluid/lite/api/lite_api_test_helper.cc b/paddle/fluid/lite/api/lite_api_test_helper.cc
new file mode 100644
index 00000000000..3c0835bc49b
--- /dev/null
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include <vector>
+
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+
+namespace paddle {
+namespace lite {
+
+const lite::Tensor* RunHvyModel() {
+  lite::Predictor predictor;
+#ifndef LITE_WITH_CUDA
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+#else
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
+      Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
+      Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
+  });
+#endif
+
+  predictor.Build(FLAGS_model_dir,
+                  Place{TARGET(kX86), PRECISION(kFloat)},  // origin cuda
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({100, 100})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+
+  // LOG(INFO) << "input " << *input_tensor;
+
+  predictor.Run();
+
+  const auto* out = predictor.GetOutput(0);
+  return out;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/lite_api_test_helper.h b/paddle/fluid/lite/api/lite_api_test_helper.h
new file mode 100644
index 00000000000..840de932f01
--- /dev/null
+++ b/paddle/fluid/lite/api/lite_api_test_helper.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+DECLARE_string(model_dir);
+DECLARE_string(optimized_model);
+
+namespace paddle {
+namespace lite {
+
+const lite::Tensor* RunHvyModel();
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/mobilenetv1_test.cc b/paddle/fluid/lite/api/mobilenetv1_test.cc
new file mode 100644
index 00000000000..38863ff6a2c
--- /dev/null
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places,
+               const Place& preferred_place) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+
+  predictor.Build(FLAGS_model_dir, preferred_place, valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
+       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
+       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
+       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
+       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+
+TEST(MobileNetV1, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      // Place{TARGET(kOpenCL), PRECISION(kFloat)},
+  });
+
+  TestModel(valid_places, Place({TARGET(kARM), PRECISION(kFloat)}));
+}
+
+TEST(MobileNetV1, test_opencl) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat)},
+  });
+
+  TestModel(valid_places, Place({TARGET(kOpenCL), PRECISION(kFloat)}));
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/mobilenetv2_test.cc b/paddle/fluid/lite/api/mobilenetv2_test.cc
new file mode 100644
index 00000000000..deb0a244b72
--- /dev/null
+++ b/paddle/fluid/lite/api/mobilenetv2_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+#ifdef LITE_WITH_ARM
+TEST(MobileNetV2, test) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.00017082224, 5.699624e-05,  0.000260885,   0.00016412718,
+       0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735,
+       0.0009059976,  9.5378724e-05, 5.386537e-05,  0.0006427285,
+       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
+       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+#endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/model_optimize_tool.cc b/paddle/fluid/lite/api/model_optimize_tool.cc
new file mode 100644
index 00000000000..92240e4586f
--- /dev/null
+++ b/paddle/fluid/lite/api/model_optimize_tool.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/utils/string.h"
+
+DEFINE_string(model_dir, "", "path of the model");
+DEFINE_string(optimize_out, "", "path of the output optimized model");
+DEFINE_string(valid_targets, "ARM",
+              "The targets this model optimized for, should be one of (arm, "
+              "opencl, x86), splitted by space");
+DEFINE_bool(int8_mode, false, "Support Int8 quantitative mode");
+
+namespace paddle {
+namespace lite_api {
+
+void Main() {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+
+  std::vector<Place> valid_places;
+  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
+  for (auto& target_repr : target_reprs) {
+    if (target_repr == "arm") {
+      valid_places.emplace_back(TARGET(kARM));
+    } else if (target_repr == "opencl") {
+      valid_places.emplace_back(TARGET(kOpenCL));
+    } else if (target_repr == "x86") {
+      valid_places.emplace_back(TARGET(kX86));
+    } else {
+      LOG(FATAL) << lite::string_format(
+          "Wrong target '%s' found, please check the command flag "
+          "'valid_targets'",
+          target_repr.c_str());
+    }
+  }
+
+  CHECK(!valid_places.empty())
+      << "At least one target should be set, should set the "
+         "command argument 'valid_targets'";
+  if (FLAGS_int8_mode) {
+    LOG(WARNING) << "Int8 mode is only support by ARM target";
+    valid_places.push_back(Place{TARGET(kARM), PRECISION(kInt8)});
+    config.set_preferred_place(Place{TARGET(kARM), PRECISION(kInt8)});
+  }
+  config.set_valid_places(valid_places);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  predictor->SaveOptimizedModel(FLAGS_optimize_out);
+}
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, false);
+  paddle::lite_api::Main();
+  return 0;
+}
diff --git a/paddle/fluid/lite/api/model_test.cc b/paddle/fluid/lite/api/model_test.cc
new file mode 100644
index 00000000000..78fe52394c8
--- /dev/null
+++ b/paddle/fluid/lite/api/model_test.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/cpu_info.h"
+#include "paddle/fluid/lite/utils/string.h"
+
+namespace paddle {
+namespace lite_api {
+
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<int64_t>& input_shape) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
+  config.set_valid_places({
+      Place{TARGET(kX86), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; ++i) {
+    data[i] = i;
+  }
+  predictor->Run();
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<int64_t>& input_shape, const std::string& model_dir,
+         const int repeat, const int thread_num, const int warmup_times = 10) {
+  lite::DeviceInfo::Init();
+  lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(input_shape);
+  float* input_data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; ++i) {
+    input_data[i] = i;
+  }
+
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+
+  auto start = lite::GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    predictor->Run();
+  }
+  auto end = lite::GetCurrentUS();
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir << ", threads num " << thread_num
+            << ", warmup: " << warmup_times << ", repeats: " << repeat
+            << ", spend " << (end - start) / repeat / 1000.0
+            << " ms in average.";
+
+  auto output = predictor->GetOutput(0);
+  const float* out = output->data<float>();
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  auto output_shape = output->shape();
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  LOG(INFO) << "output_num: " << output_num;
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    LOG(INFO) << "usage: " << argv[0] << " <model_dir> <repeat> <thread_num>";
+    exit(0);
+  }
+  std::string load_model_dir = argv[1];
+  std::string save_optimized_model_dir = load_model_dir + "opt2";
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  int repeat = std::stoi(argv[2]);
+  int thread_num = std::stoi(argv[3]);
+#endif
+
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+
+  // Output optimized model
+  paddle::lite_api::OutputOptModel(load_model_dir, save_optimized_model_dir,
+                                   input_shape);
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(input_shape, save_optimized_model_dir, repeat,
+                        thread_num);
+#endif
+
+  return 0;
+}
diff --git a/paddle/fluid/lite/api/paddle_api.cc b/paddle/fluid/lite/api/paddle_api.cc
new file mode 100644
index 00000000000..af78d35b691
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_api.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/light_api.h"
+
+namespace paddle {
+namespace lite_api {
+
+Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
+
+// TODO(Superjomn) refine this by using another `const void* const_raw`;
+Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast<void *>(raw); }
+
+lite::Tensor *tensor(void *x) { return static_cast<lite::Tensor *>(x); }
+const lite::Tensor *ctensor(void *x) {
+  return static_cast<const lite::Tensor *>(x);
+}
+
+void Tensor::Resize(const shape_t &shape) {
+  tensor(raw_tensor_)->Resize(shape);
+}
+
+template <>
+const float *Tensor::data() const {
+  return ctensor(raw_tensor_)->data<float>();
+}
+template <>
+const int8_t *Tensor::data() const {
+  return ctensor(raw_tensor_)->data<int8_t>();
+}
+
+template <>
+float *Tensor::mutable_data() const {
+  return tensor(raw_tensor_)->mutable_data<float>();
+}
+template <>
+int8_t *Tensor::mutable_data() const {
+  return tensor(raw_tensor_)->mutable_data<int8_t>();
+}
+
+shape_t Tensor::shape() const {
+  return ctensor(raw_tensor_)->dims().Vectorize();
+}
+
+void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir) {
+  LOG(ERROR)
+      << "The SaveOptimizedModel API is only supported by CxxConfig predictor.";
+}
+
+template <typename ConfigT>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT &) {
+  return std::shared_ptr<PaddlePredictor>();
+}
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/paddle_api.h b/paddle/fluid/lite/api/paddle_api.h
new file mode 100644
index 00000000000..28ac6bd799a
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_api.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines PaddlePredictor, the api for lite. It supports multiple
+ * hardware including ARM, X86, OpenCL, CUDA and so on.
+ */
+
+#ifndef PADDLE_LITE_API_H_  // NOLINT
+#define PADDLE_LITE_API_H_
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle_place.h"  // NOLINT
+
+namespace paddle {
+namespace lite_api {
+
+using shape_t = std::vector<int64_t>;
+
+struct Tensor {
+  explicit Tensor(void* raw);
+  explicit Tensor(const void* raw);
+
+  void Resize(const shape_t& shape);
+
+  /// Readonly data.
+  template <typename T>
+  const T* data() const;
+
+  template <typename T>
+  T* mutable_data() const;
+
+  /// Shape of the tensor.
+  shape_t shape() const;
+
+ private:
+  void* raw_tensor_;
+};
+
+/// The PaddlePredictor defines the basic interfaces for different kinds of
+/// predictors.
+class PaddlePredictor {
+ public:
+  PaddlePredictor() = default;
+
+  /// Get i-th input.
+  virtual std::unique_ptr<Tensor> GetInput(int i) = 0;
+
+  /// Get i-th output.
+  virtual std::unique_ptr<const Tensor> GetOutput(int i) const = 0;
+
+  virtual void Run() = 0;
+
+  /// Get a readonly tensor, return null if no one called `name` exists.
+  virtual std::unique_ptr<const Tensor> GetTensor(
+      const std::string& name) const = 0;
+
+  /// Persist the optimized model to disk. This API is only supported by
+  /// CxxConfig, and the persisted model can be reused for MobileConfig.
+  virtual void SaveOptimizedModel(const std::string& model_dir);
+
+  virtual ~PaddlePredictor() = default;
+};
+
+/// Base class for all the configs.
+class ConfigBase {
+  std::string model_dir_;
+
+ public:
+  void set_model_dir(const std::string& x) { model_dir_ = x; }
+
+  const std::string& model_dir() const { return model_dir_; }
+};
+
+/// CxxConfig is the config for the Full feature predictor.
+class CxxConfig : public ConfigBase {
+  Place preferred_place_;
+  std::vector<Place> valid_places_;
+
+ public:
+  void set_preferred_place(const Place& x) { preferred_place_ = x; }
+  void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
+
+  const Place& preferred_place() const { return preferred_place_; }
+  const std::vector<Place>& valid_places() const { return valid_places_; }
+};
+
+/// MobileConfig is the config for the light weight predictor, it will skip
+/// IR optimization or other unnecessary stages.
+class MobileConfig : public ConfigBase {};
+
+template <typename ConfigT>
+std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&);
+
+}  // namespace lite_api
+}  // namespace paddle
+
+#endif  // NOLINT
diff --git a/paddle/fluid/lite/api/paddle_api_test.cc b/paddle/fluid/lite/api/paddle_api_test.cc
new file mode 100644
index 00000000000..668d6953003
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_api_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace paddle {
+namespace lite_api {
+
+TEST(CxxApi, run) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
+  config.set_valid_places({
+      Place{TARGET(kX86), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(std::vector<int64_t>({100, 100}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto output = predictor->GetOutput(0);
+  auto* out = output->data<float>();
+  LOG(INFO) << out[0];
+  LOG(INFO) << out[1];
+
+  EXPECT_NEAR(out[0], 50.2132, 1e-3);
+  EXPECT_NEAR(out[1], -28.8729, 1e-3);
+
+  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2");
+}
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+TEST(LightApi, run) {
+  lite_api::MobileConfig config;
+  config.set_model_dir(FLAGS_model_dir + ".opt2");
+
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(std::vector<int64_t>({100, 100}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100 * 100; i++) {
+    data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto output = predictor->GetOutput(0);
+  auto* out = output->data<float>();
+  LOG(INFO) << out[0];
+  LOG(INFO) << out[1];
+
+  EXPECT_NEAR(out[0], 50.2132, 1e-3);
+  EXPECT_NEAR(out[1], -28.8729, 1e-3);
+}
+#endif
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/paddle_lite_factory_helper.h b/paddle/fluid/lite/api/paddle_lite_factory_helper.h
new file mode 100644
index 00000000000..544cd0e3130
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_lite_factory_helper.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * This file defines some MACROS that explicitly determine the op, kernel, mir
+ * passes used in the inference lib.
+ */
+#pragma once
+
+#define USE_LITE_OP(op_type__)                                   \
+  extern int touch_op_##op_type__();                             \
+  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
+      touch_op_##op_type__();
+
+#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
+  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
+  int op_type__##target__##precision__##layout__##alias__                    \
+      __attribute__((unused)) =                                              \
+          touch_##op_type__##target__##precision__##layout__##alias__();
+
+#define USE_MIR_PASS(name__)                                   \
+  extern bool mir_pass_registry##name__##_fake();              \
+  static bool mir_pass_usage##name__ __attribute__((unused)) = \
+      mir_pass_registry##name__##_fake();
+
+#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
diff --git a/paddle/fluid/lite/api/paddle_place.cc b/paddle/fluid/lite/api/paddle_place.cc
new file mode 100644
index 00000000000..62ab567e8e6
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_place.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/api/paddle_place.h"
+#include <glog/logging.h>
+#include "paddle/fluid/lite/utils/hash.h"
+
+namespace paddle {
+namespace lite_api {
+
+size_t Place::hash() const {
+  std::hash<int> h;
+  size_t hash = h(static_cast<int>(target));
+  hash = lite::hash_combine(hash, static_cast<int>(precision));
+  hash = lite::hash_combine(hash, static_cast<int>(layout));
+  hash = lite::hash_combine(hash, static_cast<int>(device));
+  return hash;
+}
+
+bool operator<(const Place& a, const Place& b) {
+  if (a.target != b.target) return a.target < b.target;
+  if (a.precision != b.precision) return a.precision < b.precision;
+  if (a.layout != b.layout) return a.layout < b.layout;
+  if (a.device != b.device) return a.device < b.device;
+  return false;
+}
+
+std::string Place::DebugString() const {
+  std::stringstream os;
+  os << TargetToStr(target) << "/" << PrecisionToStr(precision) << "/"
+     << DataLayoutToStr(layout);
+  return os.str();
+}
+
+const std::string& TargetToStr(TargetType target) {
+  static const std::string target2string[] = {"unk", "host",   "x86", "cuda",
+                                              "arm", "opencl", "any"};
+  auto x = static_cast<int>(target);
+  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
+  return target2string[x];
+}
+
+const std::string& PrecisionToStr(PrecisionType precision) {
+  static const std::string precision2string[] = {"unk",     "float",   "int8_t",
+                                                 "int32_t", "int64_t", "any"};
+  auto x = static_cast<int>(precision);
+  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
+  return precision2string[x];
+}
+
+const std::string& DataLayoutToStr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"unk", "NCHW", "any"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+const std::string& TargetRepr(TargetType target) {
+  static const std::string target2string[] = {
+      "kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny"};
+  auto x = static_cast<int>(target);
+  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
+  return target2string[x];
+}
+
+const std::string& PrecisionRepr(PrecisionType precision) {
+  static const std::string precision2string[] = {"kUnk",   "kFloat", "kInt8",
+                                                 "kInt32", "kInt64", "kAny"};
+  auto x = static_cast<int>(precision);
+  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
+  return precision2string[x];
+}
+
+const std::string& DataLayoutRepr(DataLayoutType layout) {
+  static const std::string datalayout2string[] = {"kUnk", "kNCHW", "kAny"};
+  auto x = static_cast<int>(layout);
+  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
+  return datalayout2string[x];
+}
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/paddle_place.h b/paddle/fluid/lite/api/paddle_place.h
new file mode 100644
index 00000000000..2b7299cffa9
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_place.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace paddle {
+namespace lite_api {
+
+enum class TargetType : int {
+  kUnk = 0,
+  kHost,
+  kX86,
+  kCUDA,
+  kARM,
+  kOpenCL,
+  kAny,  // any target
+  NUM,   // number of fields.
+};
+enum class PrecisionType : int {
+  kUnk = 0,
+  kFloat,
+  kInt8,
+  kInt32,
+  kInt64,
+  kAny,  // any precision
+  NUM,   // number of fields.
+};
+enum class DataLayoutType : int {
+  kUnk = 0,
+  kNCHW,
+  kAny,  // any data layout
+  NUM,   // number of fields.
+};
+
+static size_t PrecisionTypeLength(PrecisionType type) {
+  switch (type) {
+    case PrecisionType::kFloat:
+      return 4;
+    case PrecisionType::kInt8:
+      return 1;
+    case PrecisionType::kInt32:
+      return 4;
+    case PrecisionType::kInt64:
+      return 8;
+    default:
+      return 4;
+  }
+}
+
+#define TARGET(item__) paddle::lite_api::TargetType::item__
+#define PRECISION(item__) paddle::lite_api::PrecisionType::item__
+#define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
+
+const std::string& TargetToStr(TargetType target);
+
+const std::string& PrecisionToStr(PrecisionType precision);
+
+const std::string& DataLayoutToStr(DataLayoutType layout);
+
+const std::string& TargetRepr(TargetType target);
+
+const std::string& PrecisionRepr(PrecisionType precision);
+
+const std::string& DataLayoutRepr(DataLayoutType layout);
+
+/*
+ * Place specifies the execution context of a Kernel or input/output for a
+ * kernel. It is used to make the analysis of the MIR more clear and accurate.
+ */
+struct Place {
+  TargetType target{TARGET(kUnk)};
+  PrecisionType precision{PRECISION(kUnk)};
+  DataLayoutType layout{DATALAYOUT(kUnk)};
+  int16_t device{0};  // device ID
+
+  Place() = default;
+  Place(TargetType target, PrecisionType precision = PRECISION(kFloat),
+        DataLayoutType layout = DATALAYOUT(kNCHW), int16_t device = 0)
+      : target(target), precision(precision), layout(layout), device(device) {}
+
+  bool is_valid() const {
+    return target != TARGET(kUnk) && precision != PRECISION(kUnk) &&
+           layout != DATALAYOUT(kUnk);
+  }
+
+  size_t hash() const;
+
+  bool operator==(const Place& other) const {
+    return target == other.target && precision == other.precision &&
+           layout == other.layout && device == other.device;
+  }
+
+  bool operator!=(const Place& other) const { return !(*this == other); }
+
+  friend bool operator<(const Place& a, const Place& b);
+
+  friend std::ostream& operator<<(std::ostream& os, const Place& other) {
+    os << other.DebugString();
+    return os;
+  }
+
+  std::string DebugString() const;
+};
+
+}  // namespace lite_api
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/paddle_use_kernels.h b/paddle/fluid/lite/api/paddle_use_kernels.h
new file mode 100644
index 00000000000..8ec406efddc
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_use_kernels.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+ * ATTENTION this header file can only include in .cc file.
+ */
+
+#pragma once
+#include "paddle_lite_factory_helper.h"  // NOLINT
+
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
+
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
+USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
+USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
+#endif
+
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softsign, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reshape, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reshape2, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gru, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(reduce_sum, kX86, kFloat, kNCHW, def);
+// USE_LITE_KERNEL(lookup_table, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def);
+USE_LITE_KERNEL(sequence_reshape, kX86, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
+
+#ifdef LITE_WITH_OPENCL
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+#endif
diff --git a/paddle/fluid/lite/api/paddle_use_ops.h b/paddle/fluid/lite/api/paddle_use_ops.h
new file mode 100644
index 00000000000..bcb2dc5439f
--- /dev/null
+++ b/paddle/fluid/lite/api/paddle_use_ops.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// ATTENTION This can only include in a .cc file.
+
+#include "paddle_lite_factory_helper.h"  // NOLINT
+
+USE_LITE_OP(mul);
+USE_LITE_OP(fc);
+USE_LITE_OP(relu);
+USE_LITE_OP(scale);
+USE_LITE_OP(feed);
+USE_LITE_OP(fetch);
+USE_LITE_OP(io_copy);
+USE_LITE_OP(elementwise_add)
+USE_LITE_OP(elementwise_sub)
+USE_LITE_OP(square)
+USE_LITE_OP(softmax)
+USE_LITE_OP(dropout)
+USE_LITE_OP(concat)
+USE_LITE_OP(conv2d)
+USE_LITE_OP(depthwise_conv2d)
+USE_LITE_OP(pool2d)
+USE_LITE_OP(batch_norm)
+USE_LITE_OP(fusion_elementwise_sub_activation)
+USE_LITE_OP(transpose)
+USE_LITE_OP(transpose2)
+USE_LITE_OP(reshape)
+USE_LITE_OP(reshape2)
+USE_LITE_OP(softsign)
+USE_LITE_OP(gru)
+USE_LITE_OP(reduce_sum)
+USE_LITE_OP(lookup_table)
+USE_LITE_OP(sequence_reshape)
+
+USE_LITE_OP(fake_quantize_moving_average_abs_max);
+USE_LITE_OP(fake_dequantize_max_abs);
+USE_LITE_OP(calib);
diff --git a/paddle/fluid/lite/core/mir/passes.h b/paddle/fluid/lite/api/paddle_use_passes.h
similarity index 73%
rename from paddle/fluid/lite/core/mir/passes.h
rename to paddle/fluid/lite/api/paddle_use_passes.h
index 6e329a19227..4998d87738a 100644
--- a/paddle/fluid/lite/core/mir/passes.h
+++ b/paddle/fluid/lite/api/paddle_use_passes.h
@@ -13,23 +13,23 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/lite/core/mir/pass_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
+#include "paddle_lite_factory_helper.h"  // NOLINT
 
 USE_MIR_PASS(demo);
-USE_MIR_PASS(lite_fc_fuse_pass);
-USE_MIR_PASS(lite_conv_elementwise_add_act_fuse_pass);
 USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
-USE_MIR_PASS(type_target_transform_pass);
+USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
 USE_MIR_PASS(runtime_context_assign_pass);
-USE_MIR_PASS(lite_conv_bn_fuse_pass);
 USE_MIR_PASS(graph_visualze);
+
+USE_MIR_PASS(lite_conv_bn_fuse_pass);
+USE_MIR_PASS(lite_fc_fuse_pass);
+USE_MIR_PASS(identity_scale_eliminate_pass);
+USE_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass);
+USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
+USE_MIR_PASS(lite_quant_dequant_fuse_pass);
+USE_MIR_PASS(type_precision_cast_pass);
+// USE_MIR_PASS(trans_weight_pass);
diff --git a/paddle/fluid/lite/api/resnet50_test.cc b/paddle/fluid/lite/api/resnet50_test.cc
new file mode 100644
index 00000000000..a20e5ca3d5b
--- /dev/null
+++ b/paddle/fluid/lite/api/resnet50_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+#ifdef LITE_WITH_ARM
+TEST(ResNet50, test) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, FLAGS_threads);
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = input_tensor->dims().production();
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.00024139918, 0.00020566184, 0.00022418296, 0.00041731037,
+       0.0005366107,  0.00016948722, 0.00028638865, 0.0009257241,
+       0.00072681636, 8.531815e-05,  0.0002129998,  0.0021168243,
+       0.006387163,   0.0037145028,  0.0012812682,  0.00045948103,
+       0.00013535398, 0.0002483765,  0.00076759676, 0.0002773295}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+#endif
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_googlenet_lite.cc b/paddle/fluid/lite/api/test_googlenet_lite.cc
new file mode 100644
index 00000000000..b878736c1e1
--- /dev/null
+++ b/paddle/fluid/lite/api/test_googlenet_lite.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+// for googlenet
+DEFINE_string(model_dir, "", "");
+
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_X86
+TEST(CXXApi, test_lite_googlenet) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
+  std::string model_dir = FLAGS_model_dir;
+  predictor.Build(model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+  predictor.Run();
+
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results(
+      {0.00034298553, 0.0008200012, 0.0005046297, 0.000839279,
+       0.00052616704, 0.0003447803, 0.0010877076, 0.00081762316,
+       0.0003941339,  0.0011430943, 0.0008892841, 0.00080191303,
+       0.0004442384,  0.000658702,  0.0026721435, 0.0013686896,
+       0.0005618166,  0.0006556497, 0.0006984528, 0.0014619455});
+  for (size_t i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_helper.h b/paddle/fluid/lite/api/test_helper.h
new file mode 100644
index 00000000000..1a5ab31abd3
--- /dev/null
+++ b/paddle/fluid/lite/api/test_helper.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <sys/time.h>
+#include <time.h>
+
+// for eval
+DEFINE_string(model_dir, "", "model dir");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_int32(threads, 1, "threads num");
+
+namespace paddle {
+namespace lite {
+
+inline double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_inceptionv4_lite_x86.cc b/paddle/fluid/lite/api/test_inceptionv4_lite_x86.cc
new file mode 100644
index 00000000000..e3b5ead9bdc
--- /dev/null
+++ b/paddle/fluid/lite/api/test_inceptionv4_lite_x86.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(InceptionV4, test_inceptionv4_lite_x86) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
+  std::string model_dir = FLAGS_model_dir;
+  std::vector<std::string> passes(
+      {"static_kernel_pick_pass", "variable_place_inference_pass",
+       "type_target_cast_pass", "variable_place_inference_pass",
+       "io_copy_kernel_pick_pass", "variable_place_inference_pass",
+       "runtime_context_assign_pass"});
+  predictor.Build(model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places, passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.0011684548,  0.0010390386,  0.0011301535,  0.0010133048,
+       0.0010259597,  0.0010982729,  0.00093195855, 0.0009141837,
+       0.00096620916, 0.00089982944, 0.0010064574,  0.0010474789,
+       0.0009782845,  0.0009230255,  0.0010548076,  0.0010974824,
+       0.0010612885,  0.00089107914, 0.0010112736,  0.00097655767}));
+
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_mobilenetv1_lite_x86.cc b/paddle/fluid/lite/api/test_mobilenetv1_lite_x86.cc
new file mode 100644
index 00000000000..3a40ea35b57
--- /dev/null
+++ b/paddle/fluid/lite/api/test_mobilenetv1_lite_x86.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  std::string model_dir = FLAGS_model_dir;
+  std::vector<std::string> passes(
+      {"static_kernel_pick_pass", "variable_place_inference_pass",
+       "type_target_cast_pass", "variable_place_inference_pass",
+       "io_copy_kernel_pick_pass", "variable_place_inference_pass",
+       "runtime_context_assign_pass"});
+  predictor.Build(model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places, passes);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.00019130898, 9.467885e-05,  0.00015971427, 0.0003650665,
+       0.00026431272, 0.00060884043, 0.0002107942,  0.0015819625,
+       0.0010323516,  0.00010079765, 0.00011006987, 0.0017364529,
+       0.0048292773,  0.0013995157,  0.0018453331,  0.0002428986,
+       0.00020211363, 0.00013668182, 0.0005855956,  0.00025901722}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_mobilenetv2_lite_x86.cc b/paddle/fluid/lite/api/test_mobilenetv2_lite_x86.cc
new file mode 100644
index 00000000000..11a2cfc35f4
--- /dev/null
+++ b/paddle/fluid/lite/api/test_mobilenetv2_lite_x86.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+// for googlenet
+
+namespace paddle {
+namespace lite {
+
+TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
+  std::string model_dir = FLAGS_model_dir;
+  std::vector<std::string> passes(
+      {"static_kernel_pick_pass", "variable_place_inference_pass",
+       "type_target_cast_pass", "variable_place_inference_pass",
+       "io_copy_kernel_pick_pass", "variable_place_inference_pass",
+       "runtime_context_assign_pass"});
+  predictor.Build(model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places, passes);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>(
+      {0.00017082224, 5.699624e-05,  0.000260885,   0.00016412718,
+       0.00034818667, 0.00015230637, 0.00032959113, 0.0014772735,
+       0.0009059976,  9.5378724e-05, 5.386537e-05,  0.0006427285,
+       0.0070957416,  0.0016094646,  0.0018807327,  0.00010506048,
+       6.823785e-05,  0.00012269315, 0.0007806194,  0.00022354358}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+
+  int step = 50;
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      EXPECT_NEAR(out->data<float>()[j * step + (out->dims()[1] * i)],
+                  results[i][j], 1e-6);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/test_step_rnn_lite_x86.cc b/paddle/fluid/lite/api/test_step_rnn_lite_x86.cc
new file mode 100644
index 00000000000..b13e0c840d8
--- /dev/null
+++ b/paddle/fluid/lite/api/test_step_rnn_lite_x86.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/lite_api_test_helper.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+// for googlenet
+
+namespace paddle {
+namespace lite {
+
+TEST(Step_rnn, test_step_rnn_lite_x86) {
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kInt64)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
+
+  //  LOG(INFO)<<"FLAGS_eval_googlenet_dir:"<<FLAGS_test_lite_googlenet_dir;
+  std::string model_dir = FLAGS_model_dir;
+  std::vector<std::string> passes(
+      {/*"lite_fc_fuse_pass",*/ "static_kernel_pick_pass",
+       "variable_place_inference_pass", "type_target_cast_pass",
+       "variable_place_inference_pass", "io_copy_kernel_pick_pass",
+       "variable_place_inference_pass", "runtime_context_assign_pass"});
+  predictor.Build(model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
+                  valid_places, passes);
+
+  std::vector<std::string> target_names = {
+      "item_type_id",   "mthid_id",         "source_id_id",
+      "layout_id",      "mark_id",          "category_id",
+      "subcategory_id", "score_segment_id", "item_attention_id",
+      "queue_num_id",   "micro_video_id",   "vertical_type_id"};
+  for (size_t i = 0; i < target_names.size(); i++) {
+    auto* input_tensor = predictor.GetInput(i);
+    int size = 0;
+    if (i == 6 || i == 8) {
+      input_tensor->Resize(
+          lite::DDim(std::vector<lite::DDim::value_type>({5, 1})));
+      input_tensor->raw_tensor().set_lod({{0, 5}});
+      size = 5;
+    } else {
+      input_tensor->Resize(
+          lite::DDim(std::vector<lite::DDim::value_type>({1, 1})));
+      input_tensor->raw_tensor().set_lod({{0, 1}});
+      size = 1;
+    }
+    auto* data = input_tensor->mutable_data<int64_t>();
+    for (int i = 0; i < size; i++) data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor.Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor.Run();
+  }
+
+  //  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+
+  std::vector<std::vector<float>> results;
+  // i = 1
+  results.emplace_back(std::vector<float>({0.471981, 0.528019}));
+  auto* out = predictor.GetOutput(0);
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 2);
+
+  for (int i = 0; i < results.size(); ++i) {
+    for (int j = 0; j < results[i].size(); ++j) {
+      LOG(INFO) << "output[" << i << "]"
+                << "[" << j
+                << "]:" << out->data<float>()[j + (out->dims()[1] * i)];
+      // EXPECT_NEAR(out->data<float>()[j + (out->dims()[1] * i)],
+      // results[i][j],
+      //            1e-6);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt
index 8abd04b5233..1980267380d 100644
--- a/paddle/fluid/lite/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/CMakeLists.txt
@@ -1,2 +1,3 @@
 
 add_subdirectory(math)
+ 
diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt
index 8af2c33943f..1dc9ab46073 100644
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -6,4 +6,77 @@ if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
   return()
 endif()
 
-cc_library(math_arm SRCS funcs.cc packed_sgemm.cc softmax.cc scale.cc elementwise.cc DEPS ${lite_kernel_deps} eigen3)
+set(HAS_ARM_MATH_LIB_DIR OFF)
+# will search name as "libmath_arm.${os}.${abi}.${lang}.a"
+if(ARM_MATH_LIB_DIR AND EXISTS "${ARM_MATH_LIB_DIR}")
+  set(arm_math_name "")
+  if(ARM_TARGET_OS STREQUAL "android")
+    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+      set(arm_math_name "math_arm.android.armv8")
+    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+      set(arm_math_name "math_arm.android.armv7")
+    endif()
+  endif()
+
+  if(ARM_TARGET_OS STREQUAL "armlinux" )
+    if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
+      set(arm_math_name "math_arm.armlinux.armv8")
+    elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
+      set(arm_math_name "math_arm.armlinux.armv7")
+    endif()
+  endif()
+
+  if(ARM_TARGET_LANG STREQUAL "clang")
+    set(arm_math_name "${arm_math_name}.clang")
+  else()
+    set(arm_math_name "${arm_math_name}.gcc")
+  endif()
+
+  find_library(math_arm_file ${arm_math_name} ${ARM_MATH_LIB_DIR} NO_DEFAULT_PATH)
+  if(math_arm_file)
+    add_library(math_arm STATIC IMPORTED GLOBAL)
+    set_property(TARGET math_arm PROPERTY IMPORTED_LOCATION ${math_arm_file})
+    message(STATUS "ARM math library imported: ${math_arm_file}")
+    set(HAS_ARM_MATH_LIB_DIR ON)
+  else()
+    message(WARNING "Can not find arm math library ${arm_math_name} in ${ARM_MATH_LIB_DIR")
+  endif()
+endif()
+
+
+if (NOT HAS_ARM_MATH_LIB_DIR)
+  # TODO(xxx): seperate them and do not deps proto, eigen3
+  cc_library(math_arm SRCS  
+      funcs.cc 
+      packed_sgemm.cc 
+      softmax.cc 
+      scale.cc
+      pooling.cc
+      elementwise.cc
+      concat.cc
+      sgemv.cc
+      type_trans.cc
+      conv_impl.cc
+      conv_direct_3x3s1.cc
+      conv_direct_3x3s2.cc
+      conv_direct.cc
+      conv_depthwise_3x3_int7.cc
+      conv_depthwise_3x3_int8.cc
+      conv_depthwise_5x5s1_int8.cc
+      conv_depthwise_3x3p0.cc
+      conv_depthwise_3x3p1.cc
+      conv_depthwise_5x5s1.cc
+      conv_depthwise_5x5s2.cc
+      conv_depthwise.cc
+      conv_gemmlike.cc
+      conv_winograd_3x3.cc
+      conv_winograd.cc
+      split.cc
+      activation.cc
+      dropout.cc
+      gemm_prepacked_int8.cc
+      gemv_arm_int8.cc
+      conv3x3s1_direct_int8.cc
+      conv3x3s2_direct_int8.cc
+      DEPS ${lite_kernel_deps} framework_proto_lite)
+endif()
diff --git a/paddle/fluid/lite/arm/math/elementwise.cc b/paddle/fluid/lite/arm/math/elementwise.cc
deleted file mode 100644
index 68140a5d7db..00000000000
--- a/paddle/fluid/lite/arm/math/elementwise.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/arm/math/elementwise.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void elementwise_add<float>(const float* dinx, const float* diny, float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-
-    float32x4_t vsum0 = vaddq_f32(dinx0, diny0);
-    float32x4_t vsum1 = vaddq_f32(dinx1, diny1);
-    float32x4_t vsum2 = vaddq_f32(dinx2, diny2);
-    float32x4_t vsum3 = vaddq_f32(dinx3, diny3);
-
-    vst1q_f32(dout_ptr, vsum0);
-    vst1q_f32(dout_ptr + 4, vsum1);
-    vst1q_f32(dout_ptr + 8, vsum2);
-    vst1q_f32(dout_ptr + 12, vsum3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr + *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/funcs.cc b/paddle/fluid/lite/arm/math/funcs.cc
deleted file mode 100644
index 4013ac31bfd..00000000000
--- a/paddle/fluid/lite/arm/math/funcs.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/arm/math/funcs.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void fill_bias_fc<float>(float *out, const float *bias, const int num,
-                         const int channel) {
-  int cnt = channel >> 4;
-  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const float *ptr_bias = bias;
-    float *ptr_out = out + j * channel;
-
-    float32x4_t vout1;
-    float32x4_t vout2;
-    float32x4_t vout3;
-    float32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      float32x4_t vin1 = vld1q_f32(ptr_out);
-      float32x4_t vb1 = vld1q_f32(ptr_bias);
-
-      float32x4_t vin2 = vld1q_f32(ptr_out + 4);
-      float32x4_t vb2 = vld1q_f32(ptr_bias + 4);
-
-      float32x4_t vin3 = vld1q_f32(ptr_out + 8);
-      float32x4_t vb3 = vld1q_f32(ptr_bias + 8);
-
-      float32x4_t vin4 = vld1q_f32(ptr_out + 12);
-      float32x4_t vb4 = vld1q_f32(ptr_bias + 12);
-
-      vout1 = vaddq_f32(vin1, vb1);
-      vout2 = vaddq_f32(vin2, vb2);
-      vout3 = vaddq_f32(vin3, vb3);
-      vout4 = vaddq_f32(vin4, vb4);
-
-      vst1q_f32(ptr_out, vout1);
-      vst1q_f32(ptr_out + 4, vout2);
-      vst1q_f32(ptr_out + 8, vout3);
-      vst1q_f32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-#if 0
-        if (cnt > 0) {
-            asm(
-            "1: \n"
-            "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-            "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-            "vadd.f32 q2, q0, q1              @ add bias\n"
-            "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-            "subs   %[cnt], #1                @ loop count -1\n"
-            "bne    1b                        @ jump to main loop\n"
-            :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                    [cnt] "+r"(cnt)
-            :
-            :"q0", "q1", "q2"
-            );
-        }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
-    }
-  }
-}
-
-template <>
-void fill_bias_fc<int>(int *out, const int *bias, const int num,
-                       const int channel) {
-  int cnt = channel >> 4;
-  int remain = channel & 15;
-
-  for (int j = 0; j < num; ++j) {
-    const int *ptr_bias = bias;
-    int *ptr_out = out + j * channel;
-
-    int32x4_t vout1;
-    int32x4_t vout2;
-    int32x4_t vout3;
-    int32x4_t vout4;
-
-    for (int i = 0; i < cnt; ++i) {
-      int32x4_t vin1 = vld1q_s32(ptr_out);
-      int32x4_t vb1 = vld1q_s32(ptr_bias);
-
-      int32x4_t vin2 = vld1q_s32(ptr_out + 4);
-      int32x4_t vb2 = vld1q_s32(ptr_bias + 4);
-
-      int32x4_t vin3 = vld1q_s32(ptr_out + 8);
-      int32x4_t vb3 = vld1q_s32(ptr_bias + 8);
-
-      int32x4_t vin4 = vld1q_s32(ptr_out + 12);
-      int32x4_t vb4 = vld1q_s32(ptr_bias + 12);
-
-      vout1 = vaddq_s32(vin1, vb1);
-      vout2 = vaddq_s32(vin2, vb2);
-      vout3 = vaddq_s32(vin3, vb3);
-      vout4 = vaddq_s32(vin4, vb4);
-
-      vst1q_s32(ptr_out, vout1);
-      vst1q_s32(ptr_out + 4, vout2);
-      vst1q_s32(ptr_out + 8, vout3);
-      vst1q_s32(ptr_out + 12, vout4);
-
-      ptr_out += 16;
-      ptr_bias += 16;
-    }
-
-#if 0
-        if (cnt > 0) {
-        asm(
-        "1: \n"
-        "vld1.32 {d0-d1}, [%[ptr_out]]    @ load data\n"
-        "vld1.32 {d2-d3}, [%[ptr_bias]]!  @ load data\n"
-        "vadd.s32 q2, q0, q1              @ add bias\n"
-        "vst1.32  {d4-d5}, [%[ptr_out]]!  @ store result\n"
-        "subs   %[cnt], #1                @ loop count -1\n"
-        "bne    1b                        @ jump to main loop\n"
-        :[ptr_out] "+r"(ptr_out), [ptr_bias] "+r"(ptr_bias), \
-                [cnt] "+r"(cnt)
-        :
-        :"q0", "q1", "q2"
-        );
-    }
-#endif
-    for (int i = 0; i < remain; ++i) {
-      *(ptr_out++) += *(ptr_bias++);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/funcs.h b/paddle/fluid/lite/arm/math/funcs.h
deleted file mode 100644
index e95506c1a96..00000000000
--- a/paddle/fluid/lite/arm/math/funcs.h
+++ /dev/null
@@ -1,336 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <Eigen/Core>
-#include <cmath>
-
-#include "paddle/fluid/lite/arm/math/elementwise.h"
-#include "paddle/fluid/lite/arm/math/packed_sgemm.h"
-#include "paddle/fluid/lite/arm/math/scale.h"
-#include "paddle/fluid/lite/arm/math/softmax.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#define c_inv_mant_mask ~0x7f800000u
-#define c_cephes_SQRTHF 0.707106781186547524
-#define c_cephes_log_p0 7.0376836292E-2
-#define c_cephes_log_p1 -1.1514610310E-1
-#define c_cephes_log_p2 1.1676998740E-1
-#define c_cephes_log_p3 -1.2420140846E-1
-#define c_cephes_log_p4 +1.4249322787E-1
-#define c_cephes_log_p5 -1.6668057665E-1
-#define c_cephes_log_p6 +2.0000714765E-1
-#define c_cephes_log_p7 -2.4999993993E-1
-#define c_cephes_log_p8 +3.3333331174E-1
-#define c_cephes_log_q1 -2.12194440e-4
-#define c_cephes_log_q2 0.693359375
-
-// natural logarithm computed for 4 simultaneous float
-// return NaN for x <= 0
-inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0));  // force flush to zero on denormal values
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  // keep only the fractional part
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  // part2:
-  // if( x < SQRTHF ) {
-  //   e -= 1;
-  //   x = x + x - 1.0;
-  // } else {
-  //   x = x - 1.0;
-  // }
-  //
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-#define c_exp_hi 88.3762626647949f
-#define c_exp_lo -88.3762626647949f
-
-#define c_cephes_LOG2EF 1.44269504088896341
-#define c_cephes_exp_C1 0.693359375
-#define c_cephes_exp_C2 -2.12194440e-4
-
-#define c_cephes_exp_p0 1.9875691500E-4
-#define c_cephes_exp_p1 1.3981999507E-3
-#define c_cephes_exp_p2 8.3334519073E-3
-#define c_cephes_exp_p3 4.1665795894E-2
-#define c_cephes_exp_p4 1.6666665459E-1
-#define c_cephes_exp_p5 5.0000001201E-1
-
-// exp() computed for 4 float at once
-inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  // express exp(x) as exp(g + n*log(2))
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  // perform a floorf
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  // if greater, substract 1
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
-                                        c_cephes_exp_p2, c_cephes_exp_p3,
-                                        c_cephes_exp_p4, c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  // build 2^n
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-#define c_minus_cephes_DP1 -0.78515625
-#define c_minus_cephes_DP2 -2.4187564849853515625e-4
-#define c_minus_cephes_DP3 -3.77489497744594108e-8
-#define c_sincof_p0 -1.9515295891E-4
-#define c_sincof_p1 8.3321608736E-3
-#define c_sincof_p2 -1.6666654611E-1
-#define c_coscof_p0 2.443315711809948E-005
-#define c_coscof_p1 -1.388731625493765E-003
-#define c_coscof_p2 4.166664568298827E-002
-#define c_cephes_FOPI 1.27323954473516  // 4 / M_PI
-
-// evaluation of 4 sines & cosines at once.
-//
-// The code is the exact rewriting of the cephes sinf function.
-// Precision is excellent as long as x < 8192 (I did not bother to
-// take into account the special handling they have for greater values
-// -- it does not return garbage for arguments over 8192, though, but
-// the extra precision is missing).
-//
-// Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-// surprising but correct result.
-//
-// Note also that when you compute sin(x), cos(x) is available at
-// almost no extra price so both sin_ps and cos_ps make use of
-// sincos_ps..
-//
-inline void sincos_ps(float32x4_t x, float32x4_t* ysin, float32x4_t* ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  // scale by 4/Pi
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  // store the integer part of y in mm0
-  emm2 = vcvtq_u32_f32(y);
-  // j=(j+1) & (~1) (see the cephes sources)
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  // get the polynom selection mask
-  // there is one polynom for 0 <= x <= Pi/4
-  // and another one for Pi/4<x<=Pi/2
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  // the magic pass: "Extended precision modular arithmetic"
-  // x = ((x - y * DP1) - y * DP2) - y * DP3;
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  // evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-  // and the second polynom      (Pi/4 <= x <= 0) in y2
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  // select the correct result from the two polynoms
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  // pow(x, m) = exp(m * log(x))
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-template <typename T>
-void fill_bias_fc(T* tensor, const T* bias, const int num, const int channel);
-
-template <typename T>
-void fc_compute_eigen(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, 1, Eigen::Dynamic>> B(b, w_w);
-    Out = Out.array().rowwise() + B.array();
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/packed_sgemm.cc b/paddle/fluid/lite/arm/math/packed_sgemm.cc
deleted file mode 100644
index 1028d371d3c..00000000000
--- a/paddle/fluid/lite/arm/math/packed_sgemm.cc
+++ /dev/null
@@ -1,3049 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/arm/math/packed_sgemm.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-void prepackA_8x12(float *out, const float *in, const int ldin, const int m0,
-                   const int mmax, const int k0, const int kmax);
-void prepackA_trans_8x12(float *out, const float *in, const int ldin,
-                         const int m0, const int mmax, const int k0,
-                         const int kmax);
-void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias,
-                     float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                     bool transB, ARMContext *ctx);
-#else
-// for kA72
-void prepackA_6x8(float *out, const float *in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax);
-void prepackA_trans_6x8(float *out, const float *in, const int ldin,
-                        const int m0, const int mmax, const int k0,
-                        const int kmax);
-// for kA73
-void prepackA_4x8(float *out, const float *in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax);
-void prepackA_trans_4x8(float *out, const float *in, const int ldin,
-                        const int m0, const int mmax, const int k0,
-                        const int kmax);
-// for kA72, 6x8
-void sgemm_conv_6x8(const float *A_packed, const float *B, const float *bias,
-                    float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB, ARMContext *ctx);
-// for kA73, 4x8
-void sgemm_conv_4x8(const float *A_packed, const float *B, const float *bias,
-                    float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB, ARMContext *ctx);
-#endif  // __aarch64__
-
-/**
- * \brief input data is not transpose
- * for arm-v7a, transform data to block x k x 6 layout
- * for arm-v8a, transform data to block x k x 8 layout
- */
-void prepackA(float *out, const float *in, const int ldin, const int m0,
-              const int mmax, const int k0, const int kmax, bool is_trans,
-              ARMContext *ctx) {
-#ifdef __aarch64__
-  if (is_trans) {
-    prepackA_trans_8x12(out, in, ldin, m0, mmax, k0, kmax);
-  } else {
-    prepackA_8x12(out, in, ldin, m0, mmax, k0, kmax);
-  }
-#else
-  if (ctx->arch() == kA73) {
-    if (is_trans) {
-      prepackA_trans_4x8(out, in, ldin, m0, mmax, k0, kmax);
-    } else {
-      prepackA_4x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  } else {
-    if (is_trans) {
-      prepackA_trans_6x8(out, in, ldin, m0, mmax, k0, kmax);
-    } else {
-      prepackA_6x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  }
-#endif
-}
-
-void prepackA(TensorLite *tout, const TensorLite &tin, int m, int k, int group,
-              bool is_trans, ARMContext *ctx) {
-  int hblock = get_hblock(ctx->arch());
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int group_size_round_up = ((m_roundup * k + 15) / 16) * 16;
-  if (tout->numel() < group_size_round_up * group) {
-    tout->Resize({group_size_round_up * group});
-  }
-  int lda = k;
-  if (is_trans) {
-    lda = m;
-  }
-  for (int g = 0; g < group; ++g) {
-    const float *weights_group = tin.data<float>() + g * m * k;
-    float *weights_trans_ptr =
-        tout->mutable_data<float>() + g * group_size_round_up;
-    prepackA(weights_trans_ptr, weights_group, lda, 0, m, 0, k, is_trans, ctx);
-  }
-}
-
-/// a: m*k  b: k*n  c: m*n
-void sgemm_prepack(const float *A_packed, const float *B, const float *bias,
-                   float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMContext *ctx) {
-#ifdef __aarch64__
-  sgemm_conv_8x12(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB,
-                  ctx);
-#else   // armv7
-  if (ctx->arch() == kA73) {
-    sgemm_conv_4x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB,
-                   ctx);
-  } else {
-    sgemm_conv_6x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB,
-                   ctx);
-  }
-#endif  // arm64
-}
-
-#ifdef __aarch64__
-void prepackA_8x12(float *out, const float *in, const int ldin, const int m0,
-                   const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t *dout = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-  int stride = x_len * 8;
-#pragma omp parallel for
-  for (int y = m0; y < mmax; y += 8) {
-    uint32_t *outptr = dout + stride * (y - m0) / 8;
-
-    const uint32_t *inptr0 = inptr + y * ldin + k0;
-    const uint32_t *inptr1 = inptr0 + ldin;
-    const uint32_t *inptr2 = inptr1 + ldin;
-    const uint32_t *inptr3 = inptr2 + ldin;
-    const uint32_t *inptr4 = inptr3 + ldin;
-    const uint32_t *inptr5 = inptr4 + ldin;
-    const uint32_t *inptr6 = inptr5 + ldin;
-    const uint32_t *inptr7 = inptr6 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0), [ptr1] "r"(inptr1), [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3), [ptr4] "r"(inptr4), [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6), [ptr7] "r"(inptr7)
-        : "memory");
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= mmax) {
-      switch ((y + 7) - mmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          // Load up 8 elements (2 vectors) from each of 8 sources.
-          "LDP        q0, q1, [%[inptr0]], #32\n"  // q0=A0A1A2A3
-          "LDP        q2, q3, [%[inptr1]], #32\n"  // q2=B0B1B2B3
-          "LDP        q4, q5, [%[inptr2]], #32\n"  // q4=C0C1C2C3
-          "ZIP1       v16.4s, v0.4s, v4.4s\n"      // q16=A0C0A1C1
-          "prfm   pldl1keep, [%[inptr0], #128] \n"
-          "LDP        q6, q7, [%[inptr3]], #32\n"  // q6=D0D1D2D3
-          "ZIP1       v17.4s, v2.4s, v6.4s\n"      // q17=B0D0B1D1
-          "LDP        q8, q9, [%[inptr4]], #32\n"
-          "LDP        q10, q11, [%[inptr5]], #32\n"
-          "LDP        q12, q13, [%[inptr6]], #32\n"
-          "ZIP1       v18.4s, v8.4s, v12.4s\n"
-          "prfm   pldl1keep, [%[inptr1], #128]\n"
-          "LDP        q14, q15, [%[inptr7]], #32\n"
-          "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"  // q20=A0B0C0D0
-          "prfm   pldl1keep, [%[inptr2], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v0.4s, v4.4s\n"
-          "prfm   pldl1keep, [%[inptr3], #128]\n"
-          "ZIP2       v17.4s, v2.4s, v6.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Write back the first
-                                                     // element of each source
-
-          "ZIP2       v18.4s, v8.4s, v12.4s\n"
-          "ZIP2       v19.4s, v10.4s, v14.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Write back the second
-                                                     // element of each source
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr4], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP1       v16.4s, v1.4s, v5.4s\n"
-          "prfm   pldl1keep, [%[inptr5], #128]\n"
-          "ZIP1       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Third element
-
-          "ZIP1       v18.4s, v9.4s, v13.4s\n"
-          "ZIP1       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Fourth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr6], #128]\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v1.4s, v5.4s\n"
-          "ZIP2       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Fifth element
-
-          "ZIP2       v18.4s, v9.4s, v13.4s\n"
-          "prfm   pldl1keep, [%[inptr7], #128]\n"
-          "ZIP2       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Sixth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Seventh element
-
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Eighth element
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
-          :
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-            "v20", "v21", "v22", "v23", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-    }
-  }
-}
-
-void prepackA_trans_8x12(float *out, const float *in, const int ldin,
-                         const int m0, const int mmax, const int k0,
-                         const int kmax) {
-  uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr =
-      reinterpret_cast<const uint32_t *>(in) + k0 * ldin + m0;
-
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int right_pad = 8 - right_remain;
-  if (right_remain == 0) {
-    right_pad = 0;
-  }
-
-  uint32_t *outptr_row = outptr;
-  int stride_out = 8 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    const uint32_t *ptr1 = ptr0 + ldin;
-    const uint32_t *ptr2 = ptr1 + ldin;
-    const uint32_t *ptr3 = ptr2 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    uint32_t *outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-
-      vst1q_u32(outptr_row_col, vr00);
-      vst1q_u32(outptr_row_col + 4, vr01);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-
-      vst1q_u32(outptr_row_col + 8, vr10);
-      vst1q_u32(outptr_row_col + 12, vr11);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-
-      vst1q_u32(outptr_row_col + 16, vr20);
-      vst1q_u32(outptr_row_col + 20, vr21);
-
-      vst1q_u32(outptr_row_col + 24, vr30);
-      vst1q_u32(outptr_row_col + 28, vr31);
-
-      ptr0 += 8;
-      ptr1 += 8;
-      ptr2 += 8;
-      ptr3 += 8;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-
-      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
-      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-
-      vst1q_u32(outptr_row_col, vr00_1);
-      vst1q_u32(outptr_row_col + 4, vr01_1);
-
-      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
-      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-
-      vst1q_u32(outptr_row_col + 8, vr10_1);
-      vst1q_u32(outptr_row_col + 12, vr11_1);
-
-      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
-      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
-
-      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
-      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
-
-      vst1q_u32(outptr_row_col + 16, vr20_1);
-      vst1q_u32(outptr_row_col + 20, vr21_1);
-      vst1q_u32(outptr_row_col + 24, vr30_1);
-      vst1q_u32(outptr_row_col + 28, vr31_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    uint32_t *outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      vst1q_u32(outptr_row_col, vr0);
-      vst1q_u32(outptr_row_col + 4, vr1);
-
-      ptr0 += 8;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-
-      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
-      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
-
-      vst1q_u32(outptr_row_col, vr0_1);
-      vst1q_u32(outptr_row_col + 4, vr1_1);
-    }
-  }
-}
-
-#else  // __aarch64__
-void prepackA_6x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-
-  uint32_t* outptr = dout;
-
-  //! data A is not transposed, transpose A to k * 6
-  for (int y = m0; y < mmax; y += 6) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-    const uint32_t* inptr4 = inptr3 + ldin;
-    const uint32_t* inptr5 = inptr4 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 5) >= mmax) {
-      switch ((y + 5) - mmax) {
-        case 4:
-          inptr1 = zerobuff;
-        case 3:
-          inptr2 = zerobuff;
-        case 2:
-          inptr3 = zerobuff;
-        case 1:
-          inptr4 = zerobuff;
-        case 0:
-          inptr5 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 6 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d16(q8,high),r41,r51\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-    }
-  }
-}
-
-void prepackA_trans_6x8(float* out, const float* in, const int ldin,
-                        const int m0, const int mmax, const int k0,
-                        const int kmax) {
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr =
-      reinterpret_cast<const uint32_t*>(in) + k0 * ldin + m0;
-
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 6 * (x_len / 6);
-  int right_pad = 6 - right_remain;
-  if (right_remain == 0) {
-    right_pad = 0;
-  }
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 6 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-
-    uint32_t* outptr_row_col = outptr_row + y * 6;
-    int i = 0;
-    for (; i < x_len - 5; i += 6) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d2}, [%[ptr2]]!        @ load r2, 6 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr3]]!        @ load r3, 6 elements\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr1]]!        @ load r1, 6 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d6, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d2}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d6}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d6, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d6}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 6;
-    int i = 0;
-    for (; i < x_len - 5; i += 6) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d2}, [%[ptr0]]!        @ load r0, 6 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   d2, %e[vzero], %e[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d2}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-void prepackA_4x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-
-  uint32_t* outptr = dout;
-  //! data A is not transposed, transpose A to k * 4
-  for (int y = m0; y < mmax; y += 4) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 3) >= mmax) {
-      switch ((y + 3) - mmax) {
-        case 2:
-          inptr1 = zerobuff;
-        case 1:
-          inptr2 = zerobuff;
-        case 0:
-          inptr3 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 4 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-    }
-  }
-}
-
-void prepackA_trans_4x8(float* out, const float* in, const int ldin,
-                        const int m0, const int mmax, const int k0,
-                        const int kmax) {
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr =
-      reinterpret_cast<const uint32_t*>(in) + k0 * ldin + m0;
-
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = mmax - m0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 4 * (x_len / 4);
-  int right_pad = 4 - right_remain;
-  if (right_remain == 0) {
-    right_pad = 0;
-  }
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 4 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-// uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask_buffer + 4),
-// vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-
-    uint32_t* outptr_row_col = outptr_row + y * 4;
-    int i = 0;
-    for (; i < x_len - 3; i += 4) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!                @ load r0, 4 elements\n"
-          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
-          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
-          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!                @ load r0, 4 elements\n"
-          "vld1.32 {d2-d3}, [%[ptr1]]!        @ load r1, 4 elements\n"
-          "vld1.32 {d4-d5}, [%[ptr2]]!        @ load r2, 4 elements\n"
-          "vld1.32 {d6-d7}, [%[ptr3]]!        @ load r3, 4 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d2-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d5}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d6-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 4;
-    int i = 0;
-    for (; i < x_len - 3; i += 4) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!                @ load r0, 4 elements\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_pad > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d1}, [%[ptr0]]!                @ load r0, 4 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d1}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-#endif  // __aarch64__
-
-/**
-* \brief input data is transpose
-* for arm-v7a, transform data to block x k x 8 layout
-* for arm-v8a, transform data to block x k x 12 layout
-*/
-#ifdef __aarch64__
-void loadb(float *out, const float *in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr =
-      reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 12 * (x_len / 12);
-  int right_pad = 12 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t *outptr_row = outptr;
-  int stride_out = 12 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-  uint32x4_t vmask3 =
-      vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    const uint32_t *ptr1 = ptr0 + ldin;
-    const uint32_t *ptr2 = ptr1 + ldin;
-    const uint32_t *ptr3 = ptr2 + ldin;
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      vst1q_u32(outptr_row_col, vr00);
-      vst1q_u32(outptr_row_col + 4, vr01);
-      vst1q_u32(outptr_row_col + 8, vr02);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10);
-      vst1q_u32(outptr_row_col + 16, vr11);
-      vst1q_u32(outptr_row_col + 20, vr12);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 24, vr20);
-      vst1q_u32(outptr_row_col + 28, vr21);
-      vst1q_u32(outptr_row_col + 32, vr22);
-
-      vst1q_u32(outptr_row_col + 36, vr30);
-      vst1q_u32(outptr_row_col + 40, vr31);
-      vst1q_u32(outptr_row_col + 44, vr32);
-
-      ptr0 += 12;
-      ptr1 += 12;
-      ptr2 += 12;
-      ptr3 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
-      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
-      uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col, vr00_1);
-      vst1q_u32(outptr_row_col + 4, vr01_1);
-      vst1q_u32(outptr_row_col + 8, vr02_1);
-
-      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
-      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
-      uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10_1);
-      vst1q_u32(outptr_row_col + 16, vr11_1);
-      vst1q_u32(outptr_row_col + 20, vr12_1);
-
-      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
-      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
-      uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero);
-
-      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
-      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
-      uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero);
-
-      vst1q_u32(outptr_row_col + 24, vr20_1);
-      vst1q_u32(outptr_row_col + 28, vr21_1);
-      vst1q_u32(outptr_row_col + 32, vr22_1);
-
-      vst1q_u32(outptr_row_col + 36, vr30_1);
-      vst1q_u32(outptr_row_col + 40, vr31_1);
-      vst1q_u32(outptr_row_col + 44, vr32_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-      vst1q_u32(outptr_row_col, vr0);
-      vst1q_u32(outptr_row_col + 4, vr1);
-      vst1q_u32(outptr_row_col + 8, vr2);
-
-      ptr0 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
-      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
-      uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero);
-
-      vst1q_u32(outptr_row_col, vr0_1);
-      vst1q_u32(outptr_row_col + 4, vr1_1);
-      vst1q_u32(outptr_row_col + 8, vr2_1);
-    }
-  }
-}
-
-void loadb_trans(float *out, const float *in, const int ldin, const int k0,
-                 const int kmax, const int n0, const int nmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-  uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
-  //! data B is not transposed, transpose B to k * 12
-  for (int y = n0; y < nmax; y += 12) {
-    const uint32_t *inptr0 = inptr + y * ldin + k0;
-    const uint32_t *inptr1 = inptr0 + ldin;
-    const uint32_t *inptr2 = inptr1 + ldin;
-    const uint32_t *inptr3 = inptr2 + ldin;
-    const uint32_t *inptr4 = inptr3 + ldin;
-    const uint32_t *inptr5 = inptr4 + ldin;
-    const uint32_t *inptr6 = inptr5 + ldin;
-    const uint32_t *inptr7 = inptr6 + ldin;
-    const uint32_t *inptr8 = inptr7 + ldin;
-    const uint32_t *inptr9 = inptr8 + ldin;
-    const uint32_t *inptr10 = inptr9 + ldin;
-    const uint32_t *inptr11 = inptr10 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        "prfm   pldl1keep, [%[ptr8]]        \n"
-        "prfm   pldl1keep, [%[ptr8], #64]   \n"
-        "prfm   pldl1keep, [%[ptr9]]        \n"
-        "prfm   pldl1keep, [%[ptr9], #64]   \n"
-        "prfm   pldl1keep, [%[ptr10]]        \n"
-        "prfm   pldl1keep, [%[ptr10], #64]   \n"
-        "prfm   pldl1keep, [%[ptr11]]        \n"
-        "prfm   pldl1keep, [%[ptr11], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0), [ptr1] "r"(inptr1), [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3), [ptr4] "r"(inptr4), [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6), [ptr7] "r"(inptr7), [ptr8] "r"(inptr8),
-          [ptr9] "r"(inptr9), [ptr10] "r"(inptr10), [ptr11] "r"(inptr11)
-        : "memory");
-
-    int x = x_len;
-
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 11) >= nmax) {
-      switch ((y + 11) - nmax) {
-        case 10:
-          inptr1 = zerobuff;
-        case 9:
-          inptr2 = zerobuff;
-        case 8:
-          inptr3 = zerobuff;
-        case 7:
-          inptr4 = zerobuff;
-        case 6:
-          inptr5 = zerobuff;
-        case 5:
-          inptr6 = zerobuff;
-        case 4:
-          inptr7 = zerobuff;
-        case 3:
-          inptr8 = zerobuff;
-        case 2:
-          inptr9 = zerobuff;
-        case 1:
-          inptr10 = zerobuff;
-        case 0:
-          inptr11 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          // Load up 12 elements (3 vectors) from each of 8 sources.
-          "LDP        q0, q1, [%[inptr0]], #32\n"  // q0=A0A1A2A3
-          "LDP        q2, q3, [%[inptr1]], #32\n"  // q2=B0B1B2B3
-          "LDP        q4, q5, [%[inptr2]], #32\n"  // q4=C0C1C2C3
-          "ZIP1       v16.4s, v0.4s, v4.4s\n"      // q16=A0C0A1C1
-          "prfm   pldl1keep, [%[inptr0], #128] \n"
-          "LDP        q6, q7, [%[inptr3]], #32\n"  // q6=D0D1D2D3
-          "ZIP1       v17.4s, v2.4s, v6.4s\n"      // q17=B0D0B1D1
-          "LDP        q8, q9, [%[inptr4]], #32\n"
-          "LDP        q10, q11, [%[inptr5]], #32\n"
-          "LDP        q12, q13, [%[inptr6]], #32\n"
-          "ZIP1       v18.4s, v8.4s, v12.4s\n"
-          "prfm   pldl1keep, [%[inptr1], #128]\n"
-          "LDP        q14, q15, [%[inptr7]], #32\n"
-          "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"  // q20=A0B0C0D0
-          "prfm   pldl1keep, [%[inptr2], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "LDP        q24, q25, [%[inptr8]], #32\n"   // q24=A0A1A2A3
-          "LDP        q26, q27, [%[inptr9]], #32\n"   // q26=B0B1B2B3
-          "LDP        q28, q29, [%[inptr10]], #32\n"  // q28=C0C1C2C3
-          "LDP        q30, q31, [%[inptr11]], #32\n"  // q30=D0D1D2D3
-          "prfm   pldl1keep, [%[inptr3], #128]\n"
-          "prfm   pldl1keep, [%[inptr4], #128]\n"
-          "ZIP1       v16.4s, v24.4s, v28.4s\n"      // q16=A0C0A1C1
-          "ZIP1       v17.4s, v26.4s, v30.4s\n"      // q17=B0D0B1D1
-          "STP        q20, q21, [%[outptr]], #32\n"  // Write back the first
-                                                     // element of each source
-          "ZIP1       v18.4s, v16.4s, v17.4s\n"      // q20=A0B0C0D0
-          "ZIP2       v19.4s, v16.4s, v17.4s\n"      // q20=A0B0C0D0
-
-          "ZIP2       v16.4s, v0.4s, v4.4s\n"
-          "prfm   pldl1keep, [%[inptr5], #128]\n"
-          "ZIP2       v17.4s, v2.4s, v6.4s\n"
-          "STR       q18, [%[outptr]], #16\n"  // Write back the second element
-                                               // of each source
-
-          "STP        q22, q23, [%[outptr]], #32\n"  // Write back the second
-                                                     // element of each source
-          "ZIP2       v18.4s, v8.4s, v12.4s\n"
-          "prfm   pldl1keep, [%[inptr6], #128]\n"
-          "STR        q19, [%[outptr]], #16\n"  // Write back the second element
-                                                // of each source
-          "ZIP2       v19.4s, v10.4s, v14.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr7], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v24.4s, v28.4s\n"  // q16=A0C0A1C1
-          "ZIP2       v17.4s, v26.4s, v30.4s\n"  // q17=B0D0B1D1
-          "prfm   pldl1keep, [%[inptr8], #128]\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Third element
-          "ZIP1       v18.4s, v16.4s, v17.4s\n"
-          "ZIP2       v19.4s, v16.4s, v17.4s\n"
-
-          "ZIP1       v16.4s, v1.4s, v5.4s\n"
-          "prfm   pldl1keep, [%[inptr9], #128]\n"
-          "ZIP1       v17.4s, v3.4s, v7.4s\n"
-          "STR       q18, [%[outptr]], #16\n"  // Write back the second element
-                                               // of each source
-
-          "STP        q22, q23, [%[outptr]], #32\n"  // Fourth element
-          "ZIP1       v18.4s, v9.4s, v13.4s\n"
-          "prfm   pldl1keep, [%[inptr10], #128]\n"
-          "STR        q19, [%[outptr]], #16\n"  // Write back the second element
-                                                // of each source
-          "ZIP1       v19.4s, v11.4s, v15.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr11], #128]\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP1       v16.4s, v25.4s, v29.4s\n"
-          "ZIP1       v17.4s, v27.4s, v31.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Fifth element
-          "ZIP1       v18.4s, v16.4s, v17.4s\n"
-          "ZIP2       v19.4s, v16.4s, v17.4s\n"
-
-          "ZIP2       v16.4s, v1.4s, v5.4s\n"
-          "ZIP2       v17.4s, v3.4s, v7.4s\n"
-          "STR       q18, [%[outptr]], #16\n"
-
-          "STP        q22, q23, [%[outptr]], #32\n"  // Sixth element
-          "ZIP2       v18.4s, v9.4s, v13.4s\n"
-          "STR       q19, [%[outptr]], #16\n"  // Sixth element
-
-          "ZIP2       v19.4s, v11.4s, v15.4s\n"
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v25.4s, v29.4s\n"
-          "ZIP2       v17.4s, v27.4s, v31.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Seventh element
-
-          "ZIP1       v18.4s, v16.4s, v17.4s\n"
-          "ZIP2       v19.4s, v16.4s, v17.4s\n"
-          "STR       q18, [%[outptr]], #16\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Eighth element
-          "STR       q19, [%[outptr]], #16\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [inptr8] "+r"(inptr8),
-            [inptr9] "+r"(inptr9), [inptr10] "+r"(inptr10),
-            [inptr11] "+r"(inptr11), [outptr] "+r"(outptr)
-          :
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-            "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-            "v29", "v30", "v31", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-      *outptr++ = *inptr8++;
-      *outptr++ = *inptr9++;
-      *outptr++ = *inptr10++;
-      *outptr++ = *inptr11++;
-    }
-  }
-}
-
-#else  // __aarch64__
-void loadb(float* out, const float* in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr =
-      reinterpret_cast<const uint32_t*>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int right_pad = 8 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 8 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-
-void loadb_trans(float* out, const float* in, const int ldin, const int k0,
-                 const int kmax, const int n0, const int nmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];  // NOLINT
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-  //! data B is not transposed, transpose B to k * 8
-  for (int y = n0; y < nmax; y += 8) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-    const uint32_t* inptr4 = inptr3 + ldin;
-    const uint32_t* inptr5 = inptr4 + ldin;
-    const uint32_t* inptr6 = inptr5 + ldin;
-    const uint32_t* inptr7 = inptr6 + ldin;
-
-    int x = x_len;
-
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= nmax) {
-      switch ((y + 7) - nmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 8 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vst1.32  {d0},    [%[outptr]]!   @ write d0(q0,low),r00,r10\n"
-
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vst1.32  {d8},    [%[outptr]]!   @ write d8(q4,low),r20,r30\n"
-
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-
-          "vld4.32  {d24-d27}, [%[inptr6]]! @ zip load r6, "
-          "q12,q13=r60,r64,r61,r65,r62,r66,r63,r67\n"
-          "vld4.32  {d28-d31}, [%[inptr7]]! @ zip load r7, "
-          "q14,q15=r70,r74,r71,r75,r72,r76,r73,r77\n"
-          "vtrn.32  q12, q14                @ trans data:q12=r60,r70,r61,r71; "
-          "q14=r64,r74,r65,r75\n"
-          "vst1.32  {d24},    [%[outptr]]!  @ write d24(q8,low),r60,r70\n"
-
-          //"pld      [%[inptr0], #128]       @ preload r0 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d1},     [%[outptr]]!  @ write d1(q0,high),r01,r11\n"
-          "vst1.32  {d9},     [%[outptr]]!  @ write d9(q4,high),r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d17(q8,high),r41,r51\n"
-          "vst1.32  {d25},    [%[outptr]]!  @ write d25(q12,high),r61,r71\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vst1.32  {d2},     [%[outptr]]!  @ write d2(q1,low),r02,r12\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vst1.32  {d10},    [%[outptr]]!  @ write d10(q5,low),r22,r32\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vtrn.32  q13, q15                @ trans data:q13=r62,r72,r63,r73; "
-          "q15=r66,r76,r67,r77\n"
-          "vst1.32  {d26},    [%[outptr]]!  @ write d18(q9,low),r62,r72\n"
-
-          //"pld      [%[inptr1], #128]       @ preload r1 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d3},     [%[outptr]]!  @ write d3(q1,high),r03,r13\n"
-          "vst1.32  {d11},    [%[outptr]]!  @ write d11(q5,high),r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-          "vst1.32  {d27},    [%[outptr]]!  @ write d27(q13,high),r63,r73\n"
-
-          //"pld      [%[inptr2], #128]       @ preload r2 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d4},     [%[outptr]]!  @ write d4(q2,low),r04,r14\n"
-          "vst1.32  {d12},    [%[outptr]]!  @ write d12(q6,low),r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d28},    [%[outptr]]!  @ write d28(q14,low),r64,r74\n"
-
-          //"pld      [%[inptr3], #128]       @ preload r3 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d5},     [%[outptr]]!  @ write d5(q2,high),r05,r15\n"
-          "vst1.32  {d13},    [%[outptr]]!  @ write d13(q6,high),r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-          "vst1.32  {d29},    [%[outptr]]!  @ write d29(q14,high),r65,r75\n"
-
-          //"pld      [%[inptr4], #128]       @ preload r4 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d6},     [%[outptr]]!  @ write d6(q3,low),r06,r16\n"
-          "vst1.32  {d14},    [%[outptr]]!  @ write d14(q7,low),r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d30},    [%[outptr]]!  @ write d30(q15,low),r66,r76\n"
-
-          //"pld      [%[inptr5], #128]       @ preload r5 data to cache, fill
-          // pipeline\n"
-          "vst1.32  {d7},     [%[outptr]]!  @ write d7(q3,high),r07,r17\n"
-          "vst1.32  {d15},    [%[outptr]]!  @ write d15(q7,high),r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          "vst1.32  {d31},    [%[outptr]]!  @ write d31(q15,high),r67,r77\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-    }
-  }
-}
-
-#endif  // __aarch64__
-
-#ifdef __aarch64__
-void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias,
-                     float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                     bool transB, ARMContext *ctx) {
-  size_t l2_cache =
-      ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
-  float *workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  // unroll 2 loop
-  int tail_pre = (K & (KBLOCK - 1));
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float *b_pannel = workspace;
-    if (transB) {
-      loadb_trans(b_pannel, B, K, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK) {
-      unsigned int ymax = y + MBLOCK;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float bias_local[8] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-        bias_local[6] = bias[y + 6];
-        bias_local[7] = bias[y + 7];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-      float cout6[NBLOCK];
-      float cout7[NBLOCK];
-
-      float *c_ptr0 = C + y * N + x0;
-      float *c_ptr1 = c_ptr0 + N;
-      float *c_ptr2 = c_ptr1 + N;
-      float *c_ptr3 = c_ptr2 + N;
-      float *c_ptr4 = c_ptr3 + N;
-      float *c_ptr5 = c_ptr4 + N;
-      float *c_ptr6 = c_ptr5 + N;
-      float *c_ptr7 = c_ptr6 + N;
-
-      float *pout0 = c_ptr0;
-      float *pout1 = c_ptr1;
-      float *pout2 = c_ptr2;
-      float *pout3 = c_ptr3;
-      float *pout4 = c_ptr4;
-      float *pout5 = c_ptr5;
-      float *pout6 = c_ptr6;
-      float *pout7 = c_ptr7;
-
-      const float *a_ptr_l = A_packed + y * K;
-      const float *b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 7) >= ymax) {
-          switch ((y + 7) - ymax) {
-            case 6:
-              c_ptr1 = cout1;
-            case 5:
-              c_ptr2 = cout2;
-            case 4:
-              c_ptr3 = cout3;
-            case 3:
-              c_ptr4 = cout4;
-            case 2:
-              c_ptr5 = cout5;
-            case 1:
-              c_ptr6 = cout6;
-            case 0:
-              c_ptr7 = cout7;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-          pout6 = c_ptr6;
-          pout7 = c_ptr7;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-          c_ptr6 = cout6;
-          c_ptr7 = cout7;
-        }
-        const float *a_ptr = a_ptr_l;
-        int tail = tail_pre;
-        int k = k_pre;
-
-        asm volatile(
-            // Initialize result registers, load initial operands, prime
-            // prefetches.
-            "ldp	q2, q3, [%[bias_ptr]]\n"   /* load bias to q2, q3*/
-            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a01 to q0, q1*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
-            "dup	v8.4s,  v2.s[0]\n"         /* out0 = 0 */
-            "dup	v9.4s,  v2.s[0]\n"         /* out1 = 0*/
-            "dup	v10.4s, v2.s[0]\n"         /* out2 = 0*/
-            "dup	v11.4s, v2.s[1]\n"         /* out3 = 0*/
-            "dup	v12.4s, v2.s[1]\n"         /* out4 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
-            "dup	v13.4s, v2.s[1]\n"         /* out5 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
-            "dup	v14.4s, v2.s[2]\n"         /* out6 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/
-            "dup	v15.4s, v2.s[2]\n"         /* out7 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/
-            "dup	v16.4s, v2.s[2]\n"         /* out8 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/
-            "dup	v17.4s, v2.s[3]\n"         /* out9 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/
-            "dup	v18.4s, v2.s[3]\n"         /* out10 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/
-            "dup	v19.4s, v2.s[3]\n"         /* out11 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/
-            "dup	v20.4s, v3.s[0]\n"         /* out12 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/
-            "dup	v21.4s, v3.s[0]\n"         /* out13 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/
-            "dup	v22.4s, v3.s[0]\n"         /* out14 = 0*/
-            "dup	v23.4s, v3.s[1]\n"         /* out15 = 0*/
-            "dup	v24.4s, v3.s[1]\n"         /* out16 = 0*/
-            "dup	v25.4s, v3.s[1]\n"         /* out17 = 0*/
-            "dup	v26.4s, v3.s[2]\n"         /* out18 = 0*/
-            "dup	v27.4s, v3.s[2]\n"         /* out19 = 0*/
-            "dup	v28.4s, v3.s[2]\n"         /* out20 = 0*/
-            "dup	v29.4s, v3.s[3]\n"         /* out21 = 0*/
-            "dup	v30.4s, v3.s[3]\n"         /* out22 = 0*/
-            "dup	v31.4s, v3.s[3]\n"         /* out23 = 0*/
-            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
-            /* main loop */
-            /* unrool 0*/
-            "1:\n"                                   /* main loop */
-            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =
-                                                        q4 */
-            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                        q4 */
-            "ldp	q6, q7, [%[b_ptr]], #32\n"   /* load b2, b0 to q6, q7 */
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"  /* out2 = b0 * a00[2], b0 =
-                                                        q4 */
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"  /* out3 = b0 * a00[3], b0 =
-                                                        q4 */
-            "ldp	q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4 */
-            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =
-                                                       q4 */
-            "fmla	v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 =
-                                                       q4 */
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 =
-                                                       q4 */
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 =
-                                                       q4 */
-
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =
-                                                       q5 */
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =
-                                                       q5 */
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                       q5*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                       q5*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                       q5*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                       q5*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                       q5*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                       q5*/
-
-            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5 */
-
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                       q6*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                       q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                       q6*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                       q6*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                       q6*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                       q6*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                       q6*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                       q6*/
-
-            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1 */
-
-            /* unrool 1 */
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q7 */
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q7 */
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q7 */
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 =
-                                                       q7 */
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 =
-                                                       q7 */
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7
-                                                   */
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q7 */
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q7 */
-
-            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7 */
-
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q4 */
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 =
-                                                       q4 */
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q4*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q4*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q4*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q4*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q4*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q4*/
-
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b2, b0 to q4, q5 */
-            /* unrool 2*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =
-                                                        q6 */
-            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                        q6 */
-            "ldp	q2, q3, [%[a_ptr]], #32\n"  /* load a10, a11 to q3, q4*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 =
-                                                       q6*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 =
-                                                       q6*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =
-                                                       q6*/
-            "fmla	v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 =
-                                                       q6*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 =
-                                                       q6*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 =
-                                                       q6*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =
-                                                       q7*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =
-                                                       q7*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                       q7*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                       q7*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                       q7*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                       q7*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                       q7*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                       q7*/
-
-            "ldp	q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/
-
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                       q4*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                       q4*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                       q4*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                       q4*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                       q4*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                       q4*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                       q4*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                       q4*/
-            "ldp	q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            /* unrool 3*/
-            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q5*/
-            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q5*/
-            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q5*/
-            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q5*/
-            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q5*/
-            "fmla	v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b0, b1 to q4, q5*/
-            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 =
-                                                       q6*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla	v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla	v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q7*/
-            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
-            "fmla	v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q7*/
-            "bne	1b\n"
-            /* Target to use when K is 1 or 2 (i.e. zero iterations of main
-               loop)*/
-            "2:\n"                                        /* process tail*/
-            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
-            "beq		3f\n"                     /*jump to tail = 1*/
-            /* final unrool 0*/
-            /* unrool 0, tail > 1*/
-            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =
-                                                        q4*/
-            "fmla  	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                        q4*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"   /* load b2, b0 to q6, q7*/
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"  /* out2 = b0 * a00[2], b0 =
-                                                        q4*/
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"  /* out3 = b0 * a00[3], b0 =
-                                                        q4*/
-            "ldp	q2, q3, [%[a_ptr]], #32\n"  /* load a10, a11 to q2, q3*/
-            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =
-                                                       q4*/
-            "fmla	v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 =
-                                                       q4*/
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 =
-                                                       q4*/
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 =
-                                                       q4*/
-            "subs	%w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =
-                                                       q5*/
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =
-                                                       q5*/
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                       q5*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                       q5*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                       q5*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                       q5*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                       q5*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                       q5*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"  /* load b1, b2 to q4, q5*/
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                       q6*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                       q6*/
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                       q6*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                       q6*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                       q6*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                       q6*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                       q6*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                       q6*/
-            "beq		4f\n"               /*jump to tail = 2*/
-            /* unrool 1, tail > 2*/
-            "ldp	q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 =
-                                                       q7*/
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q7*/
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q7*/
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q7*/
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q7*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q7*/
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q7*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"  /* load b0, b1 to q6, q7*/
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q4*/
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 =
-                                                       q4*/
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q4*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q4*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q4*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q4*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q4*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q4*/
-            "subs	%w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q5*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q5*/
-            "beq		5f\n"               /*jump to tail = 3*/
-            /* unrool 2, tail = 4*/
-            "ldp	q4, q5, [%[b_ptr]], #32\n"   /* load b2, b0 to q4, q5*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 =
-                                                        q6*/
-            "fmla  	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                        q6*/
-            "ldp	q2, q3, [%[a_ptr]], #32\n"  /* load a10, a11 to q3, q4*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 =
-                                                       q6*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 =
-                                                       q6*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 =
-                                                       q6*/
-            "fmla	v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 =
-                                                       q6*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 =
-                                                       q6*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 =
-                                                       q6*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 =
-                                                       q7*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 =
-                                                       q7*/
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                       q7*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                       q7*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                       q7*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                       q7*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                       q7*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                       q7*/
-            "ldp	q6, q7, [%[b_ptr]], #32\n"  /* load b1, b2 to q6, q7*/
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                       q4*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                       q4*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                       q4*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                       q4*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                       q4*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                       q4*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                       q4*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                       q4*/
-            /* unrool 3, tail = 4*/
-            "fmla 	v8.4s ,  v5.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q5*/
-            "fmla	v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q5*/
-            "fmla	v14.4s,  v5.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q5*/
-            "fmla	v17.4s,  v5.4s,  v2.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q5*/
-            "fmla 	v20.4s,  v5.4s,  v3.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q5*/
-            "fmla	v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q5*/
-            "fmla	v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q7*/
-            "b		11f\n"
-            /* tails==1 final tail*/
-            "3: \n"                                  /* tail=1*/
-            "ldr	q6, [%[b_ptr]], #16\n"       /* load b2 to q6*/
-            "fmla 	v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q5*/
-            "fmla	v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q5*/
-            "fmla	v14.4s,  v4.4s,  v0.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q5*/
-            "fmla	v17.4s,  v4.4s,  v0.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q5*/
-            "fmla 	v20.4s,  v4.4s,  v1.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q5*/
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q5*/
-            "fmla	v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q5*/
-            "fmla	v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q7*/
-            "b		11f\n"
-            /* tails==2 final tail*/
-            "4:\n"                                   /* tail = 2*/
-            "fmla 	v8.4s ,  v7.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q5*/
-            "fmla	v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q5*/
-            "fmla	v14.4s,  v7.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q5*/
-            "fmla	v17.4s,  v7.4s,  v2.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q5*/
-            "fmla 	v20.4s,  v7.4s,  v3.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q5*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q5*/
-            "fmla	v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q5*/
-            "fmla	v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q7*/
-            "b		11f\n"
-            /* tails==3 final tail*/
-            "5:\n"                                   /* tail = 3*/
-            "ldr	q4, [%[b_ptr]], #16\n"       /* load b2, b0 to q4*/
-            "fmla 	v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a10[0], b0 =
-                                                        q5*/
-            "fmla	v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                        q5*/
-            "fmla	v14.4s,  v6.4s,  v0.s[2]\n"  /* out2 = b0 * a10[2], b0 =
-                                                        q5*/
-            "fmla	v17.4s,  v6.4s,  v0.s[3]\n"  /* out3 = b0 * a10[3], b0 =
-                                                        q5*/
-            "fmla 	v20.4s,  v6.4s,  v1.s[0]\n"  /* out4 = b0 * a11[0], b0 =
-                                                        q5*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla	v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 =
-                                                       q5*/
-            "fmla	v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 =
-                                                       q5*/
-            "fmla	v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                       q6*/
-            "fmla	v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                       q6*/
-            "fmla	v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                       q6*/
-            "fmla	v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                       q6*/
-            "fmla	v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                       q7*/
-            "fmla	v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                       q7*/
-            "11: \n"                                /* check if relu */
-            "cbz    %w[relu],   12f\n"              /* skip relu */
-            "movi   v2.4s, #0\n"                    /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"          /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"          /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"        /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"        /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"        /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"        /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"        /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"        /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"          /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"          /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"        /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"        /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"        /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"        /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"        /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"        /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"          /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"          /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"        /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"        /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"        /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"        /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"        /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"        /* relu*/
-            "12: \n"
-            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
-            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
-            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
-            "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */
-            "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */
-            "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */
-            "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */
-            "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [k] "+r"(k),
-              [tail] "+r"(tail), [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3),
-              [c_ptr4] "+r"(c_ptr4), [c_ptr5] "+r"(c_ptr5),
-              [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-              "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-              "v29", "v30", "v31");
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-            *pout6++ = cout6[i];
-            *pout7++ = cout7[i];
-          }
-        }
-      }
-    }
-  }
-}
-#else  // __aarch64__
-/**
- * \brief gemm with ablock = 6, bblock = 8, output 6x8
- * @param A
- * @param B
- * @param C
- * @param M
- * @param N
- * @param K
- * @param threads
- * @param workspace
- */
-void sgemm_conv_6x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB, ARMContext* ctx) {
-  size_t l2_cache =
-      ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
-  auto* workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel = workspace;
-    if (transB) {
-      loadb_trans(b_pannel, B, K, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
-      unsigned int ymax = y + MBLOCK_OTH;
-      if (ymax > M) {
-        ymax = M;
-      }
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-      float* c_ptr4 = c_ptr3 + N;
-      float* c_ptr5 = c_ptr4 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-      float* pout4 = c_ptr4;
-      float* pout5 = c_ptr5;
-
-      float bias_local[6] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 5) >= ymax) {
-          switch ((y + 5) - ymax) {
-            case 4:
-              c_ptr1 = cout1;
-            case 3:
-              c_ptr2 = cout2;
-            case 2:
-              c_ptr3 = cout3;
-            case 1:
-              c_ptr4 = cout4;
-            case 0:
-              c_ptr5 = cout5;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            // sgemm 6x8
-            "vld1.32	{d2-d4}, [%[bias_ptr]]      @ load bias 6 elements\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "pld [%[a_ptr]]                         @ preload a\n"
-            "vdup.i32	q12,d4[0]                   @ out40=0\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.i32	q13,d4[0]                   @ out41=0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.i32	q14,d4[1]                   @ out50=0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.i32	q15,d4[1]                   @ out51=0\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.i32	q4, d2[0]                   @ out00=0\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.i32	q5, d2[0]                   @ out01=0\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.i32	q6, d2[1]                   @ out10=0\n"
-            "pld [%[a_ptr], #192]                   @ preload a\n"
-            "vdup.i32	q7, d2[1]                   @ out11=0\n"
-            "pld [%[b_ptr], #192]                   @ preload a\n"
-            "vdup.i32	q8, d3[0]                   @ out20=0\n"
-            "pld [%[a_ptr], #256]                   @ preload a\n"
-            "vdup.i32	q9, d3[0]                   @ out21=0\n"
-            "pld [%[b_ptr], #256]                   @ preload a\n"
-            "vdup.i32	q10,d3[1]                   @ out30=0\n"
-            "pld [%[b_ptr], #320]                   @ preload b\n"
-            "vdup.i32	q11,d3[1]                   @ out31=0\n"
-            "pld [%[b_ptr], #384]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4, a5, and next "
-            "a0, "
-            "a1\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 1 */
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            /*"pld [%[a_ptr], #64]                    @ preload a\n"*/
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[b_ptr], #192]\n"*/
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4, a5, a0, a1\n"
-            /* Unroll 2 */
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[a_ptr], #240]                   @ preload\n"*/
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            /*"pld [%[b_ptr], #208]\n"*/
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3 */
-            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d3[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "subs		%[k], %[k], #1              @ k--\n"
-            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "bne		1b                          @ jump to main "
-            "loop\n"
-            "0:                                     @ process tail\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "beq		3f                          @ jump to tail = "
-            "1\n"
-            /* Unroll 0*/
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a4,5, a0, a1\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		4f                          @ jump to tail==2\n"
-            /* Unroll 1*/
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq		5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32	{d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32	{d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3*/
-            "vmla.f32	q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d3[1]              @ out5  += b1 * a5\n"
-            "vmla.f32	q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "b		2f\n"
-            /* tails==1 final tail*/
-            "3:                                     @ tail=1\n"
-            "vmla.f32	q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d2}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32	q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vld1.32	{d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32	q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32	q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32	q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d0}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32	q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32	q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32	q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32	q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vmla.f32	q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32	q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32	q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vmla.f32	q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32	q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d20-d23},  [%[c_ptr3]]!    @ store r3\n"
-            "vst1.32    {d24-d27},  [%[c_ptr4]]!    @ store r4\n"
-            "vst1.32    {d28-d31},  [%[c_ptr5]]!    @ store r5\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [c_ptr4] "+r"(c_ptr4),
-              [c_ptr5] "+r"(c_ptr5), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB, ARMContext* ctx) {
-  size_t l2_cache =
-      ctx->l2_cache_size() > 0 ? ctx->l2_cache_size() : 512 * 1024;
-  auto* workspace = ctx->workspace_data<float>();
-  int threads = ctx->threads();
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel = workspace;
-    if (transB) {
-      loadb_trans(b_pannel, B, K, 0, K, x0, xmax);
-    } else {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_A73) {
-      unsigned int ymax = y + MBLOCK_A73;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-
-      float bias_local[4] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-      }
-
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 3) >= ymax) {
-          switch ((y + 3) - ymax) {
-            case 2:
-              c_ptr1 = cout1;
-            case 1:
-              c_ptr2 = cout1;
-            case 0:
-              c_ptr3 = cout1;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            "vld1.32    {d4-d5}, [%[bias_ptr]]      @ load bias\n"
-            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
-            "vdup.32    q8, d4[0]                   @ add bias to out00\n"
-            "pld [%[a_ptr]]                         @ preload a, 64byte\n"
-            "vdup.32    q9, d4[0]                   @ add bias to out01\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.32    q10, d4[1]                  @ add bias to out10\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.32    q11, d4[1]                  @ add bias to out11\n"
-            "vld1.32   {d8-d11}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.32    q12, d5[0]                  @ add bias to out20\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.32    q13, d5[0]                  @ add bias to out21\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.32    q14, d5[1]                  @ add bias to out30\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.32    q15, d5[1]                  @ add bias to out31\n"
-            "pld [%[b_ptr], #192]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 1 */
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            /* Unroll 2 */
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d0-d3}, [%[a_ptr] :128]!   @ load next a0~a3\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 3 */
-            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "subs		%[k], %[k], #1              @ k--\n"
-            "bne		1b                          @ jump to main "
-            "loop\n"
-            "0:                                     @ process tail\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "beq		3f                          @ jump to tail = "
-            "1\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "beq		4f                          @ jump to tail==2\n"
-            /* Unroll 1 */
-            "vld1.32	{d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32	{d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "subs		%[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "beq		5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32	{d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /* Unroll 3 */
-            "vmla.f32	q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "b		2f\n"
-            /* tails==1 final tail */
-            "3:                                     @ tail=1\n"
-            "vmla.f32	q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16 */
-            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32	q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q7, d2[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q7, d2[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q7, d3[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q7, d3[1]              @ out7 += b2 * a3\n"
-            "b		2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32	q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32	q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32	q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32	q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32	q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32	q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32	q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32	q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16*/
-            "sub		%[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d28-d31},  [%[c_ptr3]]!    @ store r3\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-          }
-        }
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/packed_sgemm.h b/paddle/fluid/lite/arm/math/packed_sgemm.h
deleted file mode 100644
index 160b432c8d8..00000000000
--- a/paddle/fluid/lite/arm/math/packed_sgemm.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cmath>
-#include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-#include "paddle/fluid/lite/core/lite_tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-#ifdef __aarch64__
-constexpr int MBLOCK = 8;
-constexpr int NBLOCK = 12;
-constexpr int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) { return MBLOCK; }
-#else
-constexpr int MBLOCK_A73 = 4;
-constexpr int MBLOCK_OTH = 6;
-constexpr int NBLOCK = 8;
-constexpr int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) {
-  if (arch == kA73) {
-    return MBLOCK_A73;
-  } else {
-    return MBLOCK_OTH;
-  }
-}
-#endif  // __aarch64__
-
-void prepackA(float* out, const float* in, const int ldin, const int m0,
-              const int mmax, const int k0, const int kmax, bool is_trans,
-              ARMContext* ctx);
-
-void prepackA(TensorLite* tout, const TensorLite& tin, int m, int k, int group,
-              bool is_trans, ARMContext* ctx);
-
-void sgemm_prepack(const float* A_packed, const float* B, const float* bias,
-                   float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMContext* ctx);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/scale.cc b/paddle/fluid/lite/arm/math/scale.cc
deleted file mode 100644
index 40b91e6979f..00000000000
--- a/paddle/fluid/lite/arm/math/scale.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/arm/math/scale.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void scale<float>(const float* din, float* dout, int num, float scale,
-                  float bias) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vscale = vdupq_n_f32(scale);
-  float32x4_t vbias = vdupq_n_f32(bias);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-
-    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
-
-    vst1q_f32(dout_ptr, vsum1);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    vst1q_f32(dout_ptr + 8, vsum3);
-    vst1q_f32(dout_ptr + 12, vsum4);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr * scale + bias;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/softmax.cc b/paddle/fluid/lite/arm/math/softmax.cc
deleted file mode 100644
index 2a081eaf489..00000000000
--- a/paddle/fluid/lite/arm/math/softmax.cc
+++ /dev/null
@@ -1,601 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/arm/math/softmax.h"
-#include <algorithm>
-#include "paddle/fluid/lite/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <>
-void softmax_basic<float>(const float* din, float* dout, const int axis_size,
-                          const int inner_num, const int outer_num) {
-  int compute_size = inner_num * outer_num;
-#pragma omp parallel for
-  for (int i = 0; i < compute_size; ++i) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner8_axis4<float>(const float* din, float* dout,
-                                 const int axis_size, const int inner_num,
-                                 const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 3;
-  int remain = compute_size % 8;
-  float32x4_t vone = vdupq_n_f32(1.0f);
-
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 8;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float32x4_t vdata01 = vld1q_f32(din_ptr + 4);
-    float32x4_t vdata11 = vld1q_f32(din_ptr1 + 4);
-    float32x4_t vdata21 = vld1q_f32(din_ptr2 + 4);
-    float32x4_t vdata31 = vld1q_f32(din_ptr3 + 4);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float32x4_t vmax11 = vmaxq_f32(vdata01, vdata11);
-    float32x4_t vmax21 = vmaxq_f32(vdata21, vdata31);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-    float32x4_t vmax_1 = vmaxq_f32(vmax11, vmax21);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum01 = exp_ps(vsubq_f32(vdata01, vmax_1));
-    float32x4_t vsum11 = exp_ps(vsubq_f32(vdata11, vmax_1));
-    float32x4_t vsum21 = exp_ps(vsubq_f32(vdata21, vmax_1));
-    float32x4_t vsum31 = exp_ps(vsubq_f32(vdata31, vmax_1));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-    float32x4_t vsum_11 = vaddq_f32(vsum01, vsum11);
-    float32x4_t vsum_21 = vaddq_f32(vsum21, vsum31);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-    float32x4_t vsum111 = vaddq_f32(vsum_11, vsum_21);
-
-    float32x4_t vinf = div_ps(vone, vsum);
-    float32x4_t vinf1 = div_ps(vone, vsum111);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vsum01 = vmulq_f32(vsum01, vinf1);
-    vsum11 = vmulq_f32(vsum11, vinf1);
-    vsum21 = vmulq_f32(vsum21, vinf1);
-    vsum31 = vmulq_f32(vsum31, vinf1);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-
-    vst1q_f32(dout_ptr0 + 4, vsum01);
-    vst1q_f32(dout_ptr1 + 4, vsum11);
-    vst1q_f32(dout_ptr2 + 4, vsum21);
-    vst1q_f32(dout_ptr3 + 4, vsum31);
-  }
-
-  int i = cmp_cnt * 8;
-
-  if (remain > 4) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-
-    i += 4;
-  }
-  for (; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner4_axis4<float>(const float* din, float* dout,
-                                 const int axis_size, const int inner_num,
-                                 const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 2;
-  int remain = compute_size % 4;
-  float32x4_t vone = vdupq_n_f32(1.0f);
-
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 4;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // get max axis_size == 4
-    const float* din_ptr = din + real_index;
-    const float* din_ptr1 = din_ptr + inner_num;
-    const float* din_ptr2 = din_ptr1 + inner_num;
-    const float* din_ptr3 = din_ptr2 + inner_num;
-    float32x4_t vdata0 = vld1q_f32(din_ptr);
-    float32x4_t vdata1 = vld1q_f32(din_ptr1);
-    float32x4_t vdata2 = vld1q_f32(din_ptr2);
-    float32x4_t vdata3 = vld1q_f32(din_ptr3);
-
-    float* dout_ptr0 = dout + real_index;
-    float* dout_ptr1 = dout_ptr0 + inner_num;
-    float32x4_t vmax1 = vmaxq_f32(vdata0, vdata1);
-    float32x4_t vmax2 = vmaxq_f32(vdata2, vdata3);
-    float* dout_ptr2 = dout_ptr1 + inner_num;
-    float* dout_ptr3 = dout_ptr2 + inner_num;
-    float32x4_t vmax = vmaxq_f32(vmax1, vmax2);
-
-    // sub, exp and sum
-    float32x4_t vsum0 = exp_ps(vsubq_f32(vdata0, vmax));
-    float32x4_t vsum1 = exp_ps(vsubq_f32(vdata1, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax));
-    float32x4_t vsum3 = exp_ps(vsubq_f32(vdata3, vmax));
-
-    float32x4_t vsum_1 = vaddq_f32(vsum0, vsum1);
-    float32x4_t vsum_2 = vaddq_f32(vsum2, vsum3);
-
-    float32x4_t vsum = vaddq_f32(vsum_1, vsum_2);
-
-    float32x4_t vinf = div_ps(vone, vsum);
-
-    vsum0 = vmulq_f32(vsum0, vinf);
-    vsum1 = vmulq_f32(vsum1, vinf);
-    vsum2 = vmulq_f32(vsum2, vinf);
-    vsum3 = vmulq_f32(vsum3, vinf);
-
-    vst1q_f32(dout_ptr0, vsum0);
-    vst1q_f32(dout_ptr1, vsum1);
-    vst1q_f32(dout_ptr2, vsum2);
-    vst1q_f32(dout_ptr3, vsum3);
-  }
-
-  int i = cmp_cnt * 8;
-  for (; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner8<float>(const float* din, float* dout, const int axis_size,
-                           const int inner_num, const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 3;
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 8;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    const float* din_ptr = din + real_index;
-    float32x4_t vmax = vld1q_f32(din_ptr);
-    float32x4_t vmax2 = vld1q_f32(din_ptr + 4);
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      din_ptr += inner_num;
-      float32x4_t vdata = vld1q_f32(din_ptr);
-      float32x4_t vdata2 = vld1q_f32(din_ptr + 4);
-      vmax = vmaxq_f32(vmax, vdata);
-      vmax2 = vmaxq_f32(vmax2, vdata2);
-    }
-
-    // sub, exp and sum
-    din_ptr = din + real_index;
-    float* dout_ptr = dout + real_index;
-    float32x4_t vdata = vld1q_f32(din_ptr);
-    float32x4_t vdata2 = vld1q_f32(din_ptr + 4);
-    float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax));
-    float32x4_t vsum2 = exp_ps(vsubq_f32(vdata2, vmax2));
-    din_ptr += inner_num;
-    vst1q_f32(dout_ptr, vsum);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    dout_ptr += inner_num;
-    for (int j = 1; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(din_ptr);
-      float32x4_t vdata1 = vld1q_f32(din_ptr + 4);
-      vdata0 = exp_ps(vsubq_f32(vdata0, vmax));
-      vdata1 = exp_ps(vsubq_f32(vdata1, vmax2));
-      din_ptr += inner_num;
-      vsum = vaddq_f32(vsum, vdata0);
-      vsum2 = vaddq_f32(vsum2, vdata1);
-      vst1q_f32(dout_ptr, vdata0);
-      vst1q_f32(dout_ptr + 4, vdata1);
-      dout_ptr += inner_num;
-    }
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-    float32x4_t vinf2 = div_ps(vone, vsum2);
-    dout_ptr = dout + real_index;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(dout_ptr);
-      float32x4_t vdata1 = vld1q_f32(dout_ptr + 4);
-      vdata0 = vmulq_f32(vdata0, vinf);
-      vdata1 = vmulq_f32(vdata1, vinf2);
-      vst1q_f32(dout_ptr, vdata0);
-      vst1q_f32(dout_ptr + 4, vdata1);
-      dout_ptr += inner_num;
-    }
-  }
-
-  for (int i = cmp_cnt * 8; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner4<float>(const float* din, float* dout, const int axis_size,
-                           const int inner_num, const int outer_num) {
-  int compute_size = inner_num * outer_num;
-  int cmp_cnt = compute_size >> 2;
-#pragma omp parallel for
-  for (int c = 0; c < cmp_cnt; ++c) {
-    int i = c * 4;
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    // float max_data = din[real_index];
-    const float* din_ptr = din + real_index;
-    float32x4_t vmax = vld1q_f32(din_ptr);
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      din_ptr += inner_num;
-      float32x4_t vdata = vld1q_f32(din_ptr);
-      vmax = vmaxq_f32(vmax, vdata);
-    }
-    // sub, exp and sum
-    din_ptr = din + real_index;
-    float* dout_ptr = dout + real_index;
-    float32x4_t vdata = vld1q_f32(din_ptr);
-    float32x4_t vsum = exp_ps(vsubq_f32(vdata, vmax));
-    din_ptr += inner_num;
-    vst1q_f32(dout_ptr, vsum);
-    dout_ptr += inner_num;
-    for (int j = 1; j < axis_size; ++j) {
-      // real_index += inner_num;
-      float32x4_t vdata0 = vld1q_f32(din_ptr);
-      vdata0 = exp_ps(vsubq_f32(vdata0, vmax));
-      din_ptr += inner_num;
-      vsum = vaddq_f32(vsum, vdata0);
-      vst1q_f32(dout_ptr, vdata0);
-      dout_ptr += inner_num;
-    }
-
-    float32x4_t vone = vdupq_n_f32(1.0f);
-    float32x4_t vinf = div_ps(vone, vsum);
-    dout_ptr = dout + real_index;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      float32x4_t vdata0 = vld1q_f32(dout_ptr);
-      vdata0 = vmulq_f32(vdata0, vinf);
-      vst1q_f32(dout_ptr, vdata0);
-      dout_ptr += inner_num;
-    }
-  }
-
-  for (int i = cmp_cnt * 4; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <>
-void softmax_inner1_large_axis<float>(const float* din, float* dout,
-                                      const int outer_size,
-                                      const int axis_size) {
-#pragma omp parallel for
-  for (int i = 0; i < outer_size; ++i) {
-    const float* din_ptr = din + i * axis_size;
-    float* dout_ptr = dout + i * axis_size;
-
-    const float* din_max_ptr = din_ptr;
-    int nn = axis_size >> 2;
-
-    // get max
-    float32x4_t vmax = vld1q_f32(din_max_ptr);
-    din_max_ptr += 4;
-    int j = 1;
-    for (; j < nn; ++j) {
-      vmax = vmaxq_f32(vmax, vld1q_f32(din_max_ptr));
-      din_max_ptr += 4;
-    }
-    float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
-    float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
-      max_data = std::max(max_data, din_max_ptr[0]);
-      din_max_ptr++;
-    }
-
-    // sub, exp and sum
-    const float* din_sum_ptr = din_ptr;
-    float* dout_sum_ptr = dout_ptr;
-    vmax = vdupq_n_f32(max_data);
-    float32x4_t vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax));
-    float32x4_t vsum = vsub_exp;
-    vst1q_f32(dout_sum_ptr, vsub_exp);
-    din_sum_ptr += 4;
-    dout_sum_ptr += 4;
-
-    j = 1;
-    for (; j < nn; ++j) {
-      vsub_exp = exp_ps(vsubq_f32(vld1q_f32(din_sum_ptr), vmax));
-      vst1q_f32(dout_sum_ptr, vsub_exp);
-      vsum = vaddq_f32(vsum, vsub_exp);
-      din_sum_ptr += 4;
-      dout_sum_ptr += 4;
-    }
-    float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
-    float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
-
-    for (j = 4 * j; j < axis_size; ++j) {
-      dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
-      sum_data += dout_sum_ptr[0];
-      din_sum_ptr++;
-      dout_sum_ptr++;
-    }
-
-    float sum_inv = 1.f / sum_data;
-    float* dout_res_ptr = dout_ptr;
-    float32x4_t vinv = vdupq_n_f32(sum_inv);
-    // get softmax result
-    j = 0;
-    for (; j < nn; ++j) {
-      float32x4_t vout = vld1q_f32(dout_res_ptr);
-      float32x4_t vres = vmulq_f32(vout, vinv);
-      vst1q_f32(dout_res_ptr, vres);
-      dout_res_ptr += 4;
-    }
-    for (j = nn * 4; j < axis_size; ++j) {
-      dout_ptr[j] *= sum_inv;
-    }
-  }
-}
-
-template <>
-void softmax_inner1_small_axis<float>(const float* din, float* dout,
-                                      const int outer_size,
-                                      const int axis_size) {
-#pragma omp parallel for
-  for (int i = 0; i < outer_size; ++i) {
-    const float* din_ptr = din + i * axis_size;
-    float* dout_ptr = dout + i * axis_size;
-    // get max
-    float max_data = din_ptr[0];
-    for (int j = 1; j < axis_size; ++j) {
-      max_data = std::max(max_data, din_ptr[j]);
-    }
-
-    // sub, exp and sum
-    float sum_data = 0.f;
-    for (int j = 0; j < axis_size; ++j) {
-      dout_ptr[j] = expf(din_ptr[j] - max_data);
-      sum_data += dout_ptr[j];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    for (int j = 0; j < axis_size; ++j) {
-      dout_ptr[j] *= sum_inv;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/softmax.h b/paddle/fluid/lite/arm/math/softmax.h
deleted file mode 100644
index c0109ffd12f..00000000000
--- a/paddle/fluid/lite/arm/math/softmax.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void softmax_basic(const T* din, T* dout, const int axis_size,
-                   const int inner_num, const int outer_num);
-
-template <typename T>
-void softmax_inner8_axis4(const T* din, T* dout, const int axis_size,
-                          const int inner_num, const int outer_num);
-
-template <typename T>
-void softmax_inner4_axis4(const T* din, T* dout, const int axis_size,
-                          const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner8(const T* din, T* dout, const int axis_size,
-                    const int inner_num, const int outer_num);
-
-template <typename T>
-void softmax_inner4(const T* din, T* dout, const int axis_size,
-                    const int inner_num, const int outer_num);
-
-template <typename T>
-void softmax_inner1_large_axis(const T* din, T* dout, const int outer_size,
-                               const int axis_size);
-
-template <typename T>
-void softmax_inner1_small_axis(const T* din, T* dout, const int outer_size,
-                               const int axis_size);
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt
index 3edd5db08fd..ac30772cd04 100644
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -1,8 +1,8 @@
 if (WITH_TESTING)
-    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest)
+    lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
 endif()
 lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc
-  DEPS target_wrapper_host
+  DEPS target_wrapper_host place_lite
   X86_DEPS target_wrapper_x86
   CUDA_DEPS target_wrapper_cuda)
 lite_cc_library(memory_lite SRCS memory.cc DEPS target_wrapper_lite)
@@ -19,22 +19,32 @@ endif()
 
 proto_library(framework_proto_lite SRCS framework.proto)
 
-cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite ${tensor_lite})
-cc_library(variable_lite SRCS variable.cc)
-cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
-cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
-cc_library(cpu_info_lite SRCS cpu_info.cc)
-cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite)
-cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
+if (LITE_WITH_X86)
+lite_cc_library(variable_lite SRCS variable.cc DEPS framework_proto)
+lite_cc_library(types_lite SRCS types.cc DEPS framework_proto)
+else()
+lite_cc_library(variable_lite SRCS variable.cc)
+lite_cc_library(types_lite SRCS types.cc)
+endif()
+lite_cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
+lite_cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
+lite_cc_library(cpu_info_lite SRCS cpu_info.cc)
+
+if (LITE_WITH_ARM)
+lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite CL_DEPS cl_helper)
+else()
+lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3 CL_DEPS cl_helper)
+endif()
+lite_cc_library(kernel_lite SRCS kernel.cc DEPS context_lite type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite ${tensor_lite})
+lite_cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite
   cpp_op_desc_lite ${tensor_lite})
-cc_library(types_lite SRCS types.cc)
-cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
+lite_cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
 
 lite_cc_library(program_lite SRCS program.cc
-    DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite
+    DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite ${ops_lite}
     HVY_DEPS framework_proto
     PROFILE_DEPS basic_profiler_lite)
-cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
+lite_cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
 
 add_subdirectory(mir)
 add_subdirectory(profile)
@@ -44,7 +54,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     return()
 endif()
 
-cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
+lite_cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
         scope_lite op_registry_lite proto_desc op_lite
         ${ops_lite}
         ${host_kernels}
diff --git a/paddle/fluid/lite/core/context.cc b/paddle/fluid/lite/core/context.cc
index cd7006f4724..97d98a35737 100644
--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,317 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
 
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
+#ifdef LITE_WITH_OPENCL
+DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
 #endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
 
 namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-
-void Context<TargetType::kARM>::BindDev() {
-#ifdef USE_OPENMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOGE("set cpu affinity failed, cpuID: %d\n", active_ids_[i]);
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // USE_OPENMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // USE_OPENMP
-}
-
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef USE_OPENMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOGE("threads: %d, exceed the big cores size: %d\n", threads,
-               big_core_size);
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOGE("HIGH POWER MODE is not support, switch to little cores\n");
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOGW("threads: %d, exceed the little cores size: %d\n", threads,
-               small_core_size);
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOGW("LOW POWER MODE is not support, switch to big cores\n");
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOGW("threads: %d, exceed the big cores size: %d\n", threads,
-               big_core_size);
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOGW("HIGH POWER MODE is not support, switch to little cores\n");
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOGW("threads: %d, exceed the little cores size: %d\n", threads,
-               small_core_size);
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOGW("LOW POWER MODE is not support, switch to big cores\n");
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h
index 483f5154144..d75c85d54cf 100644
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -23,6 +23,11 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device_context.h"
 #endif
+#ifdef LITE_WITH_OPENCL
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#endif
 #include <map>
 #include <memory>
 #include <set>
@@ -34,6 +39,10 @@
 #include "paddle/fluid/lite/core/target_wrapper.h"
 #include "paddle/fluid/lite/utils/all.h"
 
+#ifdef LITE_WITH_OPENCL
+DECLARE_string(cl_path);
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -44,6 +53,7 @@ using HostContext = Context<TargetType::kHost>;
 using X86Context = Context<TargetType::kX86>;
 using CUDAContext = Context<TargetType::kCUDA>;
 using ARMContext = Context<TargetType::kARM>;
+using OpenClContext = Context<TargetType::kOpenCL>;
 
 template <>
 class Context<TargetType::kHost> {
@@ -51,7 +61,7 @@ class Context<TargetType::kHost> {
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
 
-  void CopyShared(const HostContext* ctx) {}
+  void CopySharedTo(const HostContext* ctx) {}
 
   std::string name() const { return "HostContext"; }
 };
@@ -61,47 +71,41 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
  public:
-  Context();
-  Context(PowerMode mode, int threads);
+  Context() {}
   explicit Context(const ARMContext& ctx);
 
-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}
 
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() { DeviceInfo::Init(); }
 
-  void CopyShared(const ARMContext* ctx) {}
+  void CopySharedTo(const ARMContext* ctx) {}
 
-  void SetRunMode(PowerMode mode, int threads);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch);
-  void BindDev();
+  void SetRunMode(PowerMode mode, int threads) {
+    return DeviceInfo::Global().SetRunMode(mode, threads);
+  }
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
 
-  PowerMode mode() const;
-  int threads() const;
-  ARMArch arch() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  int threads() const { return DeviceInfo::Global().threads(); }
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
 
   template <typename T>
   T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
   }
 
-  int l1_cache_size() const;
-  int l2_cache_size() const;
-  int l3_cache_size() const;
-  bool ExtendWorkspace(DDimLite dims);
+  bool ExtendWorkspace(DDimLite dims) {
+    return DeviceInfo::Global().ExtendWorkspace(dims);
+  }
 
   std::string name() const { return "ARMContext"; }
-
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif
 
@@ -115,7 +119,7 @@ class Context<TargetType::kCUDA> {
     cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
   }
 
-  void CopyShared(const CUDAContext* ctx) {
+  void CopySharedTo(const CUDAContext* ctx) {
     CHECK(ctx);
     CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
     ctx->cublas_fp32_ = cublas_fp32_;
@@ -181,7 +185,7 @@ class Context<TargetType::kX86> {
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
 
-  void CopyShared(const X86Context* ctx) {}
+  void CopySharedTo(const X86Context* ctx) {}
 
   const device_ctx_t* x86_device_context() { return x86_device_context_.get(); }
   void SetX86DeviceContext(std::unique_ptr<device_ctx_t>&& ctx) {
@@ -208,6 +212,43 @@ class Context<TargetType::kX86> {
 };
 #endif
 
+#ifdef LITE_WITH_OPENCL
+template <>
+class Context<TargetType::kOpenCL> {
+  mutable std::shared_ptr<CLContext> cl_context_;
+  mutable std::shared_ptr<CLHelper> cl_helper_;
+
+ public:
+  CLContext* cl_context() { return cl_context_.get(); }
+  CLHelper* cl_helper() { return cl_helper_.get(); }
+
+  void InitOnce() {
+    // Init cl engine.
+    CHECK(CLEngine::Global()->IsInitSuccess()) << "OpenCL engine init failed";
+    CLEngine::Global()->set_cl_path(FLAGS_cl_path);
+
+    cl_context_ = std::make_shared<CLContext>();
+    cl_helper_ = std::make_shared<CLHelper>();
+    cl_helper_->set_context(cl_context_.get());
+
+    PrepareKernels();
+  }
+
+  void CopySharedTo(const OpenClContext* ctx) {
+    ctx->cl_context_ = cl_context_;
+    ctx->cl_helper_ = cl_helper_;
+  }
+
+ private:
+  void PrepareKernels() {
+    cl_helper_->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+    cl_helper_->AddKernel("channel_add", "channel_add_kernel.cl");
+    cl_helper_->AddKernel("pool_max", "pool_kernel.cl");
+    cl_helper_->AddKernel("pool_avg", "pool_kernel.cl");
+  }
+};
+#endif
+
 // Context for running a kernel.
 // Holds the necessary resource and information.
 class KernelContext {
@@ -236,26 +277,32 @@ class ContextScheduler {
     std::unique_ptr<KernelContext> ctx(new KernelContext);
     switch (target) {
       case TARGET(kHost):
-        kernel_contexts_[TargetType::kHost].As<HostContext>().CopyShared(
+        kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
             &ctx->As<HostContext>());
         break;
 #ifdef LITE_WITH_X86
       case TARGET(kX86):
-        kernel_contexts_[TargetType::kX86].As<X86Context>().CopyShared(
+        kernel_contexts_[TargetType::kX86].As<X86Context>().CopySharedTo(
             &ctx->As<X86Context>());
         break;
 #endif
 #ifdef LITE_WITH_CUDA
       case TARGET(kCUDA):
-        kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopyShared(
+        kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
             &ctx->As<CUDAContext>());
         break;
 #endif
 #ifdef LITE_WITH_ARM
       case TARGET(kARM):
-        kernel_contexts_[TargetType::kARM].As<ARMContext>().CopyShared(
+        kernel_contexts_[TargetType::kARM].As<ARMContext>().CopySharedTo(
             &ctx->As<ARMContext>());
         break;
+#endif
+#ifdef LITE_WITH_OPENCL
+      case TARGET(kOpenCL):
+        kernel_contexts_[TargetType::kOpenCL].As<OpenClContext>().CopySharedTo(
+            &ctx->As<OpenClContext>());
+        break;
 #endif
       default:
         LOG(FATAL) << "unsupported target " << TargetToStr(target);
@@ -279,6 +326,9 @@ class ContextScheduler {
 #endif
 #ifdef LITE_WITH_ARM
     InitContext<TargetType::kARM, ARMContext>();
+#endif
+#ifdef LITE_WITH_OPENCL
+    InitContext<TargetType::kOpenCL, OpenClContext>();
 #endif
   }
 
diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc
index df80f1c8576..3b9fdcd602d 100644
--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -12,312 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef LITE_WITH_LINUX
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif  // TARGET_OS_IPHONE
+#endif  // __APPLE__
+
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/lite/core/cpu_info.h"
-#include <cstdarg>
 
 namespace paddle {
 namespace lite {
 
 #ifdef LITE_WITH_ARM
 
-void DeviceInfo::InitInternal(DeviceInfo* dev) {
-  set_default_cache(dev);
-  dev->compute_core_num_ = arm_get_cpucount();
-  dev->max_memory_ = arm_get_meminfo();
-
-// get max freq
-#ifdef LITE_WITH_LINUX
-  std::vector<int> max_freq(dev->compute_core_num_);
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    max_freq[i] = get_max_freq_khz(i) / 1000;
-  }
-  std::string cpu_name = arm_get_cpu_name();
-  if (get_cpu_info_from_name(dev, cpu_name) != true) {
-    arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
-                                    max_freq, &dev->cluster_ids_);
-    dev->big_core_ids_.clear();
-    dev->little_core_ids_.clear();
-    for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
-      if (dev->cluster_ids_[i] == 0) {
-        dev->big_core_ids_.push_back(dev->core_ids_[i]);
-      } else {
-        dev->little_core_ids_.push_back(dev->core_ids_[i]);
-      }
-    }
-    arm_get_cpu_arch(&dev->archs_);
-  }
-
-  LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
-              << ", frequence: " << max_freq[i]
-              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
-              << ", CPU ARCH: A" << dev->archs_[i];
-  }
-  LOG(INFO) << "L1 DataCache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB";
-  }
-  LOG(INFO) << "L2 Cache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB";
-  }
-  LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB";
-
-  dev->max_freq_ = max_freq[0];
-  for (int j = 1; j < dev->compute_core_num_; ++j) {
-    if (dev->max_freq_ < max_freq[j]) {
-      dev->max_freq_ = max_freq[j];
-    }
-  }
-#elif defined(TARGET_IOS)
-  arm_get_cpu_arch(&dev->archs_);
+#ifdef TARGET_IOS
+const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
+#else
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
 #endif
-}
 
-// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  std::vector<int>* cache;
-  switch (cache_id) {
-    case 0:
-      cache = &cpu_info->L1_cache_;
-      break;
-    case 1:
-      cache = &cpu_info->L2_cache_;
-      break;
-    case 2:
-      cache = &cpu_info->L3_cache_;
-      break;
-    default:
+int get_cpu_num() {
+#ifdef LITE_WITH_LINUX
+  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
+  int max_cpu_num = 20;
+  int cpu_num = 0;
+  for (int i = 0; i < max_cpu_num; ++i) {
+    char path[256];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    FILE* fp = fopen(path, "rb");
+    if (!fp) {
       break;
-  }
-  int core_num = cpu_info->compute_core_num_;
-  cache->resize(core_num);
-  if (argc == 1) {
-    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      (*cache)[i] = cache_size;
-    }
-  } else {
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    int big_core_cache_size = va_arg(arg_ptr, int);
-    int little_core_cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
     }
+    cpu_num++;
+    fclose(fp);
   }
-  va_end(arg_ptr);
-}
-
-void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  int core_num = cpu_info->compute_core_num_;
-  cpu_info->archs_.resize(core_num);
-  if (argc == 1) {
-    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      cpu_info->archs_[i] = arch;
-    }
-  } else {
-    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
-    }
+  if (cpu_num < 1) {
+    cpu_num = 1;
   }
-  va_end(arg_ptr);
-}
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
-    set_cache_info(cpu_info, 2, 1, 2048 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    return true;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024,
-                   /*real cache size is 2M, while that will get bad performace
-                      on conv3x3s1 or gemm, set to 1M or 512K*/
-                   1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8996") != std::string::npos) {  // 820
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {2, 3};
-    cpu_info->little_core_ids_ = {0, 1};
-    cpu_info->cluster_ids_ = {1, 1, 0, 0};
-    set_arch_info(cpu_info, 1, kA72);
-    set_cache_info(cpu_info, 0, 1, 24 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA73);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8953") != std::string::npos) {  // 625
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8939") != std::string::npos) {  // 615
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
-    return true;
-
-    /* MediaTek */
-
-  } else if (hardware_name.find("MT6797") !=
-             std::string::npos) {  // X20/X23/X25/X27
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6795") != std::string::npos ||
-             hardware_name.find("MT6762") != std::string::npos ||
-             hardware_name.find("MT6755T") != std::string::npos ||
-             hardware_name.find("MT6755S") != std::string::npos ||
-             hardware_name.find("MT6753") != std::string::npos ||
-             hardware_name.find("MT6752") != std::string::npos ||
-             hardware_name.find("MT6750") != std::string::npos) {
-    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6758") != std::string::npos ||
-             hardware_name.find("MT6757") != std::string::npos ||
-             hardware_name.find("MT6763") != std::string::npos ||
-             hardware_name.find("MT6755M") != std::string::npos ||
-             hardware_name.find("MT6755") !=
-                 std::string::npos) {  // P30, P20/P25, P23, P10
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6765") != std::string::npos ||
-             hardware_name.find("MT6739") != std::string::npos ||
-             hardware_name.find("MT6738") != std::string::npos ||
-             hardware_name.find("MT6737") !=
-                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 0, 0, 0};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
+  return cpu_num;
+#elif defined(TARGET_IOS)
+  int cpu_num = 0;
+  size_t len = sizeof(cpu_num);
+  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
+  if (cpu_num < 1) {
+    cpu_num = 1;
   }
-  return false;
+  return cpu_num;
+#else
+  return 1;
+#endif
 }
 
-size_t arm_get_meminfo() {
+size_t get_mem_size() {
 #ifdef LITE_WITH_LINUX
   // get cpu count from /proc/cpuinfo
   FILE* fp = fopen("/proc/meminfo", "rb");
   if (!fp) {
     return 1;
   }
-
   size_t memsize = 0;
   char line[1024];
   while (!feof(fp)) {
@@ -327,57 +96,27 @@ size_t arm_get_meminfo() {
     }
     sscanf(s, "MemTotal:        %d kB", &memsize);
   }
-
   fclose(fp);
-
   return memsize;
 #elif defined(TARGET_IOS)
   // to be implemented
   printf("not implemented\n");
-  return 0;
 #endif
+  return 0;
 }
 
-int arm_get_cpucount() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_count = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_count; ++i) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#elif defined(TARGET_IOS)
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
+void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
+  archs->resize(cpu_num);
+  for (int i = 0; i < cpu_num; ++i) {
+    archs->at(i) = kARMArch_UNKOWN;
   }
-  return count;
-#else
-  return 1;
-#endif
-}
-
-void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
 #ifdef LITE_WITH_LINUX
-  archs->clear();
   //! get CPU ARCH
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
     return;
   }
+  int cpu_idx = 0;
   char line[1024];
   while (!feof(fp)) {
     char* s = fgets(line, 1024, fp);
@@ -385,79 +124,80 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
       break;
     }
     if (strstr(line, "part") != NULL) {
+      ARMArch arch_type = kARMArch_UNKOWN;
       int arch_id = 0;
       sscanf(s, "CPU part\t: %x", &arch_id);
       switch (arch_id) {
         case 0xd03:
-          archs->push_back(kA53);
+          arch_type = kA53;
           break;
         case 0xd05:
-          archs->push_back(kA55);
+          arch_type = kA55;
           break;
         case 0xd07:
-          archs->push_back(kA57);
+          arch_type = kA57;
           break;
         case 0xd08:
-          archs->push_back(kA72);
+          arch_type = kA72;
           break;
         case 0xd09:
-          archs->push_back(kA73);
+          arch_type = kA73;
           break;
         case 0xd0a:
-          archs->push_back(kA75);
+          arch_type = kA75;
+          break;
+        case 0xd40:
+          arch_type = kA76;
+          break;
+        case 0x804:
+          // 855
+          arch_type = kA76;
+          break;
+        case 0x805:
+          // 855
+          arch_type = kA55;
+          break;
+        case 0x802:
+          // 845
+          arch_type = kA75;
+          break;
+        case 0x803:
+          // 845
+          arch_type = kA55;
+          break;
+        case 0x801:
+          // 835
+          arch_type = kA73;
           break;
         case 0x800:
           // 835
-          archs->push_back(kA73);
+          arch_type = kA73;
           break;
         case 0x205:
           // 820
-          archs->push_back(kA72);
+          arch_type = kA72;
           break;
         default:
-          LOG(ERROR) << "unknow type";
-          archs->push_back(kARMArch_UNKOWN);
+          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
       }
+      archs->at(cpu_idx) = arch_type;
+      cpu_idx++;
     }
   }
   fclose(fp);
-  int cpu_count = arm_get_cpucount();
-  if (archs->size() < cpu_count) {
-    for (int i = archs->size(); i < cpu_count; ++i) {
-      archs->push_back(archs->at(i - 1));
-    }
+  for (; cpu_idx > 0 && cpu_idx < cpu_num; ++cpu_idx) {
+    archs->at(cpu_idx) = archs->at(cpu_idx - 1);
   }
-#endif
-#ifdef TARGET_IOS
-  int cpu_count = arm_get_cpucount();
-  for (int i = 0; i < cpu_count; ++i) {
-    archs->push_back(APPLE);
+#elif defined(TARGET_IOS)
+  for (int i = 0; i < cpu_num; ++i) {
+    archs->at(i) = APPLE;
   }
 #endif
 }
 
 #ifdef LITE_WITH_LINUX
 
-void set_default_cache(DeviceInfo* dev) {
-  int cpu_count = arm_get_cpucount();
-  dev->L1_cache_.resize(cpu_count);
-  dev->L2_cache_.resize(cpu_count);
-  dev->L3_cache_.resize(cpu_count);
-#ifdef TARGET_IOS
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 64 * 1024;
-    dev->L2_cache_[i] = 2048 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#else
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 32 * 1024;
-    dev->L2_cache_[i] = 512 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#endif
-}
-std::string arm_get_cpu_name() {
+std::string get_cpu_name() {
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
     return "";
@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
   return "";
 }
 
-int get_max_freq_khz(int cpuid) {
+void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
+  *max_freq = 0;
+  *min_freq = 0;
   // first try, for all possible cpu
   char path[256];
   snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
   FILE* fp = fopen(path, "rb");
-
   if (!fp) {
     // second try, for online cpu
     snprintf(path, sizeof(path),
              "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpuid);
+             cpu_id);
     fp = fopen(path, "rb");
-
     if (!fp) {
       // third try, for online cpu
+      // get max_freq
       snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
       fp = fopen(path, "rb");
-
       if (!fp) {
-        return -1;
+        return;
       }
-
-      int max_freq_khz = -1;
-      fscanf(fp, "%d", &max_freq_khz);
-
+      fscanf(fp, "%d", max_freq);
       fclose(fp);
-
-      return max_freq_khz;
+      // get min_freq
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return;
+      }
+      fscanf(fp, "%d", min_freq);
+      fclose(fp);
+      return;
     }
   }
-
-  int max_freq_khz = 0;
+  *min_freq = std::numeric_limits<int>::max();
   while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+    int freq = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq);
     if (nscan != 1) {
       break;
     }
-
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
+    if (freq > *max_freq) {
+      *max_freq = freq;
+    }
+    if (freq < *min_freq) {
+      *min_freq = freq;
     }
   }
-
   fclose(fp);
-
-  return max_freq_khz;
 }
 
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids) {
-  if (cpu_count == 0) {
-    return 0;
+void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
+                            std::vector<int>* cpu_ids,
+                            std::vector<int>* cluster_ids) {
+  int cpu_num = max_freqs.size();
+  if (cpu_num == 0) {
+    return;
   }
-
-  cpuids->resize(cpu_count);
-  cluster_ids->resize(cpu_count);
-
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
+  cpu_ids->resize(cpu_num);
+  cluster_ids->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
   }
-
   // sort cpuid as big core first
   // simple bubble sort
-
-  for (int i = 0; i < cpu_count; i++) {
-    for (int j = i + 1; j < cpu_count; j++) {
-      if (cpu_freq[i] < cpu_freq[j]) {
+  for (int i = 0; i < cpu_num; i++) {
+    for (int j = i + 1; j < cpu_num; j++) {
+      if (max_freqs[i] < max_freqs[j]) {
         // swap
-        int tmp = cpuids->at(i);
-        cpuids->at(i) = cpuids->at(j);
-        cpuids->at(j) = tmp;
+        int tmp = cpu_ids->at(i);
+        cpu_ids->at(i) = cpu_ids->at(j);
+        cpu_ids->at(j) = tmp;
       }
     }
   }
   // SMP
-  int mid_max_freq_khz =
-      (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
+  int mid_max_freq =
+      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
 
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
-    if (cpu_freq[i] >= mid_max_freq_khz) {
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
+    if (max_freqs[i] >= mid_max_freq) {
       cluster_ids->at(i) = 0;
     } else {
       cluster_ids->at(i) = 1;
     }
   }
-  return 0;
 }
 
-int check_online(const std::vector<int>& core_ids) {
-  if (core_ids.size() == 0) {
-    return 0;
+void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
+                        int* l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE* fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
+          }
+        }
+      }
+    }
+  }
+}
+
+bool check_cpu_online(const std::vector<int>& cpu_ids) {
+  if (cpu_ids.size() == 0) {
+    return false;
   }
   char path[256];
-  int online = 1;
-  for (int i = 0; i < core_ids.size(); ++i) {
+  bool all_online = true;
+  for (int i = 0; i < cpu_ids.size(); ++i) {
     snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             core_ids[i]);
+             cpu_ids[i]);
     FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      return 0;
+    int is_online = 0;
+    if (fp) {
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+    } else {
+      LOG(ERROR) << "Failed to query the online statue of CPU id:"
+                 << cpu_ids[i];
+    }
+    if (is_online == 0) {
+      all_online = false;
+      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
     }
-    int cur_online = 0;
-    fscanf(fp, "%d", &cur_online);
-    online &= cur_online;
-    fclose(fp);
   }
-  return online;
+  return all_online;
 }
 
-int set_sched_affinity(const std::vector<int>& cpuids) {
+int set_sched_affinity(const std::vector<int>& cpu_ids) {
 // #define CPU_SETSIZE 1024
 // #define __NCPUBITS  (8 * sizeof (unsigned long))
 // typedef struct
@@ -608,20 +389,571 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
 #endif
   cpu_set_t mask;
   CPU_ZERO(&mask);
-  for (int i = 0; i < cpuids.size(); i++) {
-    CPU_SET(cpuids[i], &mask);
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    CPU_SET(cpu_ids[i], &mask);
   }
-
   int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
   if (syscallret) {
-    LOG(ERROR) << "syscall error " << syscallret;
     return -1;
   }
+  return 0;
+}
+
+bool bind_threads(const std::vector<int> cpu_ids) {
+#ifdef ARM_WITH_OMP
+  int thread_num = cpu_ids.size();
+  omp_set_num_threads(thread_num);
+  std::vector<int> ssarets;
+  for (int i = 0; i < thread_num; ++i) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < thread_num; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
+  }
+  for (int i = 0; i < thread_num; i++) {
+    if (ssarets[i] != 0) {
+      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
+      return false;
+    }
+  }
+#else   // ARM_WITH_OMP
+  std::vector<int> first_cpu_id;
+  first_cpu_id.push_back(cpu_ids[0]);
+  int ssaret = set_sched_affinity(first_cpu_id);
+  if (ssaret != 0) {
+    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
+    return false;
+  }
+#endif  // ARM_WITH_OMP
+  return true;
+}
+
+#endif  // LITE_WITH_LINUX
+
+// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
+void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  std::vector<int>* cache;
+  switch (cache_id) {
+    case 0:
+      cache = &L1_cache_;
+      break;
+    case 1:
+      cache = &L2_cache_;
+      break;
+    case 2:
+      cache = &L3_cache_;
+      break;
+    default:
+      break;
+  }
+  cache->resize(core_num_);
+  if (argc == 1) {
+    int cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      (*cache)[i] = cache_size;
+    }
+  } else {
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    int big_core_cache_size = va_arg(arg_ptr, int);
+    int little_core_cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < big_core_num; ++i) {
+      (*cache)[big_core_ids_[i]] = big_core_cache_size;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      (*cache)[little_core_ids_[i]] = little_core_cache_size;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+void DeviceInfo::SetArchInfo(int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  archs_.resize(core_num_);
+  if (argc == 1) {
+    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      archs_[i] = arch;
+    }
+  } else {
+    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    for (int i = 0; i < big_core_num; ++i) {
+      archs_[big_core_ids_[i]] = big_core_arch;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      archs_[little_core_ids_[i]] = little_core_arch;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+bool DeviceInfo::SetCPUInfoByName() {
+  /* Snapdragon */
+  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA76, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024,
+                 /*real cache size is 2M, while that will get bad performace
+                    on conv3x3s1 or gemm, set to 1M or 512K*/
+                 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {2, 3};
+    little_core_ids_ = {0, 1};
+    cluster_ids_ = {1, 1, 0, 0};
+    SetArchInfo(1, kA72);
+    SetCacheInfo(0, 1, 24 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM660") != std::string::npos ||
+             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA73);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {4, 5, 6, 7};
+    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
+    return true;
+    /* MediaTek */
+  } else if (dev_name_.find("MT6797") !=
+             std::string::npos) {  // X20/X23/X25/X27
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6795") != std::string::npos ||
+             dev_name_.find("MT6762") != std::string::npos ||
+             dev_name_.find("MT6755T") != std::string::npos ||
+             dev_name_.find("MT6755S") != std::string::npos ||
+             dev_name_.find("MT6753") != std::string::npos ||
+             dev_name_.find("MT6752") != std::string::npos ||
+             dev_name_.find("MT6750") != std::string::npos) {
+    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6758") != std::string::npos ||
+             dev_name_.find("MT6757") != std::string::npos ||
+             dev_name_.find("MT6763") != std::string::npos ||
+             dev_name_.find("MT6755M") != std::string::npos ||
+             dev_name_.find("MT6755") !=
+                 std::string::npos) {  // P30, P20/P25, P23, P10
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6765") != std::string::npos ||
+             dev_name_.find("MT6739") != std::string::npos ||
+             dev_name_.find("MT6738") != std::string::npos ||
+             dev_name_.find("MT6737") !=
+                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  }
+  return false;
+}
+
+void DeviceInfo::SetCPUInfoByProb() {
+#ifdef LITE_WITH_LINUX
+  // get big.LITTLE cores by sorting CPU frequency
+  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
+  big_core_ids_.clear();
+  little_core_ids_.clear();
+  for (int i = 0; i < cluster_ids_.size(); ++i) {
+    if (cluster_ids_[i] == 0) {
+      big_core_ids_.push_back(core_ids_[i]);
+    } else {
+      little_core_ids_.push_back(core_ids_[i]);
+    }
+  }
+  // get l1, l2, l3 cache size for each core
+  for (int i = 0; i < core_num_; i++) {
+    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
+  }
+#endif  // LITE_WITH_LINUX
+}
+
+void DeviceInfo::RequestPowerFullMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; ++i) {
+    if (i < big_core_size) {
+      active_ids_.push_back(big_core_ids_[i]);
+    } else if (i < big_core_size + little_core_size) {
+      active_ids_.push_back(little_core_ids_[i - big_core_size]);
+    }
+  }
+  mode_ = LITE_POWER_FULL;
+}
+
+void DeviceInfo::RequestPowerHighMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(ERROR) << "Request thread num: " << thread_num
+                 << ", exceed the big cores size: " << big_core_size
+                 << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerLowMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; i++) {
+    active_ids_.push_back(0);
+  }
+  mode_ = LITE_POWER_NO_BIND;
+}
+
+void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
+                                          const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_RAND_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the big cores size: " << big_core_size
+                   << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
+                                         const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_RAND_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(
+            little_core_ids_[(i + shift_num) % little_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
 
+int DeviceInfo::Setup() {
+  core_num_ = get_cpu_num();
+  mem_size_ = get_mem_size();
+  get_cpu_arch(&archs_, core_num_);
+  // set defalut CPU info
+  SetCacheInfo(0, 1, DEFAULT_L1_CACHE_SIZE);
+  SetCacheInfo(1, 1, DEFAULT_L2_CACHE_SIZE);
+  SetCacheInfo(2, 1, DEFAULT_L3_CACHE_SIZE);
+#ifdef LITE_WITH_LINUX
+  // get max&min freq
+  max_freqs_.resize(core_num_);
+  min_freqs_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    int max_freq, min_freq;
+    get_cpu_max_min_freq(i, &max_freq, &min_freq);
+    max_freqs_[i] = max_freq / 1000;
+    min_freqs_[i] = min_freq / 1000;
+  }
+  // get cache size and big.LITTLE core ids
+  dev_name_ = get_cpu_name();
+  if (!SetCPUInfoByName()) {
+    SetCPUInfoByProb();
+  }
+  // output info
+  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
+  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
+              << ", max freq: " << max_freqs_[i]
+              << ", min freq: " << min_freqs_[i]
+              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
+              << ", CPU ARCH: A" << archs_[i];
+  }
+  LOG(INFO) << "L1 DataCache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "L2 Cache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
+#endif
+  // set default run mode
+  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
   return 0;
 }
 
+void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
+#ifdef ARM_WITH_OMP
+  thread_num = std::min(thread_num, core_num_);
+#else
+  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
+#endif
+#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  int big_little_core_size = big_core_size + little_core_size;
+  thread_num = std::min(thread_num, big_little_core_size);
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      RequestPowerFullMode(thread_num);
+      break;
+    case LITE_POWER_HIGH:
+      RequestPowerHighMode(thread_num);
+      break;
+    case LITE_POWER_LOW:
+      RequestPowerLowMode(thread_num);
+      break;
+    case LITE_POWER_NO_BIND:
+      RequestPowerNoBindMode(thread_num);
+      break;
+    case LITE_POWER_RAND_HIGH:
+      RequestPowerRandHighMode(shift_num, thread_num);
+      break;
+    case LITE_POWER_RAND_LOW:
+      RequestPowerRandLowMode(shift_num, thread_num);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported power mode: " << mode;
+      break;
+  }
+  if (active_ids_.size() == 0) {
+    active_ids_.push_back(0);
+  }
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+  if (mode_ != LITE_POWER_NO_BIND) {
+    if (check_cpu_online(active_ids_)) {
+      bind_threads(active_ids_);
+    } else {
+      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
+      mode_ = LITE_POWER_NO_BIND;
+    }
+  }
+#else  // LITE_WITH_LINUX
+  // only LITE_POWER_NO_BIND is supported in other OS
+  RequestPowerNoBindMode(thread_num);
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
 #endif  // LITE_WITH_LINUX
+  //! alloc memory for sgemm in this context
+  workspace_.Resize(
+      {static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
+  arch_ = archs_[active_ids_[0]];
+}
+
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  SetCacheInfo(0, 1, l1size);
+  SetCacheInfo(1, 1, l2size);
+  SetCacheInfo(2, 1, l3size);
+  workspace_.Resize({2 * (l1size + l2size)});
+}
+
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
+}
 
 #endif  // LITE_WITH_ARM
 
diff --git a/paddle/fluid/lite/core/cpu_info.h b/paddle/fluid/lite/core/cpu_info.h
index 385954e6d8e..0cf44201e79 100644
--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,24 +14,12 @@
 
 #pragma once
 
+#include <cstdarg>
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"
 
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
 namespace paddle {
 namespace lite {
 
@@ -60,64 +48,73 @@ typedef enum {
 
 class DeviceInfo {
  public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-
-  std::string device_name_;
-  std::string compute_ability_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-
   static DeviceInfo& Global() {
     static auto* x = new DeviceInfo;
     return *x;
   }
 
-  static void Init() {
-    auto& info = Global();
-    InitInternal(&info);
+  static int Init() {
+    static int ret = Global().Setup();
+    return ret;
   }
 
- private:
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
+  int Setup();
 
-size_t arm_get_meminfo();
+  void SetRunMode(PowerMode mode, int thread_num);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }
 
-int arm_get_cpucount();
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
 
-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
-
-#ifdef LITE_WITH_LINUX
-
-void set_default_cache(DeviceInfo* dev);
+  template <typename T>
+  T* workspace_data() {
+    return reinterpret_cast<T*>(workspace_.mutable_data<float>());
+  }
+  bool ExtendWorkspace(DDimLite dims);
 
-std::string arm_get_cpu_name();
+ private:
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;
 
-int get_max_freq_khz(int cpuid);
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;
 
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids);
-int check_online(const std::vector<int>& core_ids);
-int set_sched_affinity(const std::vector<int>& cpuids);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);
 
-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};
 
 #endif  // LITE_WITH_ARM
 
diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h
index 16172a80035..a53bae36bdb 100644
--- a/paddle/fluid/lite/core/hvy_tensor.h
+++ b/paddle/fluid/lite/core/hvy_tensor.h
@@ -86,6 +86,7 @@ class TensorHvy : public TensorBase<TensorHvy> {
 
   template <typename T>
   T* mutable_data() {
+    memory_size_ = framework::product(data_.dims()) * sizeof(T);
     return data_.mutable_data<T>(data_.dims(), platform::CPUPlace());
   }
   template <typename T>
@@ -107,8 +108,11 @@ class TensorHvy : public TensorBase<TensorHvy> {
     data_.Resize(framework::make_ddim(dims.Vectorize()));
   }
 
+  void Resize(const std::vector<int64_t>& x) { Resize(DDimHvy(x)); }
+
   void ShareDataWith(const TensorHvy& other) {
     data_.ShareDataWith(other.data_);
+    data_.set_lod(other.data_.lod());
   }
   void ShareDataWith(const framework::Tensor& other) {
     data_.ShareDataWith(other);
@@ -126,8 +130,11 @@ class TensorHvy : public TensorBase<TensorHvy> {
   const framework::LoDTensor& raw_tensor() const { return data_; }
   framework::LoDTensor& raw_tensor() { return data_; }
 
+  size_t memory_size() const { return memory_size_; }
+
  private:
   framework::LoDTensor data_;
+  size_t memory_size_{};
 };
 
 }  // namespace lite
diff --git a/paddle/fluid/lite/core/kernel.cc b/paddle/fluid/lite/core/kernel.cc
index 44b00f53d01..0dae1394290 100644
--- a/paddle/fluid/lite/core/kernel.cc
+++ b/paddle/fluid/lite/core/kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/core/kernel.h"
+#include <cstdlib>
 
 namespace paddle {
 namespace lite {
@@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const {
   return ss.str();
 }
 
+void KernelBase::ParseKernelType(const std::string &kernel_type,
+                                 std::string *op_type, std::string *alias,
+                                 Place *place) {
+  std::stringstream ss(kernel_type);
+  std::getline(ss, *op_type, '/');
+  std::getline(ss, *alias, '/');
+  std::string target, precision, layout;
+  std::getline(ss, target, '/');
+  std::getline(ss, precision, '/');
+  std::getline(ss, layout, '/');
+
+  place->target = static_cast<TargetType>(std::atoi(target.c_str()));
+  place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
+  place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
+}
+
+std::string KernelBase::SerializeKernelType(const std::string &op_type,
+                                            const std::string &alias,
+                                            const Place &place) {
+  std::stringstream ss;
+  ss << op_type << "/";
+  ss << alias << "/";
+  // We serialize the place value not the string representation here for
+  // easier deserialization.
+  ss << static_cast<int>(place.target) << "/";
+  ss << static_cast<int>(place.precision) << "/";
+  ss << static_cast<int>(place.layout);
+  return ss.str();
+}
+
 bool ParamTypeRegistry::KeyCmp::operator()(
     const ParamTypeRegistry::key_t &a,
     const ParamTypeRegistry::key_t &b) const {
diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h
index d7b296eec12..0ef46b65870 100644
--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -118,33 +118,11 @@ class KernelBase {
 
   static std::string SerializeKernelType(const std::string& op_type,
                                          const std::string& alias,
-                                         const Place& place) {
-    std::stringstream ss;
-    ss << op_type << "/";
-    ss << alias << "/";
-    // We serialize the place value not the string representation here for
-    // easier deserialization.
-    ss << static_cast<int>(place.target) << "/";
-    ss << static_cast<int>(place.precision) << "/";
-    ss << static_cast<int>(place.layout);
-    return ss.str();
-  }
+                                         const Place& place);
 
   static void ParseKernelType(const std::string& kernel_type,
                               std::string* op_type, std::string* alias,
-                              Place* place) {
-    std::stringstream ss(kernel_type);
-    std::getline(ss, *op_type, '/');
-    std::getline(ss, *alias, '/');
-    std::string target, precision, layout;
-    std::getline(ss, target, '/');
-    std::getline(ss, precision, '/');
-    std::getline(ss, layout, '/');
-
-    place->target = static_cast<TargetType>(std::stoi(target));
-    place->precision = static_cast<PrecisionType>(std::stoi(precision));
-    place->layout = static_cast<DataLayoutType>(std::stoi(layout));
-  }
+                              Place* place);
 
   virtual ~KernelBase() = default;
   void Torch() {}
diff --git a/paddle/fluid/lite/core/lite_tensor.h b/paddle/fluid/lite/core/lite_tensor.h
index 6cccdc0dd03..abdc876e1e0 100644
--- a/paddle/fluid/lite/core/lite_tensor.h
+++ b/paddle/fluid/lite/core/lite_tensor.h
@@ -47,6 +47,22 @@ class DDimLite : public DDimBase<DDimLite> {
                            std::multiplies<value_type>());
   }
   const std::vector<value_type> &data() const { return data_; }
+  value_type count(int start, int end) const {
+    if (start < 0) {
+      start = 0;
+    }
+    if (end > size()) {
+      end = size();
+    }
+    if (end < start) {
+      end = start;
+    }
+    value_type sum = 1;
+    for (auto i = start; i < end; ++i) {
+      sum *= data_[i];
+    }
+    return sum;
+  }
 
  private:
   std::vector<value_type> data_;
@@ -90,6 +106,8 @@ class TensorLite : public TensorBase<TensorLite> {
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
 
+  const void *raw_data() const { return buffer_->data(); }
+
   size_t memory_size() const { return memory_size_; }
 
   bool IsInitialized() const { return buffer_->data(); }
diff --git a/paddle/fluid/lite/core/memory.h b/paddle/fluid/lite/core/memory.h
index 5948f6c4a85..6b019abc19d 100644
--- a/paddle/fluid/lite/core/memory.h
+++ b/paddle/fluid/lite/core/memory.h
@@ -65,6 +65,8 @@ class Buffer {
     TargetCopy(target_, data_, other.data_, nbytes);
   }
 
+  ~Buffer() { Free(); }
+
  private:
   // memory it actually malloced.
   size_t space_{0};
diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt
index 322981c5827..93b5a3875a1 100644
--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -1,38 +1,41 @@
-cc_library(mir_node SRCS node.cc DEPS framework_proto_lite)
-cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node)
-cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
-cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
-cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
+lite_cc_library(mir_node SRCS node.cc DEPS framework_proto_lite)
+lite_cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program_lite)
+lite_cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
+lite_cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
+lite_cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
 
 add_subdirectory(fusion)
-cc_library(mir_passes
-        SRCS fc_fuse_pass.cc
-        conv_elementwise_add_relu_fuse_pass.cc
-        conv_bn_fuse_pass.cc 
-        static_kernel_pick_pass.cc
-        variable_place_inference_pass.cc
-        type_target_transform_pass.cc
-        io_copy_kernel_pick_pass.cc
-        graph_visualize_pass.cc
-        generate_program_pass.cc
-        argument_type_display_pass.cc
-        demo_pass.cc
-        runtime_context_assign_pass.cc
-        DEPS mir_pass types_lite context_lite ${mir_fusers})
+add_subdirectory(elimination)
 
-# for mobile, unnecessary to compile the following testings.
-if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    return()
-endif()
-cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
-cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
-        mir_ssa_graph scope_lite op_lite
-        fc_op_lite
-        ${host_kernels}
-        mir_passes
-        mir_pass_manager
-        program_fake_utils
-        )
+lite_cc_library(mir_passes
+  SRCS
+      fusion/fc_fuse_pass.cc
+      fusion/conv_elementwise_add_activation_fuse_pass.cc
+      fusion/conv_bn_fuse_pass.cc
+      fusion/elementwise_add_activation_fuse_pass.cc
+      fusion/quant_dequant_fuse_pass.cc
+      elimination/identity_scale_eliminate_pass.cc
+      static_kernel_pick_pass.cc
+      variable_place_inference_pass.cc
+      type_target_cast_pass.cc
+      type_precision_cast_pass.cc
+      io_copy_kernel_pick_pass.cc
+      graph_visualize_pass.cc
+      generate_program_pass.cc
+      argument_type_display_pass.cc
+      #trans_weigths_pass.cc
+      demo_pass.cc
+      runtime_context_assign_pass.cc
+  DEPS mir_pass types_lite context_lite ${mir_fusers})
+
+# lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
+        #mir_ssa_graph scope_lite op_lite
+        #fc_op_lite
+        #${host_kernels}
+        #mir_passes
+        #mir_pass_manager
+        #program_fake_utils
+        #)
 # lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
 #   DEPS
 #       mul_op_lite
@@ -51,13 +54,23 @@ cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
 #   X86_DEPS mul_compute_x86
 # )
 
-
-lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
+set(pattern_deps mir_node mir_ssa_graph op_lite)
+if (WITH_TESTING)
+  list(APPEND pattern_deps gtest)
+endif()
+lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)
 
 lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)
 
 
+# for mobile, unnecessary to compile the following testings.
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    return()
+endif()
+lite_cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
+
+
 # TODO(wz) replace framework/proto to lite proto.
 if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     # it depends on the fluid/framework/proto, that is too heavy for mobile execution.
@@ -70,16 +83,21 @@ message(STATUS "----> Ops lite: ${ops_lite}")
 message(STATUS "----> Host kernels: ${host_kernels}")
 message(STATUS "----> X86 kernels: ${x86_kernels}")
 
-lite_cc_test(test_lite_fc_fuse SRCS fc_fuse_pass_test.cc
+lite_cc_test(test_lite_fc_fuse SRCS fusion/fc_fuse_pass_test.cc
    DEPS cxx_api_lite mir_passes
-   ${ops_lite} ${host_kernels} ${x86_kernels}
+   ${ops_lite} ${host_kernels} ${x86_kernels} ${arm_kernels}
    ARGS --model_dir=${LITE_MODEL_DIR}/lite_fc_model
         --optimized_model=${LITE_MODEL_DIR}/lite_fc_model_opt SERIAL)
 
 lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_fc_model.tar.gz")
 add_dependencies(test_lite_fc_fuse extern_lite_download_lite_fc_model_tar_gz)
 
-lite_cc_test(test_lite_conv_elementwise_add_relu_fuse 
-             SRCS conv_elementwise_add_relu_fuse_pass_test.cc
+
+lite_cc_test(test_lite_conv_elementwise_add_activation_fuse
+             SRCS fusion/conv_elementwise_add_activation_fuse_pass_test.cc
+             DEPS cxx_api_lite mir_passes
+             ${ops_lite} ${host_kernels} ${x86_kernels})
+lite_cc_test(test_lite_elementwise_add_activation_fuse
+             SRCS fusion/elementwise_add_activation_fuse_pass_test.cc
              DEPS cxx_api_lite mir_passes
              ${ops_lite} ${host_kernels} ${x86_kernels})
diff --git a/paddle/fluid/lite/core/mir/elimination/CMakeLists.txt b/paddle/fluid/lite/core/mir/elimination/CMakeLists.txt
new file mode 100644
index 00000000000..9fda8ec29a4
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/elimination/CMakeLists.txt
@@ -0,0 +1,7 @@
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+  lite_cc_test(test_identity_scale_eliminate_pass_lite
+    SRCS identity_scale_eliminate_pass_test.cc
+    DEPS mir_passes program_lite proto_desc cpp_op_desc_lite
+    ${ops_lite}
+    )
+endif()
diff --git a/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
new file mode 100644
index 00000000000..6f8aeb65c05
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace {
+
+class Eliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* pre_op = OpNode("preop");  // the previous op's output need update
+    // TODO(Superjomn) check has only one output
+    auto* x = VarNode("x")->assert_is_op_input("scale", "X");
+    auto* scale_op = OpNode("scale", "scale")
+                         ->assert_op_attr<float>("scale", 1.)
+                         ->assert_op_attr<float>("bias", 0.);
+    auto* out = VarNode("out")->assert_is_op_output("scale", "Out");
+
+    *pre_op >> *x >> *scale_op >> *out;
+
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    x->AsIntermediate();  // x is pre_op's output, need to update
+  }
+
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& pre_op = matched.at("preop")->AsStmt();
+    auto op_info = *pre_op.op_info();
+
+    op_info.UpdateAllOutputs(matched.at("x")->AsArg().name,
+                             matched.at("out")->AsArg().name);
+    pre_op.ResetOp(op_info, graph->valid_places());
+
+    GraphSafeRemoveNodes(graph, {matched.at("scale")});
+
+    IR_NODE_LINK_TO(matched.at("preop"), matched.at("out"));
+  }
+};
+
+}  // namespace
+
+class IdentityScaleEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    Eliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(identity_scale_eliminate_pass,
+                  paddle::lite::mir::IdentityScaleEliminatePass);
diff --git a/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
new file mode 100644
index 00000000000..89db35fe0e8
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/elimination/identity_scale_eliminate_pass_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/core/mir/ssa_graph.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  // Op list:
+  // (x)->feed -> (feed) -> scale -> (scale_out) -> fetch->(fetch)
+  // After pass
+  // (x)->feed->(scale_out)->fetch->(fetch)
+  auto* main_block = program_desc->MutableBlock(0);
+  auto* feed_op = main_block->AppendOp();
+  auto* scale_op = main_block->AppendOp();
+  auto* fetch_op = main_block->AppendOp();
+  main_block->Var("x");
+  main_block->Var("feed");
+  main_block->Var("scale_out");
+  main_block->Var("fetch_out");
+
+  scope->Var("x")->GetMutable<lite::Tensor>();
+  scope->Var("feed")->GetMutable<lite::Tensor>();
+  scope->Var("scale_out")->GetMutable<lite::Tensor>();
+  scope->Var("fetch_out")->GetMutable<lite::Tensor>();
+
+  feed_op->SetType("feed");
+  feed_op->SetInput("X", {"x"});
+  feed_op->SetAttr("col", 1);
+  feed_op->SetOutput("Out", {"feed"});
+
+  scale_op->SetType("scale");
+  scale_op->SetInput("X", {"feed"});
+  scale_op->SetOutput("Out", {"scale_out"});
+  scale_op->SetAttr("scale", 1.f);
+  scale_op->SetAttr("bias", 0.f);
+  scale_op->SetAttr("bias_after_scale", true);
+
+  fetch_op->SetType("fetch");
+  fetch_op->SetInput("X", {"scale_out"});
+  fetch_op->SetOutput("Out", {"fetch"});
+  fetch_op->SetAttr("col", 1);
+
+  program_desc->Flush();
+
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+
+  LOG(INFO) << Visualize(graph.get());
+
+  return graph;
+}
+
+TEST(identity_test, test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  const int num_nodes = graph->nodes().size();
+  auto pass = PassManager::Global().LookUp("identity_scale_eliminate_pass");
+  ASSERT_TRUE(pass);
+  pass->Apply(graph);
+  ASSERT_EQ(graph->nodes().size(), num_nodes - 2UL);
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(feed)
+USE_LITE_OP(fetch)
+USE_LITE_OP(scale)
+USE_MIR_PASS(identity_scale_eliminate_pass)
diff --git a/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt b/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
index 1aecfdaed02..818c503e904 100644
--- a/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/fusion/CMakeLists.txt
@@ -1,18 +1,30 @@
-cc_library(fuse_fc
+lite_cc_library(fuse_fc
         SRCS fc_fuser.cc
         DEPS pattern_matcher_high_api)
-cc_library(fuse_conv_elementwise_add_relu
-        SRCS conv_elementwise_add_relu_fuser.cc
+lite_cc_library(fuse_conv_elementwise_add_activation
+        SRCS conv_elementwise_add_activation_fuser.cc
         DEPS pattern_matcher_high_api)
-cc_library(fuse_conv_bn
+lite_cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_elementwise_add_activation
+        SRCS elementwise_add_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_quant_dequant
+        SRCS quant_dequant_op_fuser.cc
+        DEPS pattern_matcher_high_api)
 
 set(mir_fusers 
     fuse_fc 
-    fuse_conv_elementwise_add_relu
+    fuse_conv_elementwise_add_activation
     fuse_conv_bn
+    fuse_quant_dequant
+    fuse_elementwise_add_activation
     CACHE INTERNAL "fusers")
 
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    return()
+endif()
+
 lite_cc_test(test_lite_conv_bn_fuse SRCS conv_bn_fuse_pass_test.cc
    DEPS elementwise_ops_lite batch_norm_op_lite conv_op_lite proto_desc compatible_pb_lite program_lite mir_pass mir_pass_manager pattern_matcher_high_api)
diff --git a/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/lite/core/mir/conv_bn_fuse_pass.cc
rename to paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index 562ec7f4507..1e7d7bc5774 100644
--- a/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.h"
diff --git a/paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h
rename to paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
index 7ce20c4d6e2..3a8573b4f8c 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_bn_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_bn_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -88,6 +88,7 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
   conv_op->SetAttr("paddings", paddings);
   conv_op->SetAttr("dilations", dilations);
   conv_op->SetAttr("groups", groups);
+  conv_op->SetAttr("fuse_relu", false);
 
   bn_op->SetType("batch_norm");
   bn_op->SetInput("X", {"conv_out"});
@@ -103,6 +104,10 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
   bn_op->SetOutput("SavedVariance", {"bn_saved_var"});
   float eps = 1e-5;
   bn_op->SetAttr("epsilon", eps);
+  bn_op->SetAttr("is_test", static_cast<int>(1));
+  bn_op->SetAttr("use_global_stats", false);
+  bn_op->SetAttr("momentum", 0.9f);
+  bn_op->SetAttr("data_layout", std::string("NCHW"));
 
   program_desc->Flush();
 
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
index e753f8a858d..0a73d1e39d9 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -70,7 +70,7 @@ void ConvBNFuser::BuildPattern() {
 void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   auto op_desc = GenOpDesc(matched);
   auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add");
-  auto conv = matched.at("conv2d")->stmt()->op;
+  auto conv = matched.at("conv2d")->stmt()->op();
   auto* scope = conv->scope();
   auto& valid_places = conv->valid_places();
 
@@ -84,9 +84,9 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
                         ->GetMutable<lite::Tensor>();
   size_t bias_size = bn_scale_t->data_size();
   auto bn_scale_d = bn_scale_t->mutable_data<float>();
-  PADDLE_ENFORCE(bias_size == conv_weight_dims[0],
-                 "The BN bias's size should be equal to the size of the first "
-                 "dim size of the conv weights");
+  CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
+      << "The BN bias's size should be equal to the size of the first "
+      << "dim size of the conv weights";
 
   auto bn_mean_t = scope->FindVar(matched.at("bn_mean")->arg()->name)
                        ->GetMutable<lite::Tensor>();
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.cc
new file mode 100644
index 00000000000..f4eb5a00ad2
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ConvElementwiseAddActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  fusion::ConvElementwiseAddActivationFuser fuser("conv2d", "relu");
+  fuser(graph.get());
+
+  fusion::ConvElementwiseAddActivationFuser depthwise_fuser("depthwise_conv2d",
+                                                            "relu");
+  depthwise_fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_conv_elementwise_add_activation_fuse_pass,
+                  paddle::lite::mir::ConvElementwiseAddActivationFusePass);
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h
new file mode 100644
index 00000000000..a5a619f4d0d
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class ConvElementwiseAddActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
new file mode 100644
index 00000000000..ca50ba7692d
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass_test.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/program.h"
+
+DEFINE_string(model_dir, "", "");
+DEFINE_string(optimized_model, "", "");
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  auto* main_block = program_desc->MutableBlock(0);
+
+  auto* conv2d_1 = main_block->AppendOp();
+  auto* conv2d_2 = main_block->AppendOp();
+  auto* add_1 = main_block->AppendOp();
+  auto* relu_1 = main_block->AppendOp();
+  auto* add_2 = main_block->AppendOp();
+  auto* relu_2 = main_block->AppendOp();
+
+  main_block->Var("input_1");
+  main_block->Var("input_2");
+  main_block->Var("filter_1");
+  main_block->Var("filter_2");
+  main_block->Var("conv2d_1_out");
+  main_block->Var("conv2d_2_out");
+  main_block->Var("bias_1");
+  main_block->Var("add_1_out");
+  main_block->Var("add_2_out");
+  main_block->Var("relu_1_out");
+  main_block->Var("out");
+
+  scope->Var("input_1")->GetMutable<lite::Tensor>();
+  scope->Var("input_2")->GetMutable<lite::Tensor>();
+  scope->Var("filter_1")->GetMutable<lite::Tensor>();
+  scope->Var("filter_2")->GetMutable<lite::Tensor>();
+  scope->Var("conv2d_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("conv2d_2_out")->GetMutable<lite::Tensor>();
+  scope->Var("bias_1")->GetMutable<lite::Tensor>();
+  scope->Var("add_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("add_2_out")->GetMutable<lite::Tensor>();
+  scope->Var("relu_1_out")->GetMutable<lite::Tensor>();
+  scope->Var("out")->GetMutable<lite::Tensor>();
+
+  conv2d_1->SetType("conv2d");
+  conv2d_1->SetInput("Input", {"input_1"});
+  conv2d_1->SetInput("Filter", {"filter_1"});
+  conv2d_1->SetOutput("Output", {"conv2d_1_out"});
+  conv2d_1->SetAttr("strides", std::vector<int>({1, 1}));
+  conv2d_1->SetAttr("paddings", std::vector<int>({0, 0}));
+  conv2d_1->SetAttr("groups", 1);
+  conv2d_1->SetAttr("dilations", std::vector<int>({1, 1}));
+  conv2d_1->SetAttr("fuse_relu", false);
+
+  add_1->SetType("elementwise_add");
+  add_1->SetInput("X", {"conv2d_1_out"});
+  add_1->SetInput("Y", {"bias_1"});
+  add_1->SetOutput("Out", {"add_1_out"});
+  add_1->SetAttr("axis", 1);
+
+  relu_1->SetType("relu");
+  relu_1->SetInput("X", {"add_1_out"});
+  relu_1->SetOutput("Out", {"relu_1_out"});
+
+  conv2d_2->SetType("conv2d");
+  conv2d_2->SetInput("Input", {"input_2"});
+  conv2d_2->SetInput("Filter", {"filter_2"});
+  conv2d_2->SetOutput("Output", {"conv2d_2_out"});
+  conv2d_2->SetAttr("strides", std::vector<int>({1, 1}));
+  conv2d_2->SetAttr("paddings", std::vector<int>({0, 0}));
+  conv2d_2->SetAttr("groups", 1);
+  conv2d_2->SetAttr("dilations", std::vector<int>({1, 1}));
+  conv2d_2->SetAttr("fuse_relu", false);
+
+  add_2->SetType("elementwise_add");
+  add_2->SetInput("X", {"conv2d_2_out"});
+  add_2->SetInput("Y", {"relu_1_out"});
+  add_2->SetOutput("Out", {"add_2_out"});
+  add_2->SetAttr("axis", 1);
+
+  relu_2->SetType("relu");
+  relu_2->SetInput("X", {"add_2_out"});
+  relu_2->SetOutput("Out", {"out"});
+
+  program_desc->Flush();
+
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+
+  return graph;
+}
+
+TEST(conv_elementwise_add_relu_fuse_pass, graph_test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(), 11UL /*vars*/ + 6UL /*ops*/);
+  Visualize(graph.get());
+}
+
+TEST(conv_elementwise_add_relu_fuse_pass, fuse_test_op) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  Visualize(graph.get());
+  const int num_nodes = graph->nodes().size();
+  auto* fuser = new ConvElementwiseAddActivationFusePass;
+  fuser->Apply(graph);
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(),
+            num_nodes - 5UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(conv2d);
+USE_LITE_OP(depthwise_conv2d);
+USE_LITE_OP(relu);
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
similarity index 84%
rename from paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc
rename to paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
index 497c8f4f0d3..3786ab5c835 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h"
 #include <memory>
 #include <vector>
 
@@ -21,7 +21,7 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
-void ConvElementwiseAddReLUFuser::BuildPattern() {
+void ConvElementwiseAddActivationFuser::BuildPattern() {
   // create input nodes.
   auto* input =
       VarNode("input")->assert_is_op_input(conv_type_, "Input")->AsInput();
@@ -36,7 +36,8 @@ void ConvElementwiseAddReLUFuser::BuildPattern() {
   auto* add = OpNode("add", "elementwise_add")
                   ->assert_is_op("elementwise_add")
                   ->AsIntermediate();
-  auto* relu = OpNode("relu", "relu")->assert_is_op("relu")->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
 
   // create intermediate nodes
   auto* conv2d_out = VarNode("conv2d_out")
@@ -45,25 +46,26 @@ void ConvElementwiseAddReLUFuser::BuildPattern() {
                          ->AsIntermediate();
   auto* add_out = VarNode("add_out")
                       ->assert_is_op_output("elementwise_add", "Out")
-                      ->assert_is_op_input("relu", "X")
+                      ->assert_is_op_input(act_type_, "X")
                       ->AsIntermediate();
 
   // create output node
-  auto* out = VarNode("output")->assert_is_op_output("relu", "Out")->AsOutput();
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
 
   // create topology.
   std::vector<PMNode*> conv2d_inputs{filter, input};
   std::vector<PMNode*> add_inputs{conv2d_out, bias};
   conv2d_inputs >> *conv2d >> *conv2d_out;
   add_inputs >> *add >> *add_out;
-  *add_out >> *relu >> *out;
+  *add_out >> *act >> *out;
 }
 
-void ConvElementwiseAddReLUFuser::InsertNewNode(SSAGraph* graph,
-                                                const key2nodes_t& matched) {
+void ConvElementwiseAddActivationFuser::InsertNewNode(
+    SSAGraph* graph, const key2nodes_t& matched) {
   auto op_desc = GenOpDesc(matched);
   auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
-  auto conv_old = matched.at("conv2d")->stmt()->op;
+  auto conv_old = matched.at("conv2d")->stmt()->op();
   auto* scope = conv_old->scope();
   auto& valid_places = conv_old->valid_places();
   conv_op->Attach(op_desc, scope);
@@ -76,10 +78,11 @@ void ConvElementwiseAddReLUFuser::InsertNewNode(SSAGraph* graph,
   IR_NODE_LINK_TO(new_op_node, matched.at("output"));
 }
 
-cpp::OpDesc ConvElementwiseAddReLUFuser::GenOpDesc(const key2nodes_t& matched) {
+cpp::OpDesc ConvElementwiseAddActivationFuser::GenOpDesc(
+    const key2nodes_t& matched) {
   auto* desc = matched.at("conv2d")->stmt()->op_info();
 
-  cpp::OpDesc op_desc;
+  cpp::OpDesc op_desc = *desc;
   op_desc.SetType(conv_type_);
   op_desc.SetInput("Input", {matched.at("input")->arg()->name});
   op_desc.SetInput("Filter", {matched.at("filter")->arg()->name});
@@ -87,18 +90,17 @@ cpp::OpDesc ConvElementwiseAddReLUFuser::GenOpDesc(const key2nodes_t& matched) {
   op_desc.SetOutput("Output", {matched.at("output")->arg()->name});
   // Other inputs. See operators/conv_op.h
   std::vector<std::string> input_arg_names = desc->InputArgumentNames();
-  for (auto name : input_arg_names) LOG(INFO) << name;
 
   if (std::find(input_arg_names.begin(), input_arg_names.end(),
                 "ResidualData") != input_arg_names.end()) {
     op_desc.SetInput("ResidualData", desc->Input("ResidualData"));
   }
-
   // Only consider strides, padding, groups, dilations, fuse_relu for now
   op_desc.SetAttr("strides", desc->GetAttr<std::vector<int>>("strides"));
   op_desc.SetAttr("paddings", desc->GetAttr<std::vector<int>>("paddings"));
   op_desc.SetAttr("groups", desc->GetAttr<int>("groups"));
   op_desc.SetAttr("dilations", desc->GetAttr<std::vector<int>>("dilations"));
+  // TODO(sangoly): support other activation types
   op_desc.SetAttr("fuse_relu", true);
   return op_desc;
 }
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h
new file mode 100644
index 00000000000..14a33613fdf
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuser.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class ConvElementwiseAddActivationFuser : public FuseBase {
+ public:
+  explicit ConvElementwiseAddActivationFuser(const std::string& conv_type,
+                                             const std::string& act_type) {
+    CHECK(act_type == "relu") << "Only relu activation be supported now";
+    conv_type_ = conv_type;
+    act_type_ = act_type;
+  }
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string conv_type_;
+  std::string act_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc
rename to paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.cc
index 3110c7aa6d4..7c20df2c24c 100644
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h"
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.h
similarity index 100%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h
rename to paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass.h
diff --git a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass_test.cc
similarity index 98%
rename from paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc
rename to paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass_test.cc
index 30991313ad3..99d59cec474 100644
--- a/paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuse_pass_test.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/conv_elementwise_add_relu_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/lite/api/cxx_api.h"
 #include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_activation_fuse_pass.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
 #include "paddle/fluid/lite/core/mir/passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
diff --git a/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
new file mode 100644
index 00000000000..20d1eaa82a8
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void ElementwiseAddActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  fusion::ElementwiseAddActivationFuser fuser("relu");
+  fuser(graph.get());
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
+                  paddle::lite::mir::ElementwiseAddActivationFusePass);
diff --git a/paddle/fluid/lite/arm/math/elementwise.h b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
similarity index 73%
rename from paddle/fluid/lite/arm/math/elementwise.h
rename to paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
index cf4c8e46b07..213c3f68f60 100644
--- a/paddle/fluid/lite/arm/math/elementwise.h
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h
@@ -14,15 +14,19 @@
 
 #pragma once
 
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pass.h"
+
 namespace paddle {
 namespace lite {
-namespace arm {
-namespace math {
+namespace mir {
 
-template <typename T>
-void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
+class ElementwiseAddActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
 
-}  // namespace math
-}  // namespace arm
+}  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
new file mode 100644
index 00000000000..11474002519
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass_test.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/program.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
+                                     const std::shared_ptr<Scope>& scope,
+                                     const std::vector<Place>& valid_places) {
+  auto* main_block = program_desc->MutableBlock(0);
+
+  auto* add_1 = main_block->AppendOp();
+  auto* add_2 = main_block->AppendOp();
+  auto* relu_1 = main_block->AppendOp();
+  auto* relu_2 = main_block->AppendOp();
+
+  main_block->Var("x_1");
+  main_block->Var("y_1");
+  main_block->Var("add_out_1");
+  main_block->Var("relu_out_1");
+  main_block->Var("y_2");
+  main_block->Var("add_out_2");
+  main_block->Var("out");
+
+  scope->Var("x_1")->GetMutable<lite::Tensor>();
+  scope->Var("y_1")->GetMutable<lite::Tensor>();
+  scope->Var("add_out_1")->GetMutable<lite::Tensor>();
+  scope->Var("relu_out_1")->GetMutable<lite::Tensor>();
+  scope->Var("y_2")->GetMutable<lite::Tensor>();
+  scope->Var("add_out_2")->GetMutable<lite::Tensor>();
+  scope->Var("out")->GetMutable<lite::Tensor>();
+
+  add_1->SetType("elementwise_add");
+  add_1->SetInput("X", {"x_1"});
+  add_1->SetInput("Y", {"y_1"});
+  add_1->SetOutput("Out", {"add_out_1"});
+  add_1->SetAttr("axis", 1);
+
+  relu_1->SetType("relu");
+  relu_1->SetInput("X", {"add_out_1"});
+  relu_1->SetOutput("Out", {"relu_out_1"});
+
+  add_2->SetType("elementwise_add");
+  add_2->SetInput("X", {"relu_out_1"});
+  add_2->SetInput("Y", {"y_2"});
+  add_2->SetOutput("Out", {"add_out_2"});
+  add_2->SetAttr("axis", 1);
+
+  relu_2->SetType("relu");
+  relu_2->SetInput("X", {"add_out_2"});
+  relu_2->SetOutput("Out", {"out"});
+
+  program_desc->Flush();
+
+  lite::Program program(*program_desc->Proto(), scope, valid_places);
+  auto graph = std::unique_ptr<SSAGraph>(new SSAGraph());
+  graph->Build(program, valid_places);
+
+  return graph;
+}
+
+TEST(elementwise_add_activation_fuse_pass, graph_test) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  ASSERT_EQ(graph->nodes().size(),
+            7UL /*vars*/ + 4UL /*ops*/ + 1UL /* SSAGraph tmp node*/);
+}
+
+TEST(elementwise_add_activation_fuse_pass, fuse_test_op) {
+  framework::ProgramDesc program_desc;
+  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  auto graph = BuildGraph(&program_desc, scope, places);
+  Visualize(graph.get());
+  const int num_nodes = graph->nodes().size();
+  auto* fuser = new ElementwiseAddActivationFusePass;
+  fuser->Apply(graph);
+  Visualize(graph.get());
+  ASSERT_EQ(graph->nodes().size(),
+            num_nodes - 3UL * 2 /*nodes removed */ + 1UL * 2 /* fused nodes*/);
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(elementwise_add);
+USE_LITE_OP(fusion_elementwise_add_activation);
+USE_LITE_OP(relu);
diff --git a/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
new file mode 100644
index 00000000000..cafbc42d85b
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void ElementwiseAddActivationFuser::BuildPattern() {
+  // create input nodes.
+  auto* x = VarNode("x")->assert_is_op_input("elementwise_add", "X")->AsInput();
+  auto* y = VarNode("y")->assert_is_op_input("elementwise_add", "Y")->AsInput();
+
+  // create op nodes
+  auto* add = OpNode("add", "elementwise_add")
+                  ->assert_is_op("elementwise_add")
+                  ->AsIntermediate();
+  auto* act =
+      OpNode("act", act_type_)->assert_is_op(act_type_)->AsIntermediate();
+
+  // create intermediate nodes
+  auto* add_out = VarNode("add_out")
+                      ->assert_is_op_output("elementwise_add", "Out")
+                      ->assert_is_op_input(act_type_, "X")
+                      ->AsIntermediate();
+
+  // create output node
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+
+  // create topology.
+  std::vector<PMNode*> add_inputs{x, y};
+  add_inputs >> *add >> *add_out;
+  *add_out >> *act >> *out;
+}
+
+void ElementwiseAddActivationFuser::InsertNewNode(SSAGraph* graph,
+                                                  const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto op =
+      LiteOpRegistry::Global().Create("fusion_elementwise_add_activation");
+  auto old_op = matched.at("add")->stmt()->op();
+  auto* scope = old_op->scope();
+  auto& valid_places = old_op->valid_places();
+  op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("x"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("y"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc ElementwiseAddActivationFuser::GenOpDesc(
+    const key2nodes_t& matched) {
+  auto* desc = matched.at("add")->stmt()->op_info();
+
+  cpp::OpDesc op_desc;
+  op_desc.SetType("fusion_elementwise_add_activation");
+  op_desc.SetInput("X", {matched.at("x")->arg()->name});
+  op_desc.SetInput("Y", {matched.at("y")->arg()->name});
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+
+  op_desc.SetAttr("axis", desc->GetAttr<int>("axis"));
+  op_desc.SetAttr("act_type", act_type_);
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
similarity index 85%
rename from paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h
rename to paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
index 3e21368234f..bcd7b4cbcda 100644
--- a/paddle/fluid/lite/core/mir/fusion/conv_elementwise_add_relu_fuser.h
+++ b/paddle/fluid/lite/core/mir/fusion/elementwise_add_activation_fuser.h
@@ -23,16 +23,16 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
-class ConvElementwiseAddReLUFuser : public FuseBase {
+class ElementwiseAddActivationFuser : public FuseBase {
  public:
-  explicit ConvElementwiseAddReLUFuser(const std::string& conv_type)
-      : conv_type_(conv_type) {}
+  explicit ElementwiseAddActivationFuser(const std::string& act_type)
+      : act_type_(act_type) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string conv_type_;
+  std::string act_type_;
 };
 
 }  // namespace fusion
diff --git a/paddle/fluid/lite/core/mir/fc_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/lite/core/mir/fc_fuse_pass.cc
rename to paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.cc
index 008f05ce5cb..f50db9c17b3 100644
--- a/paddle/fluid/lite/core/mir/fc_fuse_pass.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h"
 #include <memory>
 #include <vector>
 #include "paddle/fluid/lite/core/mir/fusion/fc_fuser.h"
diff --git a/paddle/fluid/lite/core/mir/fc_fuse_pass.h b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h
similarity index 100%
rename from paddle/fluid/lite/core/mir/fc_fuse_pass.h
rename to paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h
diff --git a/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
similarity index 95%
rename from paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
rename to paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
index 35efedb5797..ca880fa30e4 100644
--- a/paddle/fluid/lite/core/mir/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/fc_fuse_pass.h"
+#include "paddle/fluid/lite/core/mir/fusion/fc_fuse_pass.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "paddle/fluid/lite/api/cxx_api.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 DEFINE_string(model_dir, "", "");
@@ -28,7 +28,7 @@ namespace lite {
 namespace mir {
 
 TEST(fc_fuse_pass, fuse_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
@@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) {
 
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(fc_fuse_pass, save_model_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
   predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
diff --git a/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc b/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc
index a8b6336595c..e39741976f8 100644
--- a/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuser.cc
@@ -46,7 +46,7 @@ void FcFuser::BuildPattern() {
 void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   auto op_desc = GenOpDesc(matched);
   auto fc_op = LiteOpRegistry::Global().Create("fc");
-  auto mul = matched.at("mul")->stmt()->op;
+  auto mul = matched.at("mul")->stmt()->op();
   auto* scope = mul->scope();
   auto& valid_places = mul->valid_places();
   fc_op->Attach(op_desc, scope);
@@ -60,7 +60,7 @@ void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
 }
 
 cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
+  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
   op_desc.SetType("fc");
   op_desc.SetInput("Input", {matched.at("x")->arg()->name});
   op_desc.SetInput("W", {matched.at("W")->arg()->name});
diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
new file mode 100644
index 00000000000..4890e707593
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> quant_types = {
+      "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"};
+  std::unordered_set<std::string> quantized_op_types = {"conv2d", "mul",
+                                                        "depthwise_conv2d"};
+  for (auto& quant_type : quant_types) {
+    for (auto& op_type : quantized_op_types) {
+      for (int i = 6; i >= 1; i--) {
+        fusion::QuantDequantOpFuser fuser(op_type, quant_type, i);
+        fuser(graph.get());
+      }
+    }
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass,
+                  paddle::lite::mir::QuantDequantFusePass);
diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.h b/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.h
new file mode 100644
index 00000000000..5cd38de51de
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_fuse_pass.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include "paddle/fluid/lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class QuantDequantFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc
new file mode 100644
index 00000000000..0c4eb033491
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void QuantDequantOpFuser::BuildPattern() {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kQuantizedOpOutOffset = 2;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  std::string weight_name = "";
+  if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+    weight_name = "Filter";
+  } else {
+    weight_name = "Y";
+  }
+  auto* quant_op_input = VarNode("quant_op_input")
+                             ->assert_is_op_input(quant_type_, "X")
+                             ->AsInput();
+  auto* quant_op_in_scale = VarNode("quant_op_in_scale")
+                                ->assert_is_op_input(quant_type_, "InScale")
+                                ->AsIntermediate();
+  auto* quant_op = OpNode("quant_op", quant_type_)
+                       ->assert_is_op(quant_type_)
+                       ->AsIntermediate();
+
+  auto* quant_op_out_scale =
+      VarNode("quant_op_out_scale")
+          ->assert_is_op_output(quant_type_, "OutScale")
+          ->assert_is_op_input("fake_dequantize_max_abs", "Scale")
+          ->AsIntermediate();
+
+  auto* quant_op_out = VarNode("quant_op_out")
+                           ->assert_is_op_output(quant_type_, "Out")
+                           ->assert_is_op_input(op_type_)
+                           ->AsIntermediate();
+  std::vector<PMNode*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(VarNode(string_format("quantized_op_weight%d", i))
+                        ->assert_is_op_input(op_type_, weight_name)
+                        ->AsInput());
+
+    nodes.push_back(OpNode(string_format("quantized_op%d", i), op_type_)
+                        ->assert_is_op(op_type_)
+                        ->AsIntermediate());
+
+    nodes.push_back(VarNode(string_format("quantized_op_out%d", i))
+                        ->assert_is_op_output(op_type_)
+                        ->assert_is_op_input("fake_dequantize_max_abs", "X")
+                        ->AsIntermediate());
+
+    nodes.push_back(
+        OpNode(string_format("dequant_op%d", i), "fake_dequantize_max_abs")
+            ->assert_is_op("fake_dequantize_max_abs")
+            ->AsIntermediate());
+    nodes.push_back(VarNode(string_format("dequant_op_out%d", i))
+                        ->assert_is_op_output("fake_dequantize_max_abs", "Out")
+                        ->AsOutput());
+  }
+
+  quant_op->LinksFrom({quant_op_input, quant_op_in_scale});
+  quant_op_out->LinksFrom({quant_op});
+  quant_op_out_scale->LinksFrom({quant_op});
+  for (int i = 0; i < times_; i++) {
+    nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom(
+        {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]});
+    nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOffset]});
+    nodes[i * kNumFields + kDequantOpOffset]->LinksFrom(
+        {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale});
+    nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kDequantOpOffset]});
+  }
+}
+
+void QuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
+                                        const key2nodes_t& matched) {
+  const int kNumFields = 5;
+  const int kQuantizedWeightOffset = 0;
+  const int kQuantizedOpOffset = 1;
+  const int kDequantOpOffset = 3;
+  const int kDequantOpOutOffset = 4;
+
+  auto* quant_op_input = matched.at("quant_op_input");
+  auto* quant_op_in_scale = matched.at("quant_op_in_scale");
+  auto* quant_op = matched.at("quant_op");
+
+  std::vector<Node*> nodes;
+  for (int i = 0; i < times_; i++) {
+    nodes.push_back(matched.at(string_format("quantized_op_weight%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op%d", i)));
+    nodes.push_back(matched.at(string_format("quantized_op_out%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op%d", i)));
+    nodes.push_back(matched.at(string_format("dequant_op_out%d", i)));
+  }
+  int bit_length = quant_op->stmt()->op_info()->GetAttr<int>("bit_length");
+  auto* scope = quant_op->stmt()->op()->scope();
+  auto& valid_places = quant_op->stmt()->op()->valid_places();
+  int range = ((1 << (bit_length - 1)) - 1);
+  auto input_scale_t = scope->FindVar(quant_op_in_scale->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  float input_scale = input_scale_t->data<float>()[0];
+
+  for (int i = 0; i < times_; i++) {
+    float max_range = nodes[i * kNumFields + kDequantOpOffset]
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<float>("max_range");
+    float weight_scale = (range * range) / max_range;
+
+    cpp::OpDesc op_desc =
+        *nodes[i * kNumFields + kQuantizedOpOffset]->stmt()->op_info();
+    if (op_type_ == "conv2d" || op_type_ == "depthwise_conv2d") {
+      op_desc.SetInput("Input", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Output", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+    } else if (op_type_ == "mul") {
+      op_desc.SetInput("X", {matched.at("quant_op_input")->arg()->name});
+      op_desc.SetOutput(
+          "Out", {nodes[i * kNumFields + kDequantOpOutOffset]->arg()->name});
+    }
+    op_desc.SetAttr("enable_int8", true);
+    op_desc.SetAttr("input_scale", input_scale);
+    auto quantized_weight_var_name =
+        nodes[i * kNumFields + kQuantizedWeightOffset]->arg()->name;
+    auto quantized_weight_t =
+        scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
+    float* quantized_weight_data = quantized_weight_t->mutable_data<float>();
+    size_t weight_num = quantized_weight_t->data_size();
+    for (size_t i = 0; i < weight_num; i++) {
+      quantized_weight_data[i] *= (weight_scale / range);
+    }
+    auto quantized_op = LiteOpRegistry::Global().Create(op_type_);
+
+    quantized_op->Attach(op_desc, scope);
+    auto* new_op_node =
+        graph->GraphCreateInstructNode(quantized_op, valid_places);
+    IR_NODE_LINK_TO(quant_op_input, new_op_node);
+    IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset],
+                    new_op_node);
+    IR_NODE_LINK_TO(new_op_node, nodes[i * kNumFields + kDequantOpOutOffset]);
+  }
+}
+
+cpp::OpDesc QuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc;
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h
new file mode 100644
index 00000000000..29ff767e772
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/fusion/quant_dequant_op_fuser.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+/* The model trained by fluid quantization is a simulation of real int8.
+ * The quantized Ops(conv2d, mul, depthwise conv2d etc) have fake_quantop
+ * in front and fake_dequantop behind.
+ *
+ * When in int8 mode, the pattern like "fake_quant + quantized_op +
+ * fake_dequant"
+ * can be detected by this fuser. The fuser extract the input_scale and
+ * the weight_scale info from fake_quant, fake_dequant op and fuse those into
+ * the quantized_op.
+ * In addition, the fuser delete fake_quant and fake_dequant op in the graph at
+ * the last.
+ */
+class QuantDequantOpFuser : public FuseBase {
+ public:
+  explicit QuantDequantOpFuser(const std::string& op_type,
+                               const std::string& quant_type, int times)
+      : op_type_(op_type), quant_type_(quant_type), times_(times) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+
+ private:
+  std::string op_type_{"conv2d"};
+  std::string quant_type_;
+  int times_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/generate_program_pass.cc b/paddle/fluid/lite/core/mir/generate_program_pass.cc
index 3751b6f1f84..97586d74842 100644
--- a/paddle/fluid/lite/core/mir/generate_program_pass.cc
+++ b/paddle/fluid/lite/core/mir/generate_program_pass.cc
@@ -24,12 +24,12 @@ namespace lite {
 namespace mir {
 
 void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "final program \n" << Visualize(graph.get());
+  VLOG(4) << "final program \n" << Visualize(graph.get());
   for (auto& item : graph->StmtTopologicalOrder()) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op, std::move(stmt.valid_kernels.front()));
+      VLOG(4) << stmt;
+      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
 }
diff --git a/paddle/fluid/lite/core/mir/graph_visualize_pass.cc b/paddle/fluid/lite/core/mir/graph_visualize_pass.cc
index 6a13bafd67c..141c1af8eee 100644
--- a/paddle/fluid/lite/core/mir/graph_visualize_pass.cc
+++ b/paddle/fluid/lite/core/mir/graph_visualize_pass.cc
@@ -17,6 +17,7 @@
 #include <set>
 #include <string>
 #include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -39,7 +40,7 @@ std::string Visualize(mir::SSAGraph* graph) {
     if (node.IsArg()) {
       key = node.AsArg().name;
     } else {
-      key = node.AsStmt().op_type + std::to_string(id++);
+      key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
     }
 
     if (node.IsStmt()) {
diff --git a/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc b/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
index ebf9e5a57bf..9f38ce01ba1 100644
--- a/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/io_copy_kernel_pick_pass.cc
@@ -25,11 +25,11 @@ class IoCopyKernelPickPass : public StmtPass {
     for (auto& node : graph->mutable_nodes()) {
       if (!node.IsStmt()) continue;
       auto& inst = node.AsStmt();
-      if (inst.op_type != "io_copy") continue;
+      if (inst.op_type() != "io_copy") continue;
 
       LOG(INFO) << "....> picking a IO COPY kernel";
 
-      auto& kernels = node.AsStmt().valid_kernels;
+      auto& kernels = node.AsStmt().kernels();
       CHECK(!kernels.empty()) << "No valid kernels found for IoCopy Op";
       const auto* inty = node.inlinks.front()->AsArg().type;
       const auto* outy = node.outlinks.front()->AsArg().type;
diff --git a/paddle/fluid/lite/core/mir/node.cc b/paddle/fluid/lite/core/mir/node.cc
index 711ff508f23..814df2b61a2 100644
--- a/paddle/fluid/lite/core/mir/node.cc
+++ b/paddle/fluid/lite/core/mir/node.cc
@@ -13,3 +13,62 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/core/mir/node.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+const OpInfo *mir::Node::Stmt::op_info() const {
+  CHECK(op_);
+  return op_->op_info();
+}
+
+Place mir::Node::Stmt::place() const {
+  CHECK(!valid_kernels_.empty());
+  return valid_kernels_.front()->place();
+}
+
+KernelBase &mir::Node::Stmt::picked_kernel() {
+  CHECK(!valid_kernels_.empty()) << "no kernel for " << op_type();
+  return *valid_kernels_.front();
+}
+
+OpInfo *mir::Node::Stmt::mutable_op_info() {
+  CHECK(op_);
+  return op_->mutable_op_info();
+}
+
+void mir::Node::Stmt::ResetOp(const cpp::OpDesc &op_desc,
+                              const std::vector<Place> &valid_places,
+                              lite::Scope *scope) {
+  CHECK((op_ && op_->scope()) || scope) << "Either scope should be set";
+  lite::Scope *the_scope = scope ? scope : op_->scope();
+  op_->Attach(op_desc, the_scope);
+  // Recreate the kernels with the latest OpInfo.
+  valid_kernels_.clear();
+
+  if (!op_ || op_->op_info()->Type() != op_desc.Type()) {
+    op_ = LiteOpRegistry::Global().Create(op_desc.Type());
+    CHECK(op_) << "No op found for " << op_desc.Type();
+  }
+  valid_kernels_ = op_->CreateKernels(valid_places);
+}
+
+std::ostream &mir::operator<<(std::ostream &os, const mir::Node::Stmt &other) {
+  os << "Statement " << other.op_type() << " " << other.place();
+  return os;
+}
+
+mir::Node::Arg &mir::Node::AsArg(const std::string &name, int id) {
+  auto &x = AsArg();
+  x.name = name;
+  x.id = id;
+  return x;
+}
+mir::Node::Arg &mir::Node::AsArg(const std::string &name) {
+  auto &x = AsArg();
+  x.name = name;
+  return x;
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/node.h b/paddle/fluid/lite/core/mir/node.h
index a5fd90dac48..08b7a963e79 100644
--- a/paddle/fluid/lite/core/mir/node.h
+++ b/paddle/fluid/lite/core/mir/node.h
@@ -41,32 +41,40 @@ class Node {
     kUnk,
   };
 
-  struct Stmt {
-    std::string op_type;
+  class Stmt {
     // The kernel instances this Statement contains.
-    std::vector<std::unique_ptr<KernelBase>> valid_kernels;
+    std::vector<std::unique_ptr<KernelBase>> valid_kernels_;
     // TODO(Superjomn) make this a shared_ptr for resource safety.
-    std::shared_ptr<OpLite> op;  // we hold op to run InferShape
+    std::shared_ptr<OpLite> op_;  // we hold op to run InferShape
 
-    const OpInfo* op_info() {
-      CHECK(op);
-      return op->op_info();
-    }
+   public:
+    // Refresh the operator and kernels with the latest OpInfo.
+    void ResetOp(const cpp::OpDesc& op_desc,
+                 const std::vector<Place>& valid_places,
+                 lite::Scope* scope = nullptr);
 
-    Place place() const {
-      CHECK(!valid_kernels.empty());
-      return valid_kernels.front()->place();
-    }
+    std::string op_type() const { return op_info()->Type(); }
+    const OpInfo* op_info() const;
+    OpInfo* mutable_op_info();
 
-    KernelBase& picked_kernel() {
-      CHECK(!valid_kernels.empty()) << "no kernel for " << op_type;
-      return *valid_kernels.front();
+    void SetKernels(std::vector<std::unique_ptr<KernelBase>>&& kernels) {
+      valid_kernels_ = std::move(kernels);
     }
-
-    friend std::ostream& operator<<(std::ostream& os, const Stmt& other) {
-      os << "Statement " << other.op_type << " " << other.place();
-      return os;
+    std::vector<std::unique_ptr<KernelBase>>& kernels() {
+      return valid_kernels_;
     }
+
+    void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
+    const std::shared_ptr<OpLite> op() const { return op_; }
+
+    Place place() const;
+
+    KernelBase& picked_kernel();
+
+    friend std::ostream& operator<<(std::ostream& os, const Stmt& other);
+
+    // Description.
+    std::string desc;
   };
 
   struct Arg {
@@ -78,26 +86,16 @@ class Node {
     bool is_weight{false};
   };
 
-  Arg& AsArg(const std::string& name, int id) {
-    auto& x = AsArg();
-    x.name = name;
-    x.id = id;
-    return x;
-  }
+  Arg& AsArg(const std::string& name, int id);
 
-  Arg& AsArg(const std::string& name) {
-    auto& x = AsArg();
-    x.name = name;
-    return x;
-  }
+  Arg& AsArg(const std::string& name);
 
   Stmt& AsStmt(const std::string& op_type,
                std::vector<std::unique_ptr<KernelBase>>&& kernels,
                const std::shared_ptr<OpLite>& op) {
     auto& x = AsStmt();
-    x.op_type = op_type;
-    x.op = op;
-    x.valid_kernels = std::move(kernels);
+    x.SetOp(op);
+    x.SetKernels(std::move(kernels));
     return x;
   }
 
@@ -142,7 +140,7 @@ class Node {
     }
     if (other.IsStmt()) {
       auto& arg = other.AsStmt();
-      os << "Statement " << arg.op_type;
+      os << "Statement " << arg.op_type();
     }
     return os;
   }
diff --git a/paddle/fluid/lite/core/mir/pass_registry.h b/paddle/fluid/lite/core/mir/pass_registry.h
index 0586845f3ce..d95f97ed029 100644
--- a/paddle/fluid/lite/core/mir/pass_registry.h
+++ b/paddle/fluid/lite/core/mir/pass_registry.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <string>
+#include "paddle/fluid/lite/api/paddle_lite_factory_helper.h"
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
 
 namespace paddle {
@@ -24,7 +25,7 @@ namespace mir {
 class PassRegistry {
  public:
   PassRegistry(const std::string& name, mir::Pass* pass) {
-    VLOG(2) << "Registry add MIR pass " << name;
+    // VLOG(2) << "Registry add MIR pass " << name;
     PassManager::Global().AddNewPass(name, pass);
   }
 
@@ -41,8 +42,3 @@ class PassRegistry {
   bool mir_pass_registry##name__##_fake() {                               \
     return mir_pass_registry##name__.Touch();                             \
   }
-
-#define USE_MIR_PASS(name__)                                   \
-  extern bool mir_pass_registry##name__##_fake();              \
-  static bool mir_pass_usage##name__ __attribute__((unused)) = \
-      mir_pass_registry##name__##_fake();
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.cc b/paddle/fluid/lite/core/mir/pattern_matcher.cc
index 7524312db8b..6250ee8ee00 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher.cc
@@ -115,7 +115,6 @@ void PatternMatcher::operator()(SSAGraph *graph,
 bool PatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) {
   VLOG(3) << "mark pmnodes in graph";
   if (graph->nodes().empty()) return false;
-
   for (auto &node : graph->mutable_nodes()) {
     for (const auto &pmnode : pattern_.nodes()) {
       if (pmnode->Tell(&node)) {
@@ -326,7 +325,7 @@ std::string PMPattern::DotString() const {
   // Create Nodes
   std::unordered_map<PMNode *, std::string> node2dot;
   for (const auto &node : nodes()) {
-    std::string node_id = "Node" + std::to_string(id++);
+    std::string node_id = string_format("Node%d", id++);
     dot.AddNode(node_id, {}, node->name());
     node2dot[node.get()] = node_id;
   }
@@ -398,7 +397,7 @@ PMNode *PMNode::assert_is_op_output(const std::string &op_type) {
   asserts_.emplace_back([=](const Node *x) {
     for (auto *op : x->inlinks) {
       if (op && op->IsStmt()) {
-        auto *op_info = x->stmt()->op_info();
+        auto *op_info = op->stmt()->op_info();
         if (op_info->Type() == op_type) return true;
       }
     }
@@ -409,8 +408,8 @@ PMNode *PMNode::assert_is_op_output(const std::string &op_type) {
 
 bool IsNthOutput(const Node *var, const Node *op, const std::string &argument,
                  size_t nth) {
-  PADDLE_ENFORCE(var->IsArg());
-  PADDLE_ENFORCE(op->IsStmt());
+  CHECK(var->IsArg());
+  CHECK(op->IsStmt());
   auto op_info = op->stmt()->op_info();
   if (op_info->Output(argument).size() <= nth) return false;
   return var->arg()->name == op_info->Output(argument)[nth];
@@ -418,8 +417,8 @@ bool IsNthOutput(const Node *var, const Node *op, const std::string &argument,
 
 bool IsNthInput(const Node *var, const Node *op, const std::string &argument,
                 size_t nth) {
-  PADDLE_ENFORCE(var->IsArg());
-  PADDLE_ENFORCE(op->IsStmt());
+  CHECK(var->IsArg());
+  CHECK(op->IsStmt());
   auto op_info = op->stmt()->op_info();
   if (op_info->Input(argument).size() <= nth) return false;
   return var->arg()->name == op_info->Input(argument)[nth];
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher.h b/paddle/fluid/lite/core/mir/pattern_matcher.h
index ff9fbce35dd..adbe7efd151 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher.h
+++ b/paddle/fluid/lite/core/mir/pattern_matcher.h
@@ -30,6 +30,7 @@
 #include "paddle/fluid/lite/core/mir/node.h"
 #include "paddle/fluid/lite/core/mir/ssa_graph.h"
 #include "paddle/fluid/lite/model_parser/pb/op_desc.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -139,14 +140,13 @@ struct PMNode {
 
   template <typename T>
   PMNode* assert_op_attr(const std::string& attr_name, const T& attr) {
-    asserts_.emplace_back([=](Node* x) {
+    asserts_.push_back([=](const Node* x) {
       if (x && x->IsStmt()) {
         auto* op_info = x->stmt()->op_info();
         return op_info->HasAttr(attr_name) &&
                op_info->GetAttr<T>(attr_name) == attr;
-      } else {
-        return false;
       }
+      return false;
     });
     return this;
   }
@@ -229,7 +229,7 @@ class PMPattern {
   FRIEND_TEST(PMPattern, NewNode);
 #endif
 
-  static std::string NewID() { return "pmnode-" + std::to_string(id_++); }
+  static std::string NewID() { return string_format("pmnode-%d", id_++); }
 
   std::vector<std::unique_ptr<PMNode>> nodes_;
   std::vector<edge_t> edges_;
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
index 5dc929cda5e..322ddb29064 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.cc
@@ -20,7 +20,7 @@ namespace lite {
 namespace mir {
 
 void FuseBase::PerformPatternMatcher(SSAGraph *graph) {
-  LOG(INFO) << "\n" << matcher_.pattern().DotString();
+  VLOG(4) << "\n" << matcher_.pattern().DotString();
   // Get subgraphs and record the mir::Node pointers for each PMNode.
   auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) {
     // get all the reigistered nodes.
@@ -41,17 +41,15 @@ void FuseBase::DeleteInterNodes(SSAGraph *graph) {
     }
   }
 
-  LOG(INFO) << "keys.size " << keys.size();
-
+  VLOG(4) << "keys: " << key2nodes_.size();
   std::unordered_set<const Node *> nodes2rm;
   for (auto &matched : key2nodes_) {
-    LOG(INFO) << "get matched " << matched.size();
     for (const auto &key : keys) {
       nodes2rm.insert(matched.at(key));
     }
   }
 
-  LOG(INFO) << "clean nodes " << nodes2rm.size();
+  VLOG(3) << "clean nodes " << nodes2rm.size();
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h
index b3a23c654bd..7c3f890383d 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api.h
@@ -49,7 +49,13 @@ class FuseBase {
   virtual void BuildPattern() = 0;
 
   // Generate an operator desc with a matched subgraph.
-  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) = 0;
+  virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) {
+    return cpp::OpDesc();
+  }
+
+  PMNode* OpNode(const std::string& key) {
+    return GetOrCreateNode(key)->assert_is_op();
+  }
 
   PMNode* OpNode(const std::string& key, const std::string& op_type);
 
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc b/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc
index 7a46bb9a93d..d0844b0b7ef 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_high_api_test.cc
@@ -52,7 +52,7 @@ class FcFuser : public FuseBase {
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
     auto op_desc = GenOpDesc(matched);
     auto fc_op = LiteOpRegistry::Global().Create("fc");
-    auto mul = matched.at("mul")->stmt()->op;
+    auto mul = matched.at("mul")->stmt()->op();
     auto* scope = mul->scope();
     auto& valid_places = mul->valid_places();
     fc_op->Attach(op_desc, scope);
@@ -90,7 +90,7 @@ std::unique_ptr<SSAGraph> BuildGraph(framework::ProgramDesc* program_desc,
   main_block->Var("w");
   main_block->Var("out");
 
-  scope->Var("w")->GetMutable<lite::Tensor>();
+  scope->Var("x")->GetMutable<lite::Tensor>();
   scope->Var("b")->GetMutable<lite::Tensor>();
   scope->Var("mul_out")->GetMutable<lite::Tensor>();
   scope->Var("w")->GetMutable<lite::Tensor>();
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_test.cc b/paddle/fluid/lite/core/mir/pattern_matcher_test.cc
index 3b082060fe2..8f2ca38f1cc 100644
--- a/paddle/fluid/lite/core/mir/pattern_matcher_test.cc
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_test.cc
@@ -23,19 +23,19 @@ namespace mir {
 void BuildGraph(SSAGraph* g) {
   g->mutable_nodes().emplace_back();
   Node& o1 = g->mutable_nodes().back();
-  o1.AsStmt().op_type = "op1";
+  o1.AsStmt().desc = "op1";
   g->mutable_nodes().emplace_back();
   Node& o2 = g->mutable_nodes().back();
-  o2.AsStmt().op_type = "op2";
+  o2.AsStmt().desc = "op2";
   g->mutable_nodes().emplace_back();
   Node& o3 = g->mutable_nodes().back();
-  o3.AsStmt().op_type = "op3";
+  o3.AsStmt().desc = "op3";
   g->mutable_nodes().emplace_back();
   Node& o4 = g->mutable_nodes().back();
-  o4.AsStmt().op_type = "op4";
+  o4.AsStmt().desc = "op4";
   g->mutable_nodes().emplace_back();
   Node& o5 = g->mutable_nodes().back();
-  o5.AsStmt().op_type = "op5";
+  o5.AsStmt().desc = "op5";
   g->mutable_nodes().emplace_back();
   Node& v1 = g->mutable_nodes().back();
   v1.AsArg("var1");
@@ -108,11 +108,11 @@ TEST(PatternMatcher, MarkPMNodesInGraph) {
   //   v2 -> o3(a node named o3)
   auto* o2 = x.pattern_.NewNode([](const Node* node) {
     // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op2";
+    return node && node->IsStmt() && node->stmt()->desc == "op2";
   });
   auto* o3 = x.pattern_.NewNode([](const Node* node) {
     // The teller can be any condition, such as op type, or variable's shape.
-    return node && node->IsStmt() && node->stmt()->op_type == "op3";
+    return node && node->IsStmt() && node->stmt()->desc == "op3";
   });
   auto* v2 = x.pattern_.NewNode([](const Node* node) {
     // The teller can be any condition, such as op type, or variable's shape.
@@ -153,8 +153,8 @@ TEST(PatternMatcher, MultiSubgraph) {
   //   op -> var
   auto* any_op = x.mutable_pattern()->NewNode(
       [](const Node* node) {
-        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
-                                  node->stmt()->op_type == "op3");
+        return node->IsStmt() &&
+               (node->stmt()->desc == "op2" || node->stmt()->desc == "op3");
       },
       "OP0");
   auto* any_var =
@@ -170,9 +170,9 @@ TEST(PatternMatcher, MultiSubgraph) {
   int count = 0;
   PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
                                         SSAGraph* g) {
-    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
+    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->desc << " -> "
               << s.at(any_var)->arg()->name << " -> "
-              << s.at(any_op1)->stmt()->op_type;
+              << s.at(any_op1)->stmt()->desc;
     count++;
   };
 
@@ -197,12 +197,12 @@ TEST(PatternMatcher, IntermediateCheck) {
   PatternMatcher matcher;
   auto* op2 = matcher.mutable_pattern()->NewNode(
       [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op2";
+        return x && x->IsStmt() && x->stmt()->desc == "op2";
       },
       "op2");
   auto* op3 = matcher.mutable_pattern()->NewNode(
       [](const Node* x) {
-        return x && x->IsStmt() && x->stmt()->op_type == "op3";
+        return x && x->IsStmt() && x->stmt()->desc == "op3";
       },
       "op3");
   auto* v2 = matcher.mutable_pattern()
diff --git a/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
new file mode 100644
index 00000000000..3b082060fe2
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/pattern_matcher_tester.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/pattern_matcher.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void BuildGraph(SSAGraph* g) {
+  g->mutable_nodes().emplace_back();
+  Node& o1 = g->mutable_nodes().back();
+  o1.AsStmt().op_type = "op1";
+  g->mutable_nodes().emplace_back();
+  Node& o2 = g->mutable_nodes().back();
+  o2.AsStmt().op_type = "op2";
+  g->mutable_nodes().emplace_back();
+  Node& o3 = g->mutable_nodes().back();
+  o3.AsStmt().op_type = "op3";
+  g->mutable_nodes().emplace_back();
+  Node& o4 = g->mutable_nodes().back();
+  o4.AsStmt().op_type = "op4";
+  g->mutable_nodes().emplace_back();
+  Node& o5 = g->mutable_nodes().back();
+  o5.AsStmt().op_type = "op5";
+  g->mutable_nodes().emplace_back();
+  Node& v1 = g->mutable_nodes().back();
+  v1.AsArg("var1");
+  g->mutable_nodes().emplace_back();
+  Node& v2 = g->mutable_nodes().back();
+  v2.AsArg("var2");
+  g->mutable_nodes().emplace_back();
+  Node& v3 = g->mutable_nodes().back();
+  v3.AsArg("var3");
+  g->mutable_nodes().emplace_back();
+  Node& v4 = g->mutable_nodes().back();
+  v4.AsArg("var4");
+
+  // o1->v1->o2
+  o1.outlinks.push_back(&v1);
+  o2.inlinks.push_back(&v1);
+  v1.inlinks.push_back(&o1);
+  v1.outlinks.push_back(&o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2.outlinks.push_back(&v2);
+  o3.inlinks.push_back(&v2);
+  o4.inlinks.push_back(&v2);
+  v2.inlinks.push_back(&o2);
+  v2.outlinks.push_back(&o3);
+  v2.outlinks.push_back(&o4);
+  // o2->v3->o5
+  o2.outlinks.push_back(&v3);
+  o5.inlinks.push_back(&v3);
+  v3.inlinks.push_back(&o2);
+  v3.outlinks.push_back(&o5);
+  // o3-v4->o5
+  o3.outlinks.push_back(&v4);
+  o5.inlinks.push_back(&v4);
+  v4.inlinks.push_back(&o3);
+  v4.outlinks.push_back(&o5);
+}
+
+TEST(PMPattern, NewNode) {
+  PMPattern x;
+  auto* n = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+
+TEST(PMPattern, AddEdge) {
+  PMPattern x;
+  auto* a = x.NewNode([](const Node* x) { return true; });
+  auto* b = x.NewNode([](const Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+
+TEST(PatternMatcher, MarkPMNodesInGraph) {
+  PatternMatcher x;
+  // mark o2, o3, v2
+
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op2";
+  });
+  auto* o3 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsStmt() && node->stmt()->op_type == "op3";
+  });
+  auto* v2 = x.pattern_.NewNode([](const Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->IsArg() && node->arg()->name == "var2";
+  });
+
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  x.MarkPMNodesInGraph(&graph);
+
+  ASSERT_EQ(x.pmnodes2nodes_.size(), 3UL);
+
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+
+TEST(PatternMatcher, MultiSubgraph) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  PatternMatcher x;
+
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](const Node* node) {
+        return node->IsStmt() && (node->stmt()->op_type == "op2" ||
+                                  node->stmt()->op_type == "op3");
+      },
+      "OP0");
+  auto* any_var =
+      x.mutable_pattern()
+          ->NewNode([](const Node* node) { return node->IsArg(); }, "VAR")
+          ->AsIntermediate();
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](const Node* node) { return node->IsStmt(); }, "OP1");
+
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+
+  int count = 0;
+  PatternMatcher::handle_t handle = [&](const PatternMatcher::subgraph_t& s,
+                                        SSAGraph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->stmt()->op_type << " -> "
+              << s.at(any_var)->arg()->name << " -> "
+              << s.at(any_op1)->stmt()->op_type;
+    count++;
+  };
+
+  x(&graph, handle);
+
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1);
+  ASSERT_LE(count, 2);
+}
+
+TEST(PatternMatcher, IntermediateCheck) {
+  SSAGraph graph;
+  BuildGraph(&graph);
+
+  // o2->v2->o3
+  // o2->v2->o4
+  // check o2+o3 fuse, should fail because v2 also link to o4.
+  PatternMatcher matcher;
+  auto* op2 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op2";
+      },
+      "op2");
+  auto* op3 = matcher.mutable_pattern()->NewNode(
+      [](const Node* x) {
+        return x && x->IsStmt() && x->stmt()->op_type == "op3";
+      },
+      "op3");
+  auto* v2 = matcher.mutable_pattern()
+                 ->NewNode(
+                     [](const Node* x) {
+                       return x && x->IsArg() && x->arg()->name == "var2";
+                     },
+                     "var2")
+                 ->AsIntermediate();
+  v2->LinksFrom({op2}).LinksTo({op3});
+
+  int count = 0;
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  EXPECT_EQ(count, 0);
+
+  count = 0;
+  v2->AsInput();
+  matcher(&graph, [&](const PatternMatcher::subgraph_t& g, SSAGraph* graph) {
+    ++count;
+  });
+  ASSERT_EQ(count, 1);
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.cc b/paddle/fluid/lite/core/mir/ssa_graph.cc
index ba99a681f79..7df9e2da42f 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph.cc
@@ -24,10 +24,10 @@ namespace lite {
 namespace mir {
 
 bool SSAGraph::CheckBidirectionalConnection() {
-  LOG(INFO) << "node count " << node_storage_.size();
+  VLOG(4) << "node count " << node_storage_.size();
   for (auto &node : node_storage_) {
-    if (node.IsStmt()) LOG(INFO) << node.AsStmt().op_info()->Type();
-    if (node.IsArg()) LOG(INFO) << node.AsArg().name << " " << node.AsArg().id;
+    if (node.IsStmt()) VLOG(4) << node.AsStmt().op_info()->Type();
+    if (node.IsArg()) VLOG(4) << node.AsArg().name << " " << node.AsArg().id;
     for (auto *in : node.inlinks) {
       CHECK(in->outlinks.end() !=
             std::find(in->outlinks.begin(), in->outlinks.end(), &node));
@@ -123,11 +123,9 @@ void SSAGraph::Build(const Program &program,
 
   std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
   for (auto &op : program.ops()) {
-    LOG(INFO) << op->op_info()->Type();
+    VLOG(3) << op->op_info()->Type();
     auto *op_node = GraphCreateInstructNode(op, valid_places);
-    LOG(INFO) << "input:";
     for (const std::string &name : op->op_info()->input_names()) {
-      LOG(INFO) << name;
       mir::Node *arg_node = nullptr;
       if (arg_update_node_map_.count(name)) {
         arg_node = arg_update_node_map_.at(name);
@@ -141,9 +139,7 @@ void SSAGraph::Build(const Program &program,
       CHECK(arg_node->IsRoleSet());
       DirectedLink(arg_node, op_node);
     }
-    LOG(INFO) << "output:";
     for (const std::string &name : op->op_info()->output_names()) {
-      LOG(INFO) << name;
       node_storage_.emplace_back();
       auto *arg_node = &node_storage_.back();
       arg_node->AsArg(name, node_storage_.size() - 1);
diff --git a/paddle/fluid/lite/core/mir/ssa_graph.h b/paddle/fluid/lite/core/mir/ssa_graph.h
index 7c0e6cef498..0a6f4022dd9 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph.h
+++ b/paddle/fluid/lite/core/mir/ssa_graph.h
@@ -65,6 +65,10 @@ class SSAGraph : GraphBase {
   Node *GraphCreateInstructNode(const std::shared_ptr<OpLite> &op,
                                 const std::vector<Place> &valid_places);
 
+  // Device related attributes
+  const std::vector<Place> &valid_places() const { return valid_places_; }
+  void SetValidPlaces(const std::vector<Place> &x) { valid_places_ = x; }
+
  private:
   mir::Node *Argument(const std::string &name);
   // Check the bidirectional connection.
@@ -89,6 +93,7 @@ class SSAGraph : GraphBase {
  private:
   std::list<mir::Node> node_storage_;
   std::map<std::string, mir::Node *> arguments_;
+  std::vector<Place> valid_places_;
 };
 
 // Remove the link between a -> b.
diff --git a/paddle/fluid/lite/core/mir/ssa_graph_test.cc b/paddle/fluid/lite/core/mir/ssa_graph_test.cc
index 520fcf6e750..98a93b46394 100644
--- a/paddle/fluid/lite/core/mir/ssa_graph_test.cc
+++ b/paddle/fluid/lite/core/mir/ssa_graph_test.cc
@@ -16,8 +16,8 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 
@@ -52,4 +52,6 @@ TEST(SSAGraph, test) {
 }  // namespace paddle
 
 USE_LITE_OP(fc);
-USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def);
+#ifdef LITE_WITH_X86
+// USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
+#endif
diff --git a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
index 9d48c123a0c..d703db1f3d0 100644
--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
@@ -29,29 +29,73 @@ bool KernelScoreCmp(const std::pair<size_t, std::unique_ptr<KernelBase>>& a,
 }
 
 void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  CHECK(kernel_pick_factors_.AnyFactorConsidered())
+  CHECK(kernel_pick_factors_.any_factor_considered())
       << "kernel_pick_factors should be specified first";
   CHECK(graph) << "graph not valid";
   // sort kernels by the factors.
+
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
     auto& instruct = node.AsStmt();
+
+    // Get candidate kernels
     std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
-    CHECK(!instruct.valid_kernels.empty()) << "No kernels found for "
-                                           << instruct.op_type;
-    for (auto&& kernel : instruct.valid_kernels) {
+    CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                       << instruct.op_type();
+    for (auto&& kernel : instruct.kernels()) {
       size_t score = KernelGrade(*kernel);
       scored.emplace_back(score, std::move(kernel));
     }
-
     std::sort(scored.begin(), scored.end(), KernelScoreCmp);
+    instruct.kernels().clear();
+
+    if (!instruct.op_info()->HasAttr("enable_int8")) {
+      // Move kernel back
+      // Just keep a single best kernel.
+      // TODO(Superjomn) reconsider this.
+      instruct.kernels().emplace_back(std::move(scored.front().second));
+      VLOG(2) << "pick " << instruct.kernels().front()->name();
 
-    // Move kernel back
-    // Just keep a single best kernel.
-    // TODO(Superjomn) reconsider this.
-    instruct.valid_kernels.clear();
-    instruct.valid_kernels.emplace_back(std::move(scored.front().second));
-    VLOG(2) << "pick " << instruct.valid_kernels.front()->name();
+    } else {
+      bool out_type_int8 = true;
+      // Only if all ops linked to this op output has enable_int8 attr,
+      // then the op output type is int8, or fp32.
+      for (auto* out_n : node.outlinks) {
+        CHECK(out_n->IsArg());
+        for (auto* tmp_op : out_n->outlinks) {
+          CHECK(tmp_op->IsStmt());
+          if (!tmp_op->AsStmt().op_info()->HasAttr("enable_int8")) {
+            out_type_int8 = false;
+            break;
+          }
+        }
+        if (!out_type_int8) break;
+      }
+
+      // According to the out type, we pick the kernel.
+      auto output_arguments = instruct.op_info()->OutputArgumentNames();
+      for (auto& candidate : scored) {
+        bool all_output_type_match = true;
+        auto expect_output_type =
+            out_type_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
+        for (auto& arg_name : output_arguments) {
+          const Type* out_arg_ty =
+              candidate.second->GetOutputDeclType(arg_name);
+          if (out_arg_ty->precision() != expect_output_type) {
+            all_output_type_match = false;
+          }
+        }
+
+        if (all_output_type_match) {
+          instruct.kernels().emplace_back(std::move(candidate.second));
+          VLOG(2) << "pick " << instruct.kernels().front()->name();
+          break;
+        }
+      }
+      CHECK(!instruct.kernels().empty()) << "No kernels found for "
+                                         << instruct.op_type();
+    }
   }
 }
 
diff --git a/paddle/fluid/lite/core/mir/trans_weigths_pass.cc b/paddle/fluid/lite/core/mir/trans_weigths_pass.cc
new file mode 100644
index 00000000000..d7a040e133f
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/trans_weigths_pass.cc
@@ -0,0 +1,171 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/trans_weigths_pass.h"
+#include <list>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void TransWeightPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // Start from inputs of the graph, those should have place set.
+  std::list<Node*> nodes;
+  for (auto& node : graph->mutable_nodes()) {
+    nodes.push_back(&node);
+  }
+
+  for (auto& node : nodes) {
+    if (!node->IsStmt()) continue;
+    auto& instruct = node->AsStmt();
+    if (!instruct.op_info()->HasAttr("enable_int8")) {
+      continue;
+    }
+    std::vector<std::string> output_arg_names =
+        instruct.op_info()->output_argnames();
+
+    CHECK(output_arg_names.size() == 1)
+        << "Currently, the op that supports int8 supports only one output";
+    // After static kernel select pass, there is only one kernel here.
+    const Type* out_arg_ty =
+        instruct.kernels()[0]->GetOutputDeclType(output_arg_names[0]);
+    auto out_precision = out_arg_ty->precision();
+    bool out_type_int8 = out_precision == PRECISION(kInt8) ? true : false;
+    float in_scale, out_scale;
+
+    in_scale = instruct.op_info()->GetAttr<float>("input_scale");
+
+    // Get next input op's input_scale
+    if (out_type_int8) {
+      LOG(INFO) << "output_type_int8";
+      auto out_node = node->outlinks.front();
+      CHECK(out_node->IsArg());
+      auto one_adj_op_node = out_node->outlinks.front();
+      CHECK(one_adj_op_node->IsStmt());
+      auto& one_adj_instruct = one_adj_op_node->AsStmt();
+      CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
+      CHECK(one_adj_instruct.op_info()->HasAttr("input_scale"));
+      out_scale = one_adj_instruct.op_info()->GetAttr<float>("input_scale");
+      instruct.mutable_op_info()->SetAttr("output_scale", out_scale);
+    } else {
+      LOG(INFO) << "output_type_fp32";
+    }
+
+    std::string op_type = instruct.op_info()->Type();
+    std::vector<float> weight_scale;
+    auto* scope = instruct.op()->scope();
+
+    if (op_type == "depthwise_conv2d" || op_type == "conv2d") {
+      std::string weight_var_name = instruct.op_info()->Input("Filter").front();
+      auto conv_weight_t =
+          scope->FindVar(weight_var_name)->GetMutable<lite::Tensor>();
+      // till now, all the weight should be float32 type
+      float* conv_weight_d = conv_weight_t->mutable_data<float>();
+      int64_t axis_size = conv_weight_t->dims()[0];
+      int64_t inner_size = conv_weight_t->data_size() / axis_size;
+      weight_scale =
+          GetWeightScale(conv_weight_d, axis_size, inner_size, 127.0);
+
+      Tensor temp_tensor;
+      temp_tensor.Resize(conv_weight_t->dims());
+      int8_t* temp_data = temp_tensor.mutable_data<int8_t>();
+      FP32ToInt8(conv_weight_d, temp_data, weight_scale.data(), axis_size, 1,
+                 inner_size);
+      conv_weight_t->CopyDataFrom(temp_tensor);
+    } else if (op_type == "fc" || op_type == "mul") {
+      std::string weight_arg_name = "W";
+      if (op_type == "mul") weight_arg_name = "Y";
+      std::string weight_var_name =
+          instruct.op_info()->Input(weight_arg_name).front();
+
+      auto fc_weight_t =
+          scope->FindVar(weight_var_name)->GetMutable<lite::Tensor>();
+      // till now, all the weight should be float32 type
+      float* fc_weight_d = fc_weight_t->mutable_data<float>();
+
+      CHECK_EQ(fc_weight_t->dims().size(), 2UL);
+
+      int64_t h = fc_weight_t->dims()[0];
+      int64_t w = fc_weight_t->data_size() / h;
+      Tensor trans_w_t, int8_temp_t;
+      trans_w_t.CopyDataFrom(*fc_weight_t);
+      float* trans_w_data = trans_w_t.mutable_data<float>();
+      int8_temp_t.Resize(fc_weight_t->dims());
+      int8_t* int8_temp_data = int8_temp_t.mutable_data<int8_t>();
+      // trans weight for calc the weight scale.
+      for (int i = 0; i < h; i++) {
+        for (int j = 0; j < w; j++) {
+          trans_w_data[i * w + j] = fc_weight_d[j * h + i];
+        }
+      }
+      weight_scale = GetWeightScale(trans_w_data, w, h, 127.0);
+
+      int8_t* fc_weight_int8_d = fc_weight_t->mutable_data<int8_t>();
+      FP32ToInt8(trans_w_data, int8_temp_data, weight_scale.data(), w, 1, h);
+      // Retrans back
+      for (int i = 0; i < w; i++) {
+        for (int j = 0; j < h; j++) {
+          fc_weight_int8_d[i * h + j] = int8_temp_data[j * w + i];
+        }
+      }
+    }
+
+    // Convert fp32 bias to int8 bias
+    std::vector<std::string> input_arg_names =
+        instruct.op_info()->InputArgumentNames();
+    if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
+            input_arg_names.end() &&
+        instruct.op_info()->Input("Bias").size() > 0) {
+      std::string bias_var_name = instruct.op_info()->Input("Bias").front();
+      auto bias_weight_t =
+          scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+      float* bias_weight_d = bias_weight_t->mutable_data<float>();
+
+      Tensor temp_bias;
+      temp_bias.Resize(bias_weight_t->dims());
+      int* temp_bias_data = temp_bias.mutable_data<int>();
+      TransFP32BiasToInt32(bias_weight_d, temp_bias_data, temp_bias.data_size(),
+                           in_scale, weight_scale);
+      bias_weight_t->CopyDataFrom(temp_bias);
+    }
+
+    instruct.mutable_op_info()->SetAttr("weight_scale", weight_scale);
+
+    auto original_selected_kernel = std::move(instruct.kernels().front());
+    auto updated_op_info = *instruct.mutable_op_info();
+    instruct.ResetOp(updated_op_info, graph->valid_places());
+    instruct.kernels().clear();
+    instruct.kernels().emplace_back(std::move(original_selected_kernel));
+    for (auto& kernel : instruct.kernels()) {
+      LOG(INFO) << "kernel info: " << kernel->name();
+      instruct.op()->AttachKernel(kernel.get());
+    }
+  }
+}
+
+void TransWeightPass::SetValidPlaces(const std::vector<Place>& valid_places) {
+  CHECK(!valid_places.empty());
+  valid_places_ = valid_places;
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(trans_weight_pass, paddle::lite::mir::TransWeightPass);
diff --git a/paddle/fluid/lite/core/mir/trans_weigths_pass.h b/paddle/fluid/lite/core/mir/trans_weigths_pass.h
new file mode 100644
index 00000000000..b31cdfb5906
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/trans_weigths_pass.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/saturate.h"
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+/*
+ * IoComplementPass complement the necessary instruction to make data
+ * transferring or transformation between different places.
+ */
+class TransWeightPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+  std::vector<float> GetWeightScale(float* in_data, int64_t axis_size,
+                                    int64_t inner_size, float scale_factor) {
+    std::vector<float> scale_out(axis_size);
+    auto calc_abs_max = [&](float* in, size_t data_size) -> float {
+      float max_data = 0.0;
+      for (size_t i = 0; i < data_size; i++) {
+        if (max_data < std::abs(in[i])) max_data = std::abs(in[i]);
+      }
+      return max_data;
+    };
+    for (int c = 0; c < axis_size; c++) {
+      float* part_in = in_data + c * inner_size;
+      scale_out[c] = calc_abs_max(part_in, inner_size) / scale_factor;
+    }
+    return scale_out;
+  }
+  void FP32ToInt8(const float* din, int8_t* dout, const float* scale,
+                  int axis_size, int64_t outer_size, int64_t inner_size) {
+    int loop_size = axis_size * outer_size;
+    for (int i = 0; i < loop_size; ++i) {
+      float inv_scale = 1.f / scale[i % axis_size];
+      for (int j = 0; j < inner_size; ++j) {
+        dout[j] = static_cast<int8_t>(std::roundf(din[j] * inv_scale));
+      }
+      dout += inner_size;
+      din += inner_size;
+    }
+  }
+
+  void TransFP32BiasToInt32(const float* din, int* dout, size_t data_size,
+                            float in_scale, std::vector<float> weight_scale) {
+    CHECK(data_size == weight_scale.size())
+        << "Bias data size should be equal toe the weight scale data size.";
+    for (size_t i = 0; i < data_size; i++) {
+      dout[i] =
+          static_cast<int>(std::roundf(din[i] / in_scale / weight_scale[i]));
+    }
+  }
+
+  void SetValidPlaces(const std::vector<Place>& valid_places);
+
+  const std::vector<Place>& valid_places() const { return valid_places_; }
+
+ private:
+  std::vector<Place> valid_places_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/type_precision_cast_pass.cc b/paddle/fluid/lite/core/mir/type_precision_cast_pass.cc
new file mode 100644
index 00000000000..c424b2f2386
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/type_precision_cast_pass.cc
@@ -0,0 +1,166 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/mir/type_precision_cast_pass.h"
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
+#include "paddle/fluid/lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  // Start from inputs of the graph, those should have place set.
+  std::list<Node*> nodes;
+  for (auto& node : graph->mutable_nodes()) {
+    nodes.push_back(&node);
+  }
+
+  for (auto& node : nodes) {
+    if (!node->IsStmt()) continue;
+    auto inlinks = node->inlinks;
+    for (auto* in : inlinks) {
+      ComplementInputs(graph.get(), node, in);
+    }
+  }
+  VLOG(3) << "\n" << Visualize(graph.get());
+}
+
+void PrecisionCastPass::ComplementInputs(SSAGraph* graph, Node* inst_node,
+                                         Node* in) {
+  // If this input is out of date.
+  if (inst_node->inlinks.end() ==
+      std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
+    return;
+
+  CHECK(inst_node->IsStmt());
+  auto& inst = inst_node->AsStmt();
+  CHECK(in->IsRoleSet());
+  CHECK(in->IsArg());
+  auto in_arg_name = in->AsArg().name;
+  std::string tmp;
+  CHECK(inst.op_info()->GetInputArgname(in_arg_name, &tmp));
+  auto decl_arg_type = inst.picked_kernel().GetInputDeclType(tmp);
+  CHECK(in->AsArg().type);
+  LOG(INFO) << inst.picked_kernel().name();
+  // if (!in->AsArg().is_weight && !PrecisionCompatibleTo(*in->AsArg().type,
+  // *decl_arg_type)) {
+  if (!PrecisionCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
+    LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name
+              << " for kernel " << inst.op()->DebugString() << " "
+              << *in->AsArg().type << " -> " << *decl_arg_type;
+    // Add an Cast instruction to make the input compatible with other dist.
+    AddCastInst(*in->AsArg().type, *decl_arg_type, in, graph, inst_node,
+                graph->valid_places());
+  }
+}
+
+void PrecisionCastPass::AddCastInst(const Type& from, const Type& to, Node* in,
+                                    SSAGraph* graph, Node* inst_node,
+                                    const std::vector<Place>& valid_places) {
+  CHECK(!valid_places.empty()) << "valid_place should be set";
+
+  // var -> new_transform_op -> new_var -> inst
+  // So there will be a new Argument node and a new Cast Statement Node.
+  CHECK(in->IsArg());
+  auto node_id = [&] { return graph->nodes().size(); };
+  auto cast_op_output_name =
+      in->AsArg().name + "/trans/" + std::to_string(node_id());
+  auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
+  auto* cast_inst = graph->NewInstructNode();
+
+  // create Op and kernels.
+  auto cast_op = LiteOpRegistry::Global().Create("calib");
+  CHECK(cast_op) << "create op [" << cast_op << "] failed";
+
+  // Create the new var manually.
+  inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
+
+  // Create Calib Instruction.
+  cpp::OpDesc op_desc;
+  op_desc.SetType("calib");
+  op_desc.SetInput("Input", {in->AsArg().name});
+  op_desc.SetOutput("Out", {cast_op_output_name});
+  CHECK(inst_node->AsStmt().op_info()->HasAttr("input_scale"));
+  op_desc.SetAttr("scale",
+                  inst_node->AsStmt().op_info()->GetAttr<float>("input_scale"));
+
+  cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+  auto kernels = cast_op->CreateKernels(valid_places);
+  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+  bool is_found = false;
+  for (auto& kernel : kernels) {
+    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+    if (in_arg_ty->precision() == from.precision() &&
+        out_arg_ty->precision() == to.precision()) {
+      is_found = true;
+      selected_kernels.emplace_back(std::move(kernel));
+      // we pick the kernel
+      cast_inst->AsStmt("calib", std::move(selected_kernels), cast_op);
+      break;
+    }
+  }
+
+  CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
+                  << in->AsArg().name << "->" << to << ":"
+                  << inst_node->AsStmt().op_info()->Type();
+
+  // Remove the old link
+  RemoveDirectedLink(in, inst_node);
+
+  // Update the original instruction OpDesc.
+  // Update its input to the io_copy_output_name
+
+  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+  DirectedLink(in, cast_inst);
+  DirectedLink(cast_inst, cast_op_output_arg);
+  DirectedLink(cast_op_output_arg, inst_node);
+
+  // reset opdesc and update kernel information
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), in->AsArg().name,
+                cast_op_output_name);
+
+  // recreate the op
+  auto original_selected_kernel =
+      std::move(inst_node->AsStmt().kernels().front());
+  auto updated_op_info = *inst_node->AsStmt().mutable_op_info();
+
+  inst_node->AsStmt().ResetOp(updated_op_info, graph->valid_places());
+  inst_node->AsStmt().kernels().clear();
+  inst_node->AsStmt().kernels().emplace_back(
+      std::move(original_selected_kernel));
+  for (auto& kernel : inst_node->AsStmt().kernels()) {
+    LOG(INFO) << "kernel info: " << kernel->name();
+    inst_node->AsStmt().op()->AttachKernel(kernel.get());
+  }
+  graph->CheckValid();
+}
+
+void PrecisionCastPass::SetValidPlaces(const std::vector<Place>& valid_places) {
+  CHECK(!valid_places.empty());
+  valid_places_ = valid_places;
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(type_precision_cast_pass,
+                  paddle::lite::mir::PrecisionCastPass);
diff --git a/paddle/fluid/lite/core/mir/type_precision_cast_pass.h b/paddle/fluid/lite/core/mir/type_precision_cast_pass.h
new file mode 100644
index 00000000000..4925d92e59b
--- /dev/null
+++ b/paddle/fluid/lite/core/mir/type_precision_cast_pass.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/mir/pass.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+static void UpdateInputTo(cpp::OpDesc* desc, const std::string& from,
+                          const std::string& to) {
+  for (auto& item : *desc->mutable_inputs()) {
+    for (auto& input : item.second) {
+      if (input == from) {
+        input = to;
+      }
+    }
+  }
+}
+
+/*
+ * The pass complement the necessary instruction to make data
+ * transferring or transformation between different places.
+ */
+class PrecisionCastPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+
+  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+
+  void AddCastInst(const Type& from, const Type& to, Node* in, SSAGraph* graph,
+                   Node* inst_node, const std::vector<Place>& valid_places);
+
+  void SetValidPlaces(const std::vector<Place>& valid_places);
+
+  const std::vector<Place>& valid_places() const { return valid_places_; }
+
+ private:
+  std::vector<Place> valid_places_;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/core/mir/type_target_transform_pass.cc b/paddle/fluid/lite/core/mir/type_target_cast_pass.cc
similarity index 85%
rename from paddle/fluid/lite/core/mir/type_target_transform_pass.cc
rename to paddle/fluid/lite/core/mir/type_target_cast_pass.cc
index 12dd2dcff06..1d8095cb63f 100644
--- a/paddle/fluid/lite/core/mir/type_target_transform_pass.cc
+++ b/paddle/fluid/lite/core/mir/type_target_cast_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/mir/type_target_transform_pass.h"
+#include "paddle/fluid/lite/core/mir/type_target_cast_pass.h"
 #include <list>
 #include <memory>
 #include <string>
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/lite/core/mir/graph_visualize_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_registry.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -62,7 +63,7 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph, Node* inst_node,
   CHECK(in->AsArg().type);
   if (!TargetCompatibleTo(*in->AsArg().type, *decl_arg_type)) {
     LOG(INFO) << "found Target unmatched tensor: " << in->AsArg().name
-              << " for kernel " << inst.op->DebugString() << " "
+              << " for kernel " << inst.op()->DebugString() << " "
               << *in->AsArg().type << " -> " << *decl_arg_type;
     // Add an IoCopy instruction to make the input compatible with other dist.
     AddIoCopyInst(*in->AsArg().type, *decl_arg_type, in, graph, inst_node,
@@ -80,7 +81,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
   CHECK(in->IsArg());
   auto node_id = [&] { return graph->nodes().size(); };
   auto io_copy_output_name =
-      in->AsArg().name + "/trans/" + std::to_string(node_id());
+      string_format("%s/trans/%d", in->AsArg().name.c_str(), node_id());
   auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
   auto* io_copy_inst = graph->NewInstructNode();
 
@@ -89,7 +90,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
   CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
   // CHECK(io_copy_op);
   // Create the new var manually.
-  inst_node->AsStmt().op->scope()->Var(io_copy_output_name);
+  inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
 
   // Create IoCopy Instruction.
   cpp::OpDesc op_desc;
@@ -97,7 +98,7 @@ void TypeTargetTransformPass::AddIoCopyInst(
   op_desc.SetInput("Input", {in->AsArg().name});
   op_desc.SetOutput("Out", {io_copy_output_name});
 
-  io_copy_op->Attach(op_desc, inst_node->AsStmt().op->scope());
+  io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
   auto kernels = io_copy_op->CreateKernels(valid_places);
   io_copy_inst->AsStmt("io_copy", std::move(kernels), io_copy_op);
 
@@ -113,19 +114,19 @@ void TypeTargetTransformPass::AddIoCopyInst(
   DirectedLink(io_copy_output_arg, inst_node);
 
   // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op->mutable_op_info(), in->AsArg().name,
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(), in->AsArg().name,
                 io_copy_output_name);
 
-  inst_node->AsStmt().op->Attach(*inst_node->AsStmt().op->op_info(),
-                                 inst_node->AsStmt().op->scope());
+  inst_node->AsStmt().ResetOp(*inst_node->AsStmt().op_info(),
+                              graph->valid_places());
 
   std::string tmp;
   if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
     CHECK(false) << "get old a " << tmp;
   }
 
-  for (auto& kernel : inst_node->AsStmt().valid_kernels) {
-    inst_node->AsStmt().op->AttachKernel(kernel.get());
+  for (auto& kernel : inst_node->AsStmt().kernels()) {
+    inst_node->AsStmt().op()->AttachKernel(kernel.get());
   }
 
   graph->CheckValid();
@@ -141,5 +142,5 @@ void TypeTargetTransformPass::SetValidPlaces(
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_MIR_PASS(type_target_transform_pass,
+REGISTER_MIR_PASS(type_target_cast_pass,
                   paddle::lite::mir::TypeTargetTransformPass);
diff --git a/paddle/fluid/lite/core/mir/type_target_transform_pass.h b/paddle/fluid/lite/core/mir/type_target_cast_pass.h
similarity index 100%
rename from paddle/fluid/lite/core/mir/type_target_transform_pass.h
rename to paddle/fluid/lite/core/mir/type_target_cast_pass.h
diff --git a/paddle/fluid/lite/core/mir/variable_place_inference_pass.h b/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
index 2128c6d2014..0a5b3c341ab 100644
--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass.h
@@ -39,7 +39,7 @@ class VariablePlaceInferencePass : public DebugPass {
     for (const auto& v : graph->inputs()) {
       // the feed op might in the inputs
       if (v->IsStmt()) {
-        LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type;
+        LOG(INFO) << "found kernel in inputs " << v->AsStmt().op_type();
         continue;
       }
     }
@@ -59,10 +59,10 @@ class VariablePlaceInferencePass : public DebugPass {
     for (auto& x : graph->StmtTopologicalOrder()) {
       auto& inst = x->AsStmt();
       // The IoCopyOp is a tool operator, it won't support the type inference.
-      if (inst.op_type == "io_copy") continue;
+      if (inst.op_type() == "io_copy") continue;
       // LOG(INFO) << "- inferencing type " <<
       // deal with inputs
-      VLOG(4) << "inferencing op " << inst.op_type;
+      VLOG(4) << "Infering op " << inst.op_info()->Repr();
       // TODO(zhaolong): Add check if the node's name in op's arguments.
 
       auto get_argname = [&](
@@ -90,12 +90,14 @@ class VariablePlaceInferencePass : public DebugPass {
         }
       }
 
+      VLOG(3) << "inst " << inst.op_info()->Repr();
       for (auto* x_out : x->outlinks) {
         std::string node_name = x_out->AsArg().name;
         std::string arg_name =
             get_argname(node_name, inst.op_info()->outputs());
         CHECK(arg_name.size() > 0) << "can not found op arguments for node "
-                                   << node_name;
+                                   << node_name << " in Inst "
+                                   << inst.op_type();
         VLOG(3) << "-- output arg_name " << arg_name;
         auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
         if (!x_out->AsArg().type) {
diff --git a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
index d6b8561c378..aeffdd2eec4 100644
--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
-#include "paddle/fluid/lite/core/mir/passes.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 #include "paddle/fluid/lite/core/optimizer.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 #include "paddle/fluid/lite/kernels/cuda/use_kernels.h"
@@ -60,7 +60,7 @@ TEST(variable_place_inference_pass, test) {
       "argument_type_display_pass",     //
       "variable_place_inference_pass",  //
       "argument_type_display_pass",     //
-      "type_target_transform_pass",     //
+      "type_target_cast_pass",          //
   });
 
   Place prefered_place{
diff --git a/paddle/fluid/lite/core/naive_test_model.py b/paddle/fluid/lite/core/naive_test_model.py
index 832661e5ee8..f89a5e115fa 100644
--- a/paddle/fluid/lite/core/naive_test_model.py
+++ b/paddle/fluid/lite/core/naive_test_model.py
@@ -18,10 +18,10 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid.backward import append_backward
 
-a = fluid.layers.data(name="a", shape=[100], dtype='float32')
-label = fluid.layers.data(name="label", shape=[100], dtype='float32')
+a = fluid.layers.data(name="a", shape=[2], dtype='float32')
+label = fluid.layers.data(name="label", shape=[10], dtype='float32')
 
-a1 = fluid.layers.fc(input=a, size=500, act=None, bias_attr=False)
+a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False)
 
 cost = fluid.layers.square_error_cost(a1, label)
 avg_cost = fluid.layers.mean(cost)
@@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program())
 with open('startup_program.pb', 'wb') as f:
     f.write(fluid.default_startup_program().desc.serialize_to_string())
 
-data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
+#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
 
 #fluid.default_main_program().desc.
 
@@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f:
 
 #outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost])
 
-sys.exit(0)
+#sys.exit(0)
 fluid.io.save_inference_model("./model2", [a.name], [a1], exe)
 
-print(numpy.array(outs))
+#print(numpy.array(outs))
diff --git a/paddle/fluid/lite/core/op_lite.cc b/paddle/fluid/lite/core/op_lite.cc
index 484d22abf52..bbcf3d30988 100644
--- a/paddle/fluid/lite/core/op_lite.cc
+++ b/paddle/fluid/lite/core/op_lite.cc
@@ -30,6 +30,8 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
   auto pick_kernel = [&](const Place &place) {
     auto ks = KernelRegistry::Global().Create(op_type_, place.target,
                                               place.precision, place.layout);
+    VLOG(5) << "pick kernel for " << op_info()->Type() << " " << place
+            << " get " << ks.size() << " kernels";
     for (auto &&it : ks) {
       AttachKernel(it.get());
       kernels.emplace_back(std::move(it));
@@ -61,7 +63,6 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
     targets.insert(place.target);
   }
 
-  // CHECK(!kernels.empty()) << "No kernel found for Op " << op_type_;
   VLOG(2) << "op " << op_type_ << " get " << kernels.size() << " kernels";
   return kernels;
 }
@@ -83,7 +84,7 @@ bool OpLite::Attach(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   scope_ = scope;
   op_info_.reset(
       new OpInfo(opdesc));  // Force clean the out-of-date infomation.
-  return AttachImpl(opdesc, scope);
+  return AttachImpl(*op_info(), scope);
 }
 
 const Tensor *OpLite::GetTensor(lite::Scope *scope,
diff --git a/paddle/fluid/lite/core/op_lite.h b/paddle/fluid/lite/core/op_lite.h
index 922aa2304e4..cd7d9ef8449 100644
--- a/paddle/fluid/lite/core/op_lite.h
+++ b/paddle/fluid/lite/core/op_lite.h
@@ -54,12 +54,10 @@ class OpLite : public Registry {
   OpLite() = default;
   explicit OpLite(const std::string &type) : op_type_(type) {}
   explicit OpLite(const std::vector<Place> &valid_places)
-      : valid_places_(valid_places) {
-    LOG(INFO) << "valid places " << valid_places.size();
-  }
+      : valid_places_(valid_places) {}
 
   void SetValidPlaces(const std::vector<Place> &places) {
-    LOG(INFO) << "valid places " << valid_places_.size();
+    VLOG(3) << "valid places " << valid_places_.size();
     valid_places_ = places;
   }
   const std::vector<Place> &valid_places() const { return valid_places_; }
@@ -199,6 +197,22 @@ class OpInfo : public cpp::OpDesc {
     }
     return false;
   }
+
+  void UpdateAllInputs(const std::string &from, const std::string &to) {
+    for (auto &item : inputs_) {
+      for (auto &var : item.second) {
+        if (var == from) var = to;
+      }
+    }
+  }
+
+  void UpdateAllOutputs(const std::string &from, const std::string &to) {
+    for (auto &item : outputs_) {
+      for (auto &var : item.second) {
+        if (var == from) var = to;
+      }
+    }
+  }
 };
 
 }  // namespace lite
diff --git a/paddle/fluid/lite/core/op_registry.cc b/paddle/fluid/lite/core/op_registry.cc
index 8c3e44733df..5bdb4d49f4a 100644
--- a/paddle/fluid/lite/core/op_registry.cc
+++ b/paddle/fluid/lite/core/op_registry.cc
@@ -42,6 +42,8 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
       CREATE_KERNEL1(target__, kFloat);                 \
     case PRECISION(kInt8):                              \
       CREATE_KERNEL1(target__, kInt8);                  \
+    case PRECISION(kInt64):                             \
+      CREATE_KERNEL1(target__, kInt64);                 \
     case PRECISION(kAny):                               \
       CREATE_KERNEL1(target__, kAny);                   \
     default:                                            \
@@ -62,6 +64,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
     case TARGET(kARM): {
       CREATE_KERNEL(kARM);
     } break;
+    case TARGET(kOpenCL): {
+      CREATE_KERNEL(kOpenCL);
+    } break;
     default:
       CHECK(false) << "not supported kernel target " << TargetToStr(target);
   }
@@ -92,12 +97,18 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kHost, kAny, kAny);
 
   INIT_FOR(kX86, kFloat, kNCHW);
+  INIT_FOR(kX86, kInt64, kNCHW);
   INIT_FOR(kX86, kAny, kNCHW);
   INIT_FOR(kX86, kAny, kAny);
 
   INIT_FOR(kARM, kFloat, kNCHW);
+  INIT_FOR(kARM, kInt8, kNCHW);
   INIT_FOR(kARM, kAny, kNCHW);
   INIT_FOR(kARM, kAny, kAny);
+
+  INIT_FOR(kOpenCL, kFloat, kNCHW);
+  INIT_FOR(kOpenCL, kAny, kNCHW);
+  INIT_FOR(kOpenCL, kAny, kAny);
 #undef INIT_FOR
 }
 
diff --git a/paddle/fluid/lite/core/op_registry.h b/paddle/fluid/lite/core/op_registry.h
index 49332262deb..54edf6180b2 100644
--- a/paddle/fluid/lite/core/op_registry.h
+++ b/paddle/fluid/lite/core/op_registry.h
@@ -20,6 +20,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/api/paddle_lite_factory_helper.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/target_wrapper.h"
@@ -32,7 +33,6 @@ namespace lite {
 
 using KernelFunc = std::function<void()>;
 using KernelFuncCreator = std::function<std::unique_ptr<KernelFunc>()>;
-
 class LiteOpRegistry final : public Factory<OpLite, std::shared_ptr<OpLite>> {
  public:
   static LiteOpRegistry &Global() {
@@ -69,6 +69,8 @@ class KernelRegistry final {
                                       DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kX86), PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kX86), PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kX86), PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kHost), PRECISION(kFloat),
@@ -80,6 +82,12 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM), PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
               KernelRegistryForTarget<TARGET(kARM), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM), PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL), PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL), PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *  //
               >;
 
@@ -91,9 +99,9 @@ class KernelRegistry final {
   void Register(const std::string &name,
                 typename KernelRegistryForTarget<Target, Precision,
                                                  Layout>::creator_t &&creator) {
-    // VLOG(3) << "register for " << TargetToStr(Target) << ":"
-    //<< PrecisionToStr(Precision) << "//"
-    //<< GetKernelOffset<Target, Precision, Layout>();
+    /*VLOG(3) << "register for " << TargetToStr(Target) << ":"
+            << PrecisionToStr(Precision) << "//"
+            << GetKernelOffset<Target, Precision, Layout>();*/
     using kernel_registor_t =
         KernelRegistryForTarget<Target, Precision, Layout>;
     auto &varient = registries_[GetKernelOffset<Target, Precision, Layout>()];
@@ -153,6 +161,9 @@ class KernelRegistor : public lite::Registor<KernelType> {
  public:
   KernelRegistor(const std::string &op_type, const std::string &alias)
       : Registor<KernelType>([=] {
+          /*VLOG(3) << "Register kernel " << op_type << " for "
+                  << TargetToStr(target) << " " << PrecisionToStr(precision)
+                  << " " << DataLayoutToStr(layout) << " alias " << alias;*/
           KernelRegistry::Global().Register<target, precision, layout>(
               op_type, [=]() -> std::unique_ptr<KernelType> {
                 std::unique_ptr<KernelType> x(new KernelType);
@@ -168,7 +179,6 @@ class KernelRegistor : public lite::Registor<KernelType> {
 
 // Operator registry
 #define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__
-#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
 #define REGISTER_LITE_OP(op_type__, OpClass)                              \
   static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
       op_type__)(#op_type__);                                             \
@@ -176,11 +186,6 @@ class KernelRegistor : public lite::Registor<KernelType> {
     return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
   }
 
-#define USE_LITE_OP(op_type__)                                   \
-  extern int touch_op_##op_type__();                             \
-  int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
-      touch_op_##op_type__();
-
 // Kernel registry
 #define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \
   op_type__##__##target__##__##precision__##__registor__
@@ -210,12 +215,6 @@ class KernelRegistor : public lite::Registor<KernelType> {
           TARGET(target__), PRECISION(precision__), DATALAYOUT(layout__)>(    \
           #op_type__ "/" #alias__)
 
-#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
-  extern int touch_##op_type__##target__##precision__##layout__##alias__();  \
-  int op_type__##target__##precision__##layout__##alias__                    \
-      __attribute__((unused)) =                                              \
-          touch_##op_type__##target__##precision__##layout__##alias__();
-
 #define LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, \
                              alias__)                                    \
   op_type__##target__##precision__##layout__##alias__
diff --git a/paddle/fluid/lite/core/optimizer.cc b/paddle/fluid/lite/core/optimizer.cc
index 1502d15e2bf..0c08688aa7d 100644
--- a/paddle/fluid/lite/core/optimizer.cc
+++ b/paddle/fluid/lite/core/optimizer.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/lite/core/optimizer.h"
 #include <fstream>
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
-#include "paddle/fluid/lite/core/mir/type_target_transform_pass.h"
+#include "paddle/fluid/lite/core/mir/type_target_cast_pass.h"
 #include "paddle/fluid/lite/model_parser/model_parser.h"
 #include "paddle/fluid/lite/utils/all.h"
 
diff --git a/paddle/fluid/lite/core/optimizer.h b/paddle/fluid/lite/core/optimizer.h
index 651cd981c76..7c6e4a2dff1 100644
--- a/paddle/fluid/lite/core/optimizer.h
+++ b/paddle/fluid/lite/core/optimizer.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
 #include "paddle/fluid/lite/core/mir/ssa_graph.h"
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
-#include "paddle/fluid/lite/core/mir/type_target_transform_pass.h"
+#include "paddle/fluid/lite/core/mir/type_target_cast_pass.h"
 #include "paddle/fluid/lite/core/program.h"
 #include "paddle/fluid/lite/core/types.h"
 #include "paddle/fluid/lite/model_parser/model_parser.h"
@@ -43,30 +43,46 @@ class Optimizer {
     CHECK(!graph_) << "duplicate optimize found";
     graph_.reset(new mir::SSAGraph);
     graph_->Build(program, valid_places);
+    graph_->SetValidPlaces(valid_places);
+
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
     if (passes.empty()) {
-      RunPasses(std::vector<std::string>{{
-          "lite_conv_bn_fuse_pass",                   //
-          "lite_conv_elementwise_add_act_fuse_pass",  //
-          "lite_fc_fuse_pass",                        //
-          "static_kernel_pick_pass",                  //
-          "variable_place_inference_pass",            //
-          "argument_type_display_pass",               //
-          "type_target_transform_pass",               //
-          "argument_type_display_pass",               //
-          "variable_place_inference_pass",            //
-          "argument_type_display_pass",               //
-          "io_copy_kernel_pick_pass",                 //
-          "variable_place_inference_pass",            //
-          "runtime_context_assign_pass",              //
-      }});
+      RunPasses(std::vector<std::string>{
+          {"lite_quant_dequant_fuse_pass",  //
+           "lite_conv_bn_fuse_pass",        //
+// This pass is disabled to force some opencl kernels selected for final
+// running, otherwise, they will be fused to ARM fusion kernels, and the OpenCL
+// devices will be discarded.
+// TODO(Superjomn) Refine the fusion related design to select fusion kernels for
+// devices automatically.
+#ifndef LITE_WITH_OPENCL
+           "lite_conv_elementwise_add_activation_fuse_pass",  //
+#endif
+           "lite_fc_fuse_pass",              //
+           "identity_scale_eliminate_pass",  //
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#ifndef LITE_WITH_OPENCL
+           "lite_elementwise_add_activation_fuse_pass",  //
+#endif
+#endif
+           "static_kernel_pick_pass",        //
+           "variable_place_inference_pass",  //
+           "argument_type_display_pass",     //
+           "type_target_cast_pass",          //
+           "variable_place_inference_pass",  //
+           "argument_type_display_pass",     //
+           "io_copy_kernel_pick_pass",       //
+           "variable_place_inference_pass",  //
+           "type_precision_cast_pass",       //
+           "argument_type_display_pass",     //
+           "trans_weight_pass",              //
+           "runtime_context_assign_pass",    //
+           "graph_visualze"}});
     } else {
       RunPasses(passes);
     }
-#endif
     exec_scope_ = program.exec_scope();
   }
 
@@ -93,7 +109,7 @@ class Optimizer {
   void InitTargetTypeTransformPass() {
     auto* pass =
         mir::PassManager::Global().LookUp<mir::TypeTargetTransformPass>(
-            "type_target_transform_pass");
+            "type_target_cast_pass");
     CHECK(pass);
     CHECK(!valid_places_.empty());
     LOG(INFO) << "valid_places.size " << valid_places_.size();
@@ -121,7 +137,7 @@ class Optimizer {
     for (auto& x : passes) {
       LOG(INFO) << "== Running pass " << x;
       auto* pass = mir::PassManager::Global().LookUp(x);
-      CHECK(pass);
+      CHECK(pass) << "Can not find pass: " << x;
       pass->Apply(graph_);
     }
   }
diff --git a/paddle/fluid/lite/core/optimizer_test.cc b/paddle/fluid/lite/core/optimizer_test.cc
index ae543dc1b19..016dfdf15e0 100644
--- a/paddle/fluid/lite/core/optimizer_test.cc
+++ b/paddle/fluid/lite/core/optimizer_test.cc
@@ -16,9 +16,9 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
 #include "paddle/fluid/lite/core/mir/generate_program_pass.h"
 #include "paddle/fluid/lite/core/mir/pass_manager.h"
-#include "paddle/fluid/lite/core/mir/passes.h"
 #include "paddle/fluid/lite/core/mir/static_kernel_pick_pass.h"
 #include "paddle/fluid/lite/core/program_fake_utils.h"
 
diff --git a/paddle/fluid/lite/core/profile/CMakeLists.txt b/paddle/fluid/lite/core/profile/CMakeLists.txt
index 43731e8a414..92ac495b6b6 100644
--- a/paddle/fluid/lite/core/profile/CMakeLists.txt
+++ b/paddle/fluid/lite/core/profile/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()
 
 lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
+ 
diff --git a/paddle/fluid/lite/core/profile/basic_profiler.cc b/paddle/fluid/lite/core/profile/basic_profiler.cc
index 86d5cd39ea9..75b1a48d3ad 100644
--- a/paddle/fluid/lite/core/profile/basic_profiler.cc
+++ b/paddle/fluid/lite/core/profile/basic_profiler.cc
@@ -19,7 +19,7 @@ namespace lite {
 namespace profile {
 
 const int BasicTimer::data_w = 10;
-const int BasicTimer::name_w = 10;
+const int BasicTimer::name_w = 15;
 
 }  // namespace profile
 }  // namespace lite
diff --git a/paddle/fluid/lite/core/profile/basic_profiler.h b/paddle/fluid/lite/core/profile/basic_profiler.h
index 16a9905f1ae..73e5b47ef93 100644
--- a/paddle/fluid/lite/core/profile/basic_profiler.h
+++ b/paddle/fluid/lite/core/profile/basic_profiler.h
@@ -59,6 +59,7 @@ class BasicTimer : TimerBase<BasicTimer> {
   uint64_t count_{};
   uint32_t max_{std::numeric_limits<uint32_t>::min()};
   uint32_t min_{std::numeric_limits<uint32_t>::max()};
+  uint32_t max_count_{};
   int id_{-1};
   std::string key_;
   std::chrono::time_point<std::chrono::high_resolution_clock> timer_{};
@@ -75,18 +76,23 @@ class BasicTimer : TimerBase<BasicTimer> {
   void SetKey(const std::string &key) { key_ = key; }
   void Start() { timer_ = std::chrono::high_resolution_clock::now(); }
   void Stop() {
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
         std::chrono::high_resolution_clock::now() - timer_);
     Log(duration.count());
   }
 
   int count() const { return count_; }
-
   void Log(uint32_t timespan) {
-    total_ += timespan;
-    max_ = std::max(max_, timespan);
-    min_ = std::min(min_, timespan);
+    uint32_t max_record = max_;
+    if (count_ >= 10000) {
+      total_ += timespan;
+      max_ = std::max(max_, timespan);
+      min_ = std::min(min_, timespan);
+    }
     count_++;
+    if (max_record != max_) {
+      max_count_ = count_;
+    }
   }
 
   static std::string basic_repr_header() {
@@ -95,17 +101,19 @@ class BasicTimer : TimerBase<BasicTimer> {
        << std::setw(data_w) << "average"  //
        << std::setw(data_w) << "min"      //
        << std::setw(data_w) << "max"      //
-       << std::setw(data_w) << "count";
+       << std::setw(data_w) << "count"    //
+       << std::setw(data_w) << "total_";
     return ss.str();
   }
 
   std::string basic_repr() const {
     std::stringstream ss;
-    ss << std::setw(name_w) << key()  //
-       << std::setw(data_w) << ave()  //
-       << std::setw(data_w) << min()  //
-       << std::setw(data_w) << max()  //
-       << std::setw(data_w) << count_;
+    ss << std::setw(name_w) << key()   //
+       << std::setw(data_w) << ave()   //
+       << std::setw(data_w) << min()   //
+       << std::setw(data_w) << max()   //
+       << std::setw(data_w) << count_  //
+       << std::setw(data_w) << total_;
     return ss.str();
   }
 
@@ -116,7 +124,12 @@ class BasicTimer : TimerBase<BasicTimer> {
     return id_;
   }
 
-  double ave() const { return total_ * 1. / count_; }
+  double ave() const {
+    if (count_ > 10000)
+      return total_ * 1. / (count_ - 10000);
+    else
+      return 0;
+  }
   double max() const { return max_; }
   double min() const { return min_; }
 
diff --git a/paddle/fluid/lite/core/program.cc b/paddle/fluid/lite/core/program.cc
index 9f12f4b87d8..5a13a4ecc9e 100644
--- a/paddle/fluid/lite/core/program.cc
+++ b/paddle/fluid/lite/core/program.cc
@@ -64,6 +64,7 @@ void RuntimeProgram::SaveParams(const std::string &dir,
 
 void Program::Build(const framework::proto::ProgramDesc &program) {
   CHECK(ops_.empty()) << "Executor duplicate Build found";
+
   // Create operators.
   for (const auto &proto_op_desc : program.blocks(0).ops()) {
     lite::OpDesc op_desc_dummy(proto_op_desc);
@@ -98,6 +99,7 @@ void Program::PrepareWorkspace(const framework::proto::ProgramDesc &program) {
     } else {
       if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
       weights_.push_back(var_desc.Name());
+      if (var_desc.Persistable()) scope_->Var(var_desc.Name());
     }
   }
 }
diff --git a/paddle/fluid/lite/core/program.h b/paddle/fluid/lite/core/program.h
index 4f2f65d3fa7..6c997e5796c 100644
--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -140,7 +140,8 @@ class RuntimeProgram {
 
   void Run() {
     for (auto& inst : instructions_) {
-      LOG(INFO) << ">> Running kernel: " << inst;
+      VLOG(4) << ">> Running kernel: " << inst.op()->op_info()->Repr()
+              << " on Target " << TargetToStr(inst.kernel()->target());
       inst.Run();
     }
   }
@@ -154,6 +155,8 @@ class RuntimeProgram {
 
   size_t num_instructions() const { return instructions_.size(); }
 
+  const std::vector<Instruction>& instructions() const { return instructions_; }
+
  protected:
   std::string SerializeProgram(const framework::proto::ProgramDesc& desc);
   void SaveParams(const std::string& dir,
diff --git a/paddle/fluid/lite/core/scope.h b/paddle/fluid/lite/core/scope.h
index 57287c17e3e..67bf52774e5 100644
--- a/paddle/fluid/lite/core/scope.h
+++ b/paddle/fluid/lite/core/scope.h
@@ -27,6 +27,12 @@ namespace lite {
 class Scope final {
  public:
   Scope() {}
+  // delete below two functions to allow pybind to recognise it cannot make a
+  // copy
+  // link:
+  // https://stackoverflow.com/questions/53807248/pybind11-returning-a-pointer-to-a-container-of-unique-ptr
+  Scope(const Scope&) = delete;
+  Scope& operator=(const Scope&) = delete;
   ~Scope();
 
   Scope& NewScope() const;
diff --git a/paddle/fluid/lite/core/target_wrapper.cc b/paddle/fluid/lite/core/target_wrapper.cc
index 5ce5f3942d8..ae833d22115 100644
--- a/paddle/fluid/lite/core/target_wrapper.cc
+++ b/paddle/fluid/lite/core/target_wrapper.cc
@@ -17,31 +17,5 @@
 #include "paddle/fluid/lite/utils/all.h"
 
 namespace paddle {
-namespace lite {
-
-size_t Place::hash() const {
-  std::hash<int> h;
-  size_t hash = h(static_cast<int>(target));
-  hash = hash_combine(hash, static_cast<int>(precision));
-  hash = hash_combine(hash, static_cast<int>(layout));
-  hash = hash_combine(hash, static_cast<int>(device));
-  return hash;
-}
-
-bool operator<(const Place &a, const Place &b) {
-  if (a.target != b.target) return a.target < b.target;
-  if (a.precision != b.precision) return a.precision < b.precision;
-  if (a.layout != b.layout) return a.layout < b.layout;
-  if (a.device != b.device) return a.device < b.device;
-  return true;
-}
-
-std::string Place::DebugString() const {
-  std::stringstream os;
-  os << TargetToStr(target) << "/" << PrecisionToStr(precision) << "/"
-     << DataLayoutToStr(layout);
-  return os.str();
-}
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/target_wrapper.h b/paddle/fluid/lite/core/target_wrapper.h
index 1029bf5300e..09cacacae18 100644
--- a/paddle/fluid/lite/core/target_wrapper.h
+++ b/paddle/fluid/lite/core/target_wrapper.h
@@ -16,7 +16,9 @@
 #include <iostream>
 #include <sstream>
 #include <string>
+#include "paddle/fluid/lite/api/paddle_place.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"
+
 #ifdef LITE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -25,119 +27,17 @@
 namespace paddle {
 namespace lite {
 
-enum class TargetType : int {
-  kUnk = 0,
-  kHost,
-  kX86,
-  kCUDA,
-  kARM,
-  kAny,  // any target
-  NUM,   // number of fields.
-};
-enum class PrecisionType : int {
-  kUnk = 0,
-  kFloat,
-  kInt8,
-  kAny,  // any precision
-  NUM,   // number of fields.
-};
-enum class DataLayoutType : int {
-  kUnk = 0,
-  kNCHW,
-  kAny,  // any data layout
-  NUM,   // number of fields.
-};
-
-// Some helper macro to get a specific TargetType.
-#define TARGET(item__) paddle::lite::TargetType::item__
-// Some helper macro to get a specific PrecisionType.
-#define PRECISION(item__) paddle::lite::PrecisionType::item__
-#define DATALAYOUT(item__) paddle::lite::DataLayoutType::item__
-
-static const std::string& TargetToStr(TargetType target) {
-  static const std::string target2string[] = {"unk", "host", "x86", "cuda",
-                                              "any"};
-  auto x = static_cast<int>(target);
-  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
-  return target2string[x];
-}
-
-static const std::string& PrecisionToStr(PrecisionType precision) {
-  static const std::string precision2string[] = {"unk", "float", "int8_t",
-                                                 "any"};
-  auto x = static_cast<int>(precision);
-  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
-  return precision2string[x];
-}
-
-static const std::string& DataLayoutToStr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"unk", "NCHW", "any"};
-  auto x = static_cast<int>(layout);
-  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
-  return datalayout2string[x];
-}
-
-static const std::string& TargetRepr(TargetType target) {
-  static const std::string target2string[] = {"kUnk", "kHost", "kX86", "kCUDA",
-                                              "kAny"};
-  auto x = static_cast<int>(target);
-  CHECK_LT(x, static_cast<int>(TARGET(NUM)));
-  return target2string[x];
-}
-
-static const std::string& PrecisionRepr(PrecisionType precision) {
-  static const std::string precision2string[] = {"kUnk", "kFloat", "kInt8",
-                                                 "kAny"};
-  auto x = static_cast<int>(precision);
-  CHECK_LT(x, static_cast<int>(PRECISION(NUM)));
-  return precision2string[x];
-}
-
-static const std::string& DataLayoutRepr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"kUnk", "kNCHW", "kAny"};
-  auto x = static_cast<int>(layout);
-  CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
-  return datalayout2string[x];
-}
-
-/*
- * Place specifies the execution context of a Kernel or input/output for a
- * kernel. It is used to make the analysis of the MIR more clear and accurate.
- */
-struct Place {
-  TargetType target{TARGET(kUnk)};
-  PrecisionType precision{PRECISION(kUnk)};
-  DataLayoutType layout{DATALAYOUT(kUnk)};
-  int16_t device{0};  // device ID
-
-  Place() = default;
-  Place(TargetType target, PrecisionType precision,
-        DataLayoutType layout = DATALAYOUT(kNCHW), int16_t device = 0)
-      : target(target), precision(precision), layout(layout), device(device) {}
-
-  bool is_valid() const {
-    return target != TARGET(kUnk) && precision != PRECISION(kUnk) &&
-           layout != DATALAYOUT(kUnk);
-  }
-
-  size_t hash() const;
-
-  bool operator==(const Place& other) const {
-    return target == other.target && precision == other.precision &&
-           layout == other.layout && device == other.device;
-  }
-
-  bool operator!=(const Place& other) const { return !(*this == other); }
-
-  friend bool operator<(const Place& a, const Place& b);
-
-  friend std::ostream& operator<<(std::ostream& os, const Place& other) {
-    os << other.DebugString();
-    return os;
-  }
-
-  std::string DebugString() const;
-};
+using lite_api::TargetType;
+using lite_api::PrecisionType;
+using lite_api::DataLayoutType;
+using lite_api::PrecisionTypeLength;
+using lite_api::TargetToStr;
+using lite_api::Place;
+using lite_api::PrecisionToStr;
+using lite_api::DataLayoutToStr;
+using lite_api::TargetRepr;
+using lite_api::PrecisionRepr;
+using lite_api::DataLayoutRepr;
 
 // Memory copy directions.
 enum class IoDirection {
diff --git a/paddle/fluid/lite/core/tensor.h b/paddle/fluid/lite/core/tensor.h
index d6980ff8898..2c001c84e4c 100644
--- a/paddle/fluid/lite/core/tensor.h
+++ b/paddle/fluid/lite/core/tensor.h
@@ -21,6 +21,7 @@
  * looks the same.
  */
 
+#include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/target_wrapper.h"
 
@@ -90,6 +91,18 @@ class DDimBase {
     return os;
   }
 
+  friend bool operator==(const DDimBase &a, const DDimBase &b) {
+    if (a.size() != b.size()) return false;
+    for (size_t i = 0; i < a.size(); i++) {
+      if (a[i] != b[i]) return false;
+    }
+    return true;
+  }
+
+  friend bool operator!=(const DDimBase &a, const DDimBase &b) {
+    return !(a == b);
+  }
+
  private:
   DDimT *self() { return static_cast<DDimT *>(this); }
   const DDimT *const_self() const { return static_cast<const DDimT *>(this); }
@@ -153,6 +166,7 @@ class TensorBase {
   const void *raw_data() const { return const_self()->data(); }
 
   size_t data_size() const { return const_self()->dims().production(); }
+  size_t memory_size() const { return const_self()->memory_size(); }
 
   void ShareDataWith(const TensorBase &other) { self()->ShareDataWith(other); }
   void CopyDataFrom(const TensorBase &other) { self()->CopyDataFrom(other); }
@@ -174,5 +188,12 @@ class TensorBase {
   }
 };
 
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/core/type_system.h b/paddle/fluid/lite/core/type_system.h
index 46d17e3c33e..d8b88f8e261 100644
--- a/paddle/fluid/lite/core/type_system.h
+++ b/paddle/fluid/lite/core/type_system.h
@@ -34,11 +34,12 @@ namespace lite {
 
 // Type is the definition of all the types that supported by the Variable that
 // represents as the input and output of an operator or kernel.
-// The DNN system is simple, and the architecture can not process that many data
-// types as a compiler, or that will turn out to a chaos.
+// The DNN system is simple, just a list of operators, and the architecture
+// can not process that many data types as a compiler, or that will turn out to
+// a chaos.
 //
 // We should make sure that the supported data types be registered here, and
-// keep the quantity small and avoid using some special data types as op's
+// keep the set small and avoid using some special data types as op's
 // inputs or outputs, such as some runtime cache, those types can't be processed
 // by the MIR.
 //
@@ -51,7 +52,7 @@ namespace lite {
 // transforming operators, for example, a DataLayoutTransformOp can convert a
 // `TensorFp32NCHWTy` to a `TensorFp32NHWCTy`; a IoCopyOp can convert a
 // `TensorFp32NCHWTy(kHost)` to `TensorFp32NCHWTy(kCUDA)`. There are many other
-// convertions between different Types, but there are some unsupportted type
+// convertions between different Types, but there are some unsupported type
 // convertions, for example, there is noway to convert a `UnsupportedTy` to a
 // `TensorAnyTy`.
 //
@@ -165,8 +166,8 @@ class Type : public DataType {
 
 // -------------------------------- compatible check ---------------------------
 static bool TargetCompatibleTo(const Type& a, const Type& b) {
-  auto is_host = [](TargetType x) {
-    return x == TARGET(kHost) || x == TARGET(kX86);
+  auto is_host = [](TargetType x) -> bool {
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
   };
   if (a.IsVoid() || b.IsVoid()) return true;
   if (a.IsTensor() || b.IsTensor()) {
diff --git a/paddle/fluid/lite/core/types.h b/paddle/fluid/lite/core/types.h
index 810901d15ba..50c3d69d854 100644
--- a/paddle/fluid/lite/core/types.h
+++ b/paddle/fluid/lite/core/types.h
@@ -36,10 +36,10 @@ class KernelPickFactor {
   };
 
   // Has any factors considered.
-  bool AnyFactorConsidered() const { return data_; }
+  bool any_factor_considered() const { return data_; }
 
   KernelPickFactor& ConsiderTarget();
-  // Perfer a specific target, e.g. prefer CUDA kernels.
+  // Prefer a specific target, e.g. prefer CUDA kernels.
   KernelPickFactor& ConsiderPrecision();
   KernelPickFactor& ConsiderDataLayout();
   KernelPickFactor& ConsiderDevice();
diff --git a/paddle/fluid/lite/core/variable.h b/paddle/fluid/lite/core/variable.h
index d52a813a09c..e4ab30a366b 100644
--- a/paddle/fluid/lite/core/variable.h
+++ b/paddle/fluid/lite/core/variable.h
@@ -15,12 +15,15 @@
 #pragma once
 #include <set>
 #include <string>
+#include <vector>
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/utils/all.h"
 
 namespace paddle {
 namespace lite {
 
+using FeedFetchList = std::vector<lite::Tensor>;
+
 class Variable {
  public:
   template <typename T>
@@ -40,7 +43,9 @@ class Variable {
   }
 
  private:
-  variant<int, float, std::string, lite::Tensor> blob_;
+  // variant<int, float, std::string, lite::Tensor> blob_;
+  variant<int, float, std::string, lite::Tensor, std::vector<lite::Tensor>>
+      blob_;
 };
 
 }  // namespace lite
diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt
index 505759c7d4a..9889b8b1aa0 100644
--- a/paddle/fluid/lite/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/cuda/CMakeLists.txt
@@ -4,3 +4,4 @@ endif()
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
 nv_library(cuda_blas_lite SRCS blas.cc)
+ 
diff --git a/paddle/fluid/lite/demo/cxx/Makefile.def b/paddle/fluid/lite/demo/cxx/Makefile.def
new file mode 100644
index 00000000000..a5a0b4e2214
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/Makefile.def
@@ -0,0 +1,37 @@
+CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
+	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
+LDFLAGS = -latomic -pthread -ldl
+
+SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
+
+THIRD_PARTY_LIBS = ../../../third_party/glog/lib/libglog.a \
+                   ../../../third_party/gflags/lib/libgflags.a
+
+SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
+	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
+	          -I/opt/android-ndk-r17c/sources/android/support/include \
+	          -I/opt/android-ndk-r17c/sysroot/usr/include \
+
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include \
+	               -I../../../third_party/glog/include
+
+ifeq ($(ARM_ABI), arm8)
+    CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
+    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=22 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
+    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
+    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
+                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES)
+else
+    CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
+    CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
+		-D__ANDROID_API__=22 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE  
+    CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--fix-cortex-a8 -Wl,--gc-sections -Wl,-z,nocopyreloc
+    SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-22/arch-arm
+    SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++_static.a \
+                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
+                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
+                  /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES)
+endif
diff --git a/paddle/fluid/lite/demo/cxx/README.md b/paddle/fluid/lite/demo/cxx/README.md
new file mode 100644
index 00000000000..8aa38ffdda2
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/README.md
@@ -0,0 +1,42 @@
+# C++ Android Demo
+1. 使用`paddle/fluid/lite/tools/Dockerfile.mobile`生成docker镜像
+2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
+3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
+4. 执行以下命令准备模拟器环境
+```shell
+# armv8
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+```shell
+# armv7
+adb kill-server
+adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
+echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
+sleep 1m
+```
+5. 准备模型、编译并运行完整api的demo
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+make
+adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
+adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
+
+6. 编译并运行轻量级api的demo
+```shell
+cd ../mobile_light
+make
+adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt"
+```
diff --git a/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
new file mode 100644
index 00000000000..6c9b7413f49
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
@@ -0,0 +1,22 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+
+mobilenetv1_full_api: mobilenetv1_full_api.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
+
+mobilenetv1_full_api.o: mobilenetv1_full_api.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc
+
+
+.PHONY: clean
+clean:
+	rm mobilenetv1_full_api.o
+	rm mobilenetv1_full_api
diff --git a/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
new file mode 100644
index 00000000000..7735f74d109
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
@@ -0,0 +1,22 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+
+mobilenetv1_full_api: mobilenetv1_full_api.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
+
+mobilenetv1_full_api.o: mobilenetv1_full_api.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_full_api.o -c mobilenetv1_full_api.cc
+
+
+.PHONY: clean
+clean:
+	rm mobilenetv1_full_api.o
+	rm mobilenetv1_full_api
diff --git a/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
new file mode 100644
index 00000000000..66a6d8f31dc
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
@@ -0,0 +1,22 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobilenetv1_light_api: mobilenetv1_light_api.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
+
+mobilenetv1_light_api.o: mobilenetv1_light_api.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc
+
+
+.PHONY: clean
+clean:
+	rm mobilenetv1_light_api.o
+	rm mobilenetv1_light_api
diff --git a/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
new file mode 100644
index 00000000000..91b281c49c8
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
@@ -0,0 +1,22 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobilenetv1_light_api: mobilenetv1_light_api.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
+
+mobilenetv1_light_api.o: mobilenetv1_light_api.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobilenetv1_light_api.o -c mobilenetv1_light_api.cc
+
+
+.PHONY: clean
+clean:
+	rm mobilenetv1_light_api.o
+	rm mobilenetv1_light_api
diff --git a/paddle/fluid/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/paddle/fluid/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
new file mode 100644
index 00000000000..e1c3c1a15e2
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+DEFINE_string(optimized_model_dir, "", "Optimized model dir.");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set CxxConfig
+  CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_preferred_place(Place{TARGET(kARM), PRECISION(kFloat)});
+  config.set_valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
+
+  // 2. Create PaddlePredictor by CxxConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<CxxConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(shape_t({1, 3, 224, 224}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  LOG(INFO) << "Ouput dim: " << output_tensor->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    LOG(INFO) << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
+  }
+
+  // 6. Save optimition model
+  predictor->SaveOptimizedModel(FLAGS_optimized_model_dir);
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
diff --git a/paddle/fluid/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/paddle/fluid/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
new file mode 100644
index 00000000000..aa71b7cc2f5
--- /dev/null
+++ b/paddle/fluid/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <iostream>
+#include <vector>
+#include "paddle_api.h"          // NOLINT
+#include "paddle_use_kernels.h"  // NOLINT
+#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_use_passes.h"   // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+DEFINE_string(model_dir, "", "Model dir path.");
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+void RunModel() {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize({1, 3, 224, 224});
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
+    data[i] = 1;
+  }
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  LOG(INFO) << "Ouput dim: " << output_tensor->shape()[1] << std::endl;
+  for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
+    LOG(INFO) << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  RunModel();
+  return 0;
+}
diff --git a/paddle/fluid/lite/demo/java/README.md b/paddle/fluid/lite/demo/java/README.md
new file mode 100644
index 00000000000..ad97638dc3b
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/README.md
@@ -0,0 +1,79 @@
+# Java Android Demo
+
+要编译和跑起 ./android 文件夹下的 Android demo 程序 PaddlePredictor，你需要准备：
+
+1. 一台能运行安卓程序的安卓手机
+2. 一台带有AndroidStudio的开发机
+
+## 手动编译
+
+### 编译：
+首先在PaddleLite的开发Docker镜像中，拉取最新PaddleLite代码，编译对应你手机架构的预测库，
+下面我们以arm8 架构举例。进入paddlelite 目录，运行以下cmake 和make 命令：
+
+```
+mkdir -p build.lite.android.arm8.gcc
+cd build.lite.android.arm8.gcc
+
+cmake .. \
+-DWITH_GPU=OFF \
+-DWITH_MKL=OFF \
+-DWITH_LITE=ON \
+-DLITE_WITH_JAVA=ON \
+-DLITE_WITH_CUDA=OFF \
+-DLITE_WITH_X86=OFF \
+-DLITE_WITH_ARM=ON \
+-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+-DWITH_TESTING=ON \
+-DARM_TARGET_OS=android -DARM_TARGET_ARCH_ABI=armv8 -DARM_TARGET_LANG=gcc
+
+make publish_inference_lite -j 4
+```
+
+Make完成后查看要存在
+```
+build.lite.android.arm8.gcc/paddle/fluid/lite/api/android/jni/native/libpaddle_lite_jni.so
+build.lite.android.arm8.gcc/paddle/fluid/lite/api/android/jni/PaddlePredictor.jar
+```
+这两个文件。他们分别为 PaddleLite c++ 动态链接库和 Java jar 包。包含 PaddleLite Java API，接下来 Android Java 代
+码会使用这些api 
+
+### 把 .so 动态库和 .jar 拷贝进安卓demo程序：
+把本文件夹下 demo/PaddlePredictor 载入到AndroidStudio。把上一步提到的`libpaddle_lite_jni.so`
+拷贝进 `PaddlePredictor/app/src/main/jinLibs/所有架构文件夹下` 比如文件夹arm8里要包含该 .so文件：
+把上一步提到的 `PaddlePredictor.jar` 拷贝进 `PaddlePredictor/app/libs` 下
+
+### 把demo使用到的模型文件拷贝进安卓程序：
+下载我们的5个模型文件，并复制到 `PaddlePredictor/app/src/main/assets` 这个文件夹中
+需要拷贝的模型文件和下载地址：
+
+    inception_v4_simple http://paddle-inference-dist.bj.bcebos.com/inception_v4_simple.tar.gz
+    lite_naive_model    http://paddle-inference-dist.bj.bcebos.com/lite_naive_model.tar.gz
+    mobilenet_v1        http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+    mobilenet_v2_relu   http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2_relu.tar.gz
+    resnet50            http://paddle-inference-dist.bj.bcebos.com/resnet50.tar.gz
+
+下载完后，assets文件夹里要包含上面五个模型文件夹。
+
+## 运行 Android 程序结果
+以上准备工作完成，就可以开始Build ，安装，和跑安卓demo程序。当你运行PaddlePredictor 程序时，大概会等10秒，
+然后看到类似以下字样：
+
+    lite_naive_model output: 50.213173, -28.872887
+    expected: 50.2132, -28.8729
+
+    inception_v4_simple test:true
+    time: 2078 ms
+
+    resnet50 test:true
+    time: 2078 ms
+
+    mobilenet_v1 test:true
+    time: 2078 ms
+
+    mobilenet_v2 test:true
+    time: 2078 ms
+
+该 demo 程序跑我们的 5 个模型，第一个模型结果将真正的头两个数字输出，并在第二行附上期望的正确值。你应该要
+看到他们的误差小于0.001。后面四个模型如果你看到 test:true 字样，说明模型输出通过了我们在 demo 程序里对其输出
+的测试。time 代表该测试花费的时间。 
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/.gitignore b/paddle/fluid/lite/demo/java/android/PaddlePredictor/.gitignore
new file mode 100644
index 00000000000..2b75303ac58
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/.gitignore
@@ -0,0 +1,13 @@
+*.iml
+.gradle
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/.gitignore b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/.gitignore
new file mode 100644
index 00000000000..796b96d1c40
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/build.gradle b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/build.gradle
new file mode 100644
index 00000000000..b86d2f8e3dd
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/build.gradle
@@ -0,0 +1,28 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 28
+    defaultConfig {
+        applicationId "com.baidu.paddle.lite"
+        minSdkVersion 23
+        targetSdkVersion 28
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+        }
+    }
+}
+
+dependencies {
+    implementation fileTree(dir: 'libs', include: ['*.jar'])
+    implementation 'com.android.support:appcompat-v7:28.0.0'
+    implementation 'com.android.support.constraint:constraint-layout:1.1.3'
+    testImplementation 'junit:junit:4.12'
+    androidTestImplementation 'com.android.support.test:runner:1.0.2'
+    androidTestImplementation 'com.android.support.test.espresso:espresso-core:3.0.2'
+}
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
new file mode 100644
index 00000000000..f1b424510da
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
new file mode 100644
index 00000000000..0fe507418c6
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/androidTest/java/com/baidu/paddle/lite/ExampleInstrumentedTest.java
@@ -0,0 +1,26 @@
+package com.baidu.paddle.lite;
+
+import android.content.Context;
+import android.support.test.InstrumentationRegistry;
+import android.support.test.runner.AndroidJUnit4;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.*;
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
+ */
+@RunWith(AndroidJUnit4.class)
+public class ExampleInstrumentedTest {
+    @Test
+    public void useAppContext() {
+        // Context of the app under test.
+        Context appContext = InstrumentationRegistry.getTargetContext();
+
+        assertEquals("com.baidu.paddle.lite", appContext.getPackageName());
+    }
+}
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
new file mode 100644
index 00000000000..240078a5877
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/AndroidManifest.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="com.baidu.paddle.lite">
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/AppTheme">
+        <activity android:name=".MainActivity">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
\ No newline at end of file
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
new file mode 100644
index 00000000000..2022a37d2a9
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/assets/README.txt
@@ -0,0 +1,7 @@
+After build PaddleLite in your build folder, copy following models in this directory:
+
+inception_v4_simple
+lite_naive_model
+mobilenet_v1
+mobilenet_v2_relu
+resnet50
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
new file mode 100644
index 00000000000..6fd541016be
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/java/com/baidu/paddle/lite/MainActivity.java
@@ -0,0 +1,244 @@
+package com.baidu.paddle.lite;
+
+import android.support.v7.app.AppCompatActivity;
+import android.os.Bundle;
+import android.widget.TextView;
+
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Date;
+import java.util.ArrayList;
+
+public class MainActivity extends AppCompatActivity {
+
+    @Override
+    protected void onCreate(Bundle savedInstanceState) {
+        super.onCreate(savedInstanceState);
+        setContentView(R.layout.activity_main);
+
+        String textOutput = "";
+        ArrayList<Tensor> output;
+        output = runNaiveModel("lite_naive_model");
+        textOutput += "lite_naive_model output: " + output.get(0).getFloatData()[0] + ", "
+                + output.get(1).getFloatData()[1] + "\n";
+        textOutput += "expected: 50.2132, -28.8729\n";
+
+        Date start = new Date();
+        output = runImageModel("inception_v4_simple");
+        Date end = new Date();
+        textOutput += "\ninception_v4_simple test: " + testInceptionV4Simple(output) + "\n";
+        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
+
+        start = new Date();
+        output = runImageModel("resnet50");
+        end = new Date();
+        textOutput += "\nresnet50 test: " + testResnet50(output) + "\n";
+        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
+
+        start = new Date();
+        output = runImageModel("mobilenet_v1");
+        end = new Date();
+        textOutput += "\nmobilenet_v1 test: " + testMobileNetV1(output) + "\n";
+        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
+
+        start = new Date();
+        output = runImageModel("mobilenet_v2_relu");
+        end = new Date();
+        textOutput += "\nmobilenet_v2 test: " + testMobileNetV2Relu(output) + "\n";
+        textOutput += "time: " + (end.getTime() - start.getTime()) + " ms\n";
+
+        TextView textView = findViewById(R.id.text_view);
+        textView.setText(textOutput);
+    }
+
+    public String copyFromAssetsToCache(String modelPath) {
+        String newPath = getCacheDir() + "/" + modelPath;
+        // String newPath = "/sdcard/" + modelPath;
+        File desDir = new File(newPath);
+
+        try {
+            if (!desDir.exists()) {
+                desDir.mkdir();
+            }
+            for (String fileName : getAssets().list(modelPath)) {
+                InputStream stream = getAssets().open(modelPath + "/" + fileName);
+                OutputStream output = new BufferedOutputStream(new FileOutputStream(newPath + "/" + fileName));
+
+                byte data[] = new byte[1024];
+                int count;
+
+                while ((count = stream.read(data)) != -1) {
+                    output.write(data, 0, count);
+                }
+
+                output.flush();
+                output.close();
+                stream.close();
+            }
+
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+
+        return desDir.getPath();
+    }
+
+    public ArrayList<Tensor> runModel(String modelName, long[] dims, float[] inputBuffer) {
+        String modelPath = copyFromAssetsToCache(modelName);
+
+        // Cxx Model
+        Place[] validPlaces = new Place[2];
+        validPlaces[0] = new Place(Place.TargetType.X86, PrecisionType.FLOAT);
+        validPlaces[1] = new Place(Place.TargetType.ARM, PrecisionType.FLOAT);
+        Place preferredPlace = validPlaces[1];
+
+        CxxConfig cxxConfig = new CxxConfig();
+        cxxConfig.setModelDir(modelPath);
+        cxxConfig.setPreferredPlace(preferredPlace);
+        cxxConfig.setValidPlaces(validPlaces);
+
+        PaddlePredictor predictor = PaddlePredictor.createPaddlePredictor(cxxConfig);
+
+        Tensor input = predictor.getInput(0);
+        input.resize(dims);
+        input.setData(inputBuffer);
+
+        predictor.run();
+        Tensor cxxOutput = predictor.getOutput(0);
+
+        String optimizedModelPath = modelPath + ".opt";
+        if (!new File(optimizedModelPath).exists()) {
+            predictor.saveOptimizedModel(optimizedModelPath);
+        }
+
+        // Mobile Model
+        MobileConfig mobileConfig = new MobileConfig();
+        mobileConfig.setModelDir(optimizedModelPath);
+        predictor = PaddlePredictor.createPaddlePredictor(mobileConfig);
+
+        input = predictor.getInput(0);
+        input.resize(dims);
+        input.setData(inputBuffer);
+        predictor.run();
+
+        Tensor mobileOutput = predictor.getOutput(0);
+
+        ArrayList<Tensor> result = new ArrayList<>();
+        result.add(cxxOutput);
+        result.add(mobileOutput);
+        return result;
+    }
+
+
+    public ArrayList<Tensor> runNaiveModel(String modelName) {
+        long[] dims = {100, 100};
+        float[] inputBuffer = new float[10000];
+        for (int i = 0; i < 10000; ++i) {
+            inputBuffer[i] = i;
+        }
+        return runModel(modelName, dims, inputBuffer);
+    }
+
+    /**
+     * Input size is 3 * 224 * 224
+     *
+     * @param modelName
+     * @return
+     */
+    public ArrayList<Tensor> runImageModel(String modelName) {
+        long[] dims = {1, 3, 224, 224};
+        int item_size = 3 * 224 * 224;
+        float[] inputBuffer = new float[item_size];
+        for (int i = 0; i < item_size; ++i) {
+            inputBuffer[i] = 1;
+        }
+        return runModel(modelName, dims, inputBuffer);
+    }
+
+    public boolean equalsNear(float a, float b, float delta) {
+        return a >= b - delta && a <= b + delta;
+    }
+
+    public boolean expectedResult(float[] expected, ArrayList<Tensor> result) {
+        if (result.size() != 2) {
+            return false;
+        }
+        if (expected.length != 20) {
+            return false;
+        }
+
+        Tensor tensor = result.get(0);
+        Tensor tensor1 = result.get(1);
+
+        long[] shape = tensor.shape();
+        long[] shape1 = tensor1.shape();
+
+        if (shape.length != 2 || shape1.length != 2) {
+            return false;
+        }
+
+        if (shape[0] != 1 || shape1[0] != 1 || shape[1] != 1000 || shape1[1] != 1000) {
+            return false;
+        }
+
+        float[] output = tensor.getFloatData();
+        float[] output1 = tensor.getFloatData();
+
+        if (output.length != output1.length || output.length != 1000) {
+            return false;
+        }
+        for (int i = 0; i < output.length; ++i) {
+            if (!equalsNear(output[i], output1[i], 1e-6f)) {
+                return false;
+            }
+        }
+        int step = 50;
+        for (int i = 0; i < expected.length; ++i) {
+            if (!equalsNear(output[i * step], expected[i], 1e-6f)) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    public boolean testInceptionV4Simple(ArrayList<Tensor> output) {
+        float[] expected = {0.0011684548f, 0.0010390386f, 0.0011301535f, 0.0010133048f,
+                0.0010259597f, 0.0010982729f, 0.00093195855f, 0.0009141837f,
+                0.00096620916f, 0.00089982944f, 0.0010064574f, 0.0010474789f,
+                0.0009782845f, 0.0009230255f, 0.0010548076f, 0.0010974824f,
+                0.0010612885f, 0.00089107914f, 0.0010112736f, 0.00097655767f};
+        return expectedResult(expected, output);
+    }
+
+    public boolean testResnet50(ArrayList<Tensor> output) {
+        float[] expected = {0.00024139918f, 0.00020566184f, 0.00022418296f, 0.00041731037f,
+                0.0005366107f, 0.00016948722f, 0.00028638865f, 0.0009257241f,
+                0.00072681636f, 8.531815e-05f, 0.0002129998f, 0.0021168243f,
+                0.006387163f, 0.0037145028f, 0.0012812682f, 0.00045948103f,
+                0.00013535398f, 0.0002483765f, 0.00076759676f, 0.0002773295f};
+        return expectedResult(expected, output);
+    }
+
+    public boolean testMobileNetV1(ArrayList<Tensor> output) {
+        float[] expected = {0.00019130898f, 9.467885e-05f, 0.00015971427f, 0.0003650665f,
+                0.00026431272f, 0.00060884043f, 0.0002107942f, 0.0015819625f,
+                0.0010323516f, 0.00010079765f, 0.00011006987f, 0.0017364529f,
+                0.0048292773f, 0.0013995157f, 0.0018453331f, 0.0002428986f,
+                0.00020211363f, 0.00013668182f, 0.0005855956f, 0.00025901722f};
+        return expectedResult(expected, output);
+    }
+
+    public boolean testMobileNetV2Relu(ArrayList<Tensor> output) {
+        float[] expected = {0.00017082224f, 5.699624e-05f, 0.000260885f, 0.00016412718f,
+                0.00034818667f, 0.00015230637f, 0.00032959113f, 0.0014772735f,
+                0.0009059976f, 9.5378724e-05f, 5.386537e-05f, 0.0006427285f,
+                0.0070957416f, 0.0016094646f, 0.0018807327f, 0.00010506048f,
+                6.823785e-05f, 0.00012269315f, 0.0007806194f, 0.00022354358f};
+        return expectedResult(expected, output);
+    }
+
+}
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
new file mode 100644
index 00000000000..1f6bb290603
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable-v24/ic_launcher_foreground.xml
@@ -0,0 +1,34 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillType="evenOdd"
+        android:pathData="M32,64C32,64 38.39,52.99 44.13,50.95C51.37,48.37 70.14,49.57 70.14,49.57L108.26,87.69L108,109.01L75.97,107.97L32,64Z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="78.5885"
+                android:endY="90.9159"
+                android:startX="48.7653"
+                android:startY="61.0927"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M66.94,46.02L66.94,46.02C72.44,50.07 76,56.61 76,64L32,64C32,56.61 35.56,50.11 40.98,46.06L36.18,41.19C35.45,40.45 35.45,39.3 36.18,38.56C36.91,37.81 38.05,37.81 38.78,38.56L44.25,44.05C47.18,42.57 50.48,41.71 54,41.71C57.48,41.71 60.78,42.57 63.68,44.05L69.11,38.56C69.84,37.81 70.98,37.81 71.71,38.56C72.44,39.3 72.44,40.45 71.71,41.19L66.94,46.02ZM62.94,56.92C64.08,56.92 65,56.01 65,54.88C65,53.76 64.08,52.85 62.94,52.85C61.8,52.85 60.88,53.76 60.88,54.88C60.88,56.01 61.8,56.92 62.94,56.92ZM45.06,56.92C46.2,56.92 47.13,56.01 47.13,54.88C47.13,53.76 46.2,52.85 45.06,52.85C43.92,52.85 43,53.76 43,54.88C43,56.01 43.92,56.92 45.06,56.92Z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 00000000000..0d025f9bf6b
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#008577"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
new file mode 100644
index 00000000000..0d1e60b97e1
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/layout/activity_main.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<android.support.constraint.ConstraintLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context=".MainActivity">
+
+    <TextView
+        android:id="@+id/text_view"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Hello World!"
+        app:layout_constraintBottom_toBottomOf="parent"
+        app:layout_constraintLeft_toLeftOf="parent"
+        app:layout_constraintRight_toRightOf="parent"
+        app:layout_constraintTop_toTopOf="parent" />
+
+</android.support.constraint.ConstraintLayout>
\ No newline at end of file
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
new file mode 100644
index 00000000000..eca70cfe52e
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
\ No newline at end of file
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
new file mode 100644
index 00000000000..eca70cfe52e
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
\ No newline at end of file
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..898f3ed59ac9f3248734a00e5902736c9367d455
GIT binary patch
literal 2963
zcmV;E3vBd>P)<h;3K|Lk000e1NJLTq002k;002k`1^@s6RqeA!000YGNkl<Zc-rlp
z32YSS6~~v;2U0^g);4x*ZZ`qK*^oekNg|Z0s+3kr6p2DrK{4SpzA;{}cfIS|yY~9R
zUK`BaHaJn}MMF6XNd$=?DWMTPP?H84lpY}@-~bDHegAKF$Ft-40IzL)Xh-^Ed)~}^
z@BQZczW3(BU<fUtCA5T=(9+KrG7K_UEJFvB5L34-7R%rPd*Ztso9Y0OWv1^(mgU(-
zl$o3nWu}S|rFjmctH3d#w#0$o^%U*0I})np_83=@BeuFoYKINm$m}>a+K}1d8+^p?
z!e{m!F(8(%L-Or7x3OYORF&;mRAm8a^;km%J=s!AdNyc=+ezQqUM;oHYO18U%`T}O
zHf$ra<Jf3JW`}R^QQvMmHnpN+OmL7?<C72eM}s|^47MzfR%4x3Phwq}N6!XDR~C3C
z*OW-@u`atOzNScOhYj1v>^L^sklEoIeAKmbOvX~v2@Y|vHs<^3JwwH?D$4l*XnPNs
zMOqozmbkT?^lZ?$DjQ9%E0x+GsV=1PwZ&39Y}iI-$Fb3d%nsk+qrN@cV=OmQMEdF%
z)iHMl(4Yu=cIkixWXtwMIV=>BvDSrHg8?)+vLJKozy*}$iE>&gGGonlG0cJhG&DRv
ztzkg-AO<u%ZB+|m4GK_XetP3T2Nao_yd!8pW9->(q)B7~G^EwE#tK@nqmJ}!(Bqtf
z=eN{I?X#P!Xx=uL)D9cAk=b!<v>~&@H~6S)=a?R4fDdP{-5E5X_!5&FwFJ^7&W2WS
z;CnxBCOsSU^v-%(vad;MPukr;&+ciI+F`>sGCPiqHe`1A1|N0p^<|#<+iECwOG@y7
zBF$;;0YAhxtqK7O0SW;M0SW;ckbsQ#9QTYyC*g`2j%bA%1Zh^g9=9l*Cy!I<J4fl6
z-HoKTFN8X0{`MNF9X4zuvp=`jO_irkQ1RPGMZKf7!#DV-w{Hx`G`+z=qqEls4=Aq4
z<i<T0+*ZKZ(h7yzlD$nG);k!^?K^kKbNe>^{_p2$PP2>j_D2AybM$NwY}iJ(ZH9O3
zlM8g4+dw;}V{dlY2EM^Z-Q(AmcmO|Ub1&3EFTS>iuHC#rcNo$wkB3@5c#lSunxsQ)
zaA7tLFV3Oxk}X2`9qVL6?4fcq?f>Yk0E0IEcm0~^P5ovLLV$&D9ibbZTOt4ivg_<=
zu^#q8tYJktl(egXwj4c3u6N&}S3mj_9pv5y{gQvL;&nM}TeNE{4K3O%_QAdpCAswa
z`Ev>!oQREY9uPqL)g(QPVc1U`Q3An`+x_7g8edZ^0zdcpXNv7^!ZsgV{ugB){w+<Y
zEulC5@@K_%lY8f8KGqNh*e|nKd`%HGo;fK35s*08wp%yF&wGK<110wL{oQu!7CBEH
zziXE37ik_@1jDPdK<*{?nNys(9R0fvignp3{SUvU%U@mXc0d=eUZG8gekI!IH9%2C
zxo*@nc(Q=5tQGFnAQkJ`FxRAT09V@Dx*d*RU$kAPjfW11w)A{}Z}3sihL~n@?;g8(
zL-2rN7!aOK;qePnu2!fcaL-1&egZ4&$nWT@8{hO89KX;;`r+S*Hno1hH~6UMJ7SvC
zR7J+F019@9eVf<)^}qk_aX5aR|Ig>5&3-Wlp}yI7?tN)6*ST)-XSL4g8_rtDVlw+a
zE+K|#(tV!<wCuny7=nHv?_K<ao@#K?^8LH%(<@)xHyljzFYIZeB`x)I@}D0|z~7{0
zukY>KfQE22d-}7B(mLkHukIp4?na@q<A?7X&{n=675kY|=tujuNej%=mp-K-mh`?0
zXyJ}}`ufHVUFT8Ym2+hsmY+u7D#+rV@Q?cjbm(vAX>?%@4Kb%u!@F-ww?o?tn_Ohb
zPi3Do`yL?Y$rDPYtEV;|250yzpS^rZT*TflAZ&YqC;by2Ul7NTZHKm<?;IlogEa7o
zNUSgGSwNjh^ONdI>C)9NA6Vv+>C%^1XhNlp5*!7zxTTKfHTPhe?@XbH=VzWEuCcmX
z@L_&qCB;=(Xi;-D&DvT)kGOiMQ0&YQTezdH&j4D;U@#9&WiZClJThS7w)OHH^fIT|
z+jn{&5bhMbynmM$P<0U*%ksp0WUy)=J!n9<aQvFf{%f2`6me0A!Z2W+O4qx=eobRX
zCfjuI4FN>~WJ&YNn$e3{jMFOW6n~uqMHg+M3FY|#>(q)ZF;RS(xqTh>S1Ez_jfFig
z#ivbPnZ26mv{5wdB5SFYrUNM5D?g-OsiZZK?hPof9gqf&7m!5-C=d>yOsw<)(t*<s
z$lMM`0Adwn^y2*>G@h5zIY2saaEx|99pU%^#gv<M;h@(DK;`B@I96XTvSxN2P_U1E
zCUc!J(P5=0>dI(Qqf<Z%ltt(%PQl;hYqN=*lCiy3PrIvV0q;N2S)h_lfPs>>)zFjf
zN}5z<W7*rq*$aAZXFWXuyQ_#MRjXBub)I=#!o+kX%?JD7P&}C9v-5aOv4Di0q^LMo
z(d?QMng=`2MS7aYy>m9~oT`PmH~EF012{9eT8?4piYolF(86uiGy`^r#V4yu7SA-c
zjm})#d$(Kx2|Yn~i19Fr<)Gs+1XaUIJs~G>kg>3<hLw6kM$Z8h^{`S;$W!?Q87s`H
zVWk!iF+P4|N^d`#$Fujmln!tNAnpkXen1*HUO-lg6M#Cx@dC291Jb~O5^Dh{zat#X
zlh&+00thcJz#)HjQ%%GQARRd9H3<$4AWgPX0O`O%uh9hrUkWnY0ri=;J5T~3@Vf#E
z%vJ&r2O+C-1fYt1T{vDqC4qqaaQuKAmOl3Siw2SxkVRVO;EG%TlGHiKR9-;ox~V(>
zkQ$CqUj*cb1ORzHKmZ`Ab2<VFtaH48ERs4$1xO3W)=hwV>^0!}Qkq&-DC(S~W*1GV
zw9}L-zX}y4ZLblxEO1qhqE9Q-IY{NmR+w+RDpB<O9ycxEIxdk<k|Iv+LUpB{`Mhsz
zS6+YV(HF`2?9F6{QRj>;$@R(PRjCP|D$yJ+BvI$!mIbb<+GQ3MGKxUdIY{N`DOv%}
zWA){tEw8M2f!r&ugC6C5AMVXM=w7ej#c_{G;Obab=fD={ut@71RLCd*b?Y1+R_HMR
zqYNuWxFqU^Yq9YB)SmxVgNKR;UMH207l5qNItP~xUO*YTsayf1g`)yAJoRV6f2$Fh
z|A1cNgyW)@1ZJ!8eBC7gN$MOgAgg|zqX4pYgkw{E4wcr09u#3tt$JW@xgr2dT0piE
zfSguooznr3CR>T88cu6RII0io!Z)mN2S3C%toVr+P`0PTJ>8yo4OoHX161h;q+jRY
zs$2o2lgirxY2o-j$>c;3w)BT<1fb;PVV(V`cL*zHj5+On;kX@;0)<bC@YRfVy<d3M
z7H^Zv?Cl@*s@8LTV#v>6rF-I?1)gyZtM6}?#ji{u+_Jz`IW9a=87nIA3aK2~3iFMS
zzYP&fCXLEibCzR_6R~#sKN@)HB>);Za`ud*QCaKG8jEwqgoknK7rwW`Cq?RYYE5r+
zh-YUqJ082>*;EG`_lhV^<FokqNaInQ*73Jj`HJRpn#U~*TvI7>v<r0(QhDI9Io)$f
zF)Hnq4<DMA_$*>HEM7d+5Y#e$d^rC*jx{U%h3B^nU%7N|*y`o4g{@w;KP-89>&W#h
zTBB2vTk*S|My+4jYTPKdk6yR3b?nAfcd`FeC@gttYuGBEl9wuf8`rOD9VP6`bhNxR
znvXql-3ssVUSXfvcf^2L5R-^4E-s=g|M$Wm!?<yg{4<(0g*A;ey`O8EFP11nRMZF&
zW0#k?hGE>BMl!51d{AS*7Ggjwh^YsbK?6jgCA5T=(9$oK{{z$fCe9x5IJ^J=002ov
JPDHLkV1g@XpTGbB

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
new file mode 100644
index 0000000000000000000000000000000000000000..dffca3601eba7bf5f409bdd520820e2eb5122c75
GIT binary patch
literal 4905
zcmV+^6V~jBP)<h;3K|Lk000e1NJLTq002k;002k`1^@s6RqeA!000v3Nkl<Zc-rM$
z33wD$w!Xb2KoVA2lMs>sCJ+Khgs=qzz9*aFfTF@MBLc!81jy1$_D*`qMnYCeSOOSS
zh~l6kD7e75FgOnvP=_arGNJ+k0uBt2?%a3It*Y+o?&`L?*#fV=?@xECZq+^KuXD~l
z_tdQ><aTb9$_OPgqSMJGfyWU4Z#Go6ak>JOSF%q}x5h@>Id>gloHZ!fr_@%N)Qad*
zI}<}@Poh`#X29>b50CkB%{yWf?z(t0rQf48W{j1a($$IrZ9{N{@#9Wqx}%DM^fL-m
z`X#_s9{BwX>^};}KMtudHpmMyRCq34!+|XCtnqeli6}6}7JiE;H+GAtDViHuQ~X9`
zP0^{y>Ov~ufreT-w7!yx_c;QOV>|0UxJK{lqSx`7cx`b!OLV*;Ez4q9Y_XdB$PKk4
z+Aq(kmz%Wb<VG#!pg~yVMzt9cyQ<6_R$;Jk`!vlCu*eM>OV3IpYsa0#_Vd?)>*2Lc
zn)<dS*(2=Suf<uE_|ECIi87OR*YNc<G^jY65`!9HCI%HiZQ=iCh^Z^3p%q3d-scEB
zj_pMIEc~_1!anQ=`rD-}&?ZXCbejV(XEpk?2x8Q{LioEod|fpNo{35{yHegqg2ZS>
zvVw}USbx|rlL2LMl<$^rb@TnK-;J83fd3GKh6#=C5WlXv83lKz{0$(8x1g-%;q}$b
z1=&8M<_eQZO4eJk#nshu9TsZZ11Z~hVkpt8oA4831ZP3Fj3C~EG*%gSnciYD-cpkI
zj{J=o1Bg-kJrjfz${Js8D?vh>vJwR{=4)c@ZtTqt#tHRR<NXUWu87F76SxHsWM|QU
zE3NdZ2{$)*^On=5DE;xojde7jD3dBp8ne=*rH=3~NT;JWJWo3;&!JMs1q@nMMtfD6
z+_u5v*v?%W><9b9ew~kVG6oc8(lNE=Pu>)F6HIf=`kIH3oJBkSO2;+SnG--LDU5kx
zC0($63w`LN)znoR#GhW@M5n&8!EGBnj_usF!G5qm>{qhQ`sdB#K<?+>+CoQF7f-se
z?#7!W#vF7jw48A-)Ulxz@0b)?7iKWQI+f<SY=aoQdIgQBE^ylhk7GM`ZLlBg5Bqi1
z22&2WskBW&`V~LbvcWgT-Gm%Mr)vj(%mOl+Yyxl2u3|!qi9sTuyl6>E6Ud#Le4H#?
z*wIeM>mtaY-X;WO^yfR4Adp*W)N+A4Yv~TqOy)a5g8AjAEfJ4acRWELKhbNNKrc!(
z&!ze1YQkhsw=A3()t7B^pu2=1)CJq>k}s1bv-{fV>=i+J^=8Lh=Pn_L(@77X+QqLi
zSM!u0YfVL$I)-o^+D$g^8iKevTQlfM$<k-Wd+us{(2^O{JGmMTtHTJj9M%!$8#ttj
z_Y1hER7WFg3h7wLJKB|GOSTaZ3khIYrO|C0JdW+$wP_FiJow@&+PkKRC5=*TP+pW9
z<+ttUbdzNyz3*AT#xf~vBi2+7!+H+h6`YmhrvS`YF)2%U;dv_Fj~E5#ynFOp`oo1^
z={rXbK?1kNsf`#j{`wMqq;Z|wHh3J{xob0N(^~q$cSq=n3+L%;|2!bs2IWP$QGQ_`
z(+y-ER=0x28Z={2_vt(bZbz=Rg_&!x2!+WNny(mmhcI)+<jnEd125E4i^alV2E6*s
zmvC1Lo!SW)R$ob@Ru;QFj=<yC&eidLwae*_!~eAQ+hjJgeU9x=UX&Z<w?76t7G=>k
z8A}@MLX0cd>SIdp0%mtcJaTy&g94$WW9QB?a!}a+T)Rd$eDM!(fgHCnNCsx!svv{S
z@9-M<iT~q%cQL3-SsvZ;%>jC~sfoKOK+dN>{)_sV(mjhof{qxwvX-7Df1DQTI(g)o
z>s6XRhgIhE&g6I!q!Sxz>EW}#SnudH5WeBSekYPp`9~Vp)1-G^r@B46=-SWs(Z;X8
z02evPKG%G<kLK>)Nf*Dpl|H<Aog_J14}HZfM(FDwK+XUcWV#^;^CAn$OQrH9a~8P{
z*4!{Bi>NSeWdw0`U#|(mpohWGktDRF;Bo`A2K9T}=|{(p(X*E>(aYDag2maC6ay^+
zk7K(%-yfyPJKv<yD@$m2RROacF*#qI`kBV9siZP*MlmZmU1jQp+kpd#qQCI8a_-zt
z5mg00^TVZ279_NJ=FLWW^0$jt1<>6-`qy{#2oNV$%o|*T^A7!TivIn?ahqEKj{ka&
z1#*R?@}3aHxtTmO=~U-w(|Xu(B2EmI8B50EvnOk9*GGbcJZK_}E{D#X@`(&j@%hg`
zvgc+#<w0WLBPGa*LXs9v=G-h%=@ftDIms%HAhr2M5Y!(=lbuKY_ofnrBUlu;-Srcs
zGRyTU8k(M*!{<awM+=fGbZHGYilD@XI+hDv^U<9=1-jaHAPM`sE`lHq$zfU*mYcTO
zJ`S-ZDC|+OqDN`ohiI<&Bol+F6b7v>V--FuV!3MbUy#-AgE(<BiLndjfW1F_&-!&G
z6UvKnqx>$~;1gULUs<ph>w`94gkTgN-nwH+_TiyxD=9t>#{5GHSR=+VC|3HUj>p$m
zF=5TOh#WCVpZxG0Mfs)VLU~bclwVS}a)Tud>)$I3M@i?-ZEb;CNQ$OT?W!<oPVqt7
z;;W05?%i!2VQw|9G+m=5pM2n>i>WPgI2K-%bDAV3iV{YFpxIA_D~#F;z7mA_2ToA0
zz;J#$$gz?H{f~tykIYwsN^&ofDHEcc3HtMs_ksmo_H~%=S!trXzdzzq@XJ@P(yd>A
zNh?17fF3z>nk9kWDu3|gPt>$~7yTPdOfi9U)o%B9hiOkpO1&hgnGv)+?=lcH(3zlF
z)1$73Anp4*+{T@4Fog)rOQR%n2^~~bNRNp!ZBKCK-@noL+ER9Y8^~8Se*UT3c%<F+
zCMDRPF9bb2evmUkk4=tLnA8`2aWR}{?f>b7TLtsqf14?X2rJH|pTWGz8-n&h;14Ov
z#z`fWWiO*ed){^1em`8ly%A*0PxH#fdX?n<MkXfow-F?aV&fBfDD-Ndg7k!!*V1#B
zFS!-??=a1{zovv)(VW*_a_b^e_!X}I?AA@Ry<S5HR2I@Ne&>dqyYz250dgaflgvo+
zJV<AsCc_7Z+eR^gpw6VNJYRj75oGY3ApC#j|9s+cQg`^5(>{-K7`Kl9diHm3hJcly
zengd6QU#LyA&GQLke(wb%#d-6v?HDD3F1f!>{yWg5#|xN?9J0WD7v<U{&?~HZ5_E>
z;l~T-X%q||!6msgyeyyoVe>kdc~D4&(TwHYfu@{&z(qUzHQHR6u}wE)#*5x&(o-7O
zw@7jXJiKu=?N?bq2i6qRnT;Fhz}ixmnKagt?l)w<id+!8(knMH^OvVC$nthBHfd9R
zizaB&##Q*JnfgN<8o9chDsg1XS4mG+p-qoPb4_fjAu(B1D&{^lnGe#`Q-bEdwWUQ9
zH2;lPtOSi(UBMBgOR*<O&&nNC4(+<kc<c$}!oJ+W`I?;E*RNMZx#1^o3TDO+-*3?b
z8Tahs2~yEfYeWQzNz(HLGwpLHTSZ6T%eZiQ<$+_YeXpXdH#velE4eNN3)=kUemgWC
z*iUzT`wiW4@N=hQVr)P36@BBI1NQq5*aMI6IP|q>-)BzP^3@k~*Wp97@gTqNpbZPR
zy$S@S*a*rO5riY0Ud8DORwP?Adna(v!QOi8<4{14v_(t!#gLwrT(JX4+=L_$A%|pc
zXmt?{(xut$cSLlVo(30Y+4jMCjtGY2uwS_m`dG?inGHD{f(#luthNkXB!$a+a>Yn-
zK~O4(yi`tCXd{2}Q7v*n=1Z+W<4npgXvmO$@_f~4uO9n2kmNBzD-1S*B*<|l$eA1@
z#7Yn<E<p|>NRI?n@&u)dVc}PLoFRSt;=(FF*KZU}pY9KTJIT}LH;AkK9+f+gq?~2G
z5#)j#B*jLMG&xp+>KqBOk%JavBS>X$J^3kS)@II(S5WsDjsv%=Is#fvo%C=}VJ79C
zu4XlR`eZez2+jdtZkwl~W8jW<Go>?<?2u=}kg6iur!*hFt7uTZH2SX4)41nL*mdmy
zqzLE+az3!OinfEB7X;xU$f&i!#Gw<&_TYw<v;*pea^bSP0&O$u8JGhECXRfZx!3P4
zVCXY(BEF;5eaLddd+Zr@eYM%sXY*Y9N;d1B_3}DK5?YcC#ddfMlDG-$YUofPz#hx3
zm<UQuhq>O+mCNa{m8IZH0?Igm<vdVdMQ3l>NQbXlLF4NHs~k~IN5KqX9?a!NuC1W)
zYsz_4m;p2<C-p<nxd*SiM=Q(PlF1XMLo5>B(rNZ|bq7KTK$6gs(A^{fuF@Y|C$u<+
zeYYY<jAR-L(}eyoUBDcLLy8SL4rn<@4i4e?=!PLrj5OLBEG+?u^v8z^5Od`Dsm*e*
zM0t7=CF4f^iRnB)&cl_jiQ%o`Z!lRp_jQ&;`Lcit7aZWZQt^k6G5yL7EXIgqj^k(*
z3<UP8FxkZ%EBJVWZiSYE#0W9RZYT%%gDj(gASK31j6FJ46Ul|oIZth8x^g;uxJ_rV
zs4W>3Gn!;AyU4%y;QbOj@OvR}OAX~1e60jYkYi7fGch)Tw9J(lK@#LJf(#;pbZHir
zB&II7NTQ;~GF=lBy<Df5n23yaC$pyMSSS4a%<C0q;uOGPl(G`!f*d555mae%Ne)_!
zm7u&-NpjF)tOVt&u|8xb86eNpNd-*D5hL38vd&dTqS)h)NV0g&JB)F?#<yj_0!hD1
zkn>QEr3##lyCO%LAbWBIf<~=H3(^R#^&aTfo7d6DH>o+Z>qt5T4kD_BN0|i~wM{;)
zQDk{ivKxY=^BgNdF34d7nZyJ+lfx0Dp`+JSH331CES`Ogv=4}5y2Zs^=PL<uaRKnR
z8yP%!GD%-_QOU1Z_h}KQn51kmL9UZPX@cyNK#rhtHd;xMgT;HWAoFNQDHN%H`HjRz
zJ<4?Abh`mte4lpkH`MQ;v2)twrdq~rT*a2I`Sh*^z72#mVo+>gRUr*8)xq~v8}M$U
zLOie%h{Y~;4ui@DJqJtzG0(xF97ij3CmS@3983s@mls%CJveFs=+cwd>4yDCfvm&e
z!5#1cb>BZeo;3I6^_Foju7YH-rfKy08n55>!E;8!9e--mI{HXM9UTG5-bio}4&^qi
zE~isoTuo;*ZeZWBo`Vxk8!8zvL!O6k1VIoUEds_IbStzRBxm^3Gm}w=_OY=YZzMUw
zCMRKGc;U#1X^+ec$Xs%Pdmk&k3F4CX?~8#O4uI@BY`Kmq!J0Uv+5@a9tSpblLOV<X
zt;iQc#ikD<fiScf9L5>))hr-m%u%E*xX4>hBnb`e#B{kyo18?4;4dFUw7M^53Rybu
z824~aV-c4}JY7hR>xV*sAg3fy6mLS7LnaNbD2_RfLpjc^aO!{=GM5BGo|C6yB@D9o
z>0^ok{idSKZKI>_xtZixNop4pgLk193Gf?Ao}Iaq1y@!>f+5tPYW8ZSJw77VrMS#<
zkU%RzE|Nf;cya`#HnR*FQxeQ`<~;c>Y2!DH$r^KWEyp=Wij2g!i9-Mb<kKb_iRv%s
zSzQI^V+L(ge7zU1h2!YvqS$36$OSoQF*qxaUtQjWb1Qb8l=<i7kB%<{*<zWDoUFI}
zS7T%(I5>cG4!}i^_bU5@kB8)I8_7rlg4C4#@<K_b*;QrI>0J#r1#qtCFoLQJrO9E%
zt`s&x4TB&q*Dj{y&(q&hhKJ${y!SHMP)2fle^N(DLRef11H>ps$3G)mFl*0{%0f#}
zK?dh~_$b?`;>l7qyL_2N&lj^qc}_^Fh@jk*X2^mq@ZAj7%2fh^%)qQ<OMmUMJ&IBH
z&qW!SESyX&=EqJrYw8-<jg1t94D9;g$XRXX&TJ&9^G~uMjLLIlaq~JL3?J6v$}W<x
z=7Kx|zat4bDHvoe!KFHLbWO1tT*=G^1mUR!&h3`XJ4t3e*4R4!J}g%ZWPPIyEx>AA
zZ3@z-Q#;=6kf<1C_wHkrQ^se@o}KxQJaxedR`bDn4a5ufwojD_f5pWfSc3vWaa8IF
z!+Z?HAa-6lxNq{aCuDPGysez_-`RL=-eMvHI(P2D`bHV<fKAakDynT@*RI`I)=CJj
z*otl4rAtDmaS8LfKm9~WkED59;tZL42NYy|+cRJLQ<yRHY_G!X3kdv<`$DslzwMuy
zvUi|5X-n5xPn6)fcn#iK@S0()59sSoA;vjx>O)$w1e0^WP&R`mBpOFQKR>_w07I2s
zIwmM1dOoD+-D@HOzvDhQc0abkw){E0*){N5cul<aEgZUUK3OrupOqIdo9))ED+vtj
zj<sN(xG<Kla0uh~^<ZtVEuMqt;x+CfQrri2R5TGrnmFP|`t)gqOOYAyJMK$lqmcY|
b2HgJxw7%AYvJx@g00000NkvXXu0mjfkw#Z`

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..64ba76f75e9ce021aa3d95c213491f73bcacb597
GIT binary patch
literal 2060
zcmV+n2=n)eP)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm000NmNkl<Zc-rk*
zX>3$g6n-PcZs4>q4bV;KlnN~%kbn}!V8maBKN?~PDN77Zj6xT>KxccMrJYVYoo)<E
zD@z15pd^^^lOHN7J8Be#*ji{ofi5#70;QcC&$(~z@LoGprqI&H@RBd@-E;4K_dEBV
zbMCqC3Bt2$JYR=bPY^^gVoX6ChJA_Ra}49<8WtaU!}9G@1$#-VVAZ7%rZ}xB{Q03w
zH%Jy8LLR*TU|F<N0vnq+1miX&c~OkfxXRpuwd5bcQq;wvaf)5?G9j+o)D>adu8>W%
zmv*U9KCo@D{=sCEstjFGl{%?R9Bd_S;`C@G{FNG~X;+5Z0h*dJ1r|5g4wB8=?S#Zy
zt3sAsXM@aL)nWAyCYz08&uXYp$}38nkeVvA0^C`|ts22ve2Y2>mf~J~_Til&y|FUz
z%#l)O^+i>bDr7NsoiC}@GN^5^{=sAkPSF?VF#7ysBZm@DnF?;l<mJpza2+uF5$wKY
z!)(D^*yC4#UIHAmcQ`;^CZyEcAj)2<ki}H$AgRtesBN(R!DI<Ic3hQFDl-nKfLP#}
zv~g{wf@@F(Hm%FRj@{!@f%R*$!c$<pu0Zzt^B$?d<c~IGpcD6ybCG@0030h1Hi5iM
zc;moM=E!o{{M|M1`hNGIw!!)blfAOH0iC;!J#!A}E}JLigY0?#{}p(qz%vD&*ooI=
zVCNokOxmP~AO|@tf2N)KR7V>e_~|Un-B}Itc2u|IlX``0V1M3jKlcCTY73+_+5_^1
zO|_7<%PEyPhbqxCEnFv#uom}FdO$lY%`OKi#h<5Co8ZPBFZA{I!|wAx!c?aisEfxs
z?T$*AUTc9D8_Hpt%L37MoudCVml+QIa-Q{X>F$I{4t=051yd2KXJy7g2ho;dPy9%m
z&|3%hK)bgG?)N=_y3^l5BAU(HpEX16sc+%jjdr-wd5e*w`^js6LDPj(u<}q7%axih
zoQB@MKIp*y%l0*noe!-3>L8Nvz`X|#;P=}%;m-Yg;Pd%Hg6jXkc0~S4=WWP7_Qlvb
zG1>9)E0=~O9SWcSdXd@th$;|?3QV+Z@1bR;tdb%M2ko%(GTA+u#e@F7$5Mb+;mB`4
z!xVgv{Jp95<uC*Nhm{!zJ;Yi|!t()&LTZdcl+RO2OyQ7&(gXfX3hujj2KvznABQ{#
z^=~?VioK7&d%+$Ti)^gSIXH$9g~BnR<rora1_MM8|MvDWhaQSB)l<9v0kYTj?|6DX
zK;iA|^=k@Z7q7R&%qkJGk9-<(RY<<>%Y!hpT7-)jrQ~&IJFY@h`L?H{0L^~?0CJaZ
z{tZjr)sT1m=#VQw^-Fg;S$l<Yh~>@ofMbuY0uykS+-JWJI=h~`ci}FY$50ATJ+%wA
zO77DqVS>075^y6_kJfo$5r(}BH#(lkaYNw(n&Hbh&XQd<uxPhybj|}LZ|&|H6$M&x
z*VbUjOhBkBFhMjIjl2nIFlL|l^D>-lYhgIk-UdHhZ4HzOR6cX9O(7$kLq}D}u9EB;
z-dh<?v|RQpV2~6*a}vsmMqwuw3qa6qVMwy;A=3D;Oo%pZg&7rksjSZpK<mVJ8t~bW
z2ZB+8785<em=A)9qd=*ExRpMwfXX=1sf1sNkP4t&PM(Nson{4jBCd5=73Ar?2@X;~
zhqLcjpsWZ~hWwE~)TiU_xy<c^IW-nY!##i_(}i*jmD=huHtP&EW?0eOfOm@omxYmk
zb8jQ0W9rU;?;|Wl2VR8dfpknMv~@OLdVK}H%Q8BL`eY;*X&SHsVj(2*6)@xm%!Y{b
zndfFh#QDriz%c4Cg4)hkpwPbpd82rwq-U-Gl9m)OD9AH=udyh|2NaNzC(;R3T!BJK
z0rN0S=&{elG`Qd1qjBKGN<&x<OC}>HFDZZ<8Lc2GP(}(AKLrJ-Oau&a1s?6Nk^&FO
z6KSRZhEqx_SQs6S0+E<b<N36QK5^4mV#-T6=!==WOrQe^onZqqT>ca!Fb^G1gONmI
zC+HbyhfVOuc?OI&<gjG()Q^Z0<}^A$+facig-B)kT21f5bUI80Fe~h)@ALGDo02LH
z`rIck6X@#!9l&{+a+v8j@JqEeP}{a4wu8D{!a{cy-b-1Gm!M<OhU{H}p3OXwPo@48
zPAd&QwZlaHJySM?^Kmz4&73FxE`(%~Fl>h7uoNn}=`c_>iW5NO1q-GUX8K1^!Zxzl
z4XfveR)GIBSo>}=cI+IH9~|U>#(X~teA-&84{aZTo0BMk;yjBqEL^gX=_9kDnP=}a
z`+sm4^17nldnZj&U`51GznG$gf}Fz|OlbvM2~cNtN6bbO;LjW>4doDpXIHr_#-WEK
zTp3oTSyarnG|L?64R(Lh#u7IM@+CF;0?j-dAKR%u-gp$bMThf`Y=V%QniZFqb4;b%
z+^sU^c~$y+58W}2ds$fqbXadxS)oD}YcBF8+Kmro`dqK7bh9_jZo>N(2|7ZqH?6u%
zs@LZQps|*E)s_+u&N{X0R(-hsYauy#KI0bVpUP;&tcc8vw<4D;UKP1mLj0?AU!cHb
ztdAKWi}A~qZL?OzGg+1b@q^keUNsrViJ`HuE@E!RO5*b9*&nDxR@U?Q6pMIaj1kMY
qJl2nQa+aK&iDQb84*TpHAJ>1BQ$$nT?9A!_0000<MNUMnLSTaDRpP7w

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
new file mode 100644
index 0000000000000000000000000000000000000000..dae5e082342fcdeee5db8a6e0b27028e2d2808f5
GIT binary patch
literal 2783
zcmV<53Ly1~P)<h;3K|Lk000e1NJLTq001xm001xu1^@s6R|5Hm000W6Nkl<Zc-qBU
zdr(x@8NW~z#1%veBnAx$Vt6=KYaVUNME^*g8q{%&8(%TC!Plq=;khg<yX>+Hy9+Dw
zQlg?UKB$_cZ8RBMYcyI%jkQf{#wz1Xr!PxQ>w~B~cKP~!=iIw{_rdOp7<T42_ug~Q
zJ?Hm3=lj0%-E&tE9`y%8h;UH(9ies22zCtglh>tZhwZ1+g(AXy-HL10DFmbXNx@L~
z3<I6(u*`QmVXUp;BtE82J9(_N;%CuT?Y5XY-N}?@Ye%$2+ZkP}y~V)KJ5pOLC#eqA
zrDwi<AvE=}QJ%E5c11)@*)c>H0wQYEps<v~l-4?S?ulvJH_NWsGshy#Wh<ucv5xlD
z$I$%%LY($Zc3Y)dr?6_SKHX>np{iIyzhEeKgc((i$;}oAoqHl}Yb`&gx~}ISy|wl#
zwdwQ;nvEgzkAnwYj%g}=Nide26RJwsNTUEE)Q2P-5}7cQ3Z84R%7r<Rv8_&i?R?f*
zPHj^k;!i2>dvN4sQKhOlPcRnSrOp+WGP}nNJgfkDx!pMkypKGe90p51ezT#4MxAxQ
zN3CC+fuRy0nP8u@+)%h}@FHZ<Af?5!4{aA>>vWFTTCD?*bPf|6Oz4#LAYDsH*sO<_
z+8Vve2|wE19JrkK!TNc*tzkb>2=OxIfDS8-yiLEA$m0k(kQf0ZJlj+Q&+pg*@-o6x
zTdEi#&vL>m?`;jX+>v0bbWnM`S<~tiA>-z6^m&Xo6y=iH&}dMDp40vqOvn?CbR0P3
z0YX_`z8klIalWefMaf}lN@-MvK>)C@OTMQsv<oGhUiQCE$X7u_(K*RsCA%ID0jykd
zY)#$;WESeh)!TFm*NPE`EjYt2fGv6@4ndEmq*}kxw<(E=#<%Js73Q+5M2_M-9Vyz7
z!}79nz&r*TFPs76>EFV1j6zbmglN3)tDNw{&IYft@#yp|U;GYg&z^)Rt7d@u#0Bpe
zimnOEmq&Tef~aWH7SjqERa#-iBMX%jZKUfNcy71bp|`IOKD_d0nA~D<-XkQV*jewl
zx|K<Ib?<h!>$GjP@M*^t)>e04FWS7-Uwy|!6q{ICob5gfvYaErq&g;Btk^VqnotOu
zSN-|V;a*P<^rDbv9KD!YExR|ex)jop)as*$VeKa$K<SB3L4{az^aJ?Gft?Ighw4(B
zd%nB~eb~m`?gyYd`5XVHC2>-3I_~rZ#$8<Hq6|eT5hit)Gfx>n0D;V;;rwan!I2{&
zEnl34toAlI^wpPe<z^?`$1=Oa0S*y#-qqa=9sj)v2fz9w6GG%4^!CF2-X3N`WGVr3
zCnNf%_MNVA8&+qyR%kISjWHBwQ%dZaDGubs-?df3-aq{wXQ9tY^kG2*p!w_=Jdd^>
zlye)Ao4ycY%W~JdLaI0e(MHvF%G1SkH=uyAXf{=!ABS!n#lZ@o8CZ4XFmw8#1n{&R
zVs(YP+3GCIkwRjs%TCiYQa(?iP=b^m$jib}=-N*{ggXx&44S-zukU>W+LOO#ZOZ!~
zOnukpUM6x&FsRNVXIChVTfbhB(rD_SHz|4<p*Svq5;Mwe<YlJQCz$O&DtQUimj2Xn
zw>}839cXjAmbiVtspfigR#uEFjIMj@si>Ore+Oei$<1cCarcfF2@0*j682U1A9rp;
zlE=d6(}XYz#@Cd03QHCwxdi0=G&$N_{=Yy1XfbK~!v(L-Fa7gxu<_$VaOSVq1CpmY
z8$Ujb&-~r%UfZSfpfHyQ7GTlb5>~#R>JqSaSxPVhD7~ea?b-3_j}BnQxCvh0zmvuF
zfymQ6C7Oj$o(rpg(e8EsF8b6fI~#$e4S@tKotNPf@Ro97lv&dmNB}MOzKDHx{Td^7
z^e>kK&H&X>w(nxk__|+v<^;uhpfq|w0oCgN2n*&Uy98ur#zdLa9sUH2!{g=78$;%}
z1L1P#zaX{-%}ARM>G(3`OF*1abzPV`HC~?1g-^B_&(OXN<=~`T0!1J)ouwb`hnx4h
z9=m{>-*my^gYQ9FLp5Z*znzJYxJcY)*bL{8bEG_x3mc;?*yV2q=Kg#a+<I{Tu?x6$
z|31v!w}Y7xi4b%J%$(d7vf`Y*$aw_k&QpmHMd7-QOaM<M`aK0?A34M_#>Xvy`pEue
zJ2#<55|A&7Ku(lOR2IUxb#E82l~|riL@t>>J=|1!XP{(Gfq7D*RSSuh3Wmux1H9O5
zbzVzIvg#nSb+dS_bpfB9xub!%!Jvc0T8>$5O?a$?#5xXzQ6&nfaS6~B@Yl=oyt`5J
zUi|^Lo>^h?bXpN!k$b{#I*o}Gg+L0KqjiNap+>{b<p|T{!6YQZ>dB$Wh1B{gdNt&z
zkU*wl;*p0Tp96`fH`Pew34JvBLf)EFl)AaU3W$CXzIJ5}*_hmnyplOlgkJ%5dN1-^
zfYFOQ7f|g*o(nK@@|F3Nh4!=hOBWWfJjm^}QhYrdl{|g|c5+Shdb>Od$s<#GvjwI%
znqg*ZJ*3tdIBXmlNOJbhCP>{}#ZfQ82y=FCgS0Is7aB~A{A+vOWk<4kG8-CsBA>N)
z2Ro)Vo9)zRim|LCBI$`F-!JxDQG~E+nVNaMk<Y?F>GbGoHB3M|cbfqm?Jyjr6ln%D
z61dqAY5B-YX2WN|HS&_#uo<W<z|k+_qrYT<mT~mCKjc#MB)<J$3~F0ti_z&5lDRkq
zc#tRJT!1@y_h{oHkHm4N$%G2I?QdFj3ArWmZR>&dO1ZLdVcx6-*l>@yGiUd^twKIQ
z1myy3dN1;B0z4enBib<Xgw!UBo#jQPIX%LUGnI_nnm`{wxQMaB$~TOpJl0m(Py3X9
z_l0C4!nBfrK}H*qJdV60y~Wzk+E}{4%s2(yT9iVQQlKdsFRWX!FBV_m6Y32P&lTEh
zV~a!}Q7*tR$Y|ptk0bA3^|P0>GcLp_=&v^1A84wc`CetouQG9=$!N7f##SDg2(;-$
z`!;UT3E!5cpgGLm)#4Fpf{Qj}^JF&E4%N%lmmNV4&oVB`hy6ytSLkp=a!l^3{cMD2
zTZ1ifMFW4}K)*?$c>mDR24g)rEZIEGUiM-d`ALieTX6^VNp)73C?Y9z`9d?=c(?d1
zs~_K-`cOc>&%IHK9z-;#Xp`TMv(d*wB}E%mPIu_y`4;N)(a6iqDI;Sfv%{G`Tq?Y?
z`XY5qua{3ZRrAk6vM-O$&0Shch^Vh<qs7>+#oUI{16*NgkrFgmFX!!x!YeN<XMJS|
zBOl&u{C%}iDoF(G1f+#7nXezM&bc<xrkB1hisn_~{+SgW5s5p5ihOoVYplj2Iw*}a
zIXyWnr5;u4YoTw<HDKH6tPkqzvHT5z$6zw4Cxpnz>2Yr^QVW|_o)XG(ZcBN)a|R?)
zB#;P8w$4loZCthCwy<I4snCUU)FZGh)@D#~ekf*xcQOi*fOsx^+O({wHy3Hp+1ukw
z^Uh7T={w@8R5#*HrC$@`3?(<>D)Kv~>DA|AHfFa+EnB3aXYkonv5irz&0+e_1c`|f
ziIC%^3DMCrgrvlo!j#n640IkHIfLEfbrQs9Mtu8!_VBgvQKZl*M~Z$T%?|zlVT_2;
lV%Z2*hu);<nIW+A{{w(@HP+rHXcqtg002ovPDHLkV1knrU~2#X

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5ed46597ea8447d91ab1786a34e30f1c26b18bd
GIT binary patch
literal 4490
zcmV;55q0i~P)<h;3K|Lk000e1NJLTq003YB003YJ1^@s6;+S_h000qDNkl<Zc-rk<
z3v^V~xjyNTFnI~m1R;rpU;u%T&<GX+wN$K?w$xjDX>6rydA(}wUDXPCF_W1vnaRBK
zeoR6LNsxyaZGA2++G?*?dRwg0Dq5+E#aFEgnub(`IsN<pd!FayJOUYF9%rxhtvNI2
zKmR%V`}Wy;|9kIqhU2uPB`s-5OIp&Bmb9d$ag;2UFK~Sub<*h5$?rQ?+rG23Js;<n
z*+*QAS_++1OQX-rT8m{$_{;WN<2iSE7U$eLkF%BLMB}0sTh27EDM$Rf7<!v2C%eUq
z+v>LD^CGWJ)s74L)DOcaT_gD&woh@MDDT7paS^E*rkp>8F->o<OiyrCRwuZO&rNk1
z-^b5>#K*x;hPkb-{<L~)fAnusNcNcf7qr**C%H_jwpq>g{@G1-RXg&<e4=kP+w=x&
ze|C#Y)lToKWIj{80-r3;K@OfHPB`Qyz*V2_yp;T}?CDrMug(2RPMaHYT3xdE`raL|
zd}p(w4dK#uFKoD{7v}pss<v0$)edWR`Bd!|pfB>d5PhrJUf$gT>-Kc2+T~(?$>*Yu
zT4h`0W>J$pZ%Azsi;{nVW%G=At*)awy8+_t6`#e`RGh(2zZ43)n*13}cE8;I5R%*`
z|5tXk`=>gMs<sQ;YyG)CkE)&CWn?~++?A@f7dJcn%XT!X+QmBz%x9{{ENv6u;<GsW
zR!Fv0{AJedx2H#Q7ZU*z90nVe{5X60X|AUHUkH%Hmv%HlMq>>q*$@(4m8?`JI1Q?{
zRHAd+JgRmHP9yV))rP7q3IO??4XSoJ$5!Su*=<s?`xj>~JDub(K$fM<8yf*a-K*Qz
zPelO^(`|+V_|-0Wk_vz*qdO0>?1mS)wM$Y29FC;)bEP-uAW0uG0ct9EO#m6#%K0RZ
z39?+K6Wk5gE*|+^5I8uFyX{ALNYa2Nz%T`Hn@(}pU9*C57Xtylz}>iUsV2Z#2;ejg
zaNoZ2a>iW@1kiDtzFVLPa8^~&DQ^ARm5e)008Ic*fO8jsh19y~Ki*W3-Qpae2p0nv
zo(NXL_4n_CukY&uHM^BPt?*wD_pyjn&Gy=Rcfp3fUR68tMLx;5n(a64-U;9T#U52V
zit5Q{QE!`~T|s99zY=X$w0cfmaNYW#0DU9B1CnnlE=a4Z9-s@!Y^>p_bSr_8-_-<a
zcJo@@j93&`)fOXBSl#PWwVR8+$R|eas<xD$b$dHi?dVP+pL7Q*+BVquc|H#;uCF<j
zxZx_T3ewsDO#n>*O#n>*O#n>*O#n>*O#n@Ra~B|fQ*l9(%QQf9xcJEvaY~>ll!7d&
zeMy*!>i>NLUU=_aXnXb`eD~hF-~w+IsQDzK^0wEj+D$`WSMKSA3v0K*aIW*wzx){v
z|Lq;P{lJ5=b}1e+^O;s(t?biT$<IKW98cA$`1M!CZYW~rk8dhi;PVd9#PsSOA5W!0
z2^6+d<S4c8e;&IZK09*;c%Fwden0&F?c+=YHJ?AYe;*Y8Vy~**+TGjPIfyVNJM#61
zci&>yLHOtC&t(07^{x))^Qyf&6nz%;wDIf6##eu8#&sKFHx$9)9f0Z%(CUS$4kJ%h
zh7xEzhK3iU_R;u@KbYx|2=~79C&+BFEBd6;PpcBt&P}D2M4-D$&W5VeCtg1)xQ^3!
z9dwsT*;DBzpVRTKQar!Iz)wS)Y_}P!pfNfWp?4YK(O3Tre#~%m=I?&-Fr?${tJVhS
z>=lrTBvW+|8iS#2`i=IfwE<-R;44R%@X>{!`|u$=e(U6DgfD8a!sD+U6_7w8>_2iC
zX4F|kjj91=H`?IFhx(x5cTdB<7oUfx-gpfTz4Im<`TO4(Xq$f9`@-{Je(C_+`S?TZ
z4vcpQ8~0gw-iMFABs?!xhr3^RjtMxadO=JCss=<Qk*XtVNK-H)-`w8~dyt$rJ~|<=
zQv=z)e&R#e^~Yyfy-mcWp|Fp>`ts28z5FLd@+WjRbPjd{sS);z$b0hGtE^P}he^1i
z7>H-yd;^|7eoS~C1QmcUcehUNIDmRU&%AkT#6+Jh?!%J56dPSF5W|cS2~^FD7Wvd}
zT-<EeB4HBXTl;%pFE*^FgR#MAfZ~wO=Y9uEv1>c21)vi6B=%lT`_GJe6+|LDhTUPB
z>Kqr7@|jIF1GGeZq0h@xpIiwP1yjb9Y*zKO!2wZMbhJU|{xvrEbS+BPy11i`MdHh_
zU@6%x@Ok(Gv{}~ZjMb!kP=K2@70hm|8K6>-+veseAW{OYUZ4qdx&3t8|MsoFVo&7r
zBR|p`^0RB9Ym&QOBA13Klxzr>w7U5`YS<f4fDP+oITR7wDyZ~VkBn-auLimnx~56d
z1}yHVg^&I@IA+KXfv#yWmh=fNK%%>n4T7nW@sCeFfg|s|3n!5j{|JLH@<G1-@#!b3
z_D{U_3M4v>6H|aVdj<UTmk+3t@W#mxVQI4yxXL>q+q(_^fRXaK3P8tZdo9e@(iRu<
zt#-^$ANe`N*~%uK05m~D0gxI2h64{X!b14LJ-fp52WMNa-_Ungz>n!?42H)aRu9tf
zZn@BbcY(EZVhL~!%>xXh%jx{h69NHlePI7Nbyew@+aBx-lTRSu!x_l?#;y+Fs_qPn
zFzyAQVd36CK07Sp-tGSwzO%a%W;so;wyOnR9>!fGhokSm2Wxk>z$}*;zO!cs^F5s7
zdN4|kx0C?4Z8H;L+zUX*9sl^`u!*Ba_}GaL;N;-QdrRble38%L9&`MolaSM3!@FQJ
z6G4Z0_?!g@Oi9v1(0V6LNg6>3G$lEgO-Tm6-~7mZF&SDOz2J<8TOPaz5~@oX5^WXm
zRgCN}thFfSJHcV(r^j|mGB%U)4;_7J+>jr_V@F?x)tyaH)Y%AYx|-ou6lC4*?Vr!2
zJS|H}beRSgvSlfiJk7T%A+RjP#kOg-=>Ybx$D05Lj~|1Xc<p*yIV9KM9njqhGh8OD
z0Sxkar)4R1)i5`9%Th+(22l6&|KIxSxD()YY&UW!57Y)!mq5{uCKO^Te&2$1ecAYF
zglXFGl@I<t?gV)M)Br4Lbq56~6$)UH5n(I}uw;83e1Mr5ZvqT_`Y9B4)H1@u0?@J*
zOHcsnriw)vivrNppXcdcL*4&9IRuRdpMrgd4?)8-PfG5O#?C{(g|0&fRqYy{A)n8p
z@83Wj`BlJn^w<0R?<2n*jGz|3XH!>HQh<^OqD2_9kucVwoaqihgiFwGD}j~1T8KAq
z9<oo2+*&Kl3y$kV4=;$3fP}5;s0Tfg#+mUq^PNIeMUaX9@~D~#_V<O@5=KCP=AZy2
zLR3eIFs4UCB}6!90c6N$0w5_70p!Ui(!&BMkS`R#FqHi2C=tf|e1J%Va}q#-d@Av>
z0*J_$7eGipRXI8<3eY7Ipjr$(pS5fpOv=;6o~r=0)r#cH3Lrr~6QEWsz)<of1&|`2
zU@7Hk`F+d<pa-?n>#GN7h+$5Xou}0dN}v_c^boY%{;YZ{WV+0(M1QNN9kM;!AOnLO
zA<P1-Mp9R`*FmP+jHnXA6G09JLK}8>!aO<$`pxu4!x90Kzr3RkuIy=J+gW&=9H=qA
z_U>+&-|S@9p4AWyTLkr1J{JXz;e*<dRS|QVodfiOyZp`uK0mmkOb+z;KRto2WV6)g
zg~Ap$T)(dy3NT|!8tpQ0FkI8)gR8sSAg{?01t;V+3-P5k5B%_+U2qxt&uei-fW(MF
z<p0OMov;*-5yBe*`dQXq3!8qv7cR%1&{EtEAzy^N_F8@g7QE$+uGqe+;Jhu&!d``H
z6%?#ua_ekEs~~eU0@V$xg0?q+-dZME?L^o2RPu>%scI*>vDKlk)jL}tnO0kitDO+6
z?2}J&RYIn-a{R1}qm0E@ZB`_oFkdWy1o<HWiUa~klTV=PMIe9#`Ah(tSO5v~nE++u
zPXP9io~?XnoFHms1p>&B&jg?@V^{!r@`-SP05aqg;X(mq$fxs-TLGNGl11do^z)ej
zbyh|4sl+n@Iva%o$n^8W0w|C#6u>A?ev|-N<5GZdoFLuJoL?^%Ksv}8B7j1W6%fFy
zNPbv=Zjk_D@+X7<OoY;52p}Kf3ItGza3cc9lP@WN6!`)0?7gAmT7b}<U>5dvA_6E6
zFN6iKm8nL!k^)EsSvqW^!UD*VZ;KXSB0MP{62Yt>fJB5F5ujW(!es*ZyvoB1VF6kp
z*=dv~|NIJ2T%dOv2k0&0@pc1G%QTb_ih|Yb=$T%62%3bDw82d2XhH;WDF$Wp8)|TS
zO9Yk>O2SA)vS<#MrV(i-iw4q$z#0HWxD;ejKcAgz2+A3z)@+3<dfVUvw+VFQR}6a)
zSU6KL8lU4FW0Uf7pr_gC3G}rJlijO#wz9zLKQUtd9!9OST@*$mEA7#^0(&dBJg}e5
z;nEns5X}K}1wg|K6yZ`2M}Y2Vv@-H<d*r^rq<l|n5N)ugSBRqSR-Q%+*7)jTJSl%u
zaY2gPG(b<Pr@GBNt&dRxy<L#j%d6SZ1J?8yDJ|ttw8dLLpN0>bosdkE<O~%;)C4dc
zZNG-(&;&Y7Au`ybVF-i9c4&Bk`u7nSg+PB)@`?PJ7&#|9D+6$eLDy;G)9S4-k0#|O
z+DrT9<GC+ki=~u@7ZR{Kq0$~`1C%3N8d#$-9mU}Vs;*F2D)mIXpw17YJOFF?`56rz
zGq0;@aSDl-=;sd+Kq1JY(aeYd@)52;06Fp}g#Z+_5-AE1u0Q~l2sa`?sbYj15TK1!
zHL~Q}L@)1-sQ@=_TsEd&*hJ*W1X*DLkTkUjmjuWc3n0<YX9Dz!1rU)ho)iA|x>d0g
z;D&1#CpZiz#?%|L1R`t^3D6uAKsmytNfdzqGC|f*0VK$e7Qk*e$z8qXvXKiA`1=hV
zmpdyx!B&1`%>9K46G0ec(a5T#01`o#KmdgZm-_e-0c6Mz|AmPOGO9|Ba#>%@WZZ2W
z>Ho;wdKvvm*|hl5+kCX*InGgW8c#HK{=|ok`9yj<On@$t05TD7SOBHr1%Uu2dGc+@
ziM{+Z1%fkwa$_%X7UYhLg#amB-ooV<+YE2U(TVAwlx!7e=$50FUyOq7_5AF5tK`0D
zBswbjqMlYh0i$a*+XZcI{*o4#^nObj&sA+<NT@F7S9Uh?dMj3yRV6&PO^nhoGV7}O
zJfD}R$#mu;8r2c#Y!!S_cMG5FFyZ&|07B=sF`QwXplJF#Z+Sa$Q7$#WsocgT30w?P
z9Jhr1BT#1g)oY6rOMdcpBA$rGJgB~t0=>eW-XboyKLmQg9WCdk*L<nj(i7<P5VWif
zy$}!sscBeBzea2M-DdV(01T?4K@;dShs%x?!yN$)FRWuP?X$9%4l+ca8^04L`k&qW
zj(+uu8;C1z;lfnLU5wHcE+ZoiKXa28Ex2mtEt~db818s2(^m0uW{q($)4uf-`wU`v
zqhoN9$1)iGn-u0cDhG>NJcD!Wm8!M{^|rzMI;*ms)i5}x+Az2Z&!25I4rWwWL}BX?
zEOKufEUd2?%)sM9ARn2w5R42L+weM@-Ge!fsOt>oIm=qnPh6z`_Ydz*&dt4=I7*o{
zE1hu`!$e9>O-f74pc5eSr(Br2T9<$6_jJqiuh$jk6-OgwWnppRih^SC?_wkr78Flg
zxdOM<ZOLcKib7R89rIa;zVlRV=^DxJ@O-W)P7vpb@{SxJ7i)+U#!Y~usj)2mp+r3*
zJvTG?B5s;K2e;=V79kc-ax7w85GTY9aTMU{yd+YU8s%gzHT8>Jdh#qTEon9)Lx{AD
zp})x??JVrlV(c?%q&{ae4u}ilB*0A^Hwr0^^>G9BT>K=*lpq(QLcEr=q$MqBNlRMN
c(!@yr22-Ey)4s~&`~Uy|07*qoM6N<$g6%nSQUCw|

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
new file mode 100644
index 0000000000000000000000000000000000000000..14ed0af35023e4f1901cf03487b6c524257b8483
GIT binary patch
literal 6895
zcmV<L8W81)P)<h;3K|Lk000e1NJLTq003YB003YJ1^@s6;+S_h000`cNkl<Zc-rk<
z2Ygdi`@T&lbQe0TP!NhO6a=X(LE7TS6qO-MLHHw}K%sObZPS%DNt2eAZfJo**$Ak(
z75{?YaNu5u3l%3KLq9i3dcOBP_uk}Y-=-;)k>BruHaWfboaZ^`J@5OTb59uN+UwfO
z>5DKPj6xxy*f-15A^38Hcw8gS)fY>m7X^~)>WdY`i-Y7Ev5tB;lGU`#+aci!MOUUM
zD}qsF_F|N>IHn{!fdYTV_wX|;<46$x9(d2I{>ArDOEMG+AD^=P{ywF-GrY99`C;pd
zTVmI*ebJ{Z?*lK5{2OnL{2bsnz#klb<KFuk@PU7M!He-5Dq8%?^YZZ;crCo94YSt1
z5q7!0l8-37X6@T&*6_Hxvgp1I<#PayvFpp<0m6TU|4D2^85!1GOQ!7ELT==KcZPvY
z>&V^vTF8LL3idsEt+KcA+ISDVmw89n=b3!uh}YH8<Apu0Ocv;)$(!j@mb=?qD?jC<
zl@lKgk6a%5m^zIvY5N9U7j2#nf9GJZ0RO%&tWu#Hzk8D|Xl<cha2?~d>Am2dcyFwO
zP>3sYL|70%XiHU}0Zo+(MxFf$fG{c^GK8Lk0nm!?MOUlH=$7@wQ=P+?afrb30+O<`
ziTG*r2zL#G;JREn?w(KwKTW>kAG@~nvD;BDbNA6Sw3X7nOleNtO`EFE_iw7?Nk@V%
z2nn}DI|Z-=FUSS{e!iMKGH%z#^FftGb+nGAxybACovek#YjQ#vb&d*p+t1kJZ`xQz
z;u|ZlH|p$>-hl#GilOt>$n{u0Xl)T;>j-tlI@@Z?Wzp-=)#G34?74swCQ~ERfdKmc
zFhPnTvx5a7><EyPaXXvzFlXqD<_YVp%9&?u2ezrxj|M(crNql4nNrza0r9+<p(CKP
z3BZnkLLx9vgw~z50or16n>%ShCv+=IbEiP%zhTLzjnoMn+{p#7s56cR+1Ip9!b!Tb
z`Sm7~BP+1z^;S0iG7&)FAn@&x7D5ZD8A|Rn^8#NH904lXb|d*p^Im_M3cx}s7!4)T
z9gHH`t8+}w++;htxjC@gx{~KPlVjj*{S_ks3$9(+#6u-Jl&IAP3pu!CJwK#M5t6c_
z>9wdD74a&~(E(Zk#1U@ZTtm|Z&dTxVSzAiRZr?zO5>r03qKN!s*CrAGLWn8vUzShH
zLj>)tEVfOD(e%jX+M_)bim*#E5_p?Gy16VcdB?_AS3UnYnfh>x4oMP&MNjS{^B><!
z#Fn4Fe>++6>|-QpN0X@X6L&Y0v_nr&QpJ?Nedk76e$t+1QRS1iuh%{F%%f!H-mR|<
zQLG8Eng=h6w*&uot15mDdp?pMw_z>mzOGmllD0RJTU#1Lm&egEdG8hyS)~+JzIUCL
zOasw+)T%|5zrIFI%imD16;(cBT?v`6d!z2=P1Pi<EbkDrj59+P3C51`^NneR>}_cC
zaY){_eM2i&Osq}6Oy>Y2JfPjfx74>{k`N|n!sM^n$$Li~8z=DouS%NFPq=6oaadk$
z0*u&FPkPm9z)j6IfM-M)d8(pgV+4M-S4t-d{CpIET*U$q-ZNqpnS{w$epknMM*J)<
zPm6>bel7I#uL*$fN%fSIg0yd#CHM7kuV;h_C^iY@0i^Gty<ZSyT@!#0*0Nzf08^Bc
zdH}eR2zYZ-?=P0l5#)ASuL)ow0CIx-C^EnlExF%4qkx}Kva^HX;-7K(5WTQ6*8~uL
z#+jnLv=hv{dEMJTDYJj<!4zkeu)W`${A&VetUp)F^uQNkGMAN8?pzjt|3gK3A0Yo6
zH}M*-3E)Hk<On-!e%g6v#e)7Uhsj>9+J2aLrPcO&e_I4V!m|%QLzX;!0D_phPA9;f
z54Vuq!_U%`L{EsIT^4|j0x3HRvX(Vc4%<2x@Oh2+Dn;)>o2t)Xj~&>w&Vc`00uyVP
z+rjjLt<Mqo*|)#^OmuoZ`CF$WN54Hub3yjXTVmYKjbs6o9|EDjnDQgH){_UGIzWas
z*NG}WZT~j%?y2v{1*qQzy^b6^eTsGxd*ug1J?=fYhfLVL+2-dwR|BfmXM3kEi$^XK
z+Y%dao2ESC8p;ZA_d<MQg#ml4EnuDo$;8}_y)xLd3<iS{dT8h?Uw=t%+qS{(-v}`l
z1f2KSZju0X5mP>~xt1(^VjmUESy@cLz5nC)L@%fx;yxhQ-ro#ptR%A^-9<Zw-D@{h
z8sh7<q))M|6uF!wI#0ysGFHqA0JC$<mgmz}6NH~5>B0u$XgK)sha_CY+|f}c==vHJ
zIsE14R^<PA;R5N?pmp5@89gHSKWD|q%*0V);`2nD?e|e;PP66ryCMLVX?*il^2de0
ztwea_qxWgPUz7k`?mn`rgv8X9l78#UDez}}4?pt|DFAL+eD3q(RvqJ)KmR0oho7S)
zw^IN%E+|YVOzMJr#mxn#=uP6IlI?W1wAhwOxLc)yY<~MFwX?zo<DBr=@efF_4G#o+
zJd&CMLw)TlFOh~fUMA0ccAWeN%-5UWe?va}@dpb08Q**Q(~n5q>;ECC&mE-m5-zZK
z+8{Cl>U!wJC$s|y>+%=$e8oRsp!aOoBrJ@MF;SPkbU$$FNuOD87#(v%q_;vE<)g{{
z)}HI>svC+uv;Os$twg|H_&AuO>#CKsTo>rM<9BT$m9M@;K7t9+k|;62$@KkG-xKZ2
zhe^_oMi>opdhOmo+KXR&YGro*f{q}Ep3j$aj{uxYnw$E)-`r`v*$LKBT)@uM9ye4J
z-Q#1bNUOU9;6>Q;!8^3)TN3u@@%O2>^UtqNkTbvkW<`=Kz-yfT?N{=`iBIXo`W%cP
zOF@78`!8CjaFJ~gEr7rbg{*#HA!~+a`8W%{Bz>w?4Y=;<toOSbh@r>y{O2FrCCt!4
zuy^g+qyHvTAKvPoK+M_<8JLnR5|X`g3r*75jg0vjI+5}2Tc>@aBLzSo8U5@X@4sm^
z5-ujt+fn`dMM}KeB4Jx*2>uVv&wPi8j_zvT3~}C%Z`$&>zV&72aX)=W3XlNt!|X?Q
zQm^Au32^rJ-)S6xb54f}0OiA!vY*2j%^E_@&@x*=87F{e-s<W2WC|5P)o}vgE{Bsp
zpSxt_^Eh$t3|;W=FacDY0Ht!$ZRG<C?7O3!{r4)&$VDF+pE?n<b-b+tpr0;%@oDIN
zzg=?jd0?8P0^Ho-ju8MSVnZ0Xw&7V1scg>!CjZ|nOe1f`XR>1IG<pnxoB@L<+_!@%
z2{C1tJavG2C7p$Q9_Rk}o#0*{tfRAb;5L>iFlvUuJSK*t=o+=Yf5Tc5TadL2IQF()
zEi;A4K7Fc758(rGN!uFr7=1be_I@-cIE<Z@#bE09zyI!Z<nzF~VI7^dW6ZYov?F21
z6T{KytsC?cAg;Nky!a*HxT|$qg8jvVNk|CrQDwi2Pa}5G%JmqDIs>M1amN~NnsQVQ
zGnAj7{i)NE&jag-b#>GhG`pj=Hqeb+VmN|mT#uW%u2aZ9WP0=nqg<PLLj20tU3W!#
ztFpgf=YSl&7<(vY4!|4^M!I?zBHD`$J}P&V_-o@FKofJ_fS>D1a!xX1#>7~!l<@*A
zoYvP%oqLK3P?~FShX9z1Sqj6ovlDNLrBCj+nMZO-0B}XA0IJ;6%pJ)C?Fk@Zmdxqz
zt<F8~P5}}(*O1T7o)*>UAO8CbdHVQ=<lwP)$(P_Bx{FM_*1==%(QD&9#Qe=?XHJtu
za4g&@K(t!^Gjr#-q1K)dFTnjXZ}Kk9{ja+O@P=4o--qvt^FqKaow2)_conQ69{Fh`
zX>%<(ai;xq23`ZNh1c{dOsDraC(;Gp_x{_&8?%}28UgCOUzsT>BkT#_$;_WV*qs7k
zaPyN$mvj4DM~Poi24V76Q+NQ14?o+kc?17edH8v_RvLR<5W!E8Nw&XzRMg*N-BY$S
zuzP*nCBWq5k(6tj0?eD4;4Tw{lUUiyM?|NRtpotF6fZvOQYu;~fC>eGYcU+!A^_gI
z>|g&+Jh5H^5!z*f#wXumUx4XTZuC;;xMdO!D9;DmFW!WFarO)uTvuikAf~*Cy!Q2%
z?KVMgd~=fYTB|S$Fu1;)-b?J?fAZ6hBmmb%3fCA#XxAj1GG?%S0g^}b05|kYcetUL
z-fe4Y`Q-Vtqy|P!>5)U^_~}z_aa-{kcrCnU&C4&rJ<hc`g*_F5C@KIhFTx1Ibc#!E
zs6uv<89E@Aa&5L(hz-OfOpl;2?-JR&?G*rnvzB+?xa?d{duRcZ<y$Oe1ps&yXMq59
z;ssEH0F??7Rhao96Yd{Hxi$;LxOHGakbjBv1<dP&RVwtjdgToGKpZ}kg^vaC?+pdl
z^V2h@uNVQaEkid{f+@<O2vp~iB#4CZJR&h11+LB4$JD4v4|zJKBEl%w3PzBMz{TQZ
zIwS)6l`FR5VMh~oZKelUI0HTihr7w}87uxh{QZwkobzcbaJ@L|xyQksts_azh<dW@
z;9fEd+q}aGM>`sE|B!wvbkd<kxqE@*K+3h%p@QrpL03QN%0g5&Xl-GupEeIc904B%
zMG^3UIR3r4kM6!=$mfy%>_OtElu>j6jNVj3Vxd?2fw$+FBYCS|S$=CYSc<5Xi_2*;
z&gOy)`=+1ggA3<z>j5q=$gF`8aHR>b`OQ}eQ<P5O0p8HC#&slST?zaSo>6h8^930&
zTfz6uT#6in{r9oABIe_L$ArY#I_=r^EJ;?q_OB~WfagCwZZ1HRKmdgU5x6DEkfO}<
zfwzyo4LP<w0@#TdKM3zrrF^430v!10y(>-t+{?-ekO2Z@S_?o$$g;aAA0l1(9&md-
z<=AWj7QQA=_Jw~#d#mJ4?b#K9JJqf<0gnCn1538001ANs_@tzj2-yZ49YM<%;c8eY
z$FZH)D*9o-^{baHqyo6OF>A<%3Ni|8q&>{r+d^jT-r}%~5L31_lEnvhk<m?6=AYs3
zHjn~d%0mYq25+U7jM`WwK($Sb1<z*r6Z^@ymi6p=o2)UkvC=|-DiA<vPCgH#2;9aj
zh`_tdxhoa9Wfmp-IoOv@gM%&sBW@nxqm+Iv971i1gz;%X9Ijki!4e5oYI0*`iQUgF
z5mlN;Ml@6q&-^sg`-1$AB?3xhBxz#}@k1gC;Jt!1)+jUqgDiA0191_iFx_bDO~^2V
zwE&qG0su;~Oj{E(mwo5mzdxp-yKwHlgI8=Uj)Dm9?9@HxXXUuC9$-g=b_?Kud?Y+T
zDHOml`AE2#08YrK0u&1cut9z?_j8F<M}K}`?m^%*lH=Cd(-{fqB87M_oO3@t2cbRN
z)0JHK?lu9|I3?ey02bt%3y|xSeC+Xd1t=CGqm_<=+*O3R<MtKEsbl+ZZic!t!NGl@
z@yE;_HgIK4*oHZBS%88yPRTb4u*rr1Hpn*$&}=DyZSsu*z?>3OrL;pn_Wlg^IkA<C
zK4Ag^`cqDmMLw(-otTJQCn9`O7i)Y=<(OPE09rB0W(nL&kLT^+UKv}T8&M)9K7|<+
zLu$1op;~SH-5Fxa3dxwpD$+%fMSP1gt)Van?m`>4rJe+-a^UwY7R5qH&49$;zI8q6
zuFa?QWFa#_X%0VCHo0|kEkwel#20?HhOE_Boonzd$ROVHrqv>s49lswR{|TU1x4L9
zYWUdAHK)eyY$D^fHyXs|f^6qRnrJT@3q;P}(?aHg7lc1M1q}7Ow>ObxkL;#qWh{6p
zNoJ@q2lV_2;LW5yv5(xor2$M!4PBBnq0SsoCnSIMQwPW-xK9!YXN?9Ewl<Jtz=Lr8
zS^Kt;B)FF)Qnof2Qot&VTw8KF{N_Ppk;_<7=EwA+dxUT>1gu%s7*t+Bg35~wxOdVL
z_!J6maK$|`wmvrlW(J|R4Qp6SZiZ11h`rAlpa;f+xk}ztOG1=6^mika+17v_cwJcm
znb@*{glqHQ_Z$<{mdK^Ro{!{5S13qeX|4t2CTLg$Yx3A^XhS&(#Cr%31fK<I^9@+P
zt_x>xLk>AE+jwroWIAJqGD8O53ik6ycRr{+uucnefYQ1B=j?lwCZCL0Z!rfHSi)rM
z13-u*5X=u3)NR;&OIH(34)$~;+?LI^bTx53U>L*(G1V#y+YdHhk;R@Ll=i?+OkCd-
z%3*SEKUbcW_h90><I(Otn5Fkt(u?E625t~w5JZvjH5yUm^8$?AP-zmt0r^G&HVXx?
zPrh9N?2u0dD6u5~B_IBm8W=g$f7aCXlslF~7f($|pe|_`6B;m(k*)aHR$SonQ3Vh!
z+YQz%JRCCtoRCiiPzeQaOg@s06Rnv54#-Ectp&(pQIe^rBCNRoGw>pZQtm|g{tib$
zTp&#%&A4L)t+45A(Dt7dVJl9s;bIyEC|u)|eC+Xd1+WujnF-*8d}{%+%uSDM1z{$R
z&7_>g#s<0G`%Nz|CMXD((fWe2kI<W`O<#o2?!!8if{59Uk8ctn-6{Dd0qV>Ja1h~|
z1dux=-=+ZA>r1lqv|jhme3Ej-a^{v(vpkqY`fO7a6BRX#kuLv&l7`Q~y7ROYB*UHn
z+5!+@oj?G`=>;nRoTL}fw?`M#BtWKv2$vOLIJmo103=_5DF<qLzG5OGMp3R3!UNpV
z`}voIiHX54G>Bm)B`<<WM#p<aZWBOhM}z`P0i5S{aN*o2fQa0Vxd4vIX9bW71u#%k
zn(?5H89n?#$`Q+z+fPsQW+EdaFkL6PZ)Q1dID2lW!L8>7DKe~FO@{*5NG})#;LV$p
z^ny_Ujoc~u*wc9ddR8e}^0QYE$@Iz9$PLF)hny$v0ZvsH#-G7`E%D3)bN6Cny)?Oo
z+qSv+;8rB2z(RmV8v@wL?N9-lEd{Wj+o1w%wGhA#`MdzbHr2Go)TqJbTt%3<(;lIm
zAUDzU378K1rVR-b78b-Utqt;cXu%;L^r5#m;S(UOxMfca@Vp&7^2Kf$-2R72FCZ2X
z4Uz3AJnS1&!MHIBQ6xl$8R)*9=6bq&fnG<!=J3r>Yy#$XFui~gt_LO97NkaamPlJi
zG}q~I`=rPHvkwCoH&ISlZaVxMHavs*`M}$I$W4lzSC%}s2RCQw@i<@HvgZtV*b$z$
z<G5TMP5Ise5isU<7Vh6chJz<`3&an%(EsfKMhdQQ;l3T*I)b|waQV=LDs#YS#Kaor
zc5oD+cVh`OH$1TCBl-93Z6UW7%8mjDiO5B82ibkwrU>1usHku}*8?kXySDgM-1OS3
zUTf%8r$G=$z>}u%up?*XVrolC&vhjv5k$Ci$41h-vY7O&P;e-=MkR~*S`E2p?^e2R
z2iI-Qp)^O8l4dnAv4*)FoLKDvZ9bYE?D@AANMDDx52qZkTzGY)>9HjOKPle;xH&j=
z@eBOKOmjv`Hyzps*NFnc=^TJ|TSRUrK%GPVdOzN?a*|%a6f$NpF_~t|=CiIQ=k0*a
z_gF9s&CV^f?WRfhqJP7Z2i@Zm5rN+@gx^9pm|1YoJ~}B;5wdmmL}=@&iPu5z8@0Jc
zAb{iaf=vM&M7XvE5Rxy|@!k$I=PsOZhtM{&ZTGnp<eP~Qzz#ti34<iF@2Ft1mS3P@
zxtjY*LS_4pt;7>nJdqF)xt#!<goOe)NiSFnAn5fN;o|S<%PvT!O+;675Vn<T^CczU
z+#Cf4c4PefP=o}}<dI95)J0#=-fCksxPESEHcOC`^n#fHB62(C0tk9Nb|Q>N9$N6F
zgblJ1XdAJum&oim79o@gW<o;nWT^gL;o)PzBpqb3R}$I~+@OSO%StyK+=|3Gh;WSp
zsIo=nc6b4BF$I3t)08rbAbp51^X`5E+JdWE{b}1mn2^p!zEA{o!~DS??g{_*L;qMZ
zJ}HA)vFKx(Y-7KV%Yf%<#y3{9w(H<lGZ8Xu2+$x9z;Sv3*Klze5m&NkXURz=%!Bhk
zk&GLi#rgRCxHkE7ZS2%vX(*vPfFt}PnP6Z1X3_!PL0v{lW>2kW(w3Y;Pl=9zrpi`&
z!mJaI$>Fh;R0Qh?H=tA~fP;NIicACUUhq}tw&EHtE`c(si%&^rOkR(5#=6rsU|X<F
z7a_=UuSvp(5kf;_xY01kt9Q?dlDX5i#wl`sxDoeNqG!Q=8;6eF`XX($C5FRuL(yhi
zxSEay$Y=%N#HyqFc5uf;{u(k|Q}~1bebZY!d-a&iO|<>Ex(9YvlOxt7`7r?j;Y@Ha
zPS9~Uq=Rp`VM6r6xi!r4g~#X|fyA-jV9L%Fxb&&yzc@|W8V$kHtq`T!J->k$fwT9f
zIY8D*dwEf&fqFE>)T?2)4Pu@N7f&9Xf6RBr<l3;ly(e+zCXUt>>&*6g&&!c~>&O}H
zr#}qk$lyMl5QDrSl9VKmNn_^Ee2iK3e<!K!MVro#3=S{?45M2%=siob_1s9&a}zoz
z6#Pt=1Jj{rr|SbWa(xombbNY+T`J6Cdp~+K=*zEM^c8dOjK}caP=^s*kB_5u^QY#<
z|6)hB7i(_7q3rJu0gPW1^_I{yB>)M7{i32${3oSk1TC7gGkDd~w?cAO{}c+|2tHX7
zU#BJGcQlcR%3^u|EI#sS6Kjh|H*En;OH2Zj6;&!Hp+#ASkepSggI6tnD`?^Do&Mky
z_(gS3!Fy7-66*lojXxVy`EzxYFjw%47oscmr^CW}fN#x@ih)QBU|84q*gJzJCZ~13
zcV=bGip38P%u7EKDP8$aq&)5O$o!1&t}Dv=F{)U027y0E7G!>hpM_^Fe<y0y(jx=b
zEZ-Y3e`aOSgp}2IExaa2`apUwyeHn9_367_4aD1<Bj_^;)aK%(hkH_SVFUB)ELk@J
zaYF!!XfC`$LdJT<#Ekchz2Q#J0kL;ez|Z)eS7_*MScWS*gtO@bxogGJYvVmw`yRmE
z+w)2@HLeID;cTuiH3PvA(%}Qm=zC-E!;X-?oXLvg2#u$~kqPnKXZ>hd{2TmRyarwi
zugRJiU+!L#tDSf;g80yf8j!fq&|tdLATY2y^~;e|A@Du?49j3d&XV1QyT&!b+bIYy
pii9&6o*bz{@b60mWOsVP{|BB8eXZ|AYE1wD002ovPDHLkV1li`I!yoo

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..b0907cac3bfd8fbfdc46e1108247f0a1055387ec
GIT binary patch
literal 6387
zcma($WmFVQySpr~^b#u_OG=0|(kva)DP1B+cP_AmARxJ*NC=Wrg0zUl5(`L)gp{N-
z(%_OG?|Z*r_s2c=$2@ap&UtF)$(eXP9W_!SdLjS-K&qjxY;ZTH{xb;h@8E{&N(%r$
z+p3|gU=%dFmq%!1q&9_NsUvvk-GvvZ<wcVzG@v|e|4Jy?LX<FGUmoL}ZkA3}ghcj_
z`#riOiz=uiNLcwO^g&<i6_yL-S(oodl@z~|6+dIO+1GBfu<fz=`|BT5UAKa2sf~z&
z<%RHX*Iz3vjL#>jaIJ%uU(o!Ypc=Wv%E8e<<)SFdRM{tz(T@!nKT{;0jT2A&dgKu3
zk|GDUX<&73+f+CnZza0G4g29@hmNkl+2wP#$0yi6=u-4CD#*a8LxJLG9KlkveQ7v}
z>E#)-tL=xh89y&5li1I!>Zzc!_i6V~nKP^5-+!69FtnX*f=*tr+cf&UpZtLBY|wv<
zJ6r*Z5<h}|-&d9@4B^3BbJ<qA=QJsOM-QK-cw*5kwc=;EviGyFMayb7i<-5~i>374
zi$7+B3A@szy#|*$Tb~kkzc_N~h3;oe8q95K$w@e#5FRGcF}wXTR}t#^!OnNc>Z52w
zu23YrlIQY7UrLLcFSW5ctMBzwrTz=X-m{1Y!*LWUbO~;u&&q8Lu;wlGFqO2h4olL;
z{rpPfr}7f=Z)eZhFw1_ITpft-VzPF1CHv-W>u;OCBJBEOEn$HmTpFjX=xN6-H5#V{
zn6Si;q3V*@lFMd>H8;M}vOp8McQcJ<pRUux#A0XW2B9<QzT$0OwsFg86_hGxRD;t7
zmm0rcB<z+MQ5#+!u1Vy&g@VEq9Zx7;4Ss*v<RWX%yS}C1tQsU$Tn*gMaLx<#5v?U~
zO$nakLr#uV#feu(^BDIg>}^bBfV`1xb0g0`9ZZa9(wb+L_RGO6wD&I8ouM<}YVDFU
ztMSz*yMDz3AkS0YO)3_lYDarEUyj?A#9s@-l<mqD6wKozIF$z;Ffjo4CzS;oYjqf}
zft$R8D~~&V=G~R5!LFi-)9=k(qPXz(KJggY%X7=e&jzX|*)^85Z)ehC{cj1WuBs5*
z=vj=Cy5(o+oUvO|st1`~4v;|Iwt7Fa4*ySr6_)6fvz)|Ix)et6E=sEKHBLh^Xkr~O
zuDl(Jy2j^JWnynXp;WY*<P$ORXGZ5r<_$#Vsv-r|R{@P;NRtJEeto6XkW({Oj^g{L
z<&U$4eD{vua4He>n${-1Op^nD7zREi=%4Hy%V?=YS7G`L@>`3kHM4eAD%)t@F};|C
zfj?B^Kox-WuPMuDp2=LPZU3Obgnl7{dD>|>*A`fn-0|^8uAHJz;<)tkTXA8lI<v&Y
z>&dHt&xG(4Il=e~QNN6o9YD7H{TR?17eM>#Z8#Y@_=7fZ?HkZX8i|mEGs5mR`uBi^
zzFh5AG^3EMyvpx(a*)!eOI1?nPTn?v0Ly$)KlQ16Xfrzh+}+Ua_I!5XU@ciwrAZ>O
z<7!MU$n6`x${EB6YH$hWOMuSEw+72Lb~rgO*Yp26LGdNp*;^;HAD@(SAr(Dk;j7w!
zQ>!M4rxUFYn7E?v7)2q)2rJ2%PY>A>-1O7bY~nt&n)jYnG$(iR#hvlih1p}c)I+|I
zy^C;<W4pxrQdxdoC_9<3`(|%ZXD^@a5ujf7^A-(?!Sk8-$M%igEog!MN5IQRc5Q^K
z0B@wk|ETv8)B{}ke=MuX3sOlX^-c#ay{^7^!VdpZ_y8X@K1W(tERg2N-O%Jp_|g;o
zl2&-%!haMQ@-T-*xCN#XN5k^wNtdy3t+?&I^<!Jj*j#st2G59Al1Co%1>=uIJImfY
zL~pm6t6Zw8FiOIY<1>EBS(<5`Cv8DBcZEpTCQ{@@-|2$Bhi;6H?Pofq1Z%b2@)&at
zUA{9iaqi62D1|=T{xTe3Czr|z52P;M7EB|V-ss{qspYc0Cj~hUUURef8?i5H?e;kA
z<~qW5`JIc(rCLz_oJ~>x8O2IVR%>+7%}`TBSQt%i+m+4tV?z0(?5cf&1v8cNlz7Lg
z%ZS>-e!({r)+sH_1+QJvE5BqOgmfK_$X*P0*x6be<T#@bau++ibWSYwem*#+-J}M@
z@D!W+pQe*DNr$CBoSMi6(pzVcvD|L7amdhwU#kS@4Fe6xl^uqBtjbV+o$>oRN|0FV
zBu+T9^1E5}1I>g&wC|Bn^{(R$!_A@+E4<}3n|QMU=H|GuQZRAZ+zSZ}SS{MNj&mi0
zRY+fp&8IQn-}zGeIVj+qntrIP-IpXF?2xAoyT|i)X+@HL$+|t{#ZAvBrd?L!=9aLy
z%@CY;X7U41O6VpHq<1UBk2vi<Psd!b&p_}S+Zgt7kh5v#0M|mk)6;Y^NyPT!O@-Eh
zrY7c|vyYAC4d90T%F(6fooR)#=^xB_6M^xB%{BJ}oFkSN^SkeYqA33HnDXqNNGjm$
z=0$z_;^~jcO229)h#c8kYoxN9PA!)Yl<Mov=>~afo_h1Xrb{vQ%cE|Fvi8EjFCP^~
zabJnB#=NPyBD*BaNSQW*V<yD>I+TbEmlu2&HD<4U_UQNUR<ygolvXm7NkRkxdo??~
zIraDkkK0sT;<8$m4!-vur`L)#hu3tF-ExdoNnJ>_`K~u~XWideSoLc(k)vEtG^CT*
zG`Zdarw^M&6C=~oi^6W<E_?`PQZOslNR@c>#WL!BMe{E&Gg9Arbg2gg;cO^sJ#+L$
zWBP!R+lcV(p-B#aK<&Ly>?*3fngF)TwSRSmGJ!zET{Brabip#AUPyChm}S9IFG!l{
z%+I_?Cl?zVm9nbGSU`<Su#CMqUz~-?Jj=U1b$a=+@;9In3O`59Zd?h-H){k9wVuq?
z2>Ksi%z1{vEPpxnv}!StZLIR4yl9y>GM~KIIbNdVs|xsuCpX=<EVQ#bD*xTl<4FxL
zIovgwrDPa(+*tEa?7@8fy=Tx3(TDph;|Q3gZH4YflQ;p_?sNBLElC~CIwDjdIlg#~
zwpXmBUmy}wB_e;T?n?D~M=<Q0>J#rE`8<@v*FO%Lb)=#c`~s7W#9EDhRI!G*VBK(y
z5D`)jJo4o1={q}Kg%YGhdH~@PGate(xi{(OiQn~MMSZM;!kHNh*1-e<+YS5-j3b?2
zq7SYPWMn1a!^Gqxr4d1gZ5G`QQ(&4Ag*OcnWO}~9rz5xeE3Ycol5cj$@jggn@8x2*
z)UpG-U2|Av7a)Hi=b^@SNp#`PEDfswF$nyx&rD*+4SF}`_U48`=1VnBn}aEm{Funk
zSWQuC>r8yUkd_D(dKEqo`7<i*fhS{OsE`tU`sDx-?b87Jz%BuQ1WGLvCpub`e&7F|
zqA|47Vc2I|t6qvDru%)3Tn8tB348H)^8=CREXz!@+HNJew^vQ!{<m9<#nRha14=fs
zR1&1%wb7RIasNCAU(gUy<wDqFKmx~bGN;V5q#<n+@kxb|%a3lP1jSrblz03z0SS@;
zdY_c``3K4IxR;&x_<`zHf&FC66JK}gpb<_a@z*gerV1~c<6eT+a0}!lw{y0Gf8RMW
zV#VZ1*AKquVpluV!}AeAHxEv9%pCAibgwk~uNNQPi`Y;rR+|Fn3sOY>i}}{#+a?O4
zDIg~&^q#d5-Ji>``G%gDDzV<~+=*qePTy_lbVjK?!d`>ygnhxwtyL65_G4A=A}{Dh
zq;iS@h|Y-wJdeGj1b{KBTkst|klERM7*Hwy#ZO<~Q$5~GzC~WjZHz>=z3~>oAVbbv
zzmgOw2JQ#Kv)GT9dwrXGJKz5(Jw%&rYPjfi;TI|dyVJrvaZ*ivGRT;i>R6}8B>7*j
zbJi0%9UfLcYKp+TU9qXLSp`rm`)3(g6YOdHa4cv2Y)-JCPZ&g1Z*%F~T@dw@_HA~-
zxeq6NeOi{(yh(ziMZ)4yIfDP6nhTg;)$=9N_-{KO!ZB@c@e$(SVH`%0b3YF`lgX)?
zmPOF$H%(2yD*LrQ;d*vDgW=s=2h+1RYg?DCXa2gXNT~<m!o{H`ZDcegLB$n4f(?R6
za_l3#2o5ldNYi!buoH{7*Mo*V6a*$HLV;7C9n*DDXEiA(lW$p%Xxd?ZlIqsdEXwj{
zSZDlFJAc<23xB7ffAD8!UqXU9<b0@k=`H!Q-N77DK|>W+Hu+pBZ$bO8IlS+nqXw^|
zBM2iS@v_S^5P@J5V0gw2hamKs7Wro(xWlv)U$%_D)AA{;Mb;l$7?FOK*2{U?f_M(W
z4#aOFFlOC*Grkxzi#w)?qgNP48e=dJ*`EYNKfLm6BlZ-j@VMi+{0T>$Y6e%gC|6;v
z4=~J;U-H`Rv(<}l7sEXpm?7;<B_ukVgD{N&G?AbC)6nAEQu($)S%O>(jXl{O>aLca
zP;<5GjkKb?74YTOqJAtFKzq|v(-+j{(@?GPIKVS9<F51=Uzl9;{cL!8zw;-kp$or2
z&{ad+>5tsog!>*S60XwAsnYHqG)dW<#@2UIte}({hi5+*r;^rQeDpKps%Ql|LRink
z=CR6^g!&1h1Ks5JplDey{0{E~MNPgvQNeH21%lrCFFh~_7#;b73>@zaFo0B}hXo(J
z#OVP*a2!ZeK|x0L<O>fazsE0=vAP5xpQ58{e}Xtzn5B`l%b)PM2PI{UmZ`}XbW%<w
zkRQx|>4eE<Q-Duh>=4-VAbQ|zojxNh6BnLDzTlx-stKQP0|=pi5R7qw0g}ivih_z$
zN`Pc6h9K3P5vFz^s^};EaGwq5yEdpH4Um!3Lju85e*w5hg)|yEkihSklp#pqhWjij
zaK_T%_)PG>g`7N9$25qwhR3WB{&pp8G2;J-#qe6%xdFHO2AeceqW`Q#`J1X4*a>V4
z;Y4EVTMA!^vxOA;$ZDCt!CPots~0yn*Erio(G!n)@W*|^D_=Wy;f*k=tF~9Zmr)dn
zCzfODoJ@UXXs>1NP-A4#YmmhGXavn<+z_gJ`<km%!n*`RuL>>cZaGo@Iz2J)=M7{{
zJ;n45y6T86%gls;?`*1bFl=sXf1H<+2AiBU`}H6YM=+eFPoz%Sg=s>Dva{ls1mJO?
zTWP*i(U7Ec^3%Z$g`f%l##*mSt_wOa-d<yud2yeR>&(0A0@(ms#pY$P8SX-ZAVg)>
zpsk00`SNH__*AQ#=>~|-wScS`e><aHbz0t(7&S;t+TzUd?0(&LR|4CUhO+FLkKpq7
zkko8D`trz9PfKE1b$7OEXV>RBCs6NsQ18sz`Q({qI(fOQUY10Mt%YO^v{>w>TEBSR
zi>oS_n(}3A8W+^iWG~}cr3Bv#s3W>CFUJm0ejS>=V^<Rt>X>!UmDV@|xH@hWB5yhc
zuXagN9&cY%tMFc@?PqIxYmy+OSGU`O5gvK2Yaic7tFAiaz`*T*dLafG4tz~<{L=*n
z1iRA9k6#TYhCWcSFW6P4&4yOea4q&Fy6Mbkfl&!{&@KmDXMWs7;2Q2bRU~gBtDs>o
zNeUgzt#lWV4oq=C=5{Id0)=a+u5HaCtDZwXnX5u!bO%{LbXF-L40}KeG<SfcY6X!)
zLy>4lG*uU{E_AOMMd4ch=Q9&rc=;3fB`I@EFBuF!XcuT783*FH`4z<TlJV8PC=T=+
zK4eNl2k0X#Yllb9E;9vC5kWhE$Rj2d(oSrPP(Vc@J!@f+yM0dKZU;O)^9Ld_+JH>O
zxZ=AOG#fzwnh^u6!|A7Fqf5u{$IesB&EF?V9g5dyhcmbVh)|M3^!U*}qJEYbGFaK2
z#0I`dWniJzl~+;sJs^jty%7`^Yv#{r+=Q<#CleH22pEWpQ)lwX9b5uv064&fPlS+b
zqZM<&o~(2`QgUJ$O29zuo%|4(uP+zAeibd;jfc(zz|+6+9EUrZ?#^|ymX-knV0Dsz
zFn=Bg(*p-JjWR}+{_C#CZ~dR&on|-C9&{&ij%~0x9gtgIMPCkr_rc{WE_}pL*bCnZ
z3d?M3AYq3)iUS7jPOFD3m9DVG)E&SJ1*`YXzZQib9R(``({n~0aGXEhgZnJU3vy*N
zlEAeqef_?@nqICTH{?wuZFw#7F{`&i?NLpf<7G2noyziDxMHBmK=Z&P8jf>~^fSVF
zFmD1h)DVg7D8erkb}OkfElv2i`s#7j5-;7~&l>SlgLRqNM90B`oFJ!3Z!I+~g7^$B
zkD<7Y^U2QID5DVT!a*uS%0aL5KAD#Lk5^|WCC!!OQcFyxCl$386q*ohKGP#?pNL0_
zG0d|NfxU%N?);5-{u0rA@S7+4>7&sDwppXmJaj`?8D#?9@k90l(a-Vg>E`q1zXh9B
zEsyo)21!OKE@yf_^P?a!d>O%I$~z&B<zAUWN^ediS|{i$|F8xE+oi4I`H5Jf`^;ZW
zj--fw4*!Wu;rxe740z*FZG2{6QR#Z*asgO5n16bKXw4M0PpG4OMrkihyMDZcc->g|
z{KuO5lVh07O|keMJh@ks$3EfHm`nFk6qNS&_PxPbKN1c~Ds8?;y>OzV;B0$XV<Ce1
zRb*{x*P4K&P>Q=LQx12PJ2~x!&?qm%Tl)eivoas}<)&`&84*`tT{?o<W`*`&H<A=e
zko<O+1oDk(@YJbgo+1BQ<apA<5D}X{{okr`(;5mR;@Nir_F2YnNK<=Y@0s9r%gdhb
z9@a?Bx}@ZIr^f@epqE0N-ORA23=+P7%}h)lzV2NZgstEH*8=ZSMy@1hH-VlcY3j_<
zb5ek;$R#9-%V6IGV5Z37;896Y;P#L7?C_G4=Y(enW-7RtJwP%VLkXi)mKWXvP6I`Q
z>u45c+RPjX;imIsuwmXJs;5Klbii3#Q0kSLKcW+Y@xKcRce+GJ-RTlpMp(c)D`xrv
zd|#_rj!Bm<&cad=Pq($+uKOY#CGCK-8EXOLAo{LJ2l({+_%87YR(e2EErULI*gm@X
z*m6LuczdHTQHH`3=)x;unt9KH-4duW3nu}xk&Cu4-DS4wjNG}S$tO5H_$l1*S3Go6
z0HH1rN4WcDUK${}+a@ICZ(ZC#*`6h6EK7)q2OePook_w)c5%-9AxwoT6E*>!XDxpM
zy_C$yP!`aN2TiCVLn_z`_E((J%LUYuw%2%(GBL3Cve+5zmepidD|^#$=@2Wfp!?NR
zUpV2SwaMg68}9+`X#n-Ust|TK-Qk@H<O0?Rn8&eQ*MB>Xu7dM*@>KO~@YA_S!geT;
zxLp>TbIo9^WI=ZuT?ErRN;LqRSZX$7)+{MdSSiDnSdSwQ+6Yqb#nF393O_Ow-rRZD
z1MtC55vP=~4kwe+$#2C8b3Q6*<^!T_D^X($HS$<Vzs_{5c};BxtpIa5eYtvSY+AdB
z9mL~o${;$|y;ronYV5#PCC&blZn7j+h;!zs$uy2Z9qvA^2`Y~y@fG(JB^jI=8#UtP
zc`-xy?<0oSdl15eDL;ce(_D)JUu&S#%s-d5chsm5ql2(JBL+^dmT-=FtWWrz<k_76
z0euiilJh5oN0nEfl(;*+c^H_MU(`DCt_dB1CEf__d!gV-Nn;n2vY_mgZD~f5c<%mB
z0k&?NfIcJYJtIDMD-pS<b64+VOnnO%6I25w2D(Rn(IBNxEl)RjokeM<Y$tWs4z_6G
z%N21xY&<kiAoy>*Ns2(pd5~m<_QgfsetRt77rwh}yjg#yx`@p|%;<R89SRx;t9u}I
zH%7V#Mr}Jr^6sDH-9GhLI--$hWc++(pm$Z_sx<MPREg4br^L@D)?8jz-fG{OK`n&)
zzM1@6_3~?nr;Aj=qFUk#72IKZXx{6C81CznT%+s1#S&q9h~lr5j69kwRaQl}h924B
zB-r?TcQNIAdwQ#eE2kp;hbdz{sjCT1@;d%TAnZHW=pyT{(7%JkEci0Ak5c{;U=UH-
za>RnzvAN8~6i5D;E<e-4@F*c#hP}N#+2l=#_05f;j9Lm})_1-iFL}cIss(SMFU4lb
zO`2yxVOL*s&wG*S-s~lA(RB-6`)U+TQ2L_TZ-?2A4_{whUnWCJ&*YMu7-U!IPVf!~
zf=);;`qwJMxfUA({r4Qqh=a2$<YiLZ*Yu+LyOO98h`!WO2Q5MG#@aEOiTj%jClnWh
zn<6$Sh-?fs?`!e}g$Wb^Xs2rp9+wu)%<6YUxA!DZ^>Qg*azSU-+F9W;M>-%sM=r4J
zY%}@{t+!28<QbBUm|QvdsTf1$&7=j6i!>83WSGMgw_85U#I}O75Rr0Q_D5;Du8|l@
zHWBq-r2&(pezi>6+daPx-qwVIQ3A6$h}GxIH72G*;HeRgyXKy?Uf!HvVg$M3Vs?lo
j7HB*8-{6~e<}KKy%g|C8?m&3=nE}vH(NX@WXdCq(XawjJ

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8ae03154975f397f8ed1b84f2d4bf9783ecfa26
GIT binary patch
literal 10413
zcmV;eC{ovnP)<h;3K|Lk000e1NJLTq0058x0058(1^@s6=SJeV001a-Nkl<Zc-rlq
z33yY*7RM)PXbas71uR=jWi6DYAe#bpK~YPgP-T-vphDTYr3=t~ryI}$ZRx^lSw)b=
zr?TinQJ%~5p`t#wr-&kM2ny1q^XAN*q)B>{+^kJY@_qlWNt)byXXcl4&di)UgOL4U
zf7l=Phy7uH*dML-fsqKMr;DlfM>yz|;&bpF`{OQzgo8jbktkySeg~64fbWuHz_H+%
zO2F)JwJEE@HLSkR79_Z#oHbogc3dx%o7^AeCk{b5(&1F_9NvTf!DryJ`XFJT+JS0q
z&?sCD-y=8K2W2PRhjJ3<`jzFS2UeBViE<NPL)nS{Mmb5}koSKjx<%ALBqzd5a`xUW
z?la`G^gD^Kc^9YG1#K%$cP~zxOOFlb&;}I94wJ1xgQ**#QhktLzF%2xfB((-^ZYg!
zZ1pS4Kkrwbf0YP>9@x1RKUQCZdv7kl1SX?3WZMS(_}*GPxT+MhW0P|fyhZ+Qq30&o
zK&_A(Oze8$+U<`PdXPq;v4_f|Urm8qVAY042UnGp45})9cTiQyEh4N`WieG?WwHFJ
zL%SQEJASBPNL8tfyeEVAm>Ttneh$6^dT@7TL)6K`4dZuI$Q8$@YC7*NxE8o3xHh;(
z)oY%paC7#DbzBq#z7eX{hBSaAFX=&XZgM%%7vkI`tW*yCO_Yg=`yqnAa-v2eeE;?>
zc<O;2*fB2|!)s&ZU!I3&9NdL_Y|6px*m8C*c21S+p@M5nE|=u22|=nq0OH;v>{iKw
z56$?22D^!CP)@={l~{!+p^?NV4J00s5s~K!m``K3Z^mK!w_^!uRBfLTqF!aWIQ-yF
z+-+mFw$C)OYiVHDrh2UxX&Im_YA#t%&~JYj4^H@@?c?sN*|d{1z)fXCWK#h&a-j`x
zMSwIVr!Zx+>*mUE)45>nPAFTm4uSn)0ywG_n3eP}spMCtk;WQXTc!X<VQCC*_!b06
zRWb!kq->a#?G<8~9?@D4_J^SH8;MHSdkm@M;{c4Zl4~|K=yFf32q2}KbIxDWFpb1y
zO+OA&=Iq3=s<EDZ2ta?&stKIDBtMj$v{NO%A(gzg0RmqFkV%bhg8*Ry5Dn@_R&U~B
zC8c{NbpUn2S4Z{-kJW+a*@+3H@~*1{n=-XZZp#2Ql})A+BVDI;qOd@AYornDD0^1K
zWalKls}dZmxvIMiAVm+ZK?S7u)ZE%#9~)Ihwo^XONy=1lSG+x{yAU9io*SJ?Vf91m
zimwjux;ZP_yNdTGmGZiV)iI`$+cH23sam7gY2ERSk+t<H2<7w4#HUC_{H|hk8qHA2
zZ5bfu)|A%hca$N0nzwG-X1t>^1(B1GFU0ED0TN)1GUEzJjf&cITr}~_843H9IFf?D
zpy-;D=W+{Ha$5$7>!~TGM>3^{(aM!hTwS-Zu6}T3B@Ohtm!x|WXwD0DS$2Sg4MHki
zT4wy)C@!)S)O94Q^ENX$IJLgcuiK`aOAMYnR<7i>43I*17(|~2Z^{a28-tFl06j}G
z1E(L_b%g+AG(2{IghMo@X493&wrmJ$)etG%R?khj1IO;za&76!!+2C}`5mZmW7T)d
zdc5TLAso7|4x4fu(6j?P@#13#aX@*#Nyh;YpF8maDO(w~k+R(hKe!7&`(pji{+WqG
zRNJD}1<c`ls(Z~iwCj6-r|A6IJ4=oqq`y{UHS>i%xZuq*IN{U@la2#gbNVFCfAchs
zIJDcO;{ZH`Z=Jz5RkkxH?-ZOri>KGuU75U|b7#s<Ol#??_smQ3>b@!GV{ltwd6tl0
z`-tj|)YKcR-o#ogdg%auyuQ|?<AA2@+m4?l$Lh5y*QDcs<*~`K9o>Hi%I3R1^-|ZB
z3w@dmquBHyVR{7VswXIVTX$?MPH4+9kb2qjlDK$t-RcV{VoZD69&BtHN{89>gQ~qP
zJ3uX1wj2^zXGt+iUU`JHjaZ|tY;IN^;K@-L=fQS>Y@uwVEi&RUN?2Y*+sNids}(cC
z+40kwrYD*P3GD#2c-goFwX_(F;ug=ctyz2p&FRs8<RX7jO#rh4WK(7164chM{uM2s
z*Vi`Y&D7(b*R@TI+5xhavXQpQB=JzvinmCcIkbE{#Eo)_$VFvn#yxg`Y_x39MrpAa
z#YC>BZP#KW)rz1wGkz3b++zpGX3NIKL+e&!v|_<T4x)*YmU3NUaxrh~%jvNLWNT$(
zt}Jo+d!!B9S>Kf@T~~axF4tuT$cD=XZI()UWvicEV_jFqjbw^Y;_9AkJsqs?mSQ_V
zHd!_~?Uk)r`5Rg=yAOj%Y^~TwjIt7{g{Gt00kYMyk+w^ZgMfMuZBvVP>lJ}>TFiaQ
z6}$vw71{x^*|Ko~^_rD(w0N!+0&330f%Q3TNHV+~AX_dQo92j#JW0ofEat`()+cpU
zNK-<*Wh>c%oF}ld7(cPM7T>>P3+`N++2#S7TwjYH+FeDL-}5iew@%rhE!V8XXvx!0
zTFweF>(f3j`6XB-!?_??289+P$hL!oDad&d`knUqYw_}zU&NQL{fPhk`)_>p#vk~F
zOaH-9ClAxr#e^P5n<nh6peu{H7OXSrIL4HXPo71oIPSG$`28!F@o#_qiErG#jo-Qa
zKm6prI?I<0+A=2zJI615)go4iEh#5@a>v&DV0je~`L#5{FGh$URTHx9AYn@Acj8H9
z-fn2Xa=Bbhm#_bhv)?!+_&C~>bovC&J9ipS=gMNVj42zRq^}*vKi$01ti15vyd!%p
zUA9JO)5+CkcwA~i2(<M?b9U<)K;q!)4OOsy4Ur7oh)_YEFQjThww;jA-$%~z{D0nW
zR+$Q2x^|UTsv*~)Dab~Qd-WI|T~}_>aSSaRpH~0l2>#}`U$mAt<;*`UUpCUF!4<_g
zFf*C<$Rf;^y{H)XiCNlB=(vxmae|1Pqx`~~S}Rm0li_pUevNx<%Eh8q90Q566Y<b(
z8}KJT{iwC9pan<wTcd2;xKP5f-%-Srduf|Tn4w$GOqm#^#>DZZYFMh0VeMrAMOVe1
z|Lz;ye`{f@1!x?J<v#oQC!J;d{nlSNU~_@x%a%ohX&Fk1#wc=bI&JevQ*(y6C1*hr
za)Ni6^|1;-?Q)ax_J98i-@0>0yCotz`^}fMr`Fm4f<X7fAFt?zM|$eG^~wgu0n%S%
zBL73$+?lq$Ny`qi*axj$eSgzxsD`giXlx~SF3G@qKl(stC4-d$OXN|QYIG%o%Y<u#
zMhzoDCAf^}S6;&DufIwK&*8l(<n!>Et{bxGcZ@CDfQlmg-(RljEY}^<fk-|33hj+r
z@yEdpEGDnOXhFnd(Do(;)4`kxa}Z}pi;Q(&rkZ&GW83?0;|4wM?DW_FC8j=K(AZK?
zRVhwC`zn6%(nb2cA8uUNd2Bvi`Qvx|%9o$v%J<IW(zo9rU~Q!fkLx@C$l6W#qo01z
zdF<Q7>PEkElrDm9b@vQz3{qdC=2bx32OI6ixaob7Peg<(shE$A37*Y0*ydf7hWB3l
zfOPA%yE6dn<bN;G{VO!T@49{+9$Zy|SCQV%@y|cOUy~k>F4t(NpuypoFMj$Fe(uB}
zYGE`j2L$`WNWctZJGzc_^Y7cZ=&iGKe5Qp4N#!&iijDjXjTz(3xiMo>J=mmazv7G#
zF};w)79FkiA@1zpCm-spe1PcGSD#bY2j6kZTSF>x2d*b>5aJ1Q0i#dXZr;STA6&qX
z?AfNYN-*H~;g8?zcE?0p{`DpSKBZ+x+2NX#R$#Yh=T4y^j8P-g+?ON+%kpw5Ksi!b
zOAq(oLt>AA{_iWD?hG2?wJ$%XV>2K8a2fw~=WnZlqj?=Lg8tUGU(+#}_pV&l`FXI2
z2R{Cgj<UD-G!29*sWI4n<>GSMfif5%=Dvs=1Gg5Q<1A2u%ogU0AeaR=a7WglGq9Gm
z05rN_()Itp2xw&&&f%Gd_t?ff9{`jo#qQFme-Q@S8}7!~yjOSWsy>00CD&oc8B<n<
zH}`~N=)UdvkREr|S_H6Xke*)0;&fY@D5kwJk9E;Ey3LLlO5K=2gNm&8)gvcmChU1d
z0Fv8U$^*niFMfIv|J~5gYT*GF2Xj^(U~<})QdPJ?MY(5&6#K_K7fB~rBO)9Pjepwp
zD#|F~05aZvSqdQN)l7Q6ngpyjS}j1|{{9;tUcHf)#pZIINSk}+uE7CGtG3a^d%<xK
z___#s)0tDR4cVt`5g=Gn<-y%m_^V%kY4tAb(`#4p_?=s6x8Jq^lFGbDZ?JD#{F^im
z0wx5-L5i%y(21S=HsyW`+c&~4e6|7*EC^M6{&o86Z3{6F2xj6T&k4$AL&<V-uc9>E
zFMG|E_M?KjbKQ9%c|x42azM)$4)-h1zrz4(v;}}*K(PA#cWCU;R^U<SkK4C!%?Iz`
z(Y56^0+6)Piv;!q)~7c(&kCJDnTQN-Cp3VTpoFLyunRG9VU0qD=q(KpSRcsPlgVXm
z4j$Ek&|DH!n=%S*Hh@SV?NyxCIItuOcbFfsh%zB0kY;lqjy>~Jl3;7>rw{Cu!{8QN
zl(B*ZEn!VUSbEKv??13(3(hAM`|DqSwpn--f-*wJC6w9N`i?w)2q&I8VbU?i)Rp5$
zpRbmO?ySVUW0vO8F+m{!u@5;7*qFB&61$hYbWjGt9T07-U^P?#05ata{Vwd{2a}a;
z(QWDK-j|R#Z<>+y4)Emu^ECb8n$m7_4%f@(9^8ck*T(DwCIkV5Cej$Fy(m5INbk)B
z81_|%Sz$1T#tN3wg#Zy2eKhpDFrV~OEAFZrs~>OtfgjpaWmJ8G<pFrW6`Im+KBbwm
zAr&RqJ$eZw+jn4W=mQ8z=!Y}n-hr9zffYsamWqLTk@LUtzc24Pcoce_l)V>Ec7e5$
z<-7`0<%3Bl$~A83zX=m=j13)K`E<GnqXc<-yQ9qbODzcyBy81Rd|z)KP+Jz19qK@2
zVEvmDprEP}>?&RU1#)%u;U-p*j;=g6-ytEUsw>Kreg^;rRu)?wAO})#2n1X6G=;eY
zbpY#7JLDu;AE2T%dC;~}?3TFl3JMDHXKYCH0n`pX@o;Z)fS+3mpgvpH<B#>+sc<*x
z<Eua5J_aP#n;oF!RllMUBL-161OViSW`*`A2e`oivg$YxSf7e_&zoH7*KhrWKOq%=
z?Z%B3N$rr)#IhUo8m^lmnQhBE=deSTwE;?L1c2PvM&Cpsqk@G4G$(8@%1*r15&%6+
zdR@>1F}9*_-oA}DzIg@@Ei1s?3sQ04(rg@i;xN56+FJ0yx!{~|Zn%b_<pt!L&eCf$
zx>xqcb^P%5t(dMXW@Ug}*T&pN4~-o|+0Y3PH&pF}W=|bT0Q%e706_}svCls?Dd?;u
zzf`BxSd7-LQcApTHC}%70KMPb((ph|^QvQq=sA_wK%P6L#o@{e=S=Dp9Q*VlcFK&`
z3z4}2a!ZM6K#x2yj<OLE=%^o{nNx<f1VC_SgGuQTzx|2_ZpcxCN1=A9G5ErF-&h==
zcQ1cSV2M?ON1=fmbLlmja{LL^Wf-MK!^U#f187~$9bo`*=E(_7pAv*}68^F#KrraF
z<HL7Ns@zK?@OI6R@&W{^|M$BV2k79%4|w4Lxn?HNYc%EfJKn#5ouQ1dzM&R~%n#7&
zTWA7xVNNJm*MzD9ps>jU$pQYbW-n|+%|^QNhAEZ%^{+o;|Dp_Dctk{ReEnaG1N7!M
zUvln?NB+f`^cqb${^jex;SpPlIV(gVl3I2ghz8NCZ=kUwM+yh%k@0;{mh_r60fM<7
zQyUMG(-U4kq8@)Rcpf7Gs5P<|e<tU6d}k#VX}}{+iN*>4I7+Y4)N_=QfSdz}A0i8M
z<9|WJh7HjV5X(eFBM0>$=J8u=0pwnoia*!0$bca|pm_&(<4!rrxI=n8_RLDeAtY}2
z=*KHo>(0ZuLTbvfXLb_qK-^8I+%<UKp*?2n34VU<Dt3o~MFl__5SbmIgs3a1pMO8f
z1~;9d2OxwxqKx=YtTmMZxy|Oh@el44egFG!@qvr)<8#ENK(ba#B#6Pe50ID$uI08F
z?}UZAnzXZ`2_8^d6GUd8%7|O~1!3YIE{H}t@6He-Ke{S|H%gB^Zw-K8;1KRikln>|
zUdG%Cl=sFd>;Oyj@<24U&RhVc(aBVo=p`QzCVUthI@4N3$j=Wx<b_hAU!)m{mBlhk
z0t9OVp8NJII~WEEtQ*`lp#~5rJr<9xEi(^5#p$xaTMMzvviYx4qr$PF2cY2K4&<+=
zM_T-rA=@_KhjvxT{LAw!wl#Lhu1d2mba1x_>TE)7Iqpe%ok|sRnzE-FFFLy4v@Ojy
z<zN#3$i|cSvCFcBu<V^DV?*bi8KUlE=;o{*{j-zTz$#(Ojvtc4il3I+8oTKHSvwF$
zYCUmdVY|0+BG4!#J^3QeI{h-9alnM_!w>Ah^N;M6&#AA&{i2o>0u#PM074u<?k-Ua
zXAQ0@g$0!wdk`8ewKevJx8JbCV03r>4E9~0hJ6dw^~A0!+7s<rl8L)*$TF_o$oB?T
z7B`M5E5Ke6Q<eb}f-Df4e6JJDJ`0;PmITqaEO}cGEcSHt(~s;h7^$O|E|NexmJ6O$
zG`g}BKS>~xzzXy*t&$}*`nH~ad24Swg^YQW%SiNd)(;TZ&v!xo_w?$uA?IrfP_|`m
zEQFQk^)0w$mv+7L-8Z=N`c!^^cB=rCZUjVG+>M2OQ>B-YZ>N5giD0_7nBKcn9Z(nY
zVT8K$EKGZqvp|-)wRvDgk=|8G?b5E#u3g0gVLJp(fT}bAG6o{JwYgv&4v1g=CLIIv
zMIDs;tm=7)QDC4e`P->SW@4!&?~R8=%fD+ww<ud%P<JB<Eb0KoY6nOtL|JTch3q8k
zw|rqOu)vrQ3j-+F7ijbDx_a?E4j{|KK;w68#aHjbTXxKuxHg{r2vy<X1t?urarz<R
zmUn6NNR7qaXHQ#2$ACB{1OX&M_tHSx!}H<s!vZUceqjqt$;}NA-1BA4wF}#tpza=l
zDGe2PngNs^%OkxI(Y@e25~6<hx^MUczycdY6bS<ef+cU#Z9}NrfDPHNQ-HF}4-o7`
zV;@8`Wiat?{N$c$HF(qj%8ciSM<HzxWsy!3fzK^!J}hz9ld>Q%fNlz;`*m_7f4lZg
zPs+CxK;6mf8GGySjQUzZnze5S&OQAymYz5)_&eH^bn*y2)>B%~UnfXQkL<$*XJ5rj
zUfj!-MX2_vYu16CIG-E`Qa)zv+b&q$i!-$Vw2cR#ICW+4KtvPw2|#OCVb?j+tDrN5
z?)7#T8bCM2K|x)hC)UY#!K_emE(FoWtx~UdHXaJ8k-wu&kn8+J-4;A-Q@)_j>(YJY
zg?Mu97A%3iAvFK5B_WJYJ=Uk;DLX5%Z$S!1DXUc!tzD^_ios5qQXIOg3I}f~YCb`#
zRk6GpUA2J+pg4XtgGkD)Rv#BBbDlJQ4i`ZC2o9iC;vkyV;Ys8tPL2MM0+eN;g~p)}
z0w6LgK%2DyWB@z>N{>Q5fDD62D?moT1F($VrU{S^crr8~0`~=JA&cjHO4_~;Wq@Nr
zWEemQNj!S?^ny4@yn0cIMFA2Bk;MTr5FUPj42OpoAS2;v4v+wNsNimoCijJ&no<L8
z@7}$;H@zbj8NOOQiG%t<9Ml<wPMS;({<p<9ThZgrEDDej@H7X=ICz=?WCA=*096x!
zqyphls*$#FOynIjZsgO@_TKm2YjFJ26bC^DnwwikaARoA%JarrA%kvh)?tY(29S~P
z2nI+GJZb=KEiwrnB|v+$0F<FiQaOTq>Ykkmt8oOdws$f#{!w*f?U)Jch8E3A=KN%$
z+~TWqXo1Kw0L2&$j}jo#@V*79M#G~7Xtyqagu%lBw2>bmUGSvS8y4j#ei=rgkL1%f
z@7Ap&y`32$qxTGRKt41A?~MHXhN9HfKQK2YxA^)%Jnqcg06k8QB}t7j8Xmm>352H!
zplw<J5)Kc5!V8d2%M%ZxV?Su=Ge4p3-AB<j??i`kOxpI6b7Nh*!rCys99J!=an4IN
zXv=Stw#FviNheZx00-nwDkN-21s+8jU-6!?y1$|XhY{D~k|CwH9qUxO7Lm%ow<HTs
zCynJ-oIzg4YgcAGg+!Y?a+7iR^EG%(WvTK#=PXcyc4ccSk!pNxIS#L{!!9|=%J-ly
z`ISWePn4S^b6OjX9cND62W^eIbm?r0`%>$Td3)1=B;S71raVS|C4XCE+i!)Y)YsxC
zwr{1D2jEFPc?7RGyqCV#udVzd$BRCC0H?lu6o-;y!s{o=UxTz0REZZH+>J9|JAt3s
zzmvYE+Eq#889~}zMJ*4&lX>bSjy`sXzE)_;9zIn!*Yltns(4batkeI%Q%T*?_v-l-
zwzrm3eQo2^eRVjbFzZgQkn!Qr)?Qv-9>(^*n!7QC+Pie_+=cw@9hkfB2xJx-vh}yA
zTVn@TmEvJ#1=R8YJWubbp>9m4%JS)VG&LMlUV!KB-HunhxDSsc$As6z%h&U3vo;k{
zO$HcWI*2C`VCj2X3Q12&RYlshwMk%k0G`<dMDD#}epP6yq?zWw@H~z_aTt#x!SsmA
z5+0S}5mlUm;q|r+vSAzZa6r_vkUZa;wRL+_-IrQ0jk>!-Fx?$J^uSaSsW%wXr8mn$
z;~AVgF)0R8iD^b{(GvruXp?%J)1xrGDF!ki=FyCE)MFsSVjfM6Au&)Wu}Bi=^k|QH
z6l$achszhr(CFcFXd8EPGdXzH1jvCdyxFM(++21qTCwm28srMxgw9+m)jJWN4erJ$
zfHVLZMJ&MM<XF4_34})tp!KOn!=nyRu}SbyfU0!?qyi7^dBw|6@=A=J4ttrlE!W11
z<T0}VqzI-Rkvp|QJ<+&flhL|We+soi=G-d-d^fluQ{d4FkOA;001Df!36Np%fFNMj
z3XlQt03_N0is!dJQS<&nL4nh0+j3(+j;03B+N_WZ>e#UxB{gzxExlj?R><7D^?>gd
zIsvP#Th0rRf$)HO7NyhMYMKBt93Bp!1R5YW1IR#lv;!2+Z+#M@Fq;1OKH8?<-rZ>%
zn<;qKH8R~3_2@bhB`p7*PXFr}owme&VS;Ayb&TsY1IP$?02p<EjDrV2;RVPDc=!Q|
z=K)A6hyRI|%>EJib{@y9PbYJ9-F0^9DWM#x0cd9E8d{Nhwu7<=K>8+N^$ZNE0c0dR
zf&mgRx77?FBjITdP&~i&$sz#7EWzl}kQ~~U7Pda>u@Fr0w?{q5-~J?^euK+yOKh+@
zK-wS@FtV&4AYl`uO#r1C4No(GOn|2epc(>Df)>{$ZJ_HW%?-am+He4COHWJ0KH7U^
zJ}zBh%m57^@+5I(e{q>?{I1NR0BKHp2%Oha0+beGG(36%GGJC+2~b6`N$@BEs@DQg
zX1pBgOSE*}Efmy$I&DJ>^}KXhp?36ES5Hqr^0%LO&a^z*cv>b}Ee=pNt0)6z*0lp<
zSV{&gYQPJSfhidrK-D||#TlBCfycn$tyX}D>xy2C#ZNx60osnWp*w3+F|xu#VTHJL
zgq)pW3H*WRxp}YA%HipiSp^_NAR?fQ+R6uz;rTqg02z_b!w-<*@IW1C1t<%~d{$u5
ztf~K`ZN{~oH)~6)SfAzrbq8wx0#N79V@ObTnO>*{L{<Yx;m&kHl%fqF<C4Le0c2z{
z7)*w$17sXLS^)xh$Tf(bop}ZM_>8A*)}e#1H3DaS0kwz1l{q{-VIh)6$u;94s{*9U
z5~XMZ$oNb`HGoXWBy0kx#3Xo{0hGz&9?~NdEngrPj~y9BU6+T4KW#fJ1kU<gS$n#s
zqPzROU>3zQ!wON-a=10NQ87wwb%6LRQHnNzVok~<R033(W>O}hUVsF`(;T3<MgmVG
zTD^$GJr9>r*TuC}N0kXv5o)1FlPiM+Bqt}hut8}4Q~S}Hl}cCEA^@pEl%fTo9TnOE
z5;!qR0U`~r9Ux&7qZFX$wE$!QJWT-AasYwrihB-=rayj^whh-tom<j%X>(<6q$B9d
zZUq^P7R@|EduBNavK9kK0a0o+4?xA*0Wx4#9hQ{S4v_F!bx8Vx+?{3s83>O8AUKu;
z7R5-2!lIdB=SZ6jp>5M1b)#+7g073t3W?bexF?D1dr=>Y&`=aP=RG=KRF>NSOQy95
zK)et|<53k_05UKoLpwl*rDX5|WCT1=*3s1jpuM#X5*RF;GwnaH88>Ycu5CP3rYl6q
zMjop1khimkM{gLVb|XErK`9BJ!`9JjPoHd<vbWBp?ZYSRtiVa9PSD+-4f<{!5rGb%
z0m}n_7arV=V_rIh$5xhd?-PwCjn_<-+Ehw4g~UIwCLt^<r+5!;E#}t-ddGyiHc~+v
zDr6EuG8p9Zz+j+Ok+NtLz!Q1&K*OrTd-0&ys4v}oeC~&~U~S@U3vrLWcnC%`J30CI
zfUh0Usb9a4sm1wUrN43vFM8=<gDKlk!(yMXrgYq^$Bg<u#05_fn8ES@JZa>bLU(bm
z;eEj(uqd?P&>oz1`XpVG5SEpLMGg41O+(c*@m(RvVTLqR$Rvb$EPmC{;Fw=5eU(@q
zfM-E*{{K4m?)@;dfs>DWA9{;2*ESMcghxGlkqgj#6g@N7fPjz(bJITSk)MJkc}X&3
zx1n||Scj*RSZZ`#x$)as6IUTgi=&nY;DLm932`IpiqozPb@`WM;c2AddJtCz%c<}x
zlTT7LK>|GFFhd$DOoH+&LAOZEBO#raL9xrfVDKn<Tt2K#+SxKZno}Lz?Lh97{()}3
zoyT=rvG6?@2}RtBXN^{ifp}t~QN%sb7zknz6EX>#VxV-BG6@wi5acWy8uM^nb<*3C
zF2kbP(>^3_>j4H&AJ*e?wdPcXIU#bR%Y(SN^(B7;+qG*q9L<Y3jS`lzaiT=UprdsG
zB7%4U+>ts!hUfDDKvSRB0+0c->J*@QZ2-mV0!U8Bd1526=;<kMkUAa#b?wxN)_E7!
z2JLJuXcpQtb7TeR>cl}bkQ8tzni+Ng#wO^Uu3(L_tPc<b)^N*^gzh(Cd?uk*fDBB@
zfP1v~0nz~vSRF`7UG*#$g-<?A>UJ2^F{|sY8r}6)1CKU{y0Ag40i>Wq#8V$DMynRd
zXk`mr#M7(*DR#7h*J;LQ680?4Yz~kS`8@mp>4Aq_pJ?eknRs%@Ca6=I+r!mym(~ss
zA4IM+m~%${$kj2BJP&es;J(Eua`v~}s5PX5=yquq0SGoEfnRZ&amirK05UQetT{mO
z+VYs?G@CFn3XA4Hby++zco~HU>eLzaW&yLSEe#Z!GbVCj-N~NF)fFHbEb;NWAI%Ow
z1wNeH15|rvqs0JH3^oD)2Bu^v0V+y2DU+}Xpi&+1NE_($Rg19bsnD~MPM#C!sK1x%
zAX=wf-MX~Km`A83YRASRU?Q&vfoLGi&p=!xesa=!(en8>x#^F@M!Hf~mK6a~LS$G<
zhHij_&#Ef{sw!;`4kW-spbWV@OXl1ZKNeC#V@a6X;(mxdSe<rgi}3IbHo|BqtbXQ8
ze0E>t;y4)0u*1N9VQ6mnIhyQEZyBO%Gb%x{I6!oXH>p9h>Ks5dJOCM%k^un0ed<bp
z(%Fg`aO({m53J72u(urg_YS8ni_{sdJ4ELny;z+}+LGH6lF{hS0zN<TO9C?tAPo~g
zHZe^UIp;gt-B<L~vJ)De^Q-`n!V)#x#Dsrt3wFe54v^vbJg`az02EAPAh_X=fCJlm
zOP6BN!kOQT9~qa)dRc>6UHP%Pb8m@^LR*1I5nOkq_hdUc^+S%FHIjIFJs_SQx=R!_
z{|}V3f?1%o4b%2-m&4)?76nK(Cekx8+8iL`lEGk!m8tc$a$<oRXi%TH9E+ZN<_~e$
zqkANto)f6mA$3F4pVk#uXNq9?u7ZazE^5zaiMwi8qoEE10;Zyf(4(Ri3vWB6L~|CX
zwoImsk+LiVkkXy$o>f-|$Uu0~<CHNDtj_~WtPCy9!J=jJZ;NI;am=A_z%(AK)0Nd#
z`?enM7;252*w{rU&Ke;S8@U8QcONRzA$ag?$C*=4JXnzV=aAwo&O$)2t(X?cvH(ET
z60udjv<wn90hA#eo^-aLWhNa6l*4PMq?I@%H|@`W#m}5{7&2%s11E^q0n`O&bvm&+
za=Kfsj!<jl!TftqYO-BiV32GGRO&q=hllrGIA<4$kuKBGZ0K^+MIP8Vl0cXY+8R4k
zs&$Z%0#ski4^T5`VjGFYk#oT@Ip(@(SP>PAo}G2sF?{mwdqxbK&cGQ$%gni}UaT%W
z>{iFH*vN(TF1pf6baWg*dmhXpN!;AVi65PqEqZ491+;wOpOAS+8#RZ)#91aeU3opr
zM1U0TES(RaEFAz5U^3zeEO9c{qvEDbq@;7OZ2q63IpG(?4?U1W%5uNL;yAjv45nq}
z!0F2Bz~yd^b&Rz}5@xDhSt1nNKIG>}ewB_*u5Bn$utQM)S>h>^Dn$#P{*b_Qi}v2A
zWlB&7DvMeu3e}jpavVlt4oQvyTVrcNloqGbjn8N#ujME$ULBYWcGoQFO`)jyw?y-1
zd?*fmxYA*8|JiWuY&?g$Do4)Z__4Bjv$8v>bkFVZm;oftBGK_9@@pl%lXjej!A!LC
zh#}9ohCi{{ZQ-mp-B&KY>P}({57N+{xyjh8FctPfr+T!$Mn30oz09XHQwIB^dljb1
z$^SVOsXW(wZ+)uVGjE;TvtW(PvtX@k@RmZ^+(Uch12(V6o&_nG{11DO9u@4h<xYB1
zZp#;5b)En8xeikw-|RdzXg;~dk?dNBAUd8y%D}}q3X7uv$Jq{!`9X4KmFvawTp+OQ
zO-%kkYLy%v6;=rvXt+y{9+SFE3R%^C&Wx&FOBb98kgoWmPfFbN{yA&$<jReB=AIfH
zRK6Y)2=L&_VjNUe0(+(F7WvGeiXuFyEFTA!X5(?>`w=yp@yLR7+-F_P_1>{dzv%Vc
z{4?EWO|R<m$i&s;8XjeqW*D_L<Qmzv4y4!4m7y~$lilK22_#20Z12u`Cik$ppa9x1
z&_GR=NeL9uXve;OQ`=7%z0x;)N<qNlxjS6uMjY?3Xy$pBrSm^_iCXxz>#D_<U85s^
za*ciN7q_@&zq!RP`^`0W=`ZxX)r&8?tXS|Be760<+2@^Sg&+5s6~4o7Mp(Z4gt03f
z1N^2+B$6@gT*KJ86_!Q;O#r*rd)T#mu(CMX!Lcq#4y;F_uu2{(Fu~lAKlv^9L#~d4
zXyds!D&i(hsD?8OF%lYI;^`S8x#yn89J+OT!l9>cC>41Q@6rEpfZPY}Qsw(iu+VtM
zk?VfLxt-`8D*o)6RH0G0sdlU^c5qq%Bu%TN3R6ec{q<$PcmS#o?ctDy1vk>p({m{8
zE>kOk6c$U>a;ZxBKlm)ODnpQ`%TPxJEO2ZmdS9GBJEt$ZhK?H0Xj&UPI5rAX2R88L
z$%0cK7N~Y(7NHkw?B3M1K;whO01!A0WE#NW=*IvFVBhg)$LPV1*_EBco1N2*U4tE(
zRtl2?YqWMOIBn0yR9sp7qyVcUb1gnBpzXq7P*oT9KOgqljw+zIvtzojb2zbcN;KS)
z9hz1SlqysTupC)~JF~`b&#VTY6#sW--*Hp{MHLo1Fn0-5nsA9VKvNapXEcv<*FF9Z
XdJ+W}DiIkV00000NkvXXu0mjfKBlg6

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..2c18de9e66108411737e910f5c1972476f03ddbf
GIT binary patch
literal 9128
zcmb`NcT^K!5btji2)!5SAPPuNq)Ls56s4*38hVo^(nUfO6%ZAH(6N9hNR=iCp@USV
zNUs_|I-wKc#ou}5-}laWIcKxU$(_yIot@8o_s%{sGSH@@=As4w(CO-E-X`sF|29fE
z>HYT9T?zm$_~>e0H4dIw&!!4C9vSZxNlr9*d^_s#H!1R~WS_6MVY<X9KRt@NZt=}!
zJmw-<%vu<9{2*46DzYqEKtRB}W?;0m`tPvir)n_|ws`U9&cRKD9E*zHuR$C4n1#E;
za+c2YE_XNhr=4w1`ERIhUJ+(t6EiL}daV&l{r~#GniUa(6@#hjzj$F=5S2gk$9PWl
zWp`VZF*ZXCuk-YS!7^kQ8v14=bEdJ<xKM{JEPw1Hx{SNYPnUkh{wY0gi>z@X@%G!e
zXHz-tb|VivQj`iFZDUWNj>i`*9rwT8VC9f`)ww2)D0tG&WBFX^J|oMigqUy#_eV)Q
z<3?;pz6pkr(;Z)thNWZ3Tu^XIU(m2~K2{iFEAS`~Gy5VW_tC>i*Cl0kv`b9xtW+!e
zPD_a1*)E4YGCWy+8(ZVrP7}Y9URLg*>8E8fyY^0u;VQCkoBQJ<_5zdXl(d!zb~b;b
z)6|dkG<QZi+WaF%@Udof3WFE!ox1A^l~w%sK3pfKy8oIly<B>)>oK`*erN6Q98nTc
z*T4b)onLqyA@?UYxy_MYQjd+D&|e(Pm(0oT&BjWQ4@?kFIoB**?M#(;rSUW9SnG<-
zSt-|WaL6iG_P3uZd9eIpr{TtNWC*$Hh2Qz?uBS}bIbRfO#e{zRE!IEy&YexD%F}@N
zL-y@k#YdI*GK@^S9Mw$gu9^2z1mSnEkrdxz+MPN|ZNhhS)_oYvhM)cLTYGn3J-&{3
z*gO%dE$+F=!pgEJp;TQOxUvmXY0MZXd)l&aIQ@q%&TOO4FwrA~ak$>;=zXV4zzr%`
z=0~OcyNxrVAu`L~2ctf1)jOUXrl5QhI{u_3cR4;2>t<wU@pMagBKNuwtKgG}A=ehS
z7p$kgZ$Qt&T)CR+i}f$#aao&&*xYW3ZEI7k6bouugE^}-_AB;YSZ14i*x}t}Z@S~Q
zCK<G`wV)M>?n_c`o(TMz?xA14+Wh$Va%BY<W2mbT=DT^|yby<@pz#}rnYMjS$BGO~
zbG_V{M8v_MH+{+#DPtYI%|kkb=d1yJipYtDre1<f-5ZWEHnxU@&f@Ev!J2V6UcZ5M
zf675C?`dT9%=-573+5~I()RMe<2sKuG_MbWJHLhYJHDum!)@h#kf&^zs@bn$_fJZ|
z66C>0&2$WKO9mM2sYf3h-OCY*=ZOJ$Ngw)1D_iorRZXHQZi4&2K7qT927nQC0Lrg3
z(#lL522bDvLQQ|!4#s}u&v;Yf6v=QytSm1*VR`JzNHPFHGlJ!`WMgHC3lNnE^`=*0
zy?^9tJWsJlLSn+d=%5(DNQYCcv%)omexK}h<gNWFsPA%6^w&%!<h`iN-}gM21=@du
zpeQ>yZmUHWQF=7JRFKXB_b-*?UD4{x!=d<XPfD$2zp>VwazRjll3YN!e1GQ6{ViI{
zhkd)N+MWKT`q_V0)j;tA_oAca{;nI(Y$Pb7t7Zgb7)DUREOEf@igE4Q;TqcgkX-wd
zJ;8G+7!?>DALr#bk)GNchOvQs{BBN~iU1F0&RMR&ou$CHl>C|ZrZ@PkAenI@K>Al%
zQ7|N8uxRTq4vM*lnm?oa%}HLn-3G$yJC_b75?=65k%LM)%(H@{N`65=i4pdO>Mz+=
zLeav25<ur3!O+tj|13g5ftpHmnAeZ8XB$vV91Qaw3L_23;{?l2FkEXkSk@$+eUdXM
z@L3hO_yQ%#5V5(`;X)7D;m<n`GzG{ve%IZs-PMX!!)f^@QbZ*1(lxHz*yO--RPiB(
zx44q7B&X-)=3y0?a2a>B?f086=X6O6;%!2@%ZP1|;Nvbnj_2aSc+8ZOx$k{x3Drh^
zc*UWh!@lFm$>1}Uo>u2rUqXSar;=W-2Mqo41Pl(rQD;>HWC;@e#W@Z29HUt(caNqC
zC&6BqG(7E8;B^rX*m6|Ejm>-6L>RWQs{?%J*!{N&Cn3FMX$DmBS8~(Emio*Dj(^J_
zk~mE@d*561epZk|Er>78iC#q_4Sp0Y3GD6B@JKKrmyoJG4WGBh)HqTZZw>kH>(OJH
zlp#iE)N?g*Z@4^*MV+s+H!!1<ug8?(BT`4A7%E&ah1dhHo?l3QQE66of_LCZD^_OT
zJ8ov&!NH;L%keL!;02WEZnps|bESIGUK*b|j_#3}7!wjn&lQ0zI^4P;B<R8)NXtK!
z<H<Qt0v>LJlIN*`JxC#o-v0{2|BS}}kDUMqX8%d%;Zo1pF*{G_rVrzNd`M2y<S4{E
zZ~?S6-J`cXoKw9Nyc!T319^R*JfQ;CQ+w%_vzSd2SA1Cl*~}5w4Th=Lu(-yB*lQ|@
zw+22g$NGGQm=zT@JCysH{2Xn`Jf|L6NF%L*D^anZd_sk<&BmUS^e{P4u2`5)XQ2pr
z6FxwTMEVRQEOwl&)%);vlI`^ws_!&-ek~dpP&K9}B(25TN;E!RZ*vT`(=BjbD|t}s
zzW9-Ab|ra=0R$O7HH=tvF02}sSNt9rzWq07iqcC`RPA>a!T0DJTesuRVwL9u7n&PS
ze_~l@1G?`(riUCq#<3T)^gi`sw~pk^JSP})C#_iBKTD*{^N7d0$A0wJ3#IRYe;0q4
zA*$YJb_LE1lo-`!M^fB~U00SLiLywh>%-_CXgSb{j<vY99&V=OvL;cr6v)Z*d1f(_
zmttDS1_bxRDJN&U^J7zW>u=7v+FzB+78O;y>TeZvRv&RoWxTLP?d+9Zi&Ypua2+{3
z?&P=TOQKt{%~L~p0$j8^;iia9j_>fKovkcwq%sUQ@nh>Z!)%cfJ0$;z4CPrz6I0OU
z@+^ZT$qbq`@V*LyaM7l>CZ1ZQo!IplAN5a81(Tt~ztAbY<i$mt9=sHzoH<;ZC3+j&
zfeoH_mod8oV=NQrKMgqIHo>c(d{@u2@?f2YdnGcoX!#60Ixw-Nvi<wZu~uCAx&ly4
zo*oEU`pRLNt<<tfYu^`o+<I^Yy&3+OpZ+3sM8|<a_+ikgx`j_8w8@oMw?QOnaxma#
ztvBdMW5dPGTHkg)r8zqmnR>x#$k1X*NJg)beT<!BI6I|$nDc()yoI)D@}gyRaOQsB
zfLbI&;}MtI-;K*eW0U+p^%s4#O>LqL8^6*<{2f@@ns|Q}RjZ!$JIHK8NbS8xrmu#@
z6ulfiVr7xxNb~dV#acSrSX_pQm;bUeyjdV!{OZy#M4(A`<QC!mJh4g5c)EF5T=q}<
zW@D_(8IdA)@LZ+^?@Re^`LN;LFa-XJ%jqe+&k%erD@TpV9_a{?l|I;EqV3fWd@!mm
znc+jjwSD4gmN~q8qb2O9b0&hq#dISkvqnp)dh^_5#awo2qtW#2hbA8~6gL-boX8(>
zwu81?V`O!?oZ`D{REMi+x!1hB*6Cy(I?k8T%kET=uKQWo39E<N!G@XLj^mB!`H3yJ
z!Qj!7#@(T$KMAIH7Uc`l?xcM4A`;H!Hh)WwbC<nD7gsWuv}r>}=ca$my=uHTEyP8y
z54Nz1YH*)(w%#ztIo^C*PQO<Sq}rhOxcNMb8@#ITb!uTvQ!QoC#4~;pzZIQh<`o5a
zsK0adCDkZzpSM2v?Dnq&Op#4|i{m+alHOF!rV!zABsraQWow3tn+iY$9*OuBUkT9M
zi!L7(cz}}k8jufu5FrW~Ooib3(c5RI3=e|#JTj3@HiyIuXNINCubwhP^CY1d5lV_I
zPR^pCAA7A;o{WSY&MoJ@^H3B3wlqudLt$03a92|ml`_fBD5^e1rQ0zZqI!!*)mUaX
zS<|#vzUIE4nH$}&9h{~-x}y>jte`Hel~gpFN_jZaXoFZnUzuu<)94E6T<5ZU?s4>c
zpU3Uo@d?+!hgYmVil!6X(ly;KNm*OwbI8{z3v|%I_4HT>Nt&7^q0@@SPXaA`iAvAR
zS<h!Nlet>r*v1muELwpeL3wqu$P7L5q4m)-N%|J6fE`4!V+xyr<AsF=N3Nw_)k%q=
z@sLeUChJ87|BuT3)Zfb(d_7rqA;;q<0RDq*yVB#%g~>Okr+X2!LT$k#tFYksHJH=n
z3F!I2Qe4B5pnFmAer;+($yQcgD*uHlDurPx@2dd)<Aw<hN8bwlTitDQCh*D4lJ&t@
zbPcg<?vGEB!6osFz!C<6J?T3elgq1CBGk`ViMLy*FsZblKacU=z9OgP`zQlmav8|r
zJqXVFXZ&oqcpL3VmE6|0LfYJxazx~g`We&de*5*>1-RjhQe(5`*~SLS`q|S9v+`3~
zQ>IMi+hcTX^%}_YWT=}koWlGSwSH~mOvRNJ&Sfrc>H__ux(6*kTUubhdoQN>V2}J<
zR)ymBx4g=I%zlp1J+QjI7joltSLskIt}qG%d@lfB@0(d>+A&l+Glwv&La86NxDmfT
zNv>`p7eT?@iBSF8R6M^wCx1D;HRt!F#6s8>2mF;&B-MF;2m~@G4CaiZ!p=4aG-$V0
zYR+PtSNvY$YwW0OPY<JADCVjnBFGw<R<nA2e$FKXH?9jddT!dyLUX$W(ij<mHx%^4
z#Kfw$Kapr+aI3ij<W<<!53T`L;(&MsUOV8aErX8@QL`(^=6IWe<B80#a`AX(GBOM|
zz);Yd#W2Z7Ls!=V<0lI#Q6KUeAF<oNH|r-BdT$SG_m>xL-i+8&!G0&s(?(IcQ&Iv2
z0Nx*-7_~pZT6#2L-so8nF7QMgH5}#22w+dCGMyllm->HAO8q%eYuJ_BHB7343cyG+
zgo9$W05T7<fr-GKzd>{CPl`Zw^P=q+<TZ&c<;8(qxZT}Mh0AA>#rx_<NkOp>`T2%M
zMCeCJLfZT%fI{csusPnQ7Xv@XSzVNmPU{iX2w134>~=<wxr-oc;}JE{7q$d%-;{bl
zFyTl|p*($(`P%HBVuIwixycDuqnx9y@GX3{y_Fp>VfgQ82*rq^p^97wA647vgT`a#
z85e!NpbSl#8uA*dnopv4RMby4F4MY{UFn^r{Li3l%Ume;QtBh5?8wCixw0*zSQ${*
z6)@M`djm|Nz;H2K_j1ACvx90`pqKN#`9b8Cd=@J|$6R{ZYc5yw){(D1GtABWH=Zy`
z-HxQuV(8LOB`UjI4iAOJ34LY@KVEmPb@XIC)FfA6m5B&*8T*hQyR{mweAL1#*kA9n
z;O}eZUE%DcD;yjrQM!F!8~hPzPrCH2Fvr-ItjJE$$pV*gv9>ye(q2lsB=uQP$h%X%
zlekK6q~fP4niGy&O9mR~_I;)G@;?e;L8#rja{}{3_rR(d$+fAsX?PiFx`2ashkOGP
zw9A><#);kE3G}H}!W&WxH1$sg*P@*n!{=#L{PK)<Hmz*c2ksnqz0=Q?Pv6<G2+l)H
ze|)2q=-l^eWMDHck0nk36M||B=_FV8jf%B632f$l@mg+rN>y~GHI;RsgpA$#8cpY~
zct*9kjG$l!k{*0T43n={dVV!idt6Zw;lPW%!2K;#E>?J>D|V%r^A`&*)MdYZJT>jL
z*;x5TTDFevc8OARtqyN`Wyt;0MTTO-DDG|<F-ksrpkirCX{-2<+9zqjKbqqU&G0*b
zj=R|>wtNxUqM1$~ye0&&wUtZ&eqI0=0|Y{WT*|Ia1An)J!bjzf9y3P874R^|FamuD
zD47YqkS6Zsd3^fEq_zq1i3zN7fM#ldxb7Z@0Y;<&n|qFI`e8q;TO3t$s`geh?U*oK
zp&F$0C<k*I-$^>KJFD-a%BYO^4KA!5J4T1f9rK@Izkpt4qui#^S_s8AE_pvL7$dKQ
z*TXfMJYx+MC<N%EC($kbK<8T>q$g?pCj@15ZQdjbAm~v`@A?MCg`$$;e!iKvcv423
z^QOF{_mgOGh3-cDZ={G<PvoTYnY2do@!iBn&T;!V$NX5|&QzV%MkUePwpLjq%5rda
zo=Z!u=Qi{bT{f@F4VyX#bda6508cLh%9p7$B-g9nUfss@=-CL0oE;`Ib{UM?^a4w@
z_wV{|otJPyPQ@?c^G~&fV{hrI^;#RgO2aJ^jS-HQZ;WMVg__#V-?pT&?R{|Awr>yr
z_&&UYqVw>f(5K`SHp~Mm5XB0N9$~=XOXd$uQNj=bO9<e_Ye%1Gib3+GiBG|@tU85&
zB{eX^oBbQ6B>5ChnZX9K@n&#T?vXPDfqt07xJZVvBuujM>H*4hP6HvbJ~#$K=z-<U
z5?=xi*(Moqexg^Jgqpmdd)91>vNQnRCryVz5<Fq3>?3YqR02@1#K{#%aX?h4VQ45b
zcmM<+1V?|eCnx}P7(IWh<1mpP1d4*Z4r1WAfB;C4dhrfKPC^**Pz;nD$YOJ0I9i3T
zdQ`v*Ujt<nWWcZ!lm#f~#fgkS>nCM$WL`J8L<$;~1_X+Oyzj(IKG(tLOn!YS8Vny{
z@>lc1XCA-~hhrD7h1@0<F6iC-DLMb3PLJt}xHM;uT1A@cqm`%4Qu#CH1Z3-2fYpL9
zw$)?wyn8v_jDld(m=U%zj630*ZuP@luIrcFdGDiz5B1ySIjmmwySjHA@H~BW`*{O)
zevpX8sl`V=<+NUR&^Qb;CGQcpym6ks%JD=k|1;ooEUWHZOx*0O{Sg{6uQw~vk15vd
z=~~d+qV^j~K!h3iZkpVLK4O1olH^KZe?&Nu^rug0Ee=<Xbvt?jd%oX6ZXQa+Y;edN
z+>O)T))gw+GcvsVwxcnaCv{EQzu|qcwKGyiwb`TTP(}njGXHh$KxOryTWq$<JW(m$
zCVycdJ0@?D!XB<%eK@mgLtacLov-1XVj<@lgw)IphsbBE!DYtf2hf{+I5P{jo1Q`0
zE+(%~hvO`aeboW%uzloy{hR%}$s>B1F6I8!hh2O<$rL^FOXZoKME=~3M&0eN93bd-
zfpL<(mU)+asMc@#Mvb?Ws^Rw;E;iny$Mb$bu)1ovt0lOm4f(~cAmY<65o0ePN*$EX
zrmHUhGI1J_t=@d`{#mmFd?eV^Q&jw>g^;Pf)7JHdLzQB*87{77<m=g1^_teVTci6*
z_IUOTewHz0BGYUp&Q|WpRcpI3qD91?OLrY6m7X*!+MDG>?Kto0xMvGjC=&M5EOW+c
zXpXOY6|Uf)0a<V8o?cS>m19ZLde+hX5J6c11*#mSinvk^A4NWc#m<TD8sWQ9vt^cd
zeJyE&D*|h8mJ2@<4@jhnoh=i_>5P)?v~|Bppv*0~T;-^rI9{w3{`~5)bC}`nF?zGx
z#@S`#(Q@kl-1Fmze)A@u^#@9=c>MA>$*eslP^G`Zvb5N|sKK{mQ*V?4eX_x+nT?*N
zalRRl;P=w1HG57g+d^AJQCZh4&g{?mbJZuj*>jJpGL#!`*C>{MRd4-HML#+<pYrl6
zi%Pa!Tmyn|(=qO+-$`YTig}hA=XUCRtovM@w<tt_76_kxdV$E~yr}?~@f$o}b2?v<
zquw5^!nROXGswV1-zET=F>BNUG#EHx5`rs8QUMda13u9eMG(lKCYTHCS2gO0L&PIU
zkkI-^jv5$aR|blKRsJ6xJ^?au7%A7>eD6+l!ALkEL&*RPl442Nll#UeUv)cn5=YV~
zP)$eQ=SZYMG+hSAy@o*c95}KXP7(~*M%`ovFuZo<iKZMJhdpA047xuI0lS&Gz|8;*
z)8^gT*8E8YqW_^H+n&$^MdOG+0EbKy3%E*#8iMALMSeR^M+X?ur;orpk}LK)l`JIo
zY|`RFTk$>s#RM5t0XkRn?DdjD!7zh+HMGoz6C^Gk*}xdzg{VaE0-<l9P_~H|)0n=^
zju(OEje|mlNG%J<SO?HycX%icn!kZ%9hY*b(Oq(s7qmV;(u*F5v4TPLL+G3q6decj
zMOndmw*+Z^y}w!_@BRSGGEFeR<9U$+qeWPhefmIA7UCNX2C1)N#lf!gy>2L4An_I#
z_)DVjA|u=a+{fkuUkWg+!HA~@f87&ENbQ{u_}}LPin9T}<o}Eles3v31Oy^&<N()8
zi;Mrz><ZivQ;9GTE0T{Svo8UN9NY}RbOmTIkMV-F-bKgYgp64#5SDL`Ba&h2A%h%I
z2s5A}elc<yf_lLWCVp@rt!R?XogjlRItpnqfdm$?L5bmoc8gN4LY^l}w2`p9e+iPO
z8fV4?vad7*0vAO{-T*NGTqLPqa?IafH&8f8p%Fl7LzSg8BpEP`WPuid_F~!1f8~Mn
zUAT|(CVLu_3ybikWAE$O562GZQDy^*g|0bE8k<FV9FNQBtvb=2bgO5yQUMm$SlqKp
z#YLrt8I@~A4+GO0C8YGwojkUB=w>}BZ5K1W#~XT5z0gcc+cy7@$?+tH6Ta*1qVBL@
zBwd%m=LAwRv8~~Cx3MfLmwax@N%=M`ciGYizcDPi#Qug{`#^)V(iZGpR*3ayNFiWv
zCT;%Yg?Tn;SO3Pvyu6Dolgt$Pq@8;O(nD{uHM<__6!t9UUP@K#N73GQB){T~9Hpci
z<4P6T>Kb;ktBMTne4`e~@)E&sIdENQj5G9OYu`7~bvsRTeRl1z?i^aI{)?VNlekCC
zXJKVy+B;Z0|Abe1cpfcW)93y`*4%NW#+1!-OVtut{#3Q5fvBQ-b<*<x<H2_yO6@T1
z1Dh=D{%X<GjwOmAT}D|e#oaQAoNi(=GmC36^R)EhNv18b`^t@R5&ou;6*pF6>gu<b
z47SMvv%b&YN)x+h8ggyEm{u_iCS0~Spor186>4x4f6pmz-<O0>x)Q<VQZ7D9yI)_t
z@0~W{nE3f{*|sM+F0eIkOn>8wc+4G^!kGq??b_{28Zdu9+dS0=wgR`1Va^@f*j96v
zE?<?v?v9Wk;(|hp^(fgFX`lGUkg&V{2gBz|x|QQN|GZw%%fUTc_0bixG&pb25)$V7
zhq-vXd?l(iEx*dvwwx}ukFWUXJ8^y7Z_7cSM}!o>=;Q{AtjKXi>F3-EkrPfL<`s@S
z(Cl$t|NBt^_k;7j{U(%~9iLt{7g5yFfhq?^mE$`_Z>W$9l{seeXUdzmz8$X$3_fz0
zNc_d*naeGkU7&S83}C%)Owd-QTjWCq)4F3puS?Y*tOH3*JX`9t7=HyB%;}BFw)~fX
zP3M8Ef?E#|5Tf;EuVktd)#&vh7trJcyxkI{{O|eok{tE^hzi3_4LW$*rN)J?Qmy@$
z@GmJ)5nOLC0(h_C(Ayd(aO3hP5pxuMsRZfvoFgBCNNrsu!(1gLl_W1XDWi)1KiM4&
z4TFIN4Z44<qXS(eM*U+KlzB`Jl6TMeJqcIoq+cX~>?71-@F^T<l2hrTOJM@J433O1
za2o)<8AuL>Gn<^DjNF#jfDTD;qdJ36mB3{oK$>kk1T9x32)H^4{v<&J$?GFZQeeKn
zog^e?9JHCkaVAg{99*Xytpn)yWZ-y+!;hT(I=Fwaat_Fckc87LJ*r7!)y;@7k^fUK
zxl{eySNWG_U%a8X+L`q+Pwk<%iyJN!iw;Q%=1>$p(4~A8CwtPS13^pt$BA_79TEm3
z!hx@gB4KmstaCTszUdc8*ch3y0f@{;*awP0cxYg(J0u?XLQsFzBA;#(`vHd`I*lBM
z;(99!j{626=)R8+$DgEz-MfuzaGI&_b*%9#-BUQaw^>IHgp<=gob@UA0r`@#>-qw0
zpfFP4HZ?#}t^J2jFG?J|6<^ALo3?t>Oz5`IuInteCESw+$NTFo3L77A?}>NbqA$vz
z-v81kRTwtLT8^1Hkf#X&iRsn`fKmr-Mu&N{*qwp;$qBXyT}BAQ@L;wB^UWEXX)3_b
zh&*ke8czIhFd!IxCi_N!jnrKGIQpfPR2xJo1%*JNF^Pv<Z+U_B<9<#M?w&vigez1E
z$S?&{o^aFyw}8!ezROLaE6TlAe``A*b{qp!ph9Y%7#8dcN1Sb}?d?nXyGrp@%fXd|
zyjrr$7y*~S<5y7hR~4Y-AIyYF4+LaFW%XpFTCQLInmvpAv)LAP?Vj0?>DwB;>G~7@
zQVZ23Q}9_P0C|)?QPY(DS0!&Y!!<B4*qd$g#oQ`S6bBT=M#g;bPvAfQM>b^`S|XCy
zKNy*Ki<wmBUDfRrO06kR2|0zDhJHS;I|n_z$uM8}F+ow@mjjjulfUjef2sf5a-~yc
zRFa&*dOgYJTbXKHHdl$F7pbF2M!=RIJBhfR7H4@o19b0sa;w<(wkH6A5AQAh;()GC
zORq6#o{QJrro~WL=;;L>l!;HIXgI}+mn{ko*V0S7_|JPJm`{p{nOe9Vi^>B;a*toh
zNY>_;v-<w8E!7l#o1BNs?z_wf#wYtVen}QzT>=$AgIA44ebwp@a!75wJN7K9j;+SW
z8uoQjVUb03=55d=@#Y_9`Fs=Ut|9xs?0ce>@0mn&q+oSJdb^!tTO8;mb$%l));(4-
zKPebA<Qs+-=No32z+@IB#h;m4<`kll+u<fplT9C)z{U<F)EYXY;*Abm)fWX>@3lPn
z@G1otTd9DCo-AAllf-ruy4anJn=H{RXLG>6j;g|@m(&__Lzek=U-sRZzRO1lOrtOJ
zm+5k9slTfFKsku7%a$T6ENphjA3uy9eG=kh6ii90n}D&mc!E$-XY)ycsx6qljq9PY
zpDzzbG!`4}xmvrE+7f*Jx351b!!}L5XmvDjt;&0$*g9U$nbVZwscA2!5>S?vG~K*d
zPzXIIrnkt|yfEO5^dk>cVc0*&Hh$%zYA8nPL(Hwwk?vVuZpJ+&#LxCsujZ^dalGUq
zk8X*2y(traI^+1KZEu-(_j%t<)w?tI>hVd#CUfisw!-|mSM{#>X=67C83>oRW^)Nc
z_@hYvV5!q}p#c+`qTV9*kqk5GkA6Z;&)MXHw7m;gzS)ito45k#Ejt_oX>5cfT<WWC
z2KQPilzc#zGBNtM6S+21XguLQ^dDB(z7%x$|9-GPGjusax-^CEce?|_6p!`LQeA*t
zPSMyw*{1IgF(Ju&%pGerqJ=JKbDhMW8MAdWH7W5Ps%BEf`_@XE%%LsfzQ^&li+p2k
z4foU))Cu$ljG^6sL~Dxwa@&z_QQ5dYY+?^vZ!Cx`c*gs}DQ3>LfXUX@_N^+#UicK@
zbUwcCAj!Nyi??H{sraN8Ni<n^weAk;y~J0UjKj{n{e-Mf;FOdSzH<=QAwgI~ZoYW2
zUpvi%)TT`4kALFV_jZJM_O<6bY?*wQEEJ~NlBv4Pxl13~^(Vhy-lxuaeReAyvv)Zl
z83s-MpuSp7Yp*OfRqsg=gRbR;Gk4?3H|XwA2=7vd7W5Mp##Mg*?k8z)v^N<f(0Z%i
z$mgMWkCQ{6;E>TB?aleSuG-iy_c^*{zg2xn*m1e+7r<Ch@N`8VIpTKl0Hu9*17Vn_
zP|UaDMbN9!!YkA(gT(V&zb_o<5of%DCQ${->BnP~o!PuP9z$Gcf(C!4f_G&|`v9JI
zHr460gE4qwW4yYiYMyx4c#(d_<1JDCcBZLe=D9DE4fC#q8)2D2Dpnaszf0h1)i*7)
zxyKd8y*&dyiKySsH2Uj5(~gfdkoWmaI$)6ycN3CquawfZ+R8$$x+k;L>%Fd*;XYy0
zkq~3{maC~f(~h3ZUsXWo-EodvK!+KO{DW8g|IOnpPq%l@9Ky`Dd0%sz0@6$Ox`Aei
I20H400LcNok^lez

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
new file mode 100644
index 0000000000000000000000000000000000000000..beed3cdd2c32af5114a7dc70b9ef5b698eb8797e
GIT binary patch
literal 15132
zcmZvDWmr_-8||54h>`B@4yC)hOQZ#cM!EzfhmdZRPLWXQlpaz*O1gvrk&^D_^84TW
z@jlOq4`=WFp4extwb#3MjE<Hv_A`oS003aCswljCIwStuG0>ilFPELs0YL1Js)Fn*
zzr}qsbfZ_wbNOa4S@vf>;bE~>+%RD!>v%IFV#WTd^7(B=#T|Xno7mV6xS4f=u6692
zQq~7{i;;}Y46D{(Y+R?~SpnS3W=+e#JKDJX-SSUi>9(#}mwE5Tv-r0dn5ZY||9_k1
zWM~Q&Gt=O&6oAqZ3T;9&9$g)JWBOFs0NWF6vYJZJ24_?zn}`jXIHjr$^?F69z!2p<
zy%t?XyTRP;!zMXPY^&6kR$$J?UW%?3bCC4XDqr@?ukqAzCEf6lUi%~QE1bZLYf8h#
zNIFjy{z&gk+iBasaZQZklPN%Bhl~H-pewWJX`t_4w;I)?=gcrEWq1%u$-pwhg=Fn&
zj3nJfbY`j%G4F^8@$CZRg?Lweh*w;b>{2YdOIAi*x9?W^yUNov<sQ+UmPU)VO4uj0
zDDQH3K8p7Ks;G!<)!}->n|q?NJ#6TPeU_fVowC-#v9#b~gYH6zAw5m28>MUeJ4Tj*
znI<k2jowvXwz1FqzRTH_Ok0mJ`IJ?muj9Nf&_cE}p?PZ*PG-IdxC|+>Vgljj#XhW$
zhiz?z_2X4xbgPrk6@%1I-IDPigjXj6D_rk=N!MHKhrgxgN|sX9wAG{r8mKBc5uYx!
zD6;oWKPFPVaeKY+;_tfGk8dnA3*mxhD6c6ylsqfXvWFU-T3PF_*(Y_!aR4ycp@UiK
zL{0B(1-*H{F=ezF{RJj(g)4PzJx50@A1Bg2>XU|TM&*KjHze0G!vbN}?9#L0`)Mh&
zSDg1vm!sTu701b=n&--{Q{n2DpuDb{%No!D^gwg<Uj2vl;fNLaN0dNie5%og@3JQi
z-)(XlzBg~YseL5qbu?htPN<|S{XjUXg>^bAW&J!~L20v4&-T0QrdY*80B?ozklkW%
z0rk7=VB9&#oB_RdT&R<N*F#8FK%9DHfWAi+Ir^%BXMO=Bt5+J4q7j3`jI;UH>hUD^
z<%mehua9i+?=)hn7$V<hnfX&)FhVkec6pe?I}a(Xe`A0O?^nb<Yc(D(*Rq`sWY1R~
zN-nW3amL}k2!6HXjbDG5yylPS+|X(?lc;5Z8p@LXyU%D?fI2Fse>mdJdx(xObB8b;
z<S{?XzJ`)H&Az7qG+mJ6Jw4@sqrEgPWkw_5uKi0t8NxxY!YFGoj!_tcb*IDi`f?_V
z6&F=Au;_D<nuvZj{;uU-=jNl?AX{_0na2RSY}rp9dcY2<Gc<^Df3nHT1X!@>d)9+r
z`yz+r{dSM<Y3FULFlEVk6Q<?95a3tLwb)XN9>5hDz=4ys1#(+WoWqC+KtBRNG8x2R
zkNK+s#C-E*)s>kZCpyIRfB`}hQ6FwUXyKlgYs)!v{kjY>{yEe5^Qr5JEe^d*zcU@;
zK#oE%1w&_PZ%A@P#G}S>`1qbU0tkHPO<2-5_Uhe0Y6$FovD9c;Ov~qVD?l$$zpcmn
z8BGk}4~3UeEkzOUc<9FqtY1TqoY%<J>qGS&?kSM=O3g}NY85}H(VQS~6J6eJsX=%$
zf%etV-q-i9X(#Qm$6xDNs6>@0-*1b4*6TC?1v|R@FkpbQLy%N<#0-I&1swvEMn?Y(
zQKWmqz2#a=uq>R|^cdhnkaB3z*DB@@Q=Jpj%9EBXLuo{WDl~W0E}qH^aARnpD#`Dn
zAO=+iepMRRSE1j%9nTDc{=3ACQK(De^37Zvsl54F9`aO8G+M-hmV$3r9l|3HavVov
z=cO%-IO<MiBG5betc0Exb?ZA??sg@Yt^jxuzzfV}RgZ+#5?^zuGU#>Vsvo}L%}Jm>
zX9gR60KV3P&h$KA;XH%c12K@uFzJy5i9S6?U7BKXLk4&WhD>E$HbfP_Ojp5OF9rfm
zT$`)n#dWaGB<22Cl)AZ@Gv7i0;!*>IUJv7##H1X4+Wx!Jki<;jka&jGH6W2$nzJ4>
z6yD|%yOMzcBZj~}DSWA5Qj5Q$P>edSrrCzs=X;k&irN=Q9KBA<jVLfalIBx9QGwJv
z1M{R)VE;9*;S$~5GJi*2oV7iH&#qx&<{V2kez<x->fO4RZ>klxjm*H%`2m5c(y7Pw
zcP@DyYA!WftG!MB6T>V!I>_ym+&LEFyikRHI`-j@U5hGl(;JWZbO|orN^1|6{D4+0
z>5k@1pQ`!&UM0WB;(#4ds`}Zu6)B_YebI)X)jZRhJn}_frc0jF4SFi~JHS=t;knPP
z&yEu(+8%qK><?Utqp=kc>YIlcGahTfF6Ze^7edgT$J`6#2qm|n26OTFDY|d8s~3hl
zpLtuXp@mq2GW8<6|E)D{#yU2)#iuPY!=|5Hmo-<*yo(QYr$3HQqx#%vtHjS|I7N<c
zY$cj1UY%X6wN$tJeGNO~&kwjmhTCJ-m4eYk9uHOHE(YzWrLVYBBtv=!agp4e?@-Xw
z#@@d%cg74l#(ELF^|t-^FR7({=JtNC<VL$kUcR-|b+hOJ{pRqy#fQajq$873%ktez
z)obm-7DbOFZTECm$8#?%C&!6(4ae3HCmok8F5G0J$;d@QIWP+ri_?e3*&omC|0ES&
z%U)<mw&J1}h$k+Q+QOPgDHVk$Mkl1=bNhagblj7)sQQtl;JPOoyc@D>iRxC6lDQq<
zTXIalFx_Ncd(TZ(!iRaFymyh~tc4h-VJo_vaMKP(y_b-@V9j{@6aA&=*?g2r3#HBa
z-Q(IP$--;P*a%%PO{^%D$`G{5nl&>sUgEN|<ob+8jM$#{SCP$0+qsi#@yG5w+QWXn
z7wjB9Eu8Pj4v*yXkaVgLKQI*WUL_Rr&!8iU1R@Y1LAdQW{AO?vo7QmrUkb(%gb<zz
zhtcyA6wiMwgDN>s^PG}Jh>ISvD%;O|psp}p`-pKAK?pbIHTV?a9?u}(q*GCDRrVm>
z0lC9`wd;C96R!Yg%?DnK2`W*_@jf%9IPnwdr@BgGxWS)z)J>cDasy)mt3Y7)p=txP
zM)#~H^+!85n&7b%<Hq6l%gb%8C)dnR*L4=OlNIhlx07*oW#FdgjvgIT_wm7LDLIQ@
z-T31afEb`rrG^))IJ%^bJ<{>$l{U`iUrdD?1+BT#+yClM)OQek##8!6GFE0paMGl~
znJT5wR_VzqeBv^?U47rJ0!hXwG=8QSN^}EyUNDp2J?(D#FGFgCo^@;lRCMe2zczB^
zM%9XHn3ccHp;wqZ^Uy8mD<>D6R1W$5gqQ>%@AfWuiX0~?SIt2=9&6BS)f-v(V+-C6
zBfbm+ypV$sk2v=A1#JUeO~Sbved*o%-1Huvn%MCF?%m%fP5;xCPP|-(b1@laO;e4-
zd6?k_0KN;j`6NXEVgi#X0MXBw38O@O`lZ=y4(f@Vx@QT9*Vpgk{{$@lzYwyh%?NrN
zGtU^<Xp?^sLam{TiaAnSu<!UI1|2>kn)F6?fKBPA{djTaw^L#(7F&HK0b>+C#os)3
zXBq#MC^QE6lzK^4733pD>UE36G;-{`GpU&0a|`(V-vTwp@G~>2EL6F$*&3YMPp-<3
z$pGu8`_-xR9b-}m{9;+irLXejrTbK_!ep%zGnh;U{^iGo^_=F2)RW>Gnr99OXB*dm
zfO+ugGg0L-0>cKR_lG&~a#|_x2{kD1`&ncdCyi6M^Lm931EU`O+-XCCFYRAnjs5f6
zUa^V+z|fk5UB$rN`lRE$u7^I~$Cjw-;Cp6f)HA(2LU;};f)pd4T8-D?I2up<CIE-_
z0BM~3fYu&CvC@lKT)~T`_^r?t_rr_OF~QC}uE3jm?W1tzq?(ssDw<-4)$UN~!5^N7
zuIp1H{Szdjgu<U*<ONwr)R<|#q8qLAbP;vKy@zed#$$gM`u78~!!AwP@sMCx*XkoW
z>+3G(m$&;vg0~+JOD};L`gqqk*eJg+xpbq{T}SE4${0xj>in~=ldQi1rE&?>CiYw2
z#vg0Xtv2hPZfP@t{cR}nkn`imMzN%Ni-Y?Fuhn*~A(k1`mx6vQI)vLRy&;WKU0n}B
z@ZJ|)Fn=>TPu!<>B>2~#eYSLuW5D_)A)V<f*OsqS8u}JIOQ4c#d-MbLo<0s46EY{A
zDm;!R*<T#xOV%2cy%%o9dW}tEfA*H7gL~tKR9uG79r==tbU*8@(G<ES&F_cHF3361
zBE(ese9+URMmmq{w2r)*{#F`lifGeQQz2;z?>?!{Y4XguE!i#eiyl1d{uE|RTBFea
zM(g%RB^85qT#!n<!>$qYwxcyR1CEXmt{nlJiLD0Zs8{OI%+d`MxVXSwT?e&2t6`t3
za4o!LrCv}!1now|E(qC6Hf>E@-0qF^3NbW7_qjxU<9CDT$8j)VXDt{8H;2Pzmw@Nb
zJ}1NB7;d^GlLw5^EU`sTe0n9Pg~GmQIXwnxEAeh@zS%X#f?&FG!fvUXW1I^%m4Huq
zFb9-|D>sEz%pg}Dy}4S#5$%jBg@1FfhQKlNSk?MlP{oDv8s=i*#<tu1V3Ix_fW<(J
zJfe^b3;ryhuK-4@%J*%Rbv+W?wIkbHPNyoO0Z4$3Hk<bluEW50Ee(aixv^zPjV#o%
zTd=x6zN(G{3}E1foJa*5>C%7KTfKRpT((!vAA*0?h5%4doY~|3y<zwG&L#TgF$bYi
zNIhPm7*i{_xSzk6%sHD=KN}hpwotTDjyX^&nHfM0Je#8FI!{3+qTU|;pw&3QlZOIy
z5rT0_Xk2_TvcOf_uPEPjT5PgwOPQDs|NIwhCgdP8ELV(hZ2mb6Axa)6Jv)^ovay7R
z+Mhwte-AdR;PM6`Fg#Q~@Tk}qJ5~O0<vL%)0hNUiU=YYh-SIXcH*JcGy!C0nWRbe<
ze}1<s-27$lW;1EE{2=nZ7|)e(;01`DA#m&cxOD%L>q_DA32&6T2RHbNq-AItD)b&W
z5)Ng>T|a!hlRxqb6(lwy3n#TR>Q{5$zoTQ(7Yp23btrx0L6lb;lMIld_ZsBm;X65W
zhL~-DK~O*?iR1lG`e>ZDti=^0@Hu{22<C9G{5GMA?`hN_MpkF91>rk-ri$|Mhlfjx
zz}x1wtNp{S65T4sf<QQm7FFT+Q1VCP=nD<5Rm~Wdvt{>tJev1F_{R<Kn;g^+4*MKN
z<XUB4CO?VyLZefp!_fqjOBBqkG4S-?j6#3_NA+h?>MAe{B#a1+VB3lE#HN&bH7Rc8
z9d*c27p;2<s6OEf8b2kT)hXp1dpW(E^6DV&D!;;o1j82mqXn;0@F^i&q?fW+7Of+u
zeff1?hHSjG<rE8hQ-Oq_!K-WQ181GfyH|k^KTR9tv9q88Lq!7+2Syy{A5l`lNr|6%
z#0*U^&a$x#IxOE_aPrB2Xum}>oA4ZYZSk)abazBuwEu8=L?5J?TG~{R3V8o868I?F
z#Lt>o_|ohZd7psYl9Vtz6-np(@R<I2C;)T8oILvZc&}!k9l=rnh(SMx>&^Q6yKF@#
zKK_Phwv=G^eE6%t(B0N4(**az{<hbQ4M>Z$|8Nab8SLz)m@0bPk@Wo;!3I&BJu}Fl
z{}e^!Iy||DQ~DlD9=@%{OB>I8fpV4ZTC})4v8^-k&+wR4`hMI|wtCe3@xtk*M_gV&
zT7}a{1ERd3c8RiWPPBv<xo_~|`%6M>InQ4k+GP<n7v_pttFwk+8IrZHab-uhkGCsy
zKju>xSExF}CJt9v>(EoD>AsA|3ioYaprn4PVQ}7|zFbK2=iyU{SL8K#I2+N-*;IUC
zGNwTD;XDPHkYcjzxc(jT?|J#?A9c3l*&Jc_`dkI4Rs7QC{PM6ty6TzkxCMvgm=@WZ
zf59SoAflkydVV7?TYoT5`U(N`-HxGa2z_V)YRIz`HRRE3`12J1-lEtmojvMCPtH+1
z)V=IiqG9TR@`K%FOk2#6!1{1OD;*%xRAYo%)EDc|<)I;%EXi}?^()_B6K<n~w!o^m
z{^B@A7{e})%_k}+m7;q)#Pl|qj(aul-6|g>`pYE*`4Sg)tmZ&*^v8jAGJgK-rh(nO
znii&AGyPojK+Ee9+EI?hH-rm&m>=`lAO7{E>D1JKm7n{&r&z%Cwi})WQZ*k0bJ<u7
zd6Z`^+Vh~3QS&C=uzWkNq0?l}$C|uZiR#V2h?`ss(Gg)$R3rey<bV=Up%131fcyo~
zuYRuRQ*|jB%|Y+Me5=-!GIK)sAe|gwJDy9lY^UhXagIWU^)o*15x0zan~F-$T0Iol
zkE`eS#}A^@eg1Etk=5kFfnOy#C!jRfwXI?m68&w@EICw?K@OP*#v$reszm1rv59NX
z5fitq2x{yk1enIxl2)trdFA+m*MFk~B8z=JkTK!s*+5W}YtO!w!K%W+3)iQL`0(TW
zeI4Xyp+w_<ov2{%@-j$`YC_2j2Hg)uaO=aKo=QInC(3k>6u=B0Pn1}ek~+ch_lXwn
zuc_uu<oWjKV#rOl>@YRZb$iGWq5BG|g|^Wd_oh(t2hEHAQ>~0CE_L3eNN1(NZ={TZ
z*Q&K4gY{whUfZO+x8Pi73^^HTU(N+4u|z~}-7IGjQufEje1K4zazaTk96zyU#Oomt
z{bZ_BZ#I(ren>G~3QNkj-ElHS()&+TCR+bjq4vO-*_o`jyU7mwVd?J!edfIxKubK~
znqmum7Gd^m1|fh?4|kW$?Yo6*!cTvq_fNlm%+Ol<W$IOHKNwa5xAbLOaV}&W!ABGs
zYB|nN1T7s|frHs%iEJW2y|f}8c-ef{Z2l497Iq)}C^Z&vO8o0azOTn<5%U8+m!g@d
z;os(O7`{f-SXo6lO87v+d0kY6*iyn;4ke-4N{C@--M=PHZxaYPQ>mz3Wf^<BjsW!H
zEPF8@r8cFn?CP`|1v<#GSYQK?cZN^~m6l$pqgd70gKk=$yqjMhkj+$LZ5xA>I(4mQ
zO~z#3)9fPojD(VbPK-c6xq)}DM$borMa#X!P?x0&SBqzQG-BST1On6bd~bfeDWpmL
zg;dMkgsT6muQ^9L>bR6T?+9!G07EA3XvMR&Q}8^<F(#gvvQ7#Q$TgpjcoAy&I^5oI
z_d+EO4=&OpCEm=|(frwZ;Y75W5UY)cvuAK%0<Rq0(kdjUn&LLc3hp!Spi>MSfgNeA
zEzFXFy<gB7nMrL~Z){BGS}hdnu*m}fXLrf7Y@)jcL3F@IolN(<h2V#abPcMfVI#d#
z)<;61X0e40^QXbsBksx0u41h-0J3pjo~F_C>ts}my(yK#E3|dx>wH+PW-82HFn_p_
z{;sH%Izw2f?je+3ZGMKbJJ%-MUk6I$Q3lW<rax1QgTLZJqB}itr1=V&KG`VQC8yd`
z9j}wKJg#IzYu*y3N4@Bb%cqox9)eOF(B%TwKPK_mvIQ411a8_I34xP9BsPy-OxP=f
z^$=Rgo=aLuf$|15vr>`X#vZ{OC+X9zuDb|vQX4W2a2z2W*Oj)w$<7+lPbGYqEE4!Y
z5j4*J(;o`UAc^wryi7M1qZAX{UySopT5y$cT@|8wdo0j-F+*z55(QN4-0X9E2(%0w
z->Pj3_BQrPW?JjaUyorsqkqgQ;wow+pkug_qLB3byas`FE+^x`c+_Iv!A2o)GczmY
zAV6d5;m~?7FDJ}pHp;5ORZwuDRq(s2BNghbg+aq0nsM$z_3LiUp~h}O&p9WQTkF%8
zM=j%0_<0RSBT*koU?wS=bWkoexJwQclztyKASoPa^=_gN4ebgz`-%PQ4pC%-=4Vq0
zfe#O}LUsDlrtPI4qXRa|3{g~nzfS$+u@EI(83`y$`zM*F4ZrP)V>J3FyYXx}ZGKDg
zcnAHvt{Rs*n3G9nWAYgvN_?47{`Qg%8)$<Aq7w7Y;i=U#C>u7L&yUCg=`X~0xo?Nm
zOT?BaawiXVZT^N9@PB8m9mlRme!pMhW#CUp&O)q1Ff49V5&%z22#hJ2F`M#8APaP0
z$_Rp4aJOUiQWa7(@mp|%WL)nG$d&Zv_rF<$bdOHX?n0#JYw}R-L?73ZR{Dh~d)_hC
zut16KfP{BGRQ-I6p%4Q2bsb~&j&!tu<3}y`>iw3ht$>i661@OYn_Xr&XV#5d@S|oP
zA@W{))lxW_UJQXd+s5{jYwPj)u*;o$QivH&LtwNF#bMPtindqcy_Sg_0jNOW`<v7!
znJEO8*|0yQ<j#3cQNLl6We)bUW6e|yCa-^2a$l;DheS8^{5s<Nt1Ne}%Cap2BVH(i
zuT&AoSPH9k16RQ62%{Iuvz(S9^H3a+LU~=#Sc&r8O|`!{<aWlnQ0y|KV8S6u=x8Z5
z<;6)4`!|oUKjwj;r5>lS26z`VMFkJaH+Sv!=ug__rdCdmKpW)`?T6Ob{o>w!vsy+D
z-B>}mgAw_|pUbN&6M&;nPF~<=LStpG+Z5n5r71uf?m?gQ-F4dx9x_V$5%CbECK$Gw
zzJ2<^i95T446#0C`xOGneN913e!;7o!R%C)^uMCe0=Tn<*P?H{k7Z&~3QPz=NJW=T
zj3CEU61-h1U6W|>zbw|;d_CCnt>k5|J0cEO>N_La+8&pSKU3E{M-On-Vw%ehQ{LlX
zxIB8%LF!fTxKT!H6<|d62Qh9ehYjV*#xl%&Z~JpAI7ZChyU6I`b9k!^*geM*&r!)0
z`P_*C_$(P{7dfN3zXX2lZVtYo4StL|JW2|=e>3xO1G$K#=;n=dYTEcI0n01mkFdT*
zZlxjCcP7Y<Xe4eK!`8~!(%b0Y7eRosBKBC^`mfl$I^R{l?pp_;1@WD32ai5kdcd3M
z^m+UN=V;a5x_*9(vTzYtR(Heec{Ls)LdV&W7y{V+ZuqPMyu+XY%$fCdKj_hAA412X
z9#03Ex55W|536?9y#sOUVDTF@5$8YaHi%%9&MDkZ)|3Gp8mgcZJl50?7fGwkHO60p
zZhyFar{50Jgv_g8$k>5aQ>oPVpawo8YKRl#hc>oIaxO{*fKmVk?3H*sQ8bIy$$PNS
zm^QUJj;!T<|8X&Tmhjigq?%e(ppMY%uLMndna;mU<hf+(U9BDY8Q!?K2c9o|Cp?z;
z^Pj4|R$@COdz6|--+R3VyzF@q@V^d8Qf<+uE=L)2F(2$~G*r*u$<=O*7m}ns(1{9D
z;Y^{|9oF*p@@9q1kiw^&M)}vDtuk~W`5T^_=WBU3ZEl@0_8<1Y)SggaGLZ!A2#f`s
zbKrEiPfENE9s6MTM4C}Q+(s);GKp&lc$TM^mKI>(!hA{kXVc%0H6AUg<LVxARbB+I
zubB-?iJ!V3Qn!4fXz%tQ2mNOfI{4Q&9U?eom<x@YKL(w>IMB;Y2q3as&sY398#kE0
zW83CIlm!|%<nHchawVWv5AyxXiITYg<9_nbqbH51Hvx={H@rm_lFaDa>OO&SzQ41d
zS$iN9BrRi!79<Ui{~hrneIXRry?g+D;*8pl@}~bWkR^0vcz>O=xyI?ngbQV~+RpO`
zgt2WYwEdm=V<3qZ)gKkzTAP9<tsQxrWc$7BL*3@Y4)#dX=FLeu$Q)WKBe%`??5j6_
zGpXJtq$_g|SJdxLBtN+S%b;aWHg5TSbFIABDVieIv5TfvgPR0+myltdME7weY1tJv
zIi+{V*DR%5%@ab+A>Zf$LsE<)l0?cLpV{+UkiYYIQGnS~Bad;H{xUx0IA93P!Z$Ub
zRs}&&XlPF1+UESgi+B-d`JNY2Bfq~xE9@Kpnx?;#;mg;m75vQ*?*d4Tztw|nTLS^Y
zH-`iqEf>b-r);F3Q~_D`cZH$BGWu)siXg~pRDs3<QlxV3MAV*`OkO*y-}U=6g73RA
z&!q!2FhL9$AfbPBK&?WWip;9TujkAMy8K<br`rsf898zEPQX{lwvyq*G7(Ck|Bds{
z>)1|az7kgqJm2#$NR_{p2Y23-4BY)UL<O0}%Tx&U<y8w{_|@^Jr_<};cMy*m3XeO!
zh9J>yBE<v`ft>a^$KdzDc9uq0^ACB~H-gaD=Y4z@9VVD}V$kHmZY*Zd<tOLK{JHGE
z$tWALwM*HX4GU!Qfar7fAMFkm`pwhW<a=>--RR|Y0w6WlPWsSq`9?!a)pOu312EGz
zk4m+W%p>D^0mr(5WfHSjGm4$@-XbLhSU&;M=<@H`iuaG1?)qq49eVAA5|f{k5V){}
z8uBYG8s*=a?&=i4q?=aPx<^%phdi8kO`X$JJFg~83BLUMcYF-+MJbGo^^<XZ_r1UG
zp!%(v7DBC*GL8_rOxn!kK0_#ed0o~>{rW9<qih$wOO}bQlRZWAg5ImkYkyNRqEwcW
z9Q5*Sgsi-1G0VoPi^P<0E(`<D+5<cy(nK4TFK*euY~BLDZ1+OhYWGBq%9FNyX`0Qo
zey95SWRvrZs>Z@->vG69q4q3;`%j1PYG2lz1;eHL<hBnTu5y~9xtP4yyTU!ozLKep
zdOf9u_~P3$JQyv#Og7IECOzmM(Cee$_5@wJ_lJAS5i^fXgXO~gUWq)fW?g=_vR^MO
zO8V;)F@|ni8GTfoDk>UAMDldZP&8yIZ=zAT!_W^5Gh_b#n<M+3C{!PZ+i&eloZ_|D
zhm3x6UzNX$-HH|Kp())EXHc2epEfRFQXafiIMh=U26yIovzjEqiu!xSNW8?j>%EiU
zZ%Fin+oCFPL;K`A8?8xGtUp%fnKU^o)jCC>R2*P%Cfi#_LmHjMEJxhmc}|a?*)R;#
zbyHfgLFFpb00`ZaHUnRQmT#aiiK}x0gu+pd23%n_RUjE4QhiC3{(j_k)DA`~jo|p#
z#u5J(u73}=8;tpFvdM1RcA}^T|4=?G_T`x+6LdEhU<tF325F#lX@R4G;8tcnT|y8;
zM7RYZbpD7{f^}i}jWC9ixK0IM8zO%8@oO+3hds|;k{}`C@I2t<ubv__!$|%x$Xhi4
zA5onHl!jYyf)v1Ac3`q6kYg%b(<Y*cPg&j8gzJ)g!`ArqxEl=bz$AR>m=K9erRBQI
z%4?gf+wXzRB%6mX!*t}t3Kv1nsQ~!hZbTr0bFyUkaDfV!snDh2##9g(Hhul2EW747
zgi;TxQ%{3b>Mc4N=<s3V+t6sd`1<n_g$zkH2L&lWp2A0?KbSyNVu^_FI3OY%a3BWE
z75dux<p+XFv-h_3W#-m@wY{wQWzL5RBNIn+nhQl|y~--UW#PxD8U?5t{)(xH|I`{D
z`CTKF?#0zLB-2|51B8XrB@4j=2WJ)$Fn(l9IeYy2o-BcyM7wsO!dSiOb5bQ)ahIdW
zbpQJlSsI$4iB`daU+xZ`)Pyp0&|PfMIx1cvoxC&%3lbs-!oj46uOX{hWK<Jf!5FC0
z@eeUJXXsnLvdFo?j(#9kGx{-y>|y#vIG(4HW=>NnpTpmFun$Rj02m`#o`ex0ONfET
z4F{r7@emkC;R~!#dbkG?-M#lhIS+y-buu?tP{T}iowTIQI|Q3D*0|PFM=K&Z8(ngl
zIFhy237n_38l?NRLR4+dQiB2V$&rEkfgtk?a6l=H7ExIM41_<)P%KaggZNGFqMZAL
zMY&tS8=|yPYSZZFA&!dSI@Tu^@(_*Fml5a%4cZC)7jK+63+eEuZ3PCX_~(AjQOo`=
zNPnlQ)GVKn42^BzfT?X|&6O%hoWj^?UbjQVlhMl_0`x{xa=q49T>Mx-$^2R5#O^pn
z>2!Sz?&CdJ65j%GFWASd4pIV3tzxpdURHySx^q=6dV<a9LJ7&|zmM+^-ETMAxR98+
z;3)9nE~i9=C(<H*$MQJ^XE5|xIdYK)4;Zjn8Nk}I(zsDb*jXs2rh@`%|0eBzB9qff
zj4xhW@Ey7nox=HOfR-;2fs#b4GbZAJM`^8d$p4fi{Epn8JL;)duC3j77s``Beu+}6
zee6tiH1#|K#=d)klygO6%l5c+uk(VJs+KO?eoPGx7M!gc=e|8B__ZUxW3Pt;-=DA+
zKVOm^Fvt@PtGDQkmmjD3_7KnieRZ)(y^Fev7YHGFBaLE?{gVLXY_i}AYAS2`_{{{U
z-B!1EKX95m`s(!~Ydmpp#!b2VLVxM3`1Zrmb@>RBZ3a7`JP?PSBjkcQPh@?pe)x&(
zA66UTKY_1wx3-Ur8<y0+O-29COF73sbq*HuuK3fHK-6(by~qTYD7S}=qHBCSt>yZU
zi(!nn?u&oDM9#cLFP7RGZ@liCG@JKro%!fz2GqHc@fk04klM@5*ths6nRZJ%lI|p)
ztyuO1VIcggf?H~xX6i7k&p4~V9`G>zjntUEflyoQ^SD~$lBIr*#v)di`!hHHzZ~Wd
zJ-QNEBRBq)fz4l2#_xXm8YV8KB%v!-2Is(P`1=|D+zIhS-F?ZUgd{4ZvFP};cKr74
zvi0T|HHv$hL!f3guj8b`g!f?>1v<?j45gfkRATgu9tJo<3CbZ93QWlzMw3yj))Xg9
z$!T~{CY_6;N_!AzXl$KEx@6(;47Lrf1(+Qv<!xLi6gTTYXMt8jP|lem^Nhj8+cX|6
zPM;$h^0LAt+C>>B0gS~UEbJ?|HOB?fc^jFhtGDY1pfHBHP3X70`g0Pl;1%{(WPrw)
zLA={hi)#y_&B|CHDe{&@tUa4*`Gx7EV=fZARJ1+2VgS0L3UZC@{Wc`R>bF^Y|J_=)
z6@<ereL`p)6Mc~}dJ$3ptLE%4Bs2y8jxP>zu_xnjZE0yN`sSuL5S5%*$tR?_Sn;IN
zk+q_-5?}{FkQtG0br0boxa+}qf_r@ocNJU^!H6bY#l--XDfxMU;d>>l#G-kxw=U|n
z4oX{wIsAKre7G+PF-<zQIXe6@QBy19fj)^RTNi7+B1;2!j%Z>;OsE5di0T5MG_-(T
zhUl%sTLJ_I(vT32H{#nS1y<Olxi3z)Hb#CHp8wVLJWf{GORQ_rFw%0a!xY7rB9h{E
z?{$OEOmFsm)~q7j=lzb~?2!C->2{d~Bk*>z;1fMDT#15#7$-u6_Yo!o9QuS!|5#-{
z<Ai??f%M(eexmRE$+o#n(O~&4UBt-oteTjh1j}0{f11=tYKtr=GnpyFIp`@dh<?C<
zI7zVl6!rv13P6_+I2t7w_^t(J%9MdC&_kl9wc*h)F3e2#6f10=AMp=&6J`Av2+`g<
z<3bdQ4gN?CUFly$P!!LICyU_sd!u{;m-!1d5NC@Cl#bvEwD$z=G*?eg-k0*kfedfe
zX#weXT6OP3eRVI#^w?%!DTkP<hgnE64Y#XT=oVjWlnH0^GiQWFx@N|Fty2R}bYOum
z2{ANmC;p?XYB{(Z9sE0TP7O#wRObOVO2O+_EQF?wKrxiDNiSZ|aXQp0u{*zKvbc-I
zq7dJ40?9tCq1<NPM<1N@PSsQ6Js$fjRtR3IqNB_U?dRZVtYCex%`Ph(KgZY+Uq<1`
ziACn~!!K#}ui*X&*uvgW!KX-{P9kgU@^yz-q>C0)T!;?6@2clqJa$)sMARqIYV;r+
zk0)L=B>56L<Qz$eI0_7od~&h~5rcWJpDc_-0^&O~Lio>%h)=EE^|VE0=oK*K#|t8-
zuPFs$^fLQzLGuZ2ZmXe@id)*N@}ZDUnL1)Z8A52hime?+&Bx7u|5)K3ImXEMUQge<
zM`(Zo{DDFnt^k6F1jF&@<b^@0(R(0pg?aSf2kS%~jfJHE-FRb>18xC^>12aHE)&2k
zs@Nwb?4XI^>w*cbU-d#dTM%R#VlaWL2MW8>deH&l@xZNi1uJB>M`h5<RPlhBdSkg&
z3fL{^GSKeXOR1<N*apZdjJXcDy-9RxuE|DI;^IiL^(CdfrD9*-z!%yp%$^yef;5;y
zdBB-X3A(Hxg?R+d&d=n2<Nu_}DjgUxh|4`|E^%rhFySlOpBksG?4vD~acWIm9I~&W
z%OWuIOmxXNY)0(e#58#DHw>y{I|JcKhaAgcz;0;FD<ODa{KJ1i@V;j$D%p|cK~%n`
zNYxRa)-Jo*u$vv*)=bCt8-%xF1W+8{@lO%I_S0_gzTkXiTIP^0XP^iqLMVH&7Kci5
zfOTisf2v14W12-K!T~v_4rj_ggW?X<?GVz>w2<~EhliI5igwCTS&^FLFZSoB$eD>H
zD10LcRu|WoR}}rm2%pHJGsgh+eOu9q0~qG^b(v)v%8_%bfYg<>q0IYcTAhF-kNC49
zGRJPK;g!YDNi0#B-0xu-ox&gG{wQ(DTXtXWgzKH6KjnvR?85x$A$ZN+G0#8>XkFb9
z9zWb_5-`)TxAZ%jIz@ik!2)usZWY?tyjjOd<;04s^5^fjU8zy`7I$70NYN82zW6h|
z$X=NbEUMsfM*!<{`)e40n^{H-)`KJX!(mZdv-cC!9L+JvSVnSO(VKcNP;t?UGtk!b
zSPgVYsnD9ejE;FGyPg{6YW6R5Q$rGiy%J(H)2LXP4eT;Slga?wulT3;iy&;Ia=@Rj
z!U(jtPyK}8ZWprMhYw6rMgQS66{Y=o_anEEOn1Vj*{8icX-1vaY{+vNoJDFj0{pO(
zMG_NH%h3QMU|oF!Z9ocohL5ayn*Z36RiYk>2PU&<Rp16`0MZ3Oy(0#K%cs&f!WKkm
z!8pHNY1#d=o1$1MQia~A!QJ)_6tIgs-{?*mojwvw+F$iq%cFzt$w|`z?-9!Ygg%~c
zXVihI-HUA|cnoq2V<JNvpg1-Tvkgs?ZgSbY5uiNT@b3Dy7-rbP056T#ouze0W1k*v
z(z{KmE0MJroUI!0*{S_ljI-Y-3n|nTVug`|%I9bECEOiCj`HQ4<uUBNO(QUl2E0_1
zHz}QUXq$a0^JOuRKpr-x=x&p4&ijADo@1_k9#~AkzW#^zf1xAxf+R(&1dYZ_g5^sn
zCF+I#D9*4gsVGz~OOlc)=Rsi7O_?Si8^ShL@d2Nm3pd+yX`nZV3kDVyeY>{vAU1j?
zkRdJ8tizF;3llfJ+zh|bK4_O(7pI-9<s2)hi0<-%N(U4|E}xH*{=Q4x8<=>w<Ny2p
z@de@1QVZ&>^Y4gTB0F9sU?J)5ad=AE{p>o;579Jw#@~5OWbag~+3Mnyph?f@wbwu8
z=fB{(_w#nycZtQsdzOuJ=!+1W3GvhPtLJ9m8OpCA&1MCEcLm9=MUSexJUgvMnqDuz
zd3!`HT>912mxR#8IDT6FH+LT`QmrCDq@~pdJ?clm$SLSgUD~0uNXRqN&U+KZqw7Df
zzDBzgap!mUAGRk7ciu7Jh?&{>=jdQn1ag0rfaz2*?e8k)dfhWi<lN=lQenN!bXuah
zH>h%4+tNn18&<Eg`QcOFlDB*4Ht$b4D?3qs;0*nwuhWeFfNh-PkKI}J!>)E9RC<4z
zeXoG((fW36d;|?kq_y=zW+bjMr=HBC9G6~Oz67sXY9iWf{^(T=lY^M^#K>_LyRTd#
zP2auGUqc^`u^ubR5w4Vs@kxf)dChil)2=KRi>a|4o@pNTPdUTmaKG~`#_vwS6!#k6
z{+4VvCc;c#xd<Hd&gM!+{y5|=I$W4>y8hCDR;Cl~`TpA&O_}1i*3^LT54QK|MZcr>
z_WFbw0$>}L+Ody2Uo6A7WL7!Jjsi|{&4b%5B5BgX4~e|uY}|YIqYsLi98Q<{`IYRM
zg6GJnsy+;=)vhXW#}ZcT6Xz)uFQxpe`U{DB-KsDH#Ubr*#odC)p9`{S*v9t${JC%W
zNwRP4qvDI=x+u!)g-*90R-vYQbpgwWYEHiCSSi3znGDt6hfK_&?&t8e#l%}MMpBFl
zxE>$Q97^qR@(KeM*(xar8JyGv7=1lKpu)}4U@!(Ggn@EP+h#cPr~OUH-`QqXhlhNd
zjl-d^u9-i0$Gp!aVs!#8LeIRnr-PZYrSHxBwm7LpU-rGj%`%3{jJ$YGlC;!ih7QtL
z?Zt!uX4Po`%PTiH$H>#58o08=3zvG`f%ntyD#+pAjuhI>e65GIil<XG$-yg$1LPbL
z&08B`5qG9S{?N*tQ#yfwU45d~%$?j<Qx-#xKH$$k)?)s)ec^xNnI~*Nd}gU*_CGQU
z29pV#7*00#RAeUEKBXBQ|LsadB;F+^O1?w-TJs+D!7Zj(D5RiYgTUhA;<N%m>-1!j
zY|&2)#*BgVwZTom3H=~rSH4u71~5Evh9-a_APuJ-&g8=GsZ%XZ`qc>;Jya=i6~{(4
zze`0_$3fz?k)M$&6Q&2k9O@)|ms0J}WX+PQI!AD_7a~rK?MmT=*{6>HgTC8@7F?wW
zQvP*i_&d*0XyEkG>uvdgHGS``HxH~dcZ(_r(SdxGqHQ%PTNR$W9pbwF`p%+Ykchrg
zd;ZKP$e_{BKpcRu)<0Yc9BtI9zz>QDE10>pjI*RY^gW>ul4rjnPF^nE9*z_fjWPsx
z;rz(NO!21+*w8E;HQ$iEs5?KQdY&WrS6@)|)f2@QGGUNb`pZ9QAe|~5VNk^MzNK=|
z;9m<V(z6F_O59723H#57{`JNPnsyyl()}m76%JXpBG<DF^T_p2japPb5N<VUavqZw
z$Pvhj7X#aei!s7Wvk-%$TpSR*76*M7a399EC6pk5IEQrXN?FRM&&S~K(&n~;aVhHg
zyi^c_T%OhQbU~|Axk9whq@J(NkSZ|#!g0BUWw<t7LOPg(xm0e|S*Pqjy%Lf4dIrKa
zs{~A492pQ61~&VEnQHz%E3%?w0?W~aAU~T5p#*S;c3L8|Z-?Xd+w9tj&7pT_pEX5l
zRCRSwQacl49EMzV{P7z~%uw1MPS~w4htpAiEC2Fd!-_6!Faf)J4nD=e0$t#PZ77xh
zC$PT02cYd7qem6R>AK2uc9Z4dpSjUqcHr9b7A0l!Z0R|#ihlchp@I~KLoS?6Doh)_
zu=K%3UGOn9lpxZdn;Jp5l_rCG^PfI$I}&ztJSpaMC0Dy0lkx;${plYda`3~ne*P2}
z9ns|~NVrt6b{V?dJkGZr?$|N@3Us`o=$|_;^#S3=1iixlG*FRl!;~WTtHWQYrv4vi
zfe1%Iyo&Usa1;vcWijV9f7lG3%s-7n>1JhqP#>q+%Q)cm8&5xe%t7J#7D4;Pq!ZrW
z*g^ioamw?yQzmW9rs}H{8t5HMq^f8a;yr5&UFlvWAEjU8sr=MHK{6`(@8X=pB5QW2
z)rThuRkfKID&7*$00)V;uz|kjA&u<%qJ(-ftQI~Y0{FUqmAQ!dX>BIlbU4uR1a+&@
zkmj#sFi6@RVdl;od8!Nb$k?GwV+%UZN9AD$I^SFxGhyZiYBo6^FlHMmi!Ic%74vOR
zTbAhK$tdDL$9G>b!@nzjgEd46*Yv8FuSvFht22=+*r<om43#E!xlUF4@<^D9?WWml
z7KrliHrw9zeB7s0Jv5ae@F;ru;4Bk}qNv@M3@4D4@|G!g-rNwjrT5}kWa>v|+4$3b
zZ!3S9Pw}ln%eG1#?EZ^BG{yxDUxw|9&~c^5s(?Zdx-((jv<d4!5hbpJL*0f74rtA>
z13BIiNg7v<)1Ffv6D%?fSr_TBhX^49!*M=iw(6`RQc?jsR0}$}pNjkz<Y_5ag}wty
zT=szx99lFnvS`K7<cjwYs0oJqD*$u`DhN)Rxzu=7_Kc*C2><6%^oMiYn`-l$ug_5e
zS1DRhObQInw-Hk}ce)nOJZ9INf!2B`WzZ4KR@X3E!~FpiZ)K(=-8Jv@E0_O7vHoC^
z*mjWnD^9@x&n<51a}BtoDA5<;<}xSCC+OaWNZ$ME3m&cIdTfwC4Zm$M?e4xF(O$|$
zrSzuPFiN2WDjj&+{!K)`jnAnWe@$`zFB!7C_VUHc>G-^C$sIK&2Yo??dG8%0cY(-P
z1rmXM{)O0gYP&rAn2vYb`0|l9nE3ECc_<5>4C^-IkP5A?DipVEh9TOz&DpiYx%6@C
z#Dno^dc`iX8XU-yP(<05{clKW%B~$F$=^>896~*gwp&*&IxfA9fhpjF$7_{qs|GRM
zLX+R<MN|4&n$SgODsrEQHA?`~#a6OZLvyv8(8kbx`illkNWOa$$Pgmgx4-ZfM!|u;
zqpg&Hb6T&^?OQTGv>8N{JxU6-9q%_r?JeOsI^WN_t7?pj&xEkHMow{;zu80jt}tvI
zFD>(I?F<}NeZm5#`PrYw0M)P3Kz3*VPJFh2r$Th$n@AOsr`1d<ctu$NWw#qFLAr;v
z4!*VWioE3mJeVeZl=V`;WtsTeOMc3EvGMp%!Uf*UanX3hm*pP`R|gVE@%r&s^ooAF
zNn9^k!NQDwHU6_X@!pbet{Mg`(R8ln0~$~D;{91<@p;yAtya#siRn)tbtBNNQIE;@
zEc*qXr-v;|1yxQ;F;)=-i`fovmObTaa(zqNhnYF^16j4|OE7s(Bo#fe444$Z8I3<J
z81hQoH@tCPs%X$o)aQ8liIX@Y{+XN>hA9WkD|k=MnY0PQDYtoFoJo3AVzoQ(6}uJ5
zwBXm2)hE`7bwu6b&XTa}cPj9p2ZnQpcF_<Pk^dW%y=#@k-jVGTyTXWv=+KADPnGuO
z;L_GrjCw!zT*Pg4PV;&%WaRzndDzV+i)$-$Lv(oRc^rOnUJm&|(tfh8O7-?H70vgu
z^5JFy3+QIOS6uA*LQ2`xtIjq_!yUcUUSnb|o%a?JikLR%W8(-(Vdg&vZ;rqJ8z5U2
zWjU)({FWZL;A%Oc?oz7F;V9*e^z_^x_q^<e=Py&jSE|J9%Ls3=7VE}8XF=|M6kupM
zFSkP5;JrhPt_Ir!sog)SJpM=pW<eOspD?YL689`?HL&MYwbD*Ya-&mZ+dqdmPRfn6
zas8m?EaKL>$!1-P{a=mYqW?0lIKJ;w@^$6in|X0*YF`$DQZHSS134zF#>yPW_`4AM
znjWs@7CMvwH&w=voOp3Nmp*fLCy%HIhrP5`8tIG_zpnAcnl=|XlAwc5huL$3P(55h
z>c_yBe?U^0$VIy65!`OulJGuDnbnWNi(Y(X%(q+=wc|?Q2Wu_JnDJ&$*`0Aw!ZUIi
zLNC5ADY4@dQNnc>jc?!5JbOc?nNQyEX>`M5$mfqT$&v=S?+6QQU0tZYtev?)e4p?-
zY{z1l6g8L;7w5*j(|auG#MUb~C2FLD6F18@<AGAjR4usKZ&vu^J%8tiaqKIyhgrMU
zRa?r(T4k33DbqbZe+4yl&%{Xc{4{4uamikkHr&Hj|K7%v4NX$PiDq5O{ilwywXB8+
z0Y}4%Z4KP5&)qmzMkYevVu`1@`IZ%ay2*l9^3p5yIwFr%yR*14Cm))|aO7P~1!-NT
zn$8t+E@<>z+LutDU_~ID;*L^^u`B!#;k#f{-zo9?Ko4_oPY}^K;S}Z+?xf&NYM^|v
z*pkvo9N^|^q7*<0z0x+Hj+W+}ccPQ$H(-$H-?fpVpC<>uExt9k+(1qEU9M}<R@%oV
zI#^MphSytBL_LMpptz_oA=J0^?Q(HaL{hlt6ttS&v6;RnSsuP(?$VI)No=x0N0Res
z2sx-0F;JnkiSaA@BLR(bMLP;7uUIW?Zv<S=$?v2Sb|&{)4TRmhc&^KP;B^pyg=m`+
z%4`y=ck#6vp9z(0Mh|#i`3~;ZV_uILfV*3mp%k!#r-6n#ZjZ$19>vo%HvX0RkxaW5
z=KK>pm4^BzfJRm1U%B1g>RZ@jDfLn$`jQ>x1y$v|mymsRDCL?c!YkXHKGa-HgE^c<
z&YfRD-oQYl9&jEJOV>1l30cc7hM{sP6OEbF4?M=-nqywL<<z<eS}wVc<p0c)^p#)1
zsD>U9Y?sIr@s$(G5wcSm@dzPD$+RR=zaQD*X%5`4WL^3uN+b)z#*3hP*#P%bC@!UE
zZ>`)nYW}1sbTh`W{0WJAY;H1vzX&xGt4PFK9HgI<Bhf5O{|B@{{P2?=(I-n1yNogg
SFP~Pu0;-By3RQBju>S)leN-3#

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
new file mode 100644
index 00000000000..69b22338c65
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/colors.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="colorPrimary">#008577</color>
+    <color name="colorPrimaryDark">#00574B</color>
+    <color name="colorAccent">#D81B60</color>
+</resources>
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
new file mode 100644
index 00000000000..168adfb0a0c
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">PaddlePredictor</string>
+</resources>
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
new file mode 100644
index 00000000000..5885930df6d
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/main/res/values/styles.xml
@@ -0,0 +1,11 @@
+<resources>
+
+    <!-- Base application theme. -->
+    <style name="AppTheme" parent="Theme.AppCompat.Light.DarkActionBar">
+        <!-- Customize your theme here. -->
+        <item name="colorPrimary">@color/colorPrimary</item>
+        <item name="colorPrimaryDark">@color/colorPrimaryDark</item>
+        <item name="colorAccent">@color/colorAccent</item>
+    </style>
+
+</resources>
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
new file mode 100644
index 00000000000..99dc6d27b35
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/app/src/test/java/com/baidu/paddle/lite/ExampleUnitTest.java
@@ -0,0 +1,17 @@
+package com.baidu.paddle.lite;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * @see <a href="http://d.android.com/tools/testing">Testing documentation</a>
+ */
+public class ExampleUnitTest {
+    @Test
+    public void addition_isCorrect() {
+        assertEquals(4, 2 + 2);
+    }
+}
\ No newline at end of file
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/build.gradle b/paddle/fluid/lite/demo/java/android/PaddlePredictor/build.gradle
new file mode 100644
index 00000000000..02199bb823f
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/build.gradle
@@ -0,0 +1,27 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        google()
+        jcenter()
+        
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:3.4.1'
+        
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        google()
+        jcenter()
+        
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle.properties b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle.properties
new file mode 100644
index 00000000000..743d692ce15
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle.properties
@@ -0,0 +1,13 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..f6b961fd5a86aa5fbfe90f707c3138408be7c718
GIT binary patch
literal 54329
zcmagFV|ZrKvM!pAZQHhO+qP}9lTN<awrzcJ(s9S^*tV^{eb3tK-m}(Od!HXQ=loIi
zj8XF$V^r1q)=-uM14jn|frbVF0TKBtARwUs>j?q^^Y^VFp)SH8qbSJ)2BQ2giqr}t
zFG7D6)c?v~^Z#E_K}1nTQbJ9gQ9<%vVRAxVj)8FwL5_iTdUB>&m3fhE=kRWl;g`&m
z!W5kh{WsV%fO*<Mi7rh$7Dd@XiRrI_)~u7vTi09HZ~r#n-!cBbnSuZPc5=1)KPL6R
z>%je&j+Lv4xxK~zsEYQls$Q-p&dwID|A)!7uWtJF-=Tm1{V@#x*+kUI$=%KUuf2ka
zjiZ{oiL1MXE2EjciJM!jrjFNwCh`~hL>iemrqwqnX?T*MX;U>>8yRcZb{Oy+VKZos
zLiFKYPw=LcaaQt8tj=eoo3-@bG_342HQ%?jpgAE?KCLEHC+DmjxAfJ%Og^$dpC8Xw
zAcp-)tfJm}BPNq_+6m4gBgBm3+CvmL>4|$2N$^Bz7W(}fz1?U-u;nE`+9`KCLuqg}
zwNstNM!J4Uw|78&Y9~9>MLf56to!@qGkJw5Thx%zkzj%Ek9Nn1QA@8NBXbwyWC>9H
z#EPwjMNYPigE>*Ofz)HfTF&%PFj$U6mCe-AFw$U%-L?~-+nSXHHKkdgC5KJRTF}`G
zE_HNdrE}S0zf4j{r_f-V2imSqW?}3w-4=f@o@-q+c<jxZ)hJGoUQe?`w={^dq{L*;
zo#MT%&<y5(Xhp#!?qX!hl87p;R?!hIpXe#kMGshFw+C8Kd9<p_X4=M~uw2+;Pb%sg
z$^33?cTFtvlm_)Z_va(O%DPX_=rK70hyM+uO<&(!lk4n5&a;xyeyPCZ^EqWXBR}af
zaj&NXw6QBwNh(mGxywqFtF$2N>ZgaAbZ((hn))@|eWWhcT2pLpTpL!;_5*vM=sRL8
zqU##{U#lJKuyqW^X$ETU5ETeEVzhU|1m1750#f}38_5N9)B_2|v@1hUu=Kt7-@dhA
zq_`OMgW01n`%1dB*}C)qxC8q;?zPeF_r;>}%JYmlER_1CUbKa07+=TV45~symC*g8
zW-8(gag#cAOuM0B1xG8eTp5HGVLE}+gYTmK=`XVVV*U!>H`~j4+ROIQ+NkN$LY>h4
zqpwdeE_@AX@PL};e5vTn`Ro(EjHVf$;^oiA%@IBQq>R7_D>m2D4OwwEepkg}R_k*M
zM-o;+P27087eb+%*+6vWFCo9UEGw>t&WI17Pe7QVuoAoGHdJ(TEQNlJOqnjZ8adCb
zI`}op16D@v7UOEo%8E-~m?c8FL1utPYlg@m$q@q7%mQ4?OK1h%ODjTj<zl>Fvqd!C
z-PI?8qX8{a@6d&Lb_X+hKxCI<?sRKEGp0kXECYz$Y`2aJFm_ruTJgdH&EikAH~SRF
zey>mb*3GFemm?W_du5_&EqRq!+H?5#xiX#w$eLti-?E$;Dhu`{R(o>LzM4CjO>ICf
z&DMfES#FW7npnbcuqREgjPQM#gs6h>`av_oEWwOJZ2i2|D|0~pYd#WazE2Bbsa}X@
zu;(9fi~%!VcjK6)?_wMAW-YXJAR{QHxrD5g(ou9mR6LPSA4BRG1QSZT6A?kelP_g-
zH(JQjLc!`H4N=oLw=f3{+WmPA*s8QEeEUf6Vg}@!xwnsnR0bl~^2GSa5vb!Yl&4!>
zWb|KQUsC$lT=3A|7vM9+d;mq=@L%uWKwXiO9}a~gP4s_4Yohc!fKEgV7WbV<YDeeg
zQ?eWqkDKATh@$=^kQK|Zk0*`Xf6$X8x1&1wqaixwkT+yU?opKnm;TIqyG0x&ow8sa
z)=S))qB&;v{b@ppB_k95vf4BVVz;!?%B@#z@4NSjRg64B1eG@=EjBDc#m){H_`sJD
zQPcp_pRarLaMnT-CL|?hRf&U{+yVF`y}a+x-?AvBM^F8uh+Qh&4md*yb4Zv2-BF$u
ziaZL^*|!IDKv$Gfg`}tw!`qURCKO3u__(iMWWJ#OHCR}Om|GVAJu!d(6|(<)a*8{7
zIodm!n3J>o<Gc)rQGgv_Mh@Gy=Fp|{WD0>>2ITb<Ig>E*i`a|V!^p@~^<={#?Gz57
zyPWeM2@p>D*FW#W5Q`1`#5NW62XduP1XNO(bhg&cX`-LYZa|m-**bu|>}S;3)eP8_
zp<y0TbX2emKIh^Fa-%^()iEHVuFR61v6x$OO459|%LCMbo$9Pk(Dy<K?l{N-{^dlW
zv!w8%lGu<$;jx1V9t;El{}!Ldop{}l{)SHBzY&7{|Kg*vtCP2nh=;qig`>NTnTfm8
ze+7wDH3KJ95p)5tlwk`S7mbD`SqHnYD*6`;gpp8VdHDz%RR_~I_Ar>5)vE-Pgu7^Y
z|9Px+>pi3!DV%E%4N;ii0U3VBd2ZJNUY1YC^-e+{DYq+<s62Rxu9!=x?KWlvcrf;q
z4fY#)oIvMwNpv5$b<8W~q5Rjv`g}I=`!E53wgM-_pz0>l@cGtmu(H#Oh%ibUBOd?C
z{y5jW3v=0eV0r@qMLgv1JjZC|cZ9l9Q)k1lLgm))UR@#FrJd>w^`+iy$c9F@ic-|q
zVHe@S2UAnc5VY_U4253QJxm&Ip!XKP8WNcnx9^cQ;KH6PlW8%pSihSH2(@{2m_o+m
zr((MvBja2ctg0d0&U5XTD;5?d?h%JcRJp{_1BQW1xu&BrA3(a4Fh9hon-ly$pyeHq
zG&<CL?eS1$gJN*;*inDHGT<|Xs>;6q?m%NJ36K1Sq_=fdP(4f{Hop;_G_(i?sPzvB
zDM}>*(uOsY0I1j^{$yn3#U(;B*g4cy$-1DTOkh3P!LQ;lJlP%jY8}Nya=h8$XD~%Y
zbV&HJ%eCD9nui-0cw!+n`V~p6VCRqh5fR<OO9ra1Pl6`~x$5GYS_C-}DD{ww$JZZ%
z=T(bF-n%!zaxtTJD5#J!5jT#KG0^JLQEpCV#$XRjn~Y#yGVR9DCn~E&eC2Y^tdTQB
zh0qU)tu{|o{DAz|O_XqpGL3`?0z!!o0z&dXFR*`Xp<Ep}WAtSY0j0G~oLEUBP3Z>X
z8`GbdZ@73r7~myQLBW%db;+BI?c-a>Y)m-FW~M=1^|<21_Sh9RT3iGbO{o-hpN%d6
z7%++#WekoBOP^d0$$|5npPe>u3PLvX_gjH2x(?{&z{jJ2tAOWTznPxv-pAv<*V7r$
z6&glt>7CAClWz6FEi3bToz-soY^{ScrjwVPV51=>n->c(NJngMj6TyHty`bfkF1hc
zkJS%A@cL~QV0-aK4>Id!9dh7>0IV;1J9(myDO+gv76L3NLMUm9XyPauvNu$S<)-|F
zZS}(kK_WnB)Cl`U?jsdYfAV4nrgzIF@+%1U8$poW&h^c6>kCx3;||fS1_7JvQT~CV
zQ8Js+!p)3oW>Df(-}uqC`Tcd%E7GdJ0p}kYj5j8NKMp(KUs9u7?jQ94C)}0rba($~
zqyBx$(1ae^HEDG`Zc@-rXk1cqc7v0wibOR4qpgRDt#>-*8N3P;uKV0CgJE2SP>#8h
z=+;i_CGlv+B^+$5a}SicVaSeaNn29K`C&=}`=#Nj&WJP9Xhz4mVa<+yP6hkrq1vo=
z1rX4qg8dc4pmEvq%NAkpMK>mf2g?tg_1k2%v}<3`$6~Wlq<DlOQ=}&%93mb{Vn~Ry
zHv_6my*MG0QdZB_mSSOt?UzC^Ff*hPNF^k7u-Y8h(v3`<>@ItJ*PhHPoEh1Yi>v57
z4k0JMO)*=S`tKvR5gb-(VTEo>5Y>DZJZzgR+j6{Y`kd|jCVr<S=g{oaepfM8<qKkC
zElzx{_tUc3MD9$yG|RFvX^C!qvpq6Yr`K2~b_No)saZ8St1KkQCthfa-6qthD`c4+
zGV6Rc3T2Br8#HtrCO$Ge(@KyZtK<qjHXNyyMO{^v+fy#vCoGOxuDPcN@*~X?Dg~2<
zo3xP+;)XFlr)oO_@Yk;iiSQ<`v)sFoDAa&!D$S@cm9Q)=IVs9AD>g!>2hVjz({kZR
z`dLlKhoqT!aI8=S+fVp(5*Dn6RrbpyO~0+?fy;bm$0jmTN|t5i6rxqr4=O}dY+ROd
zo9Et|x}!u*xi~>-y>!M^+f&jc;IAsGiM_^}+4|pHRn{LThFFpD{bZ|TA*wcGm}XV^
zr*C6~@^5X-*R%FrHIgo-hJTBcyQ|3QEj+cSqp#>&t`ZzB?cXM6S(lRQw$I2?m5=wd
z78ki`R?%;o%VUhXH?Z#(uwAn9$m`npJ=cA+lHGk@T7qq_M6Zoy1Lm9E0UUysN)I_x
zW__OAqvku^>`J&CB=ie@yNWsaFmem}#L3T(x?a`oZ+$;3O-icj2(5z72Hnj=9Z0w%
z<2#q-R=>hig*(t0^v)eGq2DHC%GymE-_j1WwBVGoU=GORGjtaqr0BNigOCqyt;O(S
zKG+DoBsZU~okF<7ahjS}bzwXxbAxFfQAk&O@>LsZMsZ`?N?|CDWM(vOm%B3CBPC3o
z%2t@%H$fwur}SSnckUm0-k)mOtht`?nwsDz=2#v=RBPGg39i#%odKq{K^;bTD!6A9
zskz$}t)sU^=a#jLZP@I=bPo?f-L}wpMs{Tc!m7-bi!Ldqj3EA~V;4(dltJmTXqH0r
z%HAWKGutEc9vOo3P6Q;JdC^YTnby->VZ6&X8f{obffZ??1(cm&L2<AcM~Uq<(8Ng(
z=Otn3JE$IgV`95rhf%=i7-nH1pH+w7+?S|;5r@T$Fl?EhLmgqP#9BRA4Od_xjk7ad
zM5h|cpfoMxrWsLSApyv{$l`9&hh@@ig<>h7q)*w**+sE6dG*;(H|_Q!WxU{g)CeoT
z(K<vy4U2)MD?P@=!pGsq#g)TUpCl$zNfpzIh^JdcRQUBj6{U$eEIgsR1-jF~hzD@#
zN_VKRN+8h%NuhPCbRU>Y&bv!Usc|m+Fqfmk;h&RNF|LWuNZ!+DdX*L=s-=_iH=@i`
z?Z+Okq^cFO4}_n|G*!)Wl_i%qiMBaH8(WuXtgI7EO=M>=i_+<rx1le^rMzvb;2h?0
z5sc4KeR7b9d(*GtIX!Y8kET`N&XUeFRlEW3zfBJ3g<;m3WGbd>;MDjf3aY~6S9w0K
zUuDO7O5Ta6+k40~xh~)D{=L&?Y0?c$s9cw*Ufe18)zzk%#ZY>Tr^|e%8KPb0ht`b(
zuP@8#Ox@nQIqz9}AbW0RzE`Cf>39bOWz5N3qzS}ocxI=o$W|(nD~@EhW13Rj5nAp;
zu2obEJa=kGC*#3=MkdkWy_%RKcN=?g$7!AZ8vBYKr$ePY(8aIQ&yRPlQ=mudv#q$q
z4%WzAx=B{i)UdLFx4os?rZp6poShD7Vc&mSD@RdBJ=_m^&OlkEE1DFU@csgKcBifJ
zz4N<X6zjOG6Pd$1)TIqw@?KA4;{l<rWcd@b+&$U8lC6nP$yh8NT>7+XEJhYzzO=86
z#%eBQZ$Nsf2+X0XPHUNmg#(sNt^NW1Y0|M(${e<0kW6f2q5M!2YE|hSEQ*X-%qo(V
zHaFwyGZ0on=I{=fhe<=zo{=Og-_(to3?cvL4m6PymtNsdDINsBh8m>a%!5o3s(en)
z<K3a<NOCWAOnN#<ht7DdTp3h4WBk0Vi+Wi^QJ?}w8I92)imOJ|Ldj_FrC*44f>=1I
z6O+YNertC|OFNqd6P=$gMyvmfa`w~p9*gKDESFqNBy(~Zw3TFDYh}$iudn)9HxPBi
zdokK@o~nu?%imcURr5Y~?6oo_JB<T{06ROY)xvOYMQE?=Q{O2ChH1OG#K_}G(cC`M
z&0>e}t|pU5qjai|#JDyG=i^V~7+a{dEnO<(y>ahND#_X_fcEBNiZ)uc&%1HVtx8Ts
z*H_Btvx^IhkfOB#{szN*n6;y05A>3eARDXslaE>tnLa>+`V&cgho?ED+&vv5KJszf
zG4@G;7i;4_bVvZ>!mli3j7~tPgybF5|J6=Lt`u$D%X0l}#iY9nOXH@(%FFJLtzb%p
zzHfABnSs;v-9(&nzbZytLiqqDIWzn>JQDk#JULcE5CyPq_<R&*SSGLH#zOQ)FlXn-
zeg1Seb_q|UjEx36*7zdeFSRyySOmGz$V?RugesvL_QkU_Uw$ig6DWIrw*hBf?YI=u
z6?y0yrM{}#t*E!whC^ujiG3Rmj(Xx6dhZ(QZ61aa1q`U|QO0_s8lmk43}c({v6q^%
z>m#4QV!}3421haQ+LcfO*>r;rg6K|r#<M0Jf@axA{?h0RnOe}CLphrI-z23J4V*Pb
zVp9l<M&T$}(G@+AZZhe<!`mWw?nN-0vcSy!i3v<^cqbB%>5Sh|y@h1ao%Cl)t*u`4
zMTP!deC?aL7uTxm5^nUv#q2vS-5QbBKP|drbDXS%erB>f<x@jf<5FQBD0zF-9QcM@
zF3)BRXXVX$gzhiTp_lKpMq3j`hBjHB!nCwXfktaf$<<%akp?@H7g;otp#Cy7Q*u_I
zGo1Hx=jpU*Q)w;-S}GZkW6w3Opl=(fJ<=c5nYdwc1tCu{k>YM84Kpk^au99-BQBZR
z7CDynflrIAi&ahza+kUryju5LR<m1AGV0-a%_}al3TuA0JZPQ44IeFBc<Fj)f?+^p
zUNT<~HwDbgnV|FBsPL!r7;q=yc<|<vv+iKvG<|3O=u#eF{5(rPp1u1cWMIwCJD1Xr
zz(`{|=QJq88uaR@A{_mB{^LC&`g3SG;NkikZz-1F+-T4{OT@PrR1y~a5%4(X7f2jy
zsmK~L@mXd9)(|Qee5K?wH16R1GP}UM#eA|yyNG=_?pUT>_}-Z27g)jqOc(!Lx9y)e
z{cYc&_r947s9pteaa4}dc|!$$N9+M38sUr7h(%@Ehq`4HJtTpA>B8CLNO__@%(F5d
z`SmX5jbux6i#qc}xOhumzbAELh*Mfr2SW99=WNOZRZgoCU4A2|4i|ZVFQt6qEhH#B
zK_9G;&h*LO6tB`5dXRSBF0hq0tk{2q__aCKXYkP#9n^)@cq}`&Lo)1KM{W+>5mSed
zKp~=}$p7>~nK@va`vN{mYzWN1(t<LDUfYz{(EH{AD)IZ$<F28%{RrOoqF0D|nWDiQ
zCUtv0Ea#-e)5}r9ecRbr_!SOK2o&&2@t#>E=u2BZhga5(VtPKk(*TvE&zmn5vSbjo
zZLVobTl%;t@6;4SsZ>5+U-XEGUZGG;+~|V(pE&qqrp_f~{_1h@5ZrNETqe{bt9ioZ
z#Qn~gWCH!t#Ha^n&fT2?{`}D@s4?9kXj;E;lWV9Zw8_4yM0Qg-6YSsKgvQ*fF{#Pq
z{=(nyV>#*`RloBVCs;Lp*R1PBIQOY=EK4CQa*BD0MsYcg=opP?8;xYQDSAJBeJpw5
zPBc_Ft9?;<0?pBhCmOtWU*pN*;CkjJ_}qVic`}V@$TwFi15!mF1*m2wVX+>5p%(+R
zQ~JUW*zWkalde{90@2v+oVlkxOZFihE&ZJ){c?hX3L2@R7jk*xjYtHi=}qb+4B(XJ
z$gYcNudR~4Kz_WRq8eS((>ALWCO)&R-MXE<vEqnorPNm+Fsb;o_Wlx}NSX#*my3$+
zHp_NEmALLgtv-PdAdM+8+m9(x^RHIWJtH>+YxDn9V#X{_H@j616<|P(8h(7z?q*r+
zmpqR#7+g$cT@e&(%_|ipI&A%9+47%30TLY(yuf&*knx1wNx|%*H^;YB%ftt%5>QM=
z^i;*6_KTSRzQm%qz*>cK&EISvF^ovbS4|R%)zKhTH_2K>jP3mBGn5{95&G9^a#4|K
zv+!>fIsR8z{^x4)FIr*cYT@Q4Z{y}};rLHL+atCgHbfX*;+k&37DIgENn&=k(*lKD
zG;uL-KAdLn*JQ?@r6Q!0V$xXP=J2i~;_+i3|F;_En;oAMG|I-RX#FwnmU&G}w`7R{
z788CrR-g1DW4h_`&$Z`ctN~{A)Hv_-Bl!%+pfi<f@n#kgTY9XT^jaBh>f8wN32rMD
zJDs$eVWBYQx1&2sCdB0!vU5~uf)=vy*{}t{2VBpcz<+~h0wb7F3?V^44*&83Z2#F`
z32!rd4>uc63rQP$3lTH3zb-47IGR}f)8kZ4JvX#toIpXH`L%NnPDE~$QI1)0)|HS4
zVcITo$$oWWw<eeUP|>CN@E-5h>N?Hua!N9CYb6f8vTFd>h3q5Jg-lCI6y%vu{Z_Uf
z$MU{{^o~;nD_@m2|E{J)q;|BK7rx%`<ACjBKdqq8Ybhcy?$CAKynzlx{zw=Od0Xy!
zvN#$9qlLsFo>m``+OqZAqAVj-Dy+pD4-S3xK?($>wn5bi90CFAQ+ACd;&m6DQB8_o
zjAq^=<LYcW-lN@?;-ZU=%Nt&mVJ`QVic@H1CE-sGqJTOyEf<AB4+J>eUYc1o{#+p+
zn;K<)Pn*4u742P!;H^E3^Qu%2dM{2slouc$AN_3V^M<Z*M}NxUVyUo{u~vx|(uf^s
z5phH18INK=LP^kX*F_W8A<xU52w_qKYV$JiJ)e5{;j5y{aka<sm<Os_94Xo$!!g$~
z`fbhkBNP6NW*Z8<hmX~Z8mCl(%$_5?-&GIIqjl*yzEi!lqxMp9X|he4cy0Q2ns4A)
zEdyY|bMENjmlmlT&MnnYz+?s=$I7esd4wjP373@7U3t01a;v&uss%A)=Njuq#t~G|
zW4E&RpnRkv>7H_KY3H)#n7qd5_p~Za7zAj|s9{l)RdbV9e||_67`#Tu*c<8!I=zb@
z(MSvQ9;Wrkq6d)!9afh+G`!f$Ip!F<4ADdc*OY-y7BZMsau%y?EN6*hW4mOF%Q~bw
z2==Z3^~?q<1G<X$MIxfi$y$Yn#2NLfUorr)W&ye%GOR8ep7_J*KzBguntmgurTNya
z?VbH*TJWIzqrd}-p|}JU6Q8V{UhwErq6s^i;U*KudB^+Q1)l0SB*1y_XCOW^a|9Tu
zq@6WnZ<rGtfA3ouLG8Mt&KqR!k89i)LPHWIK+b%3u-5N><X)#8sCJtj4vr6GE}e^T
z&gtI<kND}yjLyv0zjK>TeS>xGN-?CHZ7a#M4k<u0(*?HGTUeRU(rtF65{s7+a(>DL
zQxQr~1ZM<I{lt4|1mt7C&}^|Q548jGab9==NoIL)S|Jy@?5nf`$E@s8#cdSRrtH!P
zn73x%@8npDx$Gbfp((9wUs8kHZh!zIycYyV>zCSKFK5+32C%+C1kE#(2L=<Qz%s{v
zu9mDQk{@NHhPg0om|+{q(=~ykxDtvw7>15AR!er7GKbp?Xd1qkkGipx5Q~FI-6zt<
z*PTpeVI)Ngnnyaz5noIIgNZtb4bQdKG{Bs~&tf)?nM$a;7>r36djllw%hQxeCXeW^
z(i6@TEIuxD<2ulwLTt|&gZP%Ei+l!(%p5Yij6U(H#HMkqM8U$@OKB|5@vUiuY^d6X
zW}fP3;Kps6051OEO(|JzmVU6SX(8q>*y<zQy7G#dujaC7=7@%=f_$R$@)sIMF<2XO
zV&<O)@VvenGdrDHtq9XDTL>f*x5QoxDK={PH^F?!VCzES_Qs>()_y|jg6LJlJWp;L
zKM*g5DK7>W_*uv}{0WUB0>MHZ#oJZmO!b3MjEc}VhsLD~;E-qNNd<fQf`>?x7Q6~v
zR=0$u>Zc2Xr}>x_5$-s#l!oz6I>W?lw;m9Ae{Tf9eMX;TI-Wf_mZ6sVrMnY#F}cDd
z%CV*}fDsXUF7Vbw>PuDaGhu631+3|{xp<@Kl|%WxU+vuLlcrklMC!Aq+7n~I3cmQ!
z`e3cA!XUEGdEPSu``&lZEKD1IKO(-VGvcnSc153m(i!8ohi`)N2n>U<u}iBLDuDW*
z5SRX;S?=hSKmN>_BemYJ`uY>8B*Epj!oXRLV}XK}>D*^DHQ7?NY*&LJ9VSo`Ogi9J
zGa;clWI8vIQqkngv2>xKd91K>?0`Sw;E&TMg&6dcd20|FcTsnUT7Yn{oI5V4@Ow~m
zz#k~8TM!A9L7T!|colrC0P2WKZW7PNj_X4MfESbt<-soq*0LzShZ}fyUx!(xIIDwx
zRHt^_GAWe0-Vm~bDZ(}XG%E+`XhKpPlMBo*5q_z$BGxYef8O!ToS8aT8pmjbPq)nV
z%x*PF5ZuSHRJqJ!`5<4xC*xb2vC?7u1iljB_*iUGl6+yPyjn?F?GOF2_KW&gOkJ?w
z3e^qc-te;zez`H$rsUCE0<@7PKGW?7sT1SPYWId|FJ8H`uEdNu4YJjre`8F*D}6Wh
z|FQ`xf7yiphHIAkU&OYCn}w^ilY@o4larl?^M7&8YI;hzBIsX|i3UrLsx{QDKwCX<
zy;a>yjfJ6!sz`NcVi+a!Fqk^VE^{6G53L?@Tif|j!3QZ0fk9QeUq8CWI;OmO-Hs+F
zuZ4sHLA3{}LR2Qlyo+{d@?;`tpp6YB^<N^9B^!u$J3<mbdXRlpBvq{y2IOGm;FI;<
zp58=gjn>BMoJt?&MHFY!JQwoa0nTSD+#Ku^4b{5SZVFwU<IJ0}bQVpgF?my1a(=#v
zCe{m$czGs%5QDa^dwpTc&v1h5?Urk0&xwUXdDP6{oVl<RF(vZZ*Iaht8&i1Um{<Ya
z7K60dbu=HI%sT{`2mtYMmulVL9JH9rf?bk5(*_RZhlK3yUFVD5<1UI>9<~APYbaLO
zu~Z)nS#dxI-5lmS-Bnw!(u15by(80LlC@|ynj{TzW)XcspC*}z0~8VRZq>#Z49G`I
zgl|C#H&=}n-ajxfo{=pxPV(L*7g}gHET9b*s=cGV7VFa<;Htgjk>KyW@S!|z`lR1(
zGSYkEl&@-bZ*d2WQ~hw3NpP=YNHF^XC{TMG$Gn+{b6pZn+5=<()>C!N^jncl0w6BJ
zdHdnmSEGK5BlMeZD!v4t5m7ct7{k~$1Ie3GLFoHjAH*b?++s<|=yTF+^I&jT#zuMx
z)MLhU+;LFk8bse|_{j+d*a=&cm2}M?*arjBPnfPgLwv)86D$6L<v-FBc*O=@Aayd9
z!$`&9Ni`!p;p1hdi2)tw1E`qPlur~3XMJ_)G<yDOTF@9w1))%tUp$L9$dPXFLE-dI
z1CAFIqON5co=;5wnnj-1*d~|3{w2JyK|pB#b5`|_Wvca$o{ImsE#kk(32i8Uv}K$>
zLJ0wPul7IenMvVAK$z^q5<^!)7aI|<&GGEbOr=E;UmGOIa}yO~EIr5xWU_(ol$&fa
zR5E(2vB?S3EvJglTXdU#@qfDbCYs#82Yo^aZN6`{Ex#M)easBTe_J8utXu(fY1j|R
z9o(sQbj$bKU{IjyhosYahY{63>}$9_+hWxB3j}VQkJ@2$D@vpeRSldU?&7I;qd2MF
zSYmJ>zA(@N_iK}m*AMPIJG#Y&1KR)6`LJ83qg~`Do3v^B0<c@vrWjl0ra05o-pa$c
zV~2wLB|D#i{=yOjf)%9EFs~dm=}2RTCPHN&-r`|I@#CLnu+vrE_(YnwDE{ihqc<)X
z`9INM1uJ*dXn{L282KYsA1^f}uQ^2dS~n1{uM$2lDf}Nv<_{@EMrkp<d!t$y{t9cZ
zZTh`eqXc))c!B<E!)-SX5Zkx=T}gk>>fU&wUx(qefuTgzFED{sJ65!iw{F2}1fQ3=
ziFIP{kezQxmlx-!yo+sC4PEtG#K=5VM9YIN0z9~c4XTX?*4e@m;hFM!zVo>A`#566
z>f&3g94lJ{r)QJ5m7Xe3SLau_lOpL;A($wsj<FPGOPs;4j|gI*y@+n1^HrpcC3D#_
z_?)Ug-getbQrU}r{iT(L54IrQY8th5jSYt6V*yfXZ_*pk`O>HR`;xTXgIiZ#o&vt~
zGR6KdU$FFbLfZCC3<gTENk*-sjo;5)w@S;wX$fL7aJC4e@%U@;l`{ETOeJ+i!BbLf
zbaPqu(&El5Tl(XFzxqn<H#&!BN}r6O*jy0eg~yB+S>AEu$b`tj!9XgOGLSV=QPIYW
zjI!hSP#?8pn0@ezuenOzoka8!8~jXTbiJ6+ZuItsWW03uzASFyn*zV2kIgPFR$Yzm
zE<$cZlF>R8?Nr2_i?KiripBc+TGgJvG@vRTY2o?(_Di}D30!k&CT`>+7ry2!!iC*X
z<@=U0_C#16=PN7bB39w+zPwDOHX}h20Ap);dx}kjXX0-QkRk=cr};GYsjSvyLZa-t
zzHONWddi*)RDUH@RTAsGB_#&O+QJaaL+H<<9LLSE+nB@eGF1fALwjVOl8X_sdOYme
z0lk!X=S(@25=TZHR7LlPp}fY~yNeThMIjD}pd9+q=j<_inh0$>mIzWVY+Z9<urh;8
zH|z!)CyJ7QAUOQ_BY2kV(F}Clz(^ySmsY^`PTLR<rLrqM8rw;;!XGIZ+ZTes`dy@#
z;t(O57q+i~k}R5vTwmWBDWc^Ytq+C4?OP-Sn~lm{^t{Uc0Rz~t%3Y(Ewx6m<&mI%K
z`0>p<{D^#0Xk+b_@eNSiR8;KzSZ#7lUsk~NGMcB8C2c=m2l5paHPq`q{S(kdA7Z1a
zyfk2Y;w?^t`?@yC5Pz9&pzo}Hc#}mLgDmhKV|PJ3lKOY(Km@Fi2AV~CuET*YfUi}u
zfInZnqDX(<#vaS<^fszuR=l)AbqG{}9{rnyx?PbZz3Pyu!eSJK`<jn!(pTPC`j`BC
z6h+KNp*mSbL}^yL^w~L|C>uwkJU!ORQXy4x83r!PNgOyD33}}L=>xX_9<OPcXk91C
z7JeZa4I4_-T@qhSd93p0n!CY74f`dZ|6+u0mNto|Y$S<7P10V7Nmo*~nj+22-JGop
zi`cpyz&<4*8a3+*Rh=Xmzc2;2KCLd2!?c!iDUvsM;@99MM`6jPz38b687?0xo1|#t
z0nbYtb&(+&LVMihYH<&<D}g1zy;*?$5>3l6njNTuqL8J{l%*3FVn3MG4&Fv*`lBXZ
z?=;kn6HTT^#SrPX-N)4EZiIZI!0ByXTWy;;J-Tht{jq1mjh`DSy7yGjHxIaY%*sTx
zuy9#9CqE#qi>1misx=KRWm=qx4rk|}vd+LMY3M`ow8)}m$3Ggv&)Ri*ON+}<^P%T5
z_7JPVPfdM=Pv-oH&lttecoE}(0O7|YZc*d8`Uv_M*3Rzv7$yZnJE6N_<lW4-Y3nv8
z&0M}}4zz%H)@xku37-xNYl7+e>W=AQ3_BgU_TjA_T?a)U1csCmJ&YqMp-lJe`y6>N
zt++Bi;ZMOD%%1c&-Q;bKsYg!SmS^#J@8UFY|G3!rtyaTFb!5@e(@l?1t(87ln8rG?
z--$1)YC~vWnXiW3GXm`FNSyzu!m$qT=Eldf$sMl#PEfGmzQs^oUd=GIQfj(X=}dw+
zT*oa0*oS%@cLgvB&PKIQ=Ok?>x#c#dC#sQifgMwtAG^l3D9nIg<uQ!0`kZ2#jQjY=
z^fp6|svg0-4m9^9&y=jgtP8tOHf$4)G{z8?_hB9;^J!V@4SpkSI6&jw_YRX{M)aNa
zhHy=Usx=Vv1l2K9Y=4Ue+(9yRy1}0bT3PF<Ui&bXrp~l%srldKWk1^=Rtvnapw6|^
zFoloYAD>(Zqi;D%807TtUUCL3_;kjyte#cAg?S%e4S2W>9^A(uy8Ss0Tc++ZTjJw1
z&Em2g!3lo@LlDyri(P^I8BPpn$RE7n*q9Q-c^>rfOMM6Pd5671I=ZBjAvpj8oIi$!
zl0exNl(>NIiQpX~FRS9UgK|0l#s@#)p4?^?XAz}Gjb1?4Qe4?j&cL$C8u}n)?A@YC
zfmbSM`Hl5pQFwv$CQBF=<Q-UGZ@hw~4An$EBbxn9OWL^<{5oG0g9fW54P1`=a-g7|
zC0f|-l)OqWwLsMR6x}wWi8^{%XA#n;spL1&1D6shHJ6;;#FElpktRn4r=rQKbAvtX
zL<gj%yy+phsG2qh{)3Y7BRFH;0IsWU_HZXomlw%T4~EU|xU{7?1B40*vb18D>_$Sq
zxsV?BHI5bGZTk?B6B&KLdIN-40S426X3j_|ceLla*M3}3gx3(_7MVY1++4mzhH#7#
zD>2gTHy*%i$~}mqc#gK83288SKp@y3wz1L_e8fF$Rb}ex+`(h)j}%~Ld^3DUZkgez
zOUNy^%>>HHE|-y$V@B}-M|_{h!vXpk01xaD%{l{oQ|~+^>rR*rv9iQen5t?{BHg|%
zR`;S|KtUb!X<22RTBA4AAUM6#M?=w5VY-hEV)b`!y1^mPNEoy2K)a>OyA?Q~Q*&(O
zRzQI~y_W=IPi?-OJX*&&8dvY0zWM2%yXdFI!D-n@6FsG)pEYdJbuA`g4yy;qrgR?G
z8Mj7gv1oiWq)+_$GqqQ$(ZM@#|0j7})=#$S&hZwdoijFI4aCFLVI3tMH5fLreZ;KD
zqA`)0l~D2tuIBYOy+LGw&hJ5OyE+@cnZ0L5+<w{cWEo@_^~M|2=36q$z3_?Gf)VJr
zUpe}NVgM}-;&|6HhxkYp7HzE53izT81kTG@Jmq2@d-$W+^KDXF(P7@EuYfbYIyV7U
zV*wOvC$^_oi0@eKQ5Lg<*9K4ZW~-9Em>;yo2pIMdt@4$r^5Y!x7nHs{@<B%ZFKsNr
zUMwD7Y!0QZu_?1dF&#e)6E>>|W(MzJjATyWGNwZ^4j+EPU0RpAl-oTM@<pD`jR@;t
z`dx{{CIdGvrizSwgHXZPcy<mdC17yqEl4I024K6slY7B=b!@b?Z_t%9OP67!Fb4;O
z;drPKrpu;B8_&tqVEsY**KGIrLfftCZ<f3MS4jVJwkz|`KDK{kxe|Xt(g9|8z@N#}
zBXpE*Si?ut+wN*uFcIYk!!}k<S-vxO!E7j^pW&!{sW{MM=^wB6{+mP~m?wTGzR=Kt
zMRXJ#O3=xtl*d}rDb%YbSYbxv?0QzJq0M9@+MA{-DVds9NCFdn0W6Ib&!HkMA39U#
zVguUQDWw1^)rRq#B4hvu?!b*1Q-|#8Q%ZC@dCXVwky^O0vocgiD5m~|gPh0Z%7gv&
z&mgdW^)$B{`(^PjTFQy^-+Sul=5Atd|IdECT&;IMbS<1fr6<-2%kRj#&U)5!k}LEG
zaD~b+!LTq6V(G$oqTps8O?>u{lx*i0^yyWPfHt6QwPvYpk9xFMWfBFt!+Gu6TlAmr
zeQ#PX71vzN*_-xh&__N`IXv6`>CgV#eA_%e@7wjgkj8jlKzO~Ic6g$cT`^W{R{606
zCDP~+NVZ6DMO$jhL~#+!g*$T!XW63#(ngDn#Qwy71yj^gazS{e;3jGRM0HedGD@pt
z?(ln3pCUA(ekqAvvnKy0G@?-|-dh=eS%4Civ&c}s%wF@0K5Bltaq^2Os1n6Z3%?-Q
zAlC4goQ&vK6TpgtzkHVt*1!tBYt-`|5HLV1V7*#45Vb+GACuU+QB&hZ=N_flPy0TY
zR^HIrdskB#<<v3VZZ(6)Q<hH<WFIP7EzWSwOI2$+%qFf>$aU;HY(K{a3(OQa$0<!Z
zwV)dCsnAJ%s$60=Taf${mur@a6L!U%;lMq7fsK(vim><9qH(oa)lg@Uf>M5g2W0U5
zk!JSlhrw8quBx9A>RJ6}=;W&wt@2E$7J=9SVHsdC?K(L(KACb#z)@C$xXD8^!7|uv
zZh$6fkq)aoD}^79VqdJ!Nz-8$IrU(_-&^cHBI;4<r+1$HU?qG1J088{7lCH{fI}f^
zvy+vSe{z?zfJ#0~_d*#~6_b~IvofLQ3@^|Q9eojowZuM$JzNi`=-rXVDv!mXDZ;xe
zE9ba^4q9OO5o3vd<M~xJ;2KX?9umvZyxy44nb9eY+)`&yt0U~q@kb3&9<Rj_#e5S>
z^$B+1aPe|LG)C55LjP;jab{dTf$0~xbXS9!<M2ebiR^>!Qd<NroZru}b;ry@LG!l-
zM!n%>cmDYLbL^jvxu2y*qn<cdgNOutE`4#&#_4e){y%!N#7<u*3DvdNWs5&CCKxmF
z;-bApM*r4Onl90Px}#p^0ufjq{#e@!w*feT#7*fpVhBR>x2%jbL%<aHndMtEpHH-p
z6qWPGl6!=`TPxhax_zvd(m12tCV-av2X5b-3q&(-ReP0*;wVRV)oy3pc2xd$@S>rB
z{aP85qBJe#(&O~Prk%IJARcdEypZ)vah%ZZ%;Zk{eW(U)Bx7VlzgOi8<yX;uy|>)x
z`rh4l`@l_Ada7z&yUK>ZF;i6YLGwI*Sg#Fk#Qr0Jg&VLax(nNN$u-XJ5=MsP3<SnL
za}V*@pmgL3Baam<X2AKjxwE0R!TMUJ&9ZWO)ZZ^6gQ3>|(lEdIOJ7|(x3iY<!N++B
z=U1Q6TGt3S3YV*F+Ahywu6XP(b~Lktd&^7%Z@PEqtF(8!YzKB%4=X&Y*xq898JuT&
z&Vh%4r;T-A)?v5-+2&x+>;ea)5#BW*mDV%^=8qOeYO&gIdJVuLLN3cFaN=xZtFB=b
zH{l)PZl_j^u+qx@89}gAQW7ofb+k)QwX=aegihossZq*+@PlCpb$rpp>Cbk9UJO<~
zDjlXQ_Ig#W0zdD3&*e<b)v&6o<r;N-*6|7H8Mqq3Q2=Q3wWhIae70i~T<EsK{q2f7
zHJ5f3S3TaWCv=OH6R0kmo8ysOJ=6-#H_t!YTE&)H&)crJr8s~dk41huQv00%j*0bY
z&7rQ&g9dPJr}`;=)*qSWmdy7%@RoTs-JMb|cteD`*M|!z?!x=sEX_Wcv~Jh*ysQDu
z5$39MI^t}VEA*zB>i(FwlN#3b%FSR%&M^ywF@Fr>d~do@-kIS$e%wkIVfJ|Ohh=zc
zF&Rnic^|>@R%v?@jO}a9;nY3Qrg_!xC=ZWUcYiA5R+|2nsM*$+c$TOs6pm!}Z}dfM
zGeBhMGWw3$6KZXav^>YNA=r6Es>p<6HRYcZY)z{>yasbC81A*G-le8~QoV;rtKnkx
z;+os8BvEe?0A6W*a#<J2Tf?&<fZTof%8x8JT(_GWg%u}o_rMs&m4F@52Hp`aqUv%*
zA+83O5;aq3wKpGRSnm_3aeGBuJ>dOudsv3aWs?<dJ*490)E?9FR?9;?D^I}Y)t=~s
z9|7TF=C(4azs;8zWq`Os@;d<Ej^~L?LGh9_-_?Ac+((@_ylkxK_<&Fn(ZD2iyJUE~
zk?7A)<gYN3Rzj#nN<&#zZ*7F#f~p-9k8k0N_t+XtdkwxY@sKr{L#%H;AV@5VDQVfa
z%=vpSR66V-Kg<z*)b`bN1tO(hG)+_q$g{)_v_vjPsWS;p4%(bE<l|UJXnn8CFU>d%
z0oNngyVMjavLjtjiG`!007#?62ClTqqU$@<hq&JblU=*qC?|bLqd#83w}`s4hoCZG
zZ-O+2XV_of{|)QPh3gO||9e&Fi3S9O_`iw%|A>kIY`=x^$2e>iqIy1>o|<Za5N)p^
zLmeaZWT(bWB5ouhHen&&Blk***`S!tl|wz(B+}HqEb?f>@Tw@)P)B8_1$r#6>DB_5
zmaOaoE~^9TolgDgooKFuEFB#klSF<Vq&r#m9@hME>%9-~d2~_|kQ0Y{Ek=HH5yq9s
zDq#1S551c`kSiWPZbweN^A4kWiP#Qg6er1}HcKv{fxb1*BULboD0fwfaNM_<55>qM
zETZ8TJDO4V)=aPp_eQjX%||Ud<>wkIzvDlpNjqW>I}W!-j7M^TNe5J<O&m1qXTjRg
z-giV*V;CYIguyZ;;Y5qE=ch~mnI~pSK6XWUcE{l(6PM6OJHWW(XN#ZNOC=G^o`Kn|
z)!fw}(M2@Gu1+p9%z`Xz(5247JK&gKsJQWg4PyMIAYw>IFh#-}zAV!$ICOju8Kx)N
z0vLtzDdy*rQN!7r>Xz7rLw8J-(GzQlYYVH$WK#F`i_i^qVlzTNAh>gBWKV@XC$T-`
z3|kj#iCquDhiO7NKum07i|<-NuVsX}Q}mIP$jBJDMfUiaWR3c|F_kWBMw0_Sr|6h4
zk`_r5=0&rCR^*tOy<?~|OBH2^xi@ovc<0m6Z9pQ}p}$L)86o2kGm22nSVfgAnM9+%
z-O%Z&wlG5Q7|Vdi#a_48(&%DvANSabx6F*eZcubRUtR3m-P}10ob*11EpzdR^qybZ
zf3g(F3Srb@fhdZcRva|VnoDWmt>$A8K;@|NqwncjZ>Y-75vlpxq%Cl3EgH`}^^~=u
zoll6xxY@a>0f%Ddpi;=cY}fyG!K2N-dEyXXmUP5u){4VnyS^T4?pjN@Ot4zjL(Puw
z_U#wMH2Z#8Pts{olG5Dy0tZj;N@;fHheu>YKYQU=4Bk|wcD9MbA`3O4bj$hNRHwzb
zSLcG0SLV%zywdbuwl(^E_!@&)TdXge4O{MRWk2RKOt@!8E{$B<l(@e{<+(eX3IivK
zR*O|UTWTINjWzl=zX^M}v1dxUarP|=BCd-w1w34qQf5C{x%6J^)21}_7#me}_4*lH
z^%q-ruGlL}T~~d&8N6oL9E@FkCtaLEfQKf*k=~T!D}W&wtynLO0^<vGBc%UH9sOtu
z)!T7Qnzqz%ZvULsH*;mkh>U-AH(@4{gxs=YAz9LIob|Hzto0}9cWoz6Tp2x0&xi#$
zHh$dwO&UCR1Ob2w00-2eG7d4=cN(Y>0R#$q8?||q@iTi+7-w-xR%uMr&StFIthC<#
zvK(aPduwuNB}oJUV8+Zl)%cnfsHI%4`;x6XW^UF^e4s3Z@S<&EV8?56Wya;HNs0E>
z`$0dgRdiUz9RO9Au3RmYq>K#G=X%*_dUbSJHP`lSfBaN8t-~@F>)BL1RT*9I851A3
z<-+Gb#_QRX>~av#Ni<#zLswtu-c6{jGHR>wflhKLzC4P@b%8&~u)fosoNjk4r#GvC
zlU#UU9&0Hv;d%g72Wq?Ym<&&vtA3AB##L}=ZjiTR4hh7<lQjQPD7BH3(LxrPlKZV_
z(AQ%i>J)e>e<PJRC|ur!@X9?=ab6p0zN|sRcBVRzCz(8|n1EsiKD}%a;v@kHJ{>i}
zt*u+>h%MwN`%3}b4wYpV=QwbY!jwfIj#{me)TDOG`?tI!%l=AwL2G@9I~}?_dA5g6
zCKgK(;6Q0&P&K21Tx~k=o6jwV{dI_G+Ba*Zts|Tl6q1zeC?iYJTb{hel*<TxGDp1t
z93yVek$KyL5=fjxatJAN*YDud$cY|0pGn7|DKyyuDwvG`Z(2)p)Yci42_hErHf2UU
zS$_$$j&c&-LV>x>^wb|2RkHkU$!+S4OU4ZOKPZjV>9OVsqNnv5jK8TRAE$A&^yRwK
zj-MJ3Pl?)KA~fq#*K~W0l4$0=8<bVxkhVxe=la}*Hbv)==tjpYows*bL@c2eYWT3W
z01=rCX`^R^f`_THwcLEMklb`A^)+)^x|)kEA{OtGmozckxKB4Ey0h}MC>GRx^9+?w
z!QT8*-)w|S^B0)ZeY5gZPI2G(QtQf?DjuK(s^$rMA!C%P22vynZY4SuOE=wX2f8$R
z)A}mzJi4WJnZ`!bHG1=$lwaxm!GOnRbR15F$nRC-M*H<*VfF|pQw(;tbSfp({>9^5
zw_M1-SJ9eGF~m<G!kjc;F?%%**IztSuXp$ob-V2#!>(0dvp*P8uaA0Yw+EkP-SWqu
zqal$hK8SmM7#Mrs0@OD+%_J%H*bMyZiWAZdsIBj#lkZ!l2c&IpLu(5^T0Ge5PHzR}
zn;TXs$+I<V5KeQon$dKjn%XTNOq1iDY~a}L->Q_&;O~u=Jz+XE0wbOy`=6>m9JVG}
zJ~Kp1e5m?K3x@@>!D)piw^eMIHjD4RebtR`|IlckplP1;r21wTi8v((KqNqn%2CB<
zifaQc&T}*M&0i|LW^LgdjIaX|o~I$`owHolRqeH_CFrqCUCleN130&vH}dK|^kC>)
z-r2P~mApHotL4dRX$25lIcRh_*kJaxi^%ZN5-GAAMOxfB!6flLPY-p&QzL9TE%ho(
zRwft<s>E3sy5<*^)qYzKkL|rE>n@hyr;xPqncY6QJ8125!<x6~mJ8#t72J`9j7Vt+
z{)(shXdhRK*hg{^0-o)*=-BD_{=G?(3Uxw|igce$D@5V+;|AwbJ^|mBZT}P+{e?ev
z^Hm#S8hQoc_g2THK_hq+^7dV+oJJtv89LQULvUb*<|80Ah}f7O7thWnN;ldB%tWj;
zXGuVR(uM%0S~B_My>MWr`UCWuC~A#G1AqF1@V$kv>@NBvN&2ygy*{QvxolkRRb%Ui
zsmKRO<HOo)OXBaW@%o9D)qKMSmek)As&f%&XJh-+5w*Sni?higYzm6~sL9T}q|+D9
z%qzvC4v{11N6=sMD9x29k-{?M)q53}>R%{*g*WjUUod@@cS^4eF^}yQ1>;WlGwOli
z+Y$(8I`0(^d|w>{eaf!_BBM;NpCoeem2>J}82*!em=}}ymoXk>QEfJ>G(3LNA2-46
z5PGvjr)Xh9>aSe>vEzM*>xp{tJyZox1ZRl}QjcvX2TEgNc^(_-hir@Es>NySoa1g^
zFow_twnHdx<e!lgrNvI*WwBW)9wT|Gdp=c5apgD*amgLuUgeizAc}`P6%lbB0d#Z7
zM;4Blo<T=VRwcg`wRx_6=?ltdSX=X;(4baV;MUcCy|{FBA`Zxe#<lM|b7@g?@2fGg
zIvWGt;sS3yPx@x}jSYQsvJ=J#PvK6X<0Fh+*Q@3wty9k8PJk&wC(b6g5hW5g=_%s~
zV<m~1V;vNeZD@7?COKF+T!CB6pDv{fSFU@)-jTV7?mWdMju#)3pR&V^9*<dsEofq{
zF+@6BVl-^4$mc*8L3np=n^m5nn8IF%8bAI=nujF$D~jYJ3+6&3Mm)2am@cr9QtAFN
z6?MSMSmK(1OxK*zdYZhr8j$3c-8K0_%mPH{twF*(|Ie~m)^78KdHi44QLneiwch+8
zD=yq1et$wHz{iz|?7L^3fb6J#`$6{DTVV(S=?Mz={o?+%SMad-iF)}cnoEkSidJPM
z1tAxRVlVg^!)Vef0KfMogS||RlXA1A@sW2l3oT@37DDbNV^ovJi3(g(k=k-tSw$0w
z>(j?Q_3q51t3XI7YlJ4_q&(0#)&a+RUy{IcBq?)eaWo*=H2UUVIqtp&lW9JTJiP&u
zw8+4vo~_IJXZIJb_U^&=GI1nSD%e;P!c{kZAL<I?m@N;2QRs5XV@LYa&h7gsYW*aK
zkiR{aVR>NCm5c%%oF+I3DrA63_@4)(v4(t~JiddILp7jmoy+>cD~ivwoctFfEL<GF
zS2?j=;5_KN<!_59Fr3**B&RM#EV)k5&x%z^$4i+F6t$2SQeVakt1E=3m%N!0egN->
zP*#2Rx?_&bCpX26MBgp^4G>@h`Hxc(lnqyj!*t>9sOBcXN(hTwEDpn^X{x!!gPX?1
z*uM$}cYRwHXuf+gYTB}gDTcw{TXSOUU$S?8BeP&sc!Lc{{pEv}x#ELX>6*ipI1#>8
zKes$bHjiJ1OygZge_ak^Hz#k;=od1wZ=o71ba7oClBMq>Uk6hVq|ePPt)@FM5bW$I
z;d2Or@wBjbTyZj|;+iHp%Bo!Vy(X3YM-}lasMItEV_QrP-Kk_J4C>)L&I3Xxj=E?|
zsAF(IfVQ4w+dRRnJ>)}o^3_012YYgFWE)5TT=l2657*L8_u1KC><OI7!CHt-I8C=_
zLRniH(rk?7HtU~~$OrcZ%C_13=5$L<fZMyx5o=K4pR3}aI_Tp!*usOF04brWlk!!L
zSy#n#5F3stC!GdnY(#!PB8CicslniZojp3F1sV>Y-R{7w^S<!A!SNx8PxrI522z82
z;05AcZ?RKoY9K8k=Y(xm=o4tfrYxE@T|pxPV|T*QtY?-IR=+aeiM#C&9#VHkoR(nT
z6Vo&D<jxb-PViYakd_Ry0YqwV7R_xqvcvy}v9Exs`_0nE-Q8V_Yk}hK?(XjH6uVFy
zio3fz6sNcqcXxNU@6OKb?#zE@zia{t2?TQao%^2iKIf5Xug1iSf>&A^X^U}h20jpS
zQsdeaA#WIE*<8KG*oXc~$izYilTc#z{5xhpXmdT-YUnGh9v4c#lrHG6X82F2-t35}
zB`jo$HjKe~E*W$=g|j&P>70_cI`GnOQ;Jp<HTnC@pzHe*%-Vsf^<VjPxxJSjDb)5A
z@dUf*L9{G?#w|lxdVuotsY2r;fOm2iST%qnxNji!k#TFb0f-8Qgf3z4%40AK4^h19
z-A+mZ8|9~Z!1|i<Zp6yzDsXa7@B)f*{d=<-alUJ|78%CxgzIn2oJb7qBrmLuCNBss
zF_q8l@r%nqD9F5rSuHz`Pr79~L*nR*J%<m`h7U96U%SEtApNd^esFL!Ci%{fS?37}
zF(N_1dt4bLh|7V0L`!`*oUtV^<c06LL!nbvk{4Amm*FX0Igd2sZ20U7d%<>*JK#CT
zuEGCn{8A@bC)~0%wsEv?O^hSZF*iqjO~_h|>xv>PO+?525Nw2472(yqS>(#R)D7O(
zg)Zrj9n9$}=~b00=Wjf?E418qP-@8%MQ%PBiCTX=$B)e5cHFDu$LnOeJ~NC;xmOk#
z>z&TbsK>Qzk)!88lNI8fOE2$Uxso^j*1fz>6Ot49y@=po)j4hbTIcVR`ePHpuJSfp
zxaD^Dn3X}Na3@<_Pc>a;-|^Pon(>|ytG_+U^8j_JxP=_d>L$Hj?|0lz>_qQ#a|$+(
z(<egK_JoCQRjdK;AXx%}_!KIQhP{J$V&g4kO;k^0n4w8Q6O42Hy~+M7^^jg-Xnl|q
zua@Ow?sWRM>x=Lipuc8p4^}1EQhI|TubffZvB~lu$zz9ao%T?%ZLyV5S9}cLeT?c}
z>yCN9<04NRi~1oR)CiBakoNhY9BPnv)kw%*iv8vdr&&VgLGIs(-FbJ?d_gfbL2={-
zBk4lkdPk~7+jIxd<O?6fKBm-sx3(2vnx)%n4Ktsv_->4{M(-W1AC_WcN&Oza@jZoj
zaE*9Y;g83#m(OhA!w~LNfUJNUuRz*H-=$s*z+q+;snKPRm9EptejugC-@7-a-}Tz0
z@KHra#Y@OXK+KsaSN9WiGf?&jlZ!V7L||%KHP;SLksMFfjkeIMf<1e~t?!G3{n)H8
zQAlFY#QwfKuj;l@<$YDATA<ukA}h2$W&;+e0r*}D;-38IJ>k;%PtD%B(0<|8>rXU<
zJ66rkAVW_~Dj!7JGdGGi4NFuE?7ZafdMxIh65Sz7yQoA7fBZCE@WwysB=+`kT^LFX
zz8#FlSA5)6FG9(qL3~A24m<P6Vythc;LAwJmg~^%V;+>pzL@@2D#>0J7mMS1T*9UJ
zvOq!!a(%IYY69+h45CE?(&v9H4FCr>g<n&7wakQnQ`O|IzdV}gb4_uD%4%`E8Wyr_
zB*ahMr*K>K0>mK~F}5RdOuH2{4|}k@5XpsX7+LZo^Qa4sH5`eUj>iffoBVm<qdY>+
zz4Mtf`h?NW$*q1yr|}E&eNl)J``SZvTf6Qr*&S%tVv_OBpbjnA0&Vz#(;QmGiq-k!
zgS0br4I&+^2mgA15*~Cd00cXLYOLA#Ep}_)eED>m+K@JTPr_|lSN}(OzFXQSBc6fM
z@f-%<Ys?31?U5L7iFk_3XVAlD-h+YS+#5FBuWsR^&q+ZrQ8qFxk>2;1@BzhZa*LFV
z-LrLmkmB%<<&jEURBEW>soaZ*rSIJNwaV%-RSaCZi4X)qYy^PxZ=oL?6N-5OGOMD2
z;q_JK?zkwQ@b3~ln&sDtT5SpW9a0q+5Gm|fpVY2|zqlNYBR}E5+ahgdj!CvK$Tlk0
z9g$5N;aar=CqMsudQV>yb4l@hN(9Jcc=1(|OHsqH6|g=K-WBd8GxZ`Ak<hfgdZW0z
zKi-oN61LSSAoR(f9ZwQCaOPZ0{;FStt#m|s?TG5#XpA*N7&TI22y;2Um4})X>T?OO
z-z_Ued-??Z*R4~L7jwJ%-`s~FK|qNAJ;EmIVDVpk{Lr7T4l{}vL)|GuUuswe9c5F|
zv*5%u01hlv08?00Vpwyk*Q&&fY8k6MjOfpZfKa@F-^6d=Zv|0@&4_544RP5(s|4<x
z+b;nhA1{boC<;F%x6Hv(9O(jAOR1EC=|%OzzX`7;Ig$v)|1k`YErIgDU4rdjVe4mI
zJ4C2EkvdP+xAW=2t=H?{sVV8__t+@&`zg=)tGPIRsN4kR8&_0Y$zVv{Toa+*G@+ZJ
z^L{}=jQE;Tdvmo*$p8w6T~?mrd3G7G>VPVP-f>%u(J@23BHqo2=zJ#v9g=F!cP((h
zpt0|(s++ej?|$;2PE%+kc6JMmJjDW)3BXvBK!h!E`8Y&*7hS{c_Z?4SFP&Y<3evqf
z9-ke+bSj$%Pk{CJlJbWwlBg^mE<tg;u(mK;v`;J3wh$h&l4L=+gfK7G4(vW`7Hj|D
z=bOWeg>C^@%Ou?o>*|O)rl&`KIbHrjcpqsc$Zqt0^^F-gU2O=BusO+(Op}!jNzLMc
zT;0YT%$@ClS%V+6lM<b-p>Tfhuzzxomoat=1H?1$5Ei7&M|gxo`~{UiV5w64Np6xV
zVK^nL$)#^tjhCpTQMspXI({TW^U5h&Wi1Jl8g?P1YCV4=%ZYyjSo#5$SX&`r&1PyC
zzc;uzCd)VTIih|8eNqFNeBMe#j_FS6rq81b>5?aXg+E#&$m++Gz9<+2)h=K(xtn}F
ziV{rmu+Y>A)qvF}ms}4X^Isy!M&1%$E!rT<ZkEuVlksHeNSQ&HvE~V#;h-tRoskLU
z!R%bS+IjeENm`Rw$2Ee2fT)9mfN=f$l2kNuwG?u3F|#waHFKtrbuclq6$L6~oUM%A
zT&*1bL3d12){(&nQgSEF*XLa;oSK?fbQ-<FZBAEqbYz6kAV?cSv50+XWD`i`+m{i4
z%0+%k_bOiH3G@)F@v;GCN>O~5(p+8{U6#hWu>(Ll1}eD64Xa>~73A*538wry?v$vW
z>^O#FRdbj(k0Nr&)U`Tl(4PI*%IV~;ZcI2z&rmq=(k^}zGOYZF3b2~Klpzd2eZJl>
zB=MOLwI1{$RxQ7Y4e30&yOx?BvAvDkTBvWPpl4V8B7o>4SJn*+h1Ms&fHso%XLN5j
z-zEwT%dTefp~)J_C8;Q6i$t!dnlh-!%haR1X_NuYUuP-)`IGWjwzAvp!9@h`kPZhf
zwLwFk{m3arCdx8rD~K2`42mIN4}m%OQ|f)4kf%pL?Af5Ul<3M2fv>;nlhEPR8b)u}
zIV*2-wyy<L1TA?TG4MnL%S`E4kQ~sEpED27z7e$gGD+x=E8gMD-+6}!-<d-^Mx7#I
z(TAlejnA3QA=8rz{h%{5H=)OR4vvmWHX$#tZzM7@tHivO^PLhsq{;jsf3#K7>D%%)
zl$G@KrC#cU<!=K&+xo$$vCQyv{yPi%4XqvQUk@kaCo8%m;7T0tZx$T?hf4VS5+F(a
z*#LhL^^;LjTdUHJ-sGF1q?s%_DQNlnpq<=KGF>woL?YdQyf9WH)@gWB{jd5w4evI&
zOFF)p_D8>;3-N1z6mES!OPe>B^<;9xsh)){Cw$Vs-ez5nXS95NOr3s$IU;>VZSzKn
zBvub8_J~I%(DozZW@{)Vp37-zevxMRZ8$8iRfwHmYvyjOxIOAF2FUngKj289!(uxY
zaClWm!%x&teKmr^ABrvZ(ikx{{I-lEzw5&4t3P0eX%M~>$wG0ZjA4Mb&op+0$#SO_
z--R`>X!aqFu^F|a!{Up-iF(K+alKB{MNMs>e(i@Tpy+7Z-dK%IEjQFO(G+2mOb@BO
zP><k~U-~U(n}*BH%iq@7+uF^_$lCvY)Ycy8o#Dj)%|%Pd_XXguGKJw*5vP?4TESet
zbtO9Dth?@<a&J7K=xA#1|L~*i$G}S&Ii$`Mb~NQsZA~JlSpQd(WY*XWv0y5)*!1LM
zedRyd1iI=d=`{Og?yf71QE8(G=|)T^Hew=g@K0vzoMwc!fHlOQrLQG2?xefFM%U`^
zotJCbT-txJ43=`1S<Oz@?fEb6oLlxfKHk=)+nSxy2ADT4P6!Vyh<UhPF?1qz{BYd1
z^{c^zWSD4lEhT9${x%rJC@Ox-ei}*WUiO>WHlS#fSQm0et)bG8^ZDScGnh-qRKIFz
zfUdnk=m){ej0i(VBd@RLtRq3Ep=>&2zZ2%&vvf?Iex01hx1X!8U+?>ER;yJlR-2q4
z;Y@hzhEC=d+Le%=esE>OQ!Q|E%6yG3V_2*uh&_nguPcZ{q?DNq8h_2ahaP6=pP-+x
zK!(ve(yfoYC+n(_+chiJ6N(ZaN+XSZ{|H{TR1J_s8x4jpis-Z-rlRvRK#U%SMJ(`C
z?T2<!8lc0Y0`9pNuNna*%oU`OR{DFL$sBJm9_+l(SsLxQ1D*r)2Z%cTv0{hRPqerr
zmQQJw;@2*S-6+Q?tp!_e-)bD|uIVtl7np;!Hcr;|$iMg(Y_~jRMQqwUfq#^4D?Bv>
zF(NNfO_&W%2roEC2j#v*(nRgl1X)V-USp-H|CwFNs?n@&vpRcj@W@xCJwR6<k+ZF3
zMlV*YZiVe;7ytcG5tZ7vdF6rV0knuI7xmRxbL0Ffh{&;KauYl`7^z{zhCh<g91&pY
zsfWfKp|_B2`e7VYl*F7RI^ezWWt}Ybw~mrzPPC!<qCL)O2P$&+#MnsxNbi9)@5bEx
zU?<+GbaUm2`960|0{PD9C>@T!jt377?XjZ06=`d*MFyTdyvW!`mQm~t3luzYzvh^F
zM|V}rO>IlBjZc}9<ICqAZa1}Q=c8?w<|HA(b6|9N!!rbIeD+pLjiR$6o@Sx)<S8Rx
z$j`+=NK*=8kZ_8IkI5fER%E?$)r+Tzg;PJ{dP2%{WM^<J&+Z7mPQ&7WX{4j%2x&>Z
zd$&!tthvr>5)m;5;96LWiAV0?t)7suqdh0cZis`^Pyg@?t>Ms~7{nCU;z`Xl+raSr
zXpp=W1oHB*98s!Tpw=R5C)O{{Inl>9l7M*kq%#w9a$6N~v?BY2GKOVRkXYCgg*d<P
zS^9~2kSj9qx~FG%4r4tPN3zI3M5ligwGT|7NI1j`<~_UUXP3L76h$B<2^BN$(i;<B
zS*AnDj9TL&%4iY58fK~xFv}sz2yoA<9yXZ9LpmpGRu+NQ(8dhgmEdv><5G2M1WZP5
zzqSuO91lJod(SBDDw<*sX(+F6Uq~YAeY<z`M%hj^r3vPe7FQsZ`y;Z{SBt%ZN$kqk
zxSC4FSBdGLqi@c*;(0>V#2A;XQu_p=N5X+#cmu19Qk>QAnV=k!?wbk5I;tDWg<Ke3
z+Zln*k(p^Fx)V(N(0j>Fc}0NkvC*G=V+Yh1cyeJVq~9czZiDXe+S=VfL2g`LWo8om
z$Y~FQc6MFjV-t1Y`^D9XMwY*U_re2R?&(O~68T&D4S{X`6JYU-pz=}ew-)V0AOUT1
zVOkHAB-8uBcRjLvz<9HS#a@X*Kc@|W)nyiSgi|u5$Md|P()%2(?olGg@ypoJwp6>m
z*<ymn-fswTb^f%!tlC*e)Zfu(S723{ZOkO!ZOHx%3245^gD1Y{EuGWsRjV0(K1O+}
zFJo}`9v%{DBbDmyDdOiLSiwDk=hGPIM~tP$#l?|Hb+U{q{JFH-xG^lPux^8#ZA4(}
z#1*JkW6Z8kcFm-<?IE@GNtul<PBnXNiWyu(lHMNKoaM^9U&T{&w0rDh$O!PDU=Zs3
zEul%RKW%yeBy??93cs+Q-s5`vC39<<xgGgbpPLdMQ!E4_RQep=j_0Zp8i=bC{u=%K
zi`drGdweqis9fR$I%P8d`(CNy1r!t80j0%2r%hKYGZ#rCd!S9{f0<?KsOYF+d~ix}
zkrIq{vP<d|HihUYGf7f5e!-x?1fzs0(mx^1qzVJL?2Jf(I^4f#8xUBW1Un#2a!t;q
zFb_xhJxY{q%9HIajGERztl0SZoHunhEy{nqyv7QEc7!|l9Ssx)(Gs69ANRY-(xqPa
zl1Do{=NhWZlPUko+p>dnfjjWC>?_1p;%1brqZyDRR;8EntVA92EJ3ByOxj6a+bhPl
z;a?m4rQAV1@QU^#M1HX)0+}A<7TCO`ZR_RzF}X9-M>cRLyN4C+lCk2)kT^3gN^`IT
zNP~fAm(wyIoR+l^lQDA(e1Yv}&$I!n?&*p6?lZcQ+vGLLd~fM)qt}wsbf3r=tmVYe
zl)ntf#E!P7wlakP9MXS7m0nsAmqxZ*)#j;M&0De`oNmFgi$ov#!`6^4)iQyxg5Iuj
zjLAhzQ)r`^hf7`*1`Rh`X;LVBtDSz@0T?kkT1o!ijeyTGt5vc^Cd*tmNgiNo<V{^&
zLfFrG?6N-Vfh*a7zRa-hI?AoBxjDCsq~taz78sN8P99@#6GQn^Ac=oe%U({zYXvM?
z>^EaWvaC8$e+nb_{W01j3%=1Y&92YacjCi>eNbwk%-gPQ@H-+4xskQ}f_c=jg^S-#
zYFBDf)2?@5cy@^@FHK5$YdAK9cI;!?Jgd}25lOW%xbCJ>By3=HiK@1EM+I46A)Lsd
zeT|ZH;KlCml=@;5+hfYf>QNOr^XNH%J-lvev)$Omy8MZ`!{`j>(J5cG&ZXXgv)TaF
zg;cz99i$4CX_@3MIb?GL0s*8J=3`#P(jXF(_(6DXZjc@(@h&=M&JG)9&Te1?(^XMW
zjjC_70|b=9hB6pKQi`S^Ls7JyJw^@P>Ko^&q8F&?>6i;#CbxUiLz1ZH4lNyd@QACd
zu>{!sqjB!2Dg}pbAXD>d!<GKfmsHPhVDo}xP&THD##-_#3(aR4z908zAE_6^yD1x<
z>3jW}=5aN0b;rw*W>*PAxm7D)aw(c*RX2@bTGEI|RRp}vw7;NR2wa;rXN{L{Q#=Fa
z$x@ms6pqb>!8Au<UU`1%O<^p0K#_51IaBWaV0=e@l}O`A>V(prv>|aU8oWV={C&$c
zMa=p=CDNOC2tISZcd8~18GN5oTbKY+Vrq;3_obJlfSKRMk;Hdp1`y`&LNSOqeauR_
z^j*Ojl3Ohzb5-a49A8s|UnM*NM8tg}BJXdci5%h&;$afbmRpN0&~9rCnBA`<MBW8q
zii=3oqIYG@2L#$U`5a-*@qUd0xRZIiAX#A3@!*+Qo*}2I?GS_O{vJpcD{tJc0)1E0
zdu;^XgM9wTO*wzK_a683EROklxUtnAKe%zc5wM?+B;tNRJ8=uVumGIcXULI>#lG!p
zc{(9Y?A0Y9yo?wSYn>iigf~KP$0*@bGZ>*YM4&D;@{<%G<K9Dm7(GMQY-vOtNa-3k
z^1Gs8?lBtOp?IXNh`6FJOL*ILM5J7?(#|^u(@7jtHZ{gL-R)TD_S(>g5^uUJGRrV4
z(aZOGB&{_0f*O=Oi0k{@8vN^BU>s3jJRS&CJOl3o|BE{FAA&a#2YYiX3pZz@|Go-F
z|Fly;7eX2OTs>R}<`4RwpHFs9nwh)B28*o5qK1Ge=_^w0m`uJOv!=&!tzt#Save(C
zgKU=Bsgql|`ui(e1KVxR`?>Dx>(rD1$iWp&m`v)3A!j5(6vBm*z|aKm*T*)mo(W;R
zNGo2`KM!^SS7+*9YxTm6YMm_oSrLceqN*nDOAtagULuZl5Q<7mOnB@Hq&P|#9y{5B
z!2x+2s<%Cv2Aa0+u{bjZXS<eYvH2TwY^!y_lmH+heIicN?XfJ|VJYUm*=Mf4<I%hV
zedGKhEz54U;aQ6Uy*KcjZ4R$`k#i-xZL=<FD%;D~)@x9HPebppVEw|I%7S#wcNK1d
zv7<w=YQ$mc)mf#^yDqFDxkzbx_DjhwV|CeCMS58I=R>);#IFPk(Ph-K7K?3i|4ro>
zRbqJoiOEYo(Im^((r}U4b8nvo_>4<`)ut`24?ILnglT;Pd&U}$lV3U$F9#PD(O=yV
zgNNA=GW|(E=&m_1;uaNmipQe?pon4{T=zK!N!2_CJL0E*R^XXIKf*wi!>@l}3_P9Z
zF~JyMbW!+n-+>!u=A1ESxzkJy$DRuG+$o<yICUm+39&}07-fE7jWYct6i={)gpF`c
zq%};;XNgi&kAzY(kK*me)WuA}+4Ev33y?TCU9L?;$&CDIvE!tRzC(Eub8YYU0+HsD
zsGFKaGBM%U=Qu}6btMVEIqFFG6@$bH*HjsnRcIFz_*J1eUMDilC-d8mj&xih0&4+8
zTNRYlD2Yp9+9kL$3dv9DxJrcjT!^*;sF6{USgG4WWJ#s(va~|Vtivvpl4^(v2|~$I
z4KvV%6oS^$ag7Md8bL=uJOiO#K7G+oa1_jxXKw8s{$Fc*<s%Gq2xtbL2&B5v{QKVh
zkJ=XVG&6DgZ|_o#A3o^7n{HJ+y_XikFitJY4-jzS1Qw6h1PpwYwHHMOeCydkEg2i1
zAC;IDm|N^t!rq0)dMk~joeJJVjfT^$uzY!Pm#_SvasF9ezbe&@^X_gBA2&ME9y4|)
z#&tSxa<C1cY6(A)w(C%ckas6oPp(8^6h>ioG7(@Et|xVbJ#BCt;J43Nvj@MKvTxzy
zMmjNuc#LXBxFAwIGZJk~^!q$*`FME}yKE8<LVNO0eIwqR59{+xK19B~if7K=eFkgD
zIh>d1f5Mp}KHNq(@=Z8YxV}0@;YS~|SpGg$_jG7>_8WWYcVx#4SxpzlV9N4aO>K{c
z$P?a_fyDzGX$Of3@<FPf)Zo`EIjBzhUAP0YL?@0Y2PsSLwG1MnlXE=W3qk@MV%!sa
z1w_PRZva4oQuIZ>ykvedGd<@-R;M^Shlj*SswJLD+j@hi_&_>6WZ}#AYLR0iWMK|A
zH_NBeu(tMyG=6VO-=Pb>-Q#$F*or}KmEGg*-n?vWQREURdB#+6AvOj*I%!R-4E_2$
zU5n9m>RWs|Wr;h2<xNYO6LIQU3LtCy=I2Xd2!kJhd!E~@d^vN$Yfu&5wfeS|&jUU!
zp-j6yzC=XsCqT+u-WvC`)_%q^wE@k2puG1oWXVN+Eb#~@5VAd*UaPrLVaf))C=GUk
zK%IJNK{6*^$z@1Y#bRaIw41OKL#4hSRCb<AJ3w*jbO9WVb5V1jxSU~#LejNz`6mRQ
z1)Fy7HJ72>DaO&mFBdDb-Z{APGQx$(L`if?C|njd*fC=rTS%{o69U|meRvu?N;Z|Y
zbT|ojL>j;q*?xXmnHH#3R4O-59NV1j=uapkK7}6@Wo*^Nd#(;$iuGsb;H315xh3pl
zHaJ>h-_$hdNl{+|Zb%DZH%ES;*P*v0#}g|vrKm9;j-9e1M4qX@zkl&5OiwnCz=tb6
zz<6HXD+rGIV<!n~=b=nvG!TOPVF%Z<h|!Lxb5_bC!p~HE#l0w~R!i$%;iU6+rk1&q
zNvMcxRx{Q1OKjBor!}odgT(nSx<VF%(%Q1DFG^AtWdws15DKet6Nys*tNYUdvJth+
zq**er^8_6w6&M{{gT9wfJ=VRT7IDho)_ZpK{iH4G?dd`6U2^5`5$Rq`*UXh$X1WC*
zx=x;@i7j`y6IINxQbWa-ki0xngQ8y@=<Q?75y@@=nDdj$$F@c3Wrm3Ijv{c|-{86;
zFYEd)RaV=^PL!_jI@EhnJG6V1?!I`YP7n^H1`%(C6;hwl&NDWZ(zeFn!@-e!`u|QZ
zr%*2EsM;E$zlLMW=p4Umly;DBXD`^I<`kFeRnU_4Qe`SUCS%v>pGtkb{Q^LIgExOm
zz?I|oO9)!BOLW#krLmWvX5(k!h{i>ots*EhpvAE;06K|u_c~y{#b|UxQ*O@Ks=bca
z^_F0a@61j3I(Ziv{xLb8AXQj3;R{f_l6a#H5ukg5rxwF9A$?Qp-Mo54`N-SKc}fWp
z0T)-L@V$$&my;l#Ha{O@!fK4-FSA)L&3<${Hcwa7ue`=f&YsXY(NgeDU#sRlT3+9J
z6;(^(<LDM+AvzTU!-C{iPXaX4Nc}0`D$xG$>sjSK@3?oMo$%L-nqy*E;3pb0nZLx6
z;h5)T$y8GXK1DS-F@bGun8|J(v-9<Nj(YoS^a@X%WZ9CzyPzt0MsL^*li+Uz8R464
zO-KqEgSa2^s7jV3sjlYQyBA#{n=mhA!#eK4E&>o=42&nLJy#}M5D0T^5VWBNn$RpC
zZzG6Bt66VY4_?<IE3S-C%&9DDOR8D|Xhv=kV0CuLf#wqH{>W=PX$DMpKAI!d`INr)
zkMB{XPQ<52rvWVQqgI0OL_NWxoe`xxw&X8yVftdODPj5|t}S6*VMqN$-h9)1MBe0N
zYq?g0+e8fJCoAksr0af1)FYtz?Me!Cxn`g<M_?!7yYXUX5LhS#3<X_Le=@)07fxfX
zg<wQS_`%em|3SoY1|=4jyh^zC3dtEdfn<*o-|Lal<ZY+P?t&|dqFmI}T2qi9$C}2W
zj)GYEvpt>Ux&|T;)695GG6HF7!Kg1zzRf_{VWv^bo81v4$?F6u2g|wxHc6eJQAg&V
z#%0DnWm2Rmu71rPJ8#xFUNFC*V{+N_qqFH@gYRLZ6C?GAcVRi>^n3zQxORPG)$-B~
z%_oB?-%Zf7d*Fe;cf%tQwcGv2S?rD$Z&>QC2X^vwYjnr5pa5u#38cHCt4G3|efuci
z@3z=#A13`+ztmp;%zjXwPY_aq-;isu*hecWWX_=Z8paSqq7;XYnUjK*T>c4~PR4W7
z#C*%_H&tfGx`Y$w7`dXvVhmovDnT>btmy~SLf>>~84jkoQ%cv=MMb+a{JV&t0+1`I
z32g_Y@yDhKe|K^PevP~MiiVl{Ou7^Mt9{lOnXEQ`xY^6L8<d-9S$!~O7k-KROY*PL
zCA|!)_d-_?xx)Gcs#Ag@Of}YkY_9U`To>D$705GON{!1?1&YJEl#fTf5Z)da=yiEQ
zGgtC-soFGOEBE<kWyJ42cr@+Ng;c_*w#9AHzDDTzeU(l2#nqVc#nsY}WyV2@_UTD^
z0p3$t>B~ZF_{7b(76En>d}mI~XIwNw{e>=Fv)sgcw@qOsykWr?+qAOZSVrQfg}TNI
ztKNG)1SRrAt6#Q?(me%)>&A_^DM`pL>J{2xu>xa$3d@90xR61TQDl@fu%_85DuUUA
za9tn64?At;{`BAW6oykwntxHeDpXsV#{tmt5RqdN7LtcF4vR~_kZNT|wqyR#z^Xcd
zFdymVRZvyLfTpBT>w9<)Ozv@;Yk@dOSVWbbtm^y@@C>?flP^EgQPAwsy75bveo=}T
zFxl(f)s)j(0#N_>Or(xEuV(n$M+`#;Pc$1@OjXEJZumkaekVqgP_i}p`oTx;terTx
zZpT+0dpUya2hqlf`SpXN<YE2ffZ2e*Y$SI>{}>PfhajNk_J0`H|2<5E;U5Vh4F8er
z;RxLSFgpGhkU>W?IwdW~NZTyOBrQ84H7_?gviIf71l`EETodG9a1!8e{jW?DpwjL?
zGEM&eCzwoZt^P*<N!;*VGB5Djp#t2G{>8KHZ$B<%{I}>46IT%jJ3AnnB5P%D2E2Z_
z1M!vr#8r}1|KTqWA4%67ZdbMW2YJ81b(KF&SQ2L1Qn(y-=J${p?xLMx3W7*MK;LFQ
z6Z`aU;;mTL4XrrE<i<Or!wqhq%QlyTjdg*5fDb6gaB^_iv!f*rl-?Ln!l|T3VJ~h%
zFG5H{F@Wd5y{?v7_`yvHg7*5QBVZnIdi~R=W<qSt+5=YVtMBSbwsxP|D{Se=)W!F_
zxvNkUp1u>;HY*Rkh6N%?qviUGNAKiCB~!P}Z->IpO6E(g<M&OUIhRLNqDkr%x|E#j
zMC1rVZG&lU_Gx-!F5@j2`I=*FOTQj)Tg<qL0g{oF1T*!`;X&0>Gd7I#eDuT7j|?nZ
zK}I(EJ>$Kb&@338M~O+em9(L!+=0zBR;JAQesx|3?Ok90)D1aS9P?yTh6Poh8Cr4X
zk3zc=f2rE7jj+aP7nUsr@~?^EGP>Q>h#NHS?F{Cn`g-gD<8F&dqOh-0sa%pfL`b+1
zUsF*4a~)KGb4t<Bv2oG)5VkWX1^EnN#5~luJsu$ucbCPf?=LgPz2nRsx?GLVUI`=(
zY$9SFA5^Me2%6^ZY_wkJC<KL%Hc?4sCh|C>e&K0}bE>z3yb8%<cvwb58{*M5QVoG>
zibb5Q%Sfiv7fe<uW#_+&tCRnFhL4ZM4$sDrCbtY#b_!YAfVsY$g`+bS@`A0L^(QY?
z#W|G!LhF#VQQPxNK_Mhx#q)buV#Q=UEEgyBA<mFM`k+AZ6ZM`=2=73Hpf@19E@eR7
zu;s6N(QU6mW?3M@cL+3p`Sw36_diSVzwy2Qy(p{Hbu{pSV4pmzkzuOyk3!5m%(%Ib
z!r$;kO;Xc|%J_)ba*$RwXB4=7e@v++Wtrb#H}%Fm3|3Q=bY(Ph-U&aD+;XHO(Wzz<
zDYY_P9<u$OA2uv5(*!<0&XGYb3rM%o__3~kghWc-?#G0vttHh0d@|n>b1r0tfmiMv
z@^4XYwg@KZI=;`wC<fA3q+i3K`%`-~GM5g<V!~_MKB~jR$cR`F3fc7<G;rSHk2r&C
ztrJxN)0M>)`1jUA9K<xCa85y51*NLZm@2!jk+e&%=>v{HKe2t$WmRcR4y8)VAFjRi
zaz&O7Y2tDmc5+SX(bj6yGHYk$dBkWc96u3u&F)2yEE~*i0F%t9Kg^L6MJSb&?wrXi
zGSc;_rln$!^ybwYBeacEFRsVGq-&4uC{F)*Y;<0y7~USXswMo>j4?~5%Zm!m@i@->
zXzi82sa-vpU{6MFRktJy+E0j#w`f`>Lbog{zP|9~hg(r{RCa!uGe>Yl536cn$;ouH
za#@8<bb6U~GCb`PT?W5hHtV|47ZFG$+hAyN&fq$4(Ot12lKB<21MEZg_Q-YY!%}x=
zuWFCT;tpouD&fMRN=Y>XMvS-k<e;&>ddc1`!1LVq;h57~zV`7IYR}pp3u!JtE6Q67
zq3H9ZUcWPm2V4IukS}MCHSdF0qg2@~ufNx9+VMjQP&exiG_u9TZAeAEj*jw($G)zL
zq9%#v{wVyOAC4A~AF=dPX|M}MZV)s(qI9@aIK?Pe+~ch|>QYb+78lDF*Nxz2-v<D%
zcRd<z99X^7c7PW8j+gcz?IBq-;oMn%;{H0yI^JQO`G+@}_ePz1Wy+Mh|7pZC=@Ii3
z0*m9!w~(oyyl|dFC58MGT~tAUx$vd#eA>pRbtQ*F4$0fDbvNM#CCatgQ@z1+EZWrt
z2dZfywXkiW=no5jus-92>gXn5rFQ-COvKyegmL=4+NPzw6o@a?wGE-1Bt;<PJi94(
z<6c`oa?1fKdAjFJW5W_g?tBjcWSvia^nQZ6V4vu{o{O_DSW2)fWnWP>pCHe;34K%Z
z-FnOb%!nH;)gX+!a3nCk?5(f1HaWZBMmmC@lc({dUah+E;NOros{?ui1zPC-Q0);w
zEbJmdE$oU$AVGQPdm{?xx<!1KpS@)5+lwbpwbHgBb5t;Tv7q=WjZP!H5`~PMu(R<=
zb_w2iXtGulObR{20>I_0CKNG$LbY*i?YRQ$(&;NiA#h@DCxC(U@AJ$Yt}}^xt-EC_
z4!;QlLkjvSOhdx!bR~W|Ezmuf6A#@T`2tsjkr>TvW*lFCMY>Na_v8+{Y|=MCu1P8y
z89vP<f7GYh$B_>iH5+CKcG-5lzk0oY>~aJC_0+4rS@c@ZVKLAp`G-sJB$$)^4*A!B
zmcf}lIw|VxV9NSoJ8Ag3CwN&d7`|@>&B|l9G8tXT^BDHOUPrtC70NgwN4${$k~d_4
zJ@eo6%YQnOgq$th?0{h`KnqYa$Nz@vlHw<%!C5du6<*j1nwquk=uY}B8r7f|lY+v7
zm|JU$US08u<I_Z0*-mWQzO-I}`qHpS$zU=BX|3&Luf49L-+WxZKy4vRpusJ%8m)@0
ziv{;^9|o211-;}2C=izc*@WKjs!pixu>gor8E$h3wH$c&i~;guC|3-tqJy#T;v(g(
zBZtPMSyv%jzf->435yM(-UfyHq_D=6;ouL4!ZoD+xI5uCM5ay2m)RPmm$I}h>()hS
zO!0gzMxc`BPkUZ)WXaXam%1;)gedA7SM8~8yIy@6TPg!hR0=T>4$Zxd)j&P-pXeSF
z9W`lg6@~YDhd19B9ETv(%er^Xp8Yj@AuFVR_8t*KS;6VHkEDKI#!@l!l3v6`W1`1~
zP{C@keuV4Q`Rjc08lx?zmT$e$!3esc9&$XZf4nRL(Z*@keUbk!GZi(2Bmyq*saOD?
z3Q$V<*P-X1p2}aQmuMw9nSMbOzuASsxten7DKd6A@ftZ=NhJ(0IM|Jr<91uAul4JR
zADqY^AOVT3a(NIxg|U;fyc#ZnSzw2cr}#a5lZ38>nP{05D)7~ad7JPhw!LqOwA<Y{
z)yrJ_La;uB3SP!wHE^grv11_tcxh#A!E8bJQhBK(9Mo&`u*lMyrj4p(I8aUmo)2i*
zK1Ri%OzatjYYNK8-7I{EKb1H$SU6ydR@xd)2@5&9s<Q^eFvM0hFpYmGTORfJP~Sm0
zbzAdRW+^A`^K2`yHoag|woQA(>TXtRhK!w0X4HgS1i<%AxbFmGJx9?sEURV+S{k~g
zGYF$IWSlQonq6}e;B(X(sIH|;52+(LYW}v_gBcp|x%rEAVB`5LXg_d5{Q5tMDu0_2
z|LOm$@K2?lrLN<F4(!5Ih-vJ&D`=Lp&e(8;uy%9Wg4#kfE!IquHgsYkR;<Orv$CnO
zt>F=mr%YP|U-t)~9bqd+wHb4KuPmNK<}PK6e@aosGZK57=Zt+kcszVOSbe;`E^dN!
ze7`ha3WUUU7(nS0{?@!}{0+-VO4A{7+nL~UOPW9<FTzMloH2%%9rtpS=Jy|^du^;_
z#!4Hd4(#1Ul%Gj75ynFpB2oy8p)HhvdITMDu^`)I+jaV6N>_P(6^GL0h${SLtqG!}
zKl~Ng5#@Sy?65wk9z*3SA`Dpd4b4T^@C8Fhd8O)k_4%0RZL5?#b~jmgU+0|DB%0Z)
zql-c<cx#eek}u-o<6!z^!jA`Z&7_*Rf08gH81%Bz6+y1+Co``NPc_0V2NfM4GGwsb
zowc^r=VRUe^j6An3gYrL@v1Vl;+|i)K_fHqR%(frk}US!rna3>PC>A9HPjdOTpPC`
zQwvF}uB5kG$Xr4XnaH#ruSjM*xG?_hT7y3G+8Ox`flzU^QIgb_>2&-f+XB6MDr-na
zSi#S+c!ToK84<&m6s<?C_%KbO28)G|23ii3xXG@;CM=`fG3)1$3S3z7vhRja3qMoH
zK?O+5-P#^X7Tjd7ulJI^d48(+_u6aya_BTAPgbwS>CiGTd^8pNdXo+$3^l3FL_E`0
z>8it5YIDxtTp2Tm(?}FX^w{fbfgh7>^8mtvN>9fWgFN_*a1P`Gz*dyOZF{OV7BC#j
zQV=FQM5m>47xXgapI$WbPM5V`V<7J9tD)oz@d~MDoM`R^Y6-Na(lO~uvZlpu?;zw6
zVO1faor3dg#JEb5Q*gz4<<AnFRC*cC)M_OVz>W8tgC3nE2BG2je<udTI%)`On6Q}o
zR&JsDj?T>IQs1)<{In&7hJ39x=;ih;CJDy)>0S1at*7n?Wr0ahYCpFjZ|@u91Zl7(
zv;CSBRC65-6f+*JPf4p1UZ)k=XivKTX6_bWT~7V#rq0Xjas6hMO!HJN8GdpBKg_$B
zwDHJF6;z?h<;GXFZan8W{XFNPpOj!(&I1`&kWO86p?Xz`a$`7qV7Xqev|7nn_lQuX
ziGpU1MMYt&5dE2A62iX3;*0WzNB9*nSTzI%62A+N?f?;S>N@8M=|ef3gtQTIA*=yq
zQAAjOqa!CkHOQo4?TsqrrsJLclXcP?dlAVv?v`}YUjo1Htt;6djP@NPFH+&p1I+f_
z)Y279{7OWomY8baT(4TAOlz1OyD{4P?(DGv3XyJTA2IXe=kqD)^h(@*E3{I~w;ws8
z)ZWv<c6*)h1$(Z51qcP+DRirB9I?!9z{4!tKo&NOtiNe!3Nskjf;<7)ZGR)Ce+8XH
z$7`qTDUDKj%cwKeq%F!E5f^m5JlenxKeZ*MX)t~+9P40{hT}M-SFnMQu)KvR6_Mv-
z1os%YtNb@bzIhWOjpZuY=nBo_$5tYbzK&6)5bkkIr((WN-0LPmQiEL5b_>7E)pbEM
zd3MO<B?Qx@?L~w=ib1`&Z<WC=541d*cau`)HK)ilxYxk^H+U-}O0<j|!$=c%YHv$(
zB62PGmLx4Gcyu!<9<2m@eK#f-9?|M1Q^a4}1sMCZnN)#sp5?k^i`?k>XRH3mQhks9
zv6{s;k0y5vrcjXaVfw8^>YyPo=oIqd5IGI{)+TZq5Z5O&hXAw%ZlL}^6FugH;-%vP
zAaKFtt3i^ag226=f0YjzdPn6|4(C2sC5wHFX{7QF!tG1E-JFA`>eZ`}$ymcRJK?0c
zN363o{&ir)QySOFY0vcu6)kX#;l??|7o{HBDVJN+17rt|w3;(C_1b>d;g9Gp=8YVl
zYTtA5<a+evm|Uh9(^nt7y}^=R%V&07M;`wL$Z6eB3P>2@!7AUEkTm@P&h#eg+F*lR
zQ7iotZTcMR1frJ0*V@Hw__~CL>_~2H2cCtuzYIUD24=Cv!1j6s{QS!v=PzwQ(a0HS
zBKx04KA}-Ue+%9d`?PG*hIij@54RDSQpA7|>qY<VUdmtvCUx(~$ooAEO+~lek^feJ
z*nH#k-4kpWB@9MHG~~$EzIn^8Dz$c4ZdF?E!e)Znig4A9{4xQ^OBKe|I6diEpHCz;
z4=JikDy^i)j1->VIrK_G6%6;#ZkR}NjUgmGju)2F`>|WJoljo)DJgZr4eo1k1i1+o
z<qL)5lk*b&hIr4A=KB{MWeJIAjTgw!9R>1D{>^RlpIY8OUaOEf5EBu%a&~c5aWnqM
zxBpJq98f=%M^{4mm~5`CWl%)<vVlq$sK10I+C!$T$f&F>nFR64U{(chmST&2jp+-r
z3675V<;Qi-kJud%oWnCLdaU-)xTnMM%rx%Jw6v@=J|Ir=4n-1Z23r-EVf91CGMGNz
zb~wyv4V{H-hkr3j3WbGnComiqmS0vn?n?5v2`Vi>{Ip3OZUEPN7N8XeUtF)Ry6>y>
zvn0BTLCiqGroFu|m2zG-;Xb6;W`UyLw)@v}H&(M}XCEVXZQoWF=Ykr5lX3XWwyNyF
z#jHv)A*L~2BZ4lX?AlN3X#axMwOC)PoVy^6lCGse9bkGjb=qz%kDa6}MOmSwK`cVO
zt(e*MW-x}XtU?GY5}9{MKhRhYOlLhJE5=<R+-QqP#4_qUQ*fYhz-TKyg5<VHOV^iF
zmw{U2tLdX6=8e$W`cctWz+|!Xu%{|!xm;hl(Bfg6K$gw?-RM@Bzr)89_7_)3i&GcH
z6YFo3@<W2)L^k*F4xWPR2w3%Hgqypk=|CBOh(C-wWyEZ$m@J1wyxA;wt!BYSYxD8g
zFU9s}C)A08k5*p<s<(MOY~!sL?}I{Bwt`D&jolU!RoLd_l5{TA0Zdwec$X$MJ{In;
z#qO9Yd_=QW5qS-dut~psFz4yPhQd!|M-JzeY1y$#z7SRyus~%_uGKZ>ca+-RmO04^
z66z{40J=s=<Mmb*Zx^<grQ^dX@zjQdeLQA7+rGP3X0qLLUJC0KRO>ey9OCdc(RCzy
zd7Zr1%!y3}MG(D=wM_ebhXnJ@MLi7cImDkhm0y{d-Vm81j`0mbi4lF=eirlr)oW~a
zCd?26&j^m4AeXEsIUXiTal)+SPM4)HX%%YWF1?(FV47BaA`h9m67S9x>hWMVHx~Hg
z1meUYoL<blDsj63TBv640kUaLAx#!=rnp@1u_p};Z5nB5f$LgFFKoN)PJO29x+f3U
zGA(28c!A^u21zIZ<FO6vnD|j2sRb9EgK?g^yqSvsY{aPo%19I#i9^C=?qc-KdQuFf
zsR6Hn1<EP(2*WHlNndxJqD|^9Swomu2_pSS6s%xNp$a0Q@DbGy=0j4(aFot4&~k<D
z{V#LA=j-W8;ooL_=>L(p@b3?x|9DgWeI|AJ`<t;U$xdMwsPOn~!9P8M!lwzJf)5ge
zPt3~?XF>Ia84*P{Mb%H$ZRROouR4wZhOPX15=KiBMHl!^JnCt$Az`KiH^_d>cev&f
zaG2>cWf$=A@&GP~DubsgYb|L~o<gBF&<LO~=SWReQBq}@iYsfq=fR#&V4w#~C!EfY
zrw>)cn5h%2`i^!2)bzOTw2UR!>q5^r&2Vy}JaWFUQE04v>2;Z@ZPwXr?y&G(B^@&y
zsd6kC=hHdKV>!NDLIj+3rgZJ|dF`%N$DNd;B)9BbiT9Ju^Wt%%u}SvfM^=|q-nxDG
zuWCQG9e#~Q5cyf8@y76#kkR^}{c<_KnZ0QsZcAT|YLRo~&tU|N@BjxOuy`#>`X~Q<
z?R?-Gsk$$!oo(BveQLlUrcL#eirhgBLh`qHEMg`+sR1`A=1QX7)ZLMRT+GBy?&mM8
zQG^z-!Oa&J-k7I(3_2#Q6Bg=NX<|@X&+YMIOzfEO2$6Mnh}YV!m!e^__{W@-CTprr
zbdh3f=BeCD$gHwCrmwgM3LAv3!Mh$wM)~KWzp^w)Cu6roO7uUG5z*}i0_0j47}pK;
ztN530`ScGatLOL06~zO)Qmuv`h!gq5l#wx(EliKe&rz-5qH(hb1*fB#B+q`9=jLp@
zOa2)>JTl7ovxMbrif`Xe9;+fqB1K<t$PN%b0*juJk@B^zQ;tiy5(~*|#DUYAgor}T
zM8=J^f)tau_?D0X_`rotGny(P?~w+fXV=r|(S=EzflR<iVA^pG*GB>#l=Dv!iT;xF
zdkCvS>C5q|O;}ns3AgoE({Ua-zNT-9_5|P0iANmC6O76Sq_(AN?UeEQJ>#b54fi3k
zFmh+P%b1x3^)0M;QxXLP!BZ^h|AhOde*{9A=f3|Xq*JAs^Y{eViF|=EBfS6L%k4ip
zk+7M$gEKI3<lnX;I~8;d41fE2N2dl$NyMaP%R(U%rVx-XQi||%EoeXN5;;rlz8cAC
z{M55(C<Dd|hkU?)n?=$dY~JLVWqK!mN6pIqg)hP!h#156?rbpWcfmd7_psLRSOaWI
zh=O=PQqk`j7Hd}4KFyusk+@oV@<<s_-9gM4h2N&3CTczWNu;}9hMGuq4_irhsrY}g
zM7hOuM4@nU!4b6JLY-gUiMz%akvVA3tIjL8*l3q4&#Ks%cUI%AdaeJluTU3ha1PO5
zKwR${cl2Aq|Dft)aq!r3X3IdBNpGuS$wUD}n|D@fdCWOS)8}<)DcNL{7e#uF(ZIGN
zY^$B;Sy?T*Rv2Mt>?bQg?H3zaE@;cyv9kv;cqK$VxQbFEsy^iM{XXW0@2|DOu$!-k
zSFl}Y=jt-VaT>Cx*KQnHTyXt}f9XswFB9ibYh+k2J!ofO+nD?1iw@mwtrqI4_i?nE
zhLkPp41ED62me}J<`3RN80#vjW;wt`pP?%oQ!oqy7`miL>d-35a=qotK$p{IzeSk#
ze_$CFYp_zIkrPFVaW^s#U4xT1lI^A0IBe~Y<4uS%zSV=wcuLr%gQT=&5$&K*bwqx|
zWzCMiz>7t^Et@9CRUm9E+@hy~sBpm9fri$sE1zgLU((1?Yg{N1Sars=DiW&~Zw=3I
zi7y)&oTC?UWD<bim(U(Kcv26Hs1MgPYCZBQyaz{OU-1!oDtd9;K0gr>2w97xQ&5vx
zRXEBGeJ(I?Y}eR0_O{$<B>~)bMJRTsNUPIfR!xU9PE7A>AMNr_wbrFK>&vVw=Y;RH
zO$mlpmM<e5@>sQ}-FQ2cSj7s7GpC+~^<qIE<1Uq4Aa|;IK^K)Yh(A_XUCB|Y_R?~k
zei=zP?!DdEmTszWYaP-@z07H$=_l%jx(w*~q_p3m(MQdLR!iM5qQ}A5&l2~45^XB!
zUW8`<CFCM-FHQde?_1|-4E>Q~dC?y>M}%!-3kq(F3hGWo9B-Gn02AwUgJ>Z-pKOaj
zysJBQx{1>Va=*e@sLb2z&RmQ7ira;aBijM-xQ&cpR>X3wP^foXM~u1>sv9xOjzZpX
z<BQ#2^vCHKQ$F!+WCtF?gZbu_QJAuhoSUjsyz8+;?vb5qdIDvheozYu-4++t9hTcY
zbR)6?>0K;EGouSYD~oQ&lAafj3~EaXfFShC+><iKQbwWBYIkb|tCE~^kzi;pS;%$l
zKgeQc!Ja|tXHhs|vuNBpq7;why7Y~OkBN=s5_$h(<&luxUowJL<gbi2wNJlu_eEYQ
zb_?aVix=V!OXdJWUufB<?#(vny@AY4z`w$l-OaG&e89%J0Nk&L|6k4!5hE8fDSH<)
zdlxHLV0@7ZP<v)?0i0?bjexwv|IC&yQdza16-41X)m?)M12iHks%lH;wkdt}6QWUq
z5EYKV097vUa!80?9<8HWEqlWJAZ6;n_yPT!Eg+<Iaz!0Qjzp4Xbn@(JJdJzzy5#aw
z0Hk!62ApsJL47X1kI!vwv96|%k`R+njgSN2s=~u+$wDE$j}9<hC4>VsRlEMa9cg9i
zFxhCKO}K0ax6g4@DEA?dg{mo>s+~RPI^ybb^u--^nTF>**0l5R9pocwB?_K)BG_)S
zyLb&k%XZhBVr7U$wlhMqwL)_r&&n%*N$}~qijbkfM|dIWP{MyLx}X&}ES?}7i;9bW
zmTVK@zR)7kE2+L42Q`n4m0VVg5l5(W`SC9HsfrLZ=v%lpef=Gj)W59VTLe+Z$8T8i
z4V%5+T0t8LnM&H>Rsm5C%qpWBFqgTwL{=_4mE{S3EnBXknM&u8n}A^IIM4$s3m(Rd
z>zq=CP-!9p9es2C*)_hoL@tDYABn+o#*l;6@7;knWIyDrt5EuakO99S$}n((Fj4y}
zD!VvuRzghcE{!s;jC*<_H$y6!6QpePo2A3ZbX*ZzRnQq*b%KK^NF^z96CHaWmzU@f
z#j<acd!VSzE28qM%06`oTXBbQEt6zJ&2X&)^HOYF6Pnpz<>;y?X=UP&+YS3kZx7;{
zDA{9(wfz7GF`1A6iB6fnXu0?&d|^p|6)%3$aG<Kg5Jl{;b)9vyrW+~P_>0Uor~8o?
z*e}u#qz7Ri?8Uxp4m_u{a@%bztvz-BzewR6bh*1Xp+G=tQGpcy|4V_&*aOqu|32CM
zz3r*E8o8SNea2hYJpLQ-_}R&M9^%@AMx&`1H8aDx4j%-gE+baf2+9zI*+Pmt+v{39
zDZ3Ix_vPYSc;Y;yn68kW4CG>PE5RoaV0n@#eVmk?p$u&Fy&KDTy!f^Hy6&^-H*)#u
zdrSCTJPJw?(hLf56%2;_3n|uj<S!~jd!&;hUy<>USJOU8VPOTlDULwt0jS@j^t1WS
z!n7dZIoT+|O9hFUUMbID4Ec$!cc($DuQWkocVRcYSikFeM&RZ=?BW)mG4?fh#)KVG
zcJ!<=-8{&MdE)+}?C8s{k@l49I|Zwswy^<hSY@OWrCd=daYhyVsh3++Yv9P!Otyxd
zYr!|~;J5!$7otrpXL(;Gt`M6)w8EjcUYeS0ogAHFK&g17jnaBRmS2tFge^7{t|dx<
zUgs1<p==05JEfOIW~$+{WKl}T&QH1g-J2AoD4r7s&R3K8i}<Z<e!RF()LS`-mtZYo
zezQv0sh3)<Sa%bi-o4*w%h1YNuhKF@#1?2f1$C)eaGkKn;xb6f@Jm-3j`p?BErB+l
zHwQJSKqYTvG)b1Mx5%)zS8k>ZN3;E!FKyglY~Aq?4m74P-0)sMTGXqd5(S<-(DjjM
z&7dL<EW0B_YAuips^)+S>-Mr8jhUCAG$5^m<srgKVh%Jz7M@*Ies!?2)w)Kdl4a_a
zuHwQSNhF~YHc5%rQCXctfet<O8za-I?YydOFk8iN?JqT#U=4tC4f@I)EBP`j*`--{
zNGBM|)9Ytkc3T^M7RDsL<yKyN7!^_SO>I<|%`;JI5FVUnNj!VO2?Jiqa|c2;4^n!R
z`5KK0hyB*F4w%cJ@Un6GC{mY&r%g`OX|1w2$B7wxu97%<@~9>NlXYd9RMF2UM>(z0
zouu4*+u+1*k;+nFPk%ly!nuMBgH4sL5Z`@Rok&?Ef=JrTmvBAS1h?C0)ty5+yEFRz
zY$G=coQtNmT@1O5uk#_MQM1&bPPnspy5#>=_7%WcEL*n$;sSAZcXxMpcXxLe;_mLA
z5F_paad+bGZV*oh@8h0(|D2P<J(t&2Qw&v^wfgJcGu^$r_u9Q*RQLJ1p@nF4Tco@2
z(1Ps*g@KQZFol96Tqf`~w0w8i@XjVE!)t~{G1+Ted=d-`Ysb%GXG84BqJ~^?`UyR8
zw*)YgFW}yav*K+Tc;as<ycKp3q1UreEHgV%{-DD&Lw587I$PqFzfY`h9kC<heY*xX
zeTV-1r@oi~Jzv?6`f+rA)2}J0FW8Yx)AXHWVAC=Kl{qRb_;ZZenWU{)T$~^kw>!q#
zTHjmiphJ=AazSeKQPkGOR-D8``LjzToyx{lfK-1CDD6M7?pMZOdLKFtjZaZMPk4}k
zW)97Fh(Z+_Fqv(Q_C<zhRDIaVb+X5HoUF<TQrGZ$uH?09fna{AQ6dwyMUF<j(@?_{
z7dup3{JX?S)cqadoWhnBFSYmMD_->MH-YYi?fR5fBnz7KOt0*t^cxmDoIokc=+`o#
zrud|^h_?KW=Gv%byo~(Ln@({?3gnd?DUf-j2J}|$Mk>mOB+1{ZQ8HgY#SA8END(Zw
z3T+W)a&;OO54~m}ffemh^oZ!Vv;!O&yhL0~hs(p^(Yv=(3c+PzPXlS5W79Er8B1o*
z`c`NyS{Zj_mKChj+q=w<HOP7Tq0RxJ;u<fTlvk?B9o$Wi&hPOY?Y2c&mNuZvb`-Y2
z_5}8ELw*g`I6xv4ops05GS?1{(ES#Zm-kpClr;oo6ZRu~upD(%jyh&9`ledY>)B}K
za*zzPhs?c^`EQ;keH{-OXdXJet1EsQ)7;{3eF!-t^4_Srg4(Ot7M*E~91gwnfhqaM
zNR7dFaWm7MlDYWS*m}CH${o?+YgHiPC|4?X?`vV+ws&Hf1ZO-w@OGG^o4|`b{bLZj
z&9l=aA-Y(L11!EvRjc3Zpxk7lc@yH1e$a}8$_-r$)5++<GK}kbqxgH=4Tct(TwL^`
z^f<xeT@)HfJ0rR60)NutB#?@dlV#K-$S%4<q|dyJ9(<Vo8k-QWkML>`_eUr1+dTb@
zU~2P1HM#W8qiNN3b*=f+FfG1!rFxnNlGx{15}BTI<gZnXqCb;nB49D)H>HgxO>Cq4
z;#9H9YjH%>Z2frJDJ8=xq>Z@H%GxXosS@Z>cY9ppF+)e~t_hWXYlrO6)0p7NBMa`+
z^L>-#GTh;k_XnE)Cg<Bw9XsVmKp2?IBmR@^xjFIWwfvA(PR_0Y3bH<dHA$ukL^nqU
zFc)9w=@dA}yE)^wFlkFa(%m~`8h+w~I9AY&!?Z<t9kWjGo)R&+#Vu8>y|0Dw;(c0*
zSzW14ZXozu)|I@5mRFF1eO%JM=f~R1dkNpZM+Jh(?&Zje3NgM{2ezg1N`AQg5%+3Y
z64PZ0rPq6;_)Pj-hyIOgH_Gh`1$j1!jhml7ksHA1`CH3FDKiHLz+~=^u@kUM{ilI5
z^FPiJ7mSrzBs9{HXi2{sFhl5AyqwUnU{sPcUD{3+l-ZHAQ)C;c$=g1bdoxeG(5N01
zZy=t8i{*w9m?Y>V;uE&Uy~i<w#eexJWxGnFfWNZGkrIlf$sI~zk&Tqr=Ed!9rbKTc
z`|lpRCJ}6^>Y{pY4AV3_N;RL_jT_QtLFx^KjcUy~q9K<?m7^TR%~{qqS!wi!O`F=v
zm8o|@KNdytDU%MB5>cLE3$QJ{!)@$@En{UGG7&}lc*5Kuc^780;7Bj;)X?1CSy*^^
zPP^M)Pr5R>mvp3_hmCtS?5;W^e@5BjE>Cs<`lHDxj<|gtOK4De?Sf0YuK5GX9G93i
zMYB{8X|hw|T6HqCf7Cv&r8A$S@AcgG1cF&iJ5=%+x;3yB`!lQ}2Hr(DE8=LuNb~Vs
z=FO&2pdc16nD$1QL7j+!U^XWTI?2qQKt3H8=beVTdHHa9=MiJ&tM1RRQ-=+vy!~iz
zj3O{pyRhCQ+b(>jC*H)J)%Wq}p>;?@W*Eut@P&?VU+Sdw^4kE8lvX|6czf{l*~L;J
zFm*V~UC;3oQY(ytD|D*%*uVrBB}BbAfjK&%S;z;7$w68(8PV_whC~yvkZmX)xD^s6
z{$1Q}q;99W?*YkD2*;)tRCS{q2s@JzlO~<8x9}X<0?hCD5vpydvOw#<cvxXp^iJBo
zLh*WQB{twDd_ndGde;Ycv4>Z$2;$@cZkYrp83J0PsS~!CFtY%BP=yxG?<@#{7%2sy
zOc&^FJxsUYN36kSY)d7W=*1-{7ghPAQAXwT7z+NlESlkUH&8ODlpc8iC*iQ^MAe(B
z?*xO4i{zFz^G=^G#9MsLKIN64rRJykiuIVX5~0#vAyDWc9-=6BDNT_aggS2G{B>dD
ze-B%d3b6iCfc5{@yz$>=@1kdK^tX9qh0=ocv@9$ai``a_ofxT=>X7_Y0`X}a^M?d#
z%EG)4@`^Ej_=%0_J-{ga!gFtji_byY&Vk@T1c|ucNAr(JNr@)nCWj?QnCy<Q#dA2E
z;P8C%0;>vXg&?FW;S-VOmNL6^km_dqiVjJuIASVGSFEos@EVF7St$WE&Z%)`Q##+0
zjaZ=JI1G@0!?l|^+-ZrNd$WrHBi)DA0-Eke>dp=_XpV<%CO_Wf5kQx}5e<90dt>8k
zAi00d0rQ821nA>B4JHN7U8Zz=0;9&U6LOTKOaC1FC8GgO&kc=_wHIOGycL@c*<bG)
zwo>$`ce703t%>S}mvxEnD-V!;6c`2(p74V7D0No1Xxt`urE66$0(ThaAZ1YVG#QP$
zy~NN%kB*zhZ2Y!kjn826pw4bh)75*e!dse+2Db(;bN34Uq7bLpr47XTX{8UEeC?2i
z*{$`3dP}32${8pF$!$2Vq^gY|#w+VA_|o(oWmQX8^iw#n_crb(K3{69*iU?<%C-%H
zuKi)3M1BhJ@3VW>JA`M>L~5*_bxH@Euy@niFrI$82C1}fwR$p2E&Z<Ob;#(TU74Ph
ztf8J0&fyeMMw)zi=hIx%&y{|{T_5jY)NMBe+>Ynu?jlS}u7W9AyfdXh2pM>78bIt3
z)JBh&XE@zA!kyCDfvZ1qN^np20c1u#%P6;6tU&dx0phT1l=(mw7`u!-0e=PxEjDds
z9E}{E!7f9>jaCQhw)&2TtG-qiD)lD(4jQ!q{`x|<ESM5KH=eknfZ>8l&nmtHkdul#
zy+CIF8lKbp9_w{;oR+jSLtTfE+B@tOd6h=QePP>rh4@~!8c;Hlg9m%%&<QnUTcKE;
z)YrxkvA7z9t4D^hQ9oX+mA>?e`*Z?qz5-zLEWfi>`ord5uHF-s{^bexKAoMEV@9nU
z^5nA{f{dW&g$)BAGfkq@r5D)jr%!Ven~Q58c!Kr;*Li#`4Bu_?BU0`Y`nVQGhNZk@
z!>Yr$+nB=`z#o2nR0)V3M7-eVLuY`z@6CT#OTUXKnxZn$fNLPv7w1<nPf|^3AoMwV
zKsB+NSiodoZIAhe1$EQSDVdL)?SZ(PH*jg!xwhJ*#U_F)oKXO&*jTkSXt<YKQT&PH
zl+Mukt=H*1QL&wAQ_xuxbV-d>y7eGE=Qv@Hey`n;`U=xEl|q@CCV^#l)s0ZfT+mUf
z^(j5r4)L5i2jnHW4+!6Si3q_LdOLQi<^fu?6WdohIkn79=jf%Fs3JkeXwF(?_tcF?
z?z#j6iXEd(wJy4|p6v?xNk-)iIf2oX5^^Y3q3ziw16p9C6B;{COXul%)`>nuUoM*q
zzmr|NJ5n)+sF$!yH5zwp=iM1#ZR`O%L83tyog-q<XVH~w)?V;YEs2<xxAL9u8Cn)q
z1@19=BrkPVp`q0nx*9KBOD=73^U!k`$Oz)fql$!XXe-VDQxD5No*n!+(&gJj6>h1I
z0%dcj{NUs?{myT~33H^(%0QOM>-$hGFeP;U$puxoJ>>o-<CEgGXJLB#!{-C@Jwxpl
zUVta>%Lk*8X^rx1>j|LtH$*)>1C!Pv&gd16%`qw5LdOIUbkNhaBBTo}5iuE%K&ZV^
zAr_)kkeNKNYJRgjsR%vexa~&8qMrQYY}+RbZ)egRg9_$vkoyV|Nc&MH@8L)`&rpqd
zXnVaI@~A;Z^c3+{x=xgdhnocA&OP6^rr@rTvCnhG6^tMox$ulw2U7NgUtW%|-5VeH
z_<gxR5|IS!q~A=25>qyd47}1?IbuKtqNbNx$HR`*+9o=8`%vM8&SIKbkX9&%TS++x
z5|&6P<%=F$C?owUI`%uvUq^yW0>`>yz!|WjzsoB9dT;2Dx8iSuK%%_XPgy0dTD4kd
zDXF@&O_vBVVKQq(9YTClUPM30Sk7B!v7nOyV`XC!BA;BIVwphh+c)?5VJ^(C;GoQ$
zvBxr7_p*k$T%I1ke}`U&)$uf}I_T~#3XTi53OX)PoXVgxEcLJgZG^i47U&>LY(l%_
z;9vVDEtuMCyu2fqZeez|RbbIE7@)UtJvgAcVwVZNLccswxm+*L&w`&t=ttT=sv6Aq
z!Hou<k0lNu5^&RexWJv?GQ<(f-tAdV`ej!bdbIg1R|4AZ>Sc-24Y9;0q$>j<b^i|8
zppCpm&+RogSUV(8s{BI8BL#}F;4aOJZKcE!Pf-lH9R<RO^Qvy3pp`?zlkS$40ETip
z#iwAYNc9U|m)^oU39^nY%y|fh6uw!3zzx%TC0aynt(FtcFlv}PC-h|0p?DEJQYrjq
zBKI!p<K#dC6MPk}7LWi#I^%1f&O0$*|AhoYuOGhe5ch<EZ*Rey>X<1DnnGmAsP))-
z^F~o99gHZw`S&Aw7e4id6Lg7kMk-e)B~=tZ!kE7sGTOJ)8@q}np@j7&7Sy{2`D^FH
zI7aX%06vKsfJ168QnCM2=l|i>{I{%<!ihA(&wjT8>@gcr>ExM0Dw{PX6ozEuqFYEt
z087%MKC;wVsMV}kIiuu9Zz9~H!21d!;Cu#b;hMDIP7nw3xSX~#?5#SSjyyg+Y@xh|
z%(~fv3`0j#5CA2D8!M2TrG=8{%>YFr(j)I0DYlcz(2~92?G*?DeuoadkcjmZszH5&
zKI@Lis%;RPJ8mNsbrxH@?J8Y2LaVjUIhRUiO-oqjy<&{2X~*f|)YxnUc6OU&5iac=
z*^0qwD~L%FKiPmlzi&~a*9sk2$u<7Al=_`Ox^o2*kEv?p`#G(p(&i|ot8}T;8KLk-
zPVf_4A9R`5^e`Om2LV*cK59EshYXse&IoByj}4WZaBomoHAPKqxRKbPcD`lMBI)g-
zeMRY{gFaUuecSD6q!+b5(?vAnf>c`Z(8@RJy%Ulf?W~xB1dFAjw?CjSn$ph>st5bc
zUac1aD_m6{l|$#g_v6;=32(mw<xx{81gP??-5Uh*EV=|5q0G)(9Um);{*OY?3hgGs
zHqE$wRv9KDE!`nta<{9Y4QRqGn${O@+U-}(7qm0?ojE3A!<eQO9on1lM0ltLJ}Kpf
zPO%OShmlRE#Wmx(J9Hyj$tUDEO>pveQDWhmjR7{|B=$oBhz`7_g7qNp)n20|^^op3
zSfTdWV#Q>cb{CMKlWk91^;mHap{mk<B1o0O$~&VV_U2#V<W~>)o?udk$^Q^^u@&jd
zfZ;)saW6<!B#!OgYCYl~S`2$rXNu%O_6_#lt;!oR!`vSRj~T^(P81s=*gVNND(5&m
za4R|>{e*yoL6#0}oVPb2!}r{pAUYtn4{P~ES9tTfC5hXZnM{HrC8^=Pof{G4%Bh#8
ze~?C9m*|fd8MK;{L^!+wMy>=f^8b&y?yr6KnTq28$pFMBW9Oy7!oV<DBr1GmQ?eRF
zSbp)4f>5z|VM$s-cZ{I|Xf@}-)1=$V&x7e;9v81eiTi4O5-vs?^5pCKy2l>q);!MA
zS!}M48l$scB~+Umz}7NbwyTn=rqt@`YtuwiQSMvCMFk2$83k<zeO0U*)I|(yg*Kj9
zONJ(J+8Nxd$mc?9L-e3_6F%}5XllimT|8wU-RBL2UO3qjy<3$=B&}4tijES(#plr#
zo#B)LRZ<DaC0ur2d6y>50Q>OK5&fe*xCddIm)3D0I6vBU<+!3=6?(OhkO|b4fE_-j
zimOzyfBB_*7*p8AmZi~X2bgVhyPy>KyGLAnOpou~sx9)S9%r)5dE%ADs<phjJ+~cu
z6e^C%cj-bwkDqeTKwT845#%IrytZTl)MM>4v%fFybDa_w*0?+>PsEHTbhKK^G=pFz
z@IxLTCROWiKy*)cV3y%0FwrDvf53Ob_XuA1#tHbyn%Ko!1D#sdhBo`;VC*e1YlhrC
z?*y3rp86m#qI|qeo8)_xH*G4q@70aXN|SP+6MQ!fJQqo1kwO_v7zqvUfU=Gwx`CR@
zRFb*O8+54%_8tS(ADh}-hUJzE`s*8wLI>1c4b@$al)l}^%GuIXjzBK!EWFO8W`>F^
ze7y#qPS0NI7*aU)g$_ziF(1ft;2<}6Hfz10cR8P}67FD=+}MfhrpOkF3hFhQu;Q1y
zu%=jJHTr;0;oC94Hi@LAF5quAQ(rJG(uo%BiR<W&PP;_;Uh7C_pCS&MdX%OcW)hw?
zL}{(HSc2IOGWvq06nVXQXW)q)t<UD4PDnDq6M`=+mQENt0<@wGM<nU_;&^eWn$!ar
zz_42ZyHoVPGok(apsld>Q@8U;nhX)j0i?0SL2g-A*YeAqF>RVCBOTrn{0R27vu}_S
zS>tX4!#&U4W;ikTE!eFH+PKw%p+B(MR2I%n#+m0{#?qRP_tR@zpgCb=4rcrL!F=;A
zh%EIF8m6%JG+qb&mEfuFTLHSxUAZEvC-+kvZKyX~SA3Umt`k}}c!5dy?-sLIM{h@>
z!2=C)@nx>`;c9DdwZ&zeUc(7t<21D7qBj!|1^Mp1eZ6)PuvHx+poKSDCSBMFF{bKy
z;9*&EyKitD99N}%mK8431rvbT+^%|O|HV23{;RhmS{$5tf!bIPoH9RKps`-EtoW5h
zo6H_!s)Dl}2gCeGF6>aZtah9iLuGd19^z0*OryPNt{70RvJSM<#Ox9?HxGg04}b^f
zrVEPceD%)#0)v5$YDE?f`73bQ6TA6wV;b^x*u2Ofe|S}+q{s5gr&m~4<L!Kst{sot
z5KH0dt;&=upf|M07weKb>qGd!wOu|cZ||#h_u<art*duxwj*&{AKvt_C_~Rbj+&9M
zkydF-zES=7nwWu)>=k*fB;R6&k?FoM+c&J;ISg70h!J7*xGus)ta4veTdW)S^@sU@
z4$OBS=a~@F*V0ECic;ht4@?Jw<9kpjBgHfr2FDPykCCz|v2)`JxTH55?b3IM={@DU
z!^|9nVO-R#s{`VHypWyH0%cs;0GO3E;It6W@0gX6wZ%W|Dzz&O%m17pa19db(er}C
zUId1a4#I+O<l=|m7ZxfZst4BS&JikX?HdNANhau=>u8E1MU$g=zo%g7K(=0Pn$)Rk
z<4T2u<0rD)*j+tcy2<G9Ivw5qe%5#fv~8!s2fYQ!PCLn0mO&fewcXp9;Ab`>XvY+0
z0d2pqm4)4lDewsAGThQi{2Kc3&C=|OQF!vOd#WB_`4gG3@inh-4>BoL!&#ij8bw7?
zqjFRDaQz!J-YGitV4}$*$hg`vv%N)@#UdzHFI2E<&_@0Uw@h_ZHf}7)G;_NUD3@18
zH5;EtugNT0*RXVK*by>WS>jaDDfe!A61Da=VpIK?mcp^W?!1S2oah^w<GqG&Q{GKG
zsZrmPGpG`5;Zw5KjP_T(w6*Oc#l;%A<XJKoa;gjzf{rQ<msaCqJm@pc8fC%^aHdwx
za;%pk=(M*8?1xrs9O?8~P(Pm^@-SApIj`E()NzqXYNwFt6`={MznUJN%?Q(&I7`j@
zUShz8uo+42K1njt7G_PTc!^uW&zBsg2Q6Hkvq_Ac-_%QcGd|r(JA=3c^ko`1$x6iV
zeuEh$OpkHJoFOO;=9K^m4f2__2er<ga^Us%$iDhlGu0Gyf;e4o?^T0h5Mgqa<m(1*
zz7Tzd6V3Btto~j{F-a}ip<@6RQ&-IuP<M>owRnrYjl~`lgP-mv$?yb6{{S55CCu{R
z$9;`dyf0Y>uM1=X<U>Sl_$01Lc1Iy68IosWN8Q9Op=~I(F<0+_kKfgC*JggjxNgK6
z-3gQm6;sm?J&;bYe&(dx4BEjvq}b`OT^RqF$J4enP1YkeBK#>l1@-K`ajbn05`<Oq
zmqj5{969^1c=`CM^D;dPl`V1bO3H_#G}K_47zMZo;VH~XSUf;WZ1OW;I6MlYB}z!$
ziM)W7<C{za(FgW$rM|O>0J?0daOtnzh@l3^=BkedW1EahZlRp;`j*CaT;-21&f2wU
z+Nh-gc4I36Cw+;3<L&ul7M-c)rmHy9{KG4CN7|Yi5BPNiq@XCF2M`heJTxqm_-j<)
zZv0h3mch@o2qwh*nupqD^@}aY<ug;}r8BckctukkvlZML584k|XHJvXR2{vA6<U@R
zGOK6H91N*q)Wp}(%MPb(Yd4uT-&f8gwV(D=^MR0u<D2LG8^3H>UAc<%ySb`#+c@5y
ze~en&bYV|kn?Cn|@fqmGxgfz}U!98$=drjAkMi`43I4R%&H0GKEgx-=7PF}y`+j>r
zg&JF`jomnu2G{%QV~Gf_-1gx<3Ky=Md9Q3VnK=;;u0lyTBCuf^aUi?+1+`4lLE6ZK
zT#(Bf`5rmr(tgTbIt?yA@y`(Ar=f>-aZ}T~>G32EM%XyFvhn&@PWCm#-<&ApLDCXT
z<x33$GxGAak;#YXrG}i!b5RQTMf?{_;>D#(9m|V(OOo7PmE@`vD4$S5;+9IQm19dd
zvMEU`)E1_F+0o0-z>YCWqg0u8ciIknU#{q02{~YX)gc_u;8;i233D66pf(IkTDxeN
zL=4z2)?S$TV9=ORVr&AkZMl<4tTh(v;Ix1{`pPVqI3n2ci&4Dg+W|N8TBUfZ*WeLF
zqCH_1Q0W&<NGK!A^MDXcf{IX8(y4apmdd$^)KF8&_-`QJW>f9T$lx3CFJ$o@Lz$99
zW!G&@zFHxTaP!o#z^~xgF|(vrHz8R_r9eo;TX9}2ZyjslrtH=%6O)?1?cL&BT(Amp
zTGFU1%%#xl&6sH-UIJk_PGk_McFn7=%yd6tAjm|lnmr8bE2le3I~L{0(ffo}TQjyo
zHZZI{-}{E4ohYTlZaS$blB!h$Jq^Rf#(ch}@S+Ww&$b);8+>g84IJcLU%B-W?+IY&
zslcZIR>+U4v3O9RFEW;8NpCM0w1ROG84=WpKxQ^R`{=0MZCubg3s<Etz&NY5gvG$o
zt!yR2s+eA9B|xgrHuO<dDs_6qmbK**o8#w0$rg#6N2V3+@h}vZqT~<9AqzG~0(P}r
zmXo1RiMgVr)#~eH6ebDnM|qh^Q`sc-mnRmD`WlG2Ikp0I-Ox5n26$IlnwP}x{0W>t
z48AyJNEvyxn-jCPTlTwp4EKvyEwD3e%kpdY?^BH0!3n6Eb57_L%J1=a*3>|k68A}v
zaW`*4YitylfD}ua8V)vb79)N_Ixw_mpp}yJGbNu+5YYOP9K-7nf*jA1#<^rb4#AcS
zKg%zCI)7cotx}L&J8Bqo8O1b0q;B1J#B5N5Z$Zq=wX~nQFgUfAE{@u0+EnmK{1hg>
zC{vMfFLD;L8b4L+B51&LCm|scVLPe6h02rws@kGv@R+#IqE8>Xn8i|vRq_Z`V;x6F
zNeot$1Zsu`lLS92QlLWF54za6vOEKGYQMdX($0JN*cjG7HP&qZ#3+bEN$8O_PfeAb
z0R5;=zXac2IZ?fxu59?Nka;1lKm|;0)6|#RxkD05P5qz;*AL@ig!+f=lW5^Jbag%2
z%9@iM0ph$WFlxS!`p31t92z~TB}P-*CS+1Oo_g;7`6k(Jyj8m8U|Q3Sh7o-Icp4kV
zK}%qri5>?%IPfamXIZ8pXbm-#{ytiam<{a5A+3dVP^xz!Pvirsq7Btv?*d7eYgx7q
zWFxrzb3-%^lDgMc=Vl7^={=VDEKabTG?VWqOngE`Kt7hs236QKidsoeeUQ<dJdpn~
z8ugX}n?|k0G&ZNsQXiIdub6%BbDrFoQP{UHgSJr?H1haK6mY(Yh7KKv?iO`4vz^#t
zxo0DGapnn_ZyGm|->_^FzsXjprCDd@pW25rNx#6x&L6ZEpoX9Ffzv@olnH3rGOSW(
zG-D|cV0Q~qJ>-L}NIyT?T-+x+wU%;+_GY{>t(l9dI%Ximm+Kmwhee;FK$%{dnF;C%
zFjM2&$W68Sz#d*wtfX?*WIOXwT;P6NUw}IHdk|)fw*YnGa0rHx#paG!m=Y6GkS4VX
zX`T$4eW9k1W!=q8!(#8A9h67fw))k_G)Q9~Q1e3f`aV@kbcSv7!priDUN}gX(iXTy
zr$|kU0Vn%*ylmyDCO&G0Z3g>%JeEPFAW!5*H2Ydl>39w3W+gEUjL&vrRs(xGP{(ze
zy7EMWF14@Qh>X>st8_029||TP0>7SG9on_xxeR2Iam3G~Em$}aGsNt$iES9zFa<3W
zxtOF*!G@=PhfHO<!$-(3o&U_UkqyK40n=6Vlx|zXzLEF_{kGb3A?Bg4s<?A8=0T~?
zO=7Dsw!Ft7XCkIj^h-?lS0!wDbCClJ$pTlqYJJ-6S4tKPD||BXU`XGAJbWXRLvm)V
zf)*66PqtX-R?cdA?IdZmRnc`+F~d;OH&Tq8uTfmSYG72eTO=Uss91Xs!~`obeW_$1
zh&9cdKR%|6GCoh+jx11S$4qnWiXaNR35MtDg(^dtMQwVOJ-Lk$#EBnm{#8l|?g5yC
zn5>!=9pVPXMUVi30WmkPoy$02w}&6A7mF)G6-<G0>`~EVq5CwD2`9Zu`kd)52``#V
zNSb`9dG~8(dooi1*-aSMf!fun7Sc`-C$-E(3BoSC$2kKrVcI!&yC*+ff2+C-@!AT_
zsvlAIV+%bRDfd{R*TMF><1&_a%@yZ0G0lg2K;F>7b+7A6pv3-S7qWIgx+Z?dt8}|S
z>Qbb6x(+^aoV7FQ!Ph8|RUA6vX<EwOZP?_l8E>WQH*1$GJC+wXLXizNIc9p2yLzw9
z0=MdQ!{NnOwIICJc8!+Jp!zG}**r#E!<}&Te&}|B4q;U57$+pQI^}{qj669zMMe_I
z&z0uUCqG%YwtUc8HVN7?0GHpu=bL7&{C>hcd5d(iFV{I5c~jpX&!(a{yS*4MEoYXh
z*X4|Y@RVfn;piRm-C%b@{0R;<vec$F=k8NwAb#fTD<{6|sRCl8En=5xM-0JZd-!rE
zs8JFw4w^fq(Co60(caS?->aXrjBtvx^HO;6(>i*RnoG0Rtcd25BT6edxTNOgUAOjn
zJ2)l{ipj8IP$KID2}*#F=M%^n&=bA0tY<hfH<LW9Lo;6?J5TRypd=4|!3w{=vW6;g
zfMUqyTlucEN|n*{H6D!}R83xo!DD>98@+2I+7~A&T-tw%W#3GV>GTmkHaqftl)#+E
zMU*P(Rjo>8%P@_@#UNq(_L{}j(&-@1iY0TRizhiATJrnvwSH0v>lYfCI2ex^><3$q
znzZgpW0JlQx?JB#0^^s-Js1}}wKh6f>(e%NrMwS`Q(FhazkZb|uyB@d%_9)_xb$6T
zS*#-Bn)9gmobhAtvBmL+9H-+0_0US?g6^TOvE8f3v=z3o%NcPjOaf{5EMRnn(_z8-
z$|m<k1+Y2M80uitGacvNl9R2EG12kWV0*95H-KkW?UVAYdtp6iv(`p<v|~>0D$FTU
zDy;21v-#0i)9%_bZ7eo6B9@Q@&XprR&oKl4m>zIj-fi<A_ml^c*yyE~9KBeV?m&C{
z?zC$KvvQZ{0k<TvO$pP1byryt+m|`yvw<0(cY4VrGVL^;IHYW?KvVF8&gSy)h4$0M
z?pQwA7YCHlk$luMaQh|UoJJ+izzn@@h#*~PJ@IegUzf;B4Z#&O$b;oRANjHDWA!n1
z^0^6(LV~RIUb^czO|XQZf$dD@R92b7GQ7*v1+HC0JUUK#)to%s$(LaZOA<rBPjC_H
z&swsIbNNpHEd(`5r)GK%@jitrQ;3_BcW_MzVsDr-lsR9A8%>Ry4Dqy@VVVs?rscG|
zmzaDQ%>AQTi<^vYCmv#KOTd@l7#2VIpsj?nm_WfRZzJako`^uU%Nt3e;cU*y*|$7W
zLm%fX#i_*HoUXu!NI$ey>BA<5HQB=|nRAwK!$L#n-Qz;~`zACig0PhAq#^5QS<8L2
zS3A+8%vbVMa7LOtTEM?55apt(DcWh#L}R^P2AY*c8B}Cx=6OFAdMPj1f>k3#^#+Hk
z6uW1WJW&RlBRh*1DLb7mJ+KO>!t^t8hX1#_Wk`gjDio9)9IGbyCAGI4DJ~orK+YRv
znjxRMtshZQHc$#Y-<-JOV6g^Cr@odj&Xw5B(FmI)*qJ9NHmIz_r{t)TxyB`L-%q5l
ztzHgD;S6cw?7Atg*6E1!c6*gPRCb%t7D%z<(xm+K{%EJNiI2N0l8ud0Ch@_av_RW?
zIr!nO4dL5466WslE6MsfMss7<)-S!e)2@r2o=7_W)OO`~CwklRWzHTfpB)_HYwg<t
zz{HB>z=BzLhgZ9S<{nLBOwOIgJU=94uj6r!m>Xyn9>&xP+=5!zG_*yEoRgM0`aYts
z^)&8(>z5C-QQ*o_s(8E4*?AX#S^0)aqB)OTyX>4BMy8h(cHjA8ji1PRlox@jB*1n?
zDIfyDjzeg91Ao(;Q;KE@zei$}>EnrF6I}q&Xd=~&$WdDsyH0H7fJX|E+O~%LS*7^Q
zYzZ4`pBdY{b7u72gZm6^5~O-57HwzwAz{)NvVaowo`X02tL3PpgLjwA`^i9F^vSpN
zAqH3mRjG8VeJNHZ(1{%!XqC+)Z%D}58Qel{_weSEHoy<sQiA0B_Gqb=CByVMyX%!w
z4Eje5Bt-T0q$B3%(R6oxFvh1CN<`TxM65`>gT<GyFDwf<^GY6Vfrl5#gR3&m)nrt)
zCWR3-*#eZPX3RXX`pjGOfi3o17!KQK8dwfP^~8MI@{GlTZ?WiL#7ES_s$*l2K2Z}H
zas@Kh)G&#NThp?T8I`3G;i;?O${)4keDi<B5`os2ipc!lEq_U9t1+4j6&f+i3E1Qt
zV(COdPse*sjcYv9z&+bej1ic_Zq~W{eH1#pFTwi$HEL#iDucQfEGnaMi)KRHfim?X
z6PkMl?xaac7nJR0Ns&WP-SdXEOTpQyN=*w7lw%g(K13?>9pN@i<m@Psd5jg42?4iP
z1TJCbd#Y_)triFdU(An6c0*yy6t5RbQdYS-sCDoqS+j+^Kq%-VNX#0B3ajQlsOh;>
zi=G;!Vj6XQk2tuJC>lza%ywz|`f7TIz*EN2Gdt!s199Dr4Tfd_%~fu8gXo~|ogt5Q
zlEy_CXEe^BgsYM^o@L?s33WM14}7^T(kqohOX_iN@U?u;$l|rAvn{rwy>!yfZw13U
zB@X9)qt&4;(C6dP?yRsoTMI!j-f1KC!<%~i1}u7yLXYn)(#a;Z6~r>hp~kfP));mi
zcG%kdaB9H)z9M=H!f>kM->fTjRVOELNwh1amgKQT=I8J66kI)u_?0@$$~5f`u%;zl
zC?pkr^p2Fe=J~WK%4ItSzKA+<GP*jGNYka`Iu;bBcRXMzDVt^~dFbWv7OyePceS}f
z-0Rr6B-P~VbM+kR0+dK+p>QHqJ@~m|Cduv=Q&-P8I5rQ-#G@bYH}<jhJxi*3)1Jx`
z)AAx_CCf(b?oSg9xO-v~d&OUM2BAEz9qyGod)3A7h^QvW_MGq?LHCy~ZEPKLo>YJr
zUS(~(w|vKyU(T(*py}jTUp%I<o#}x3P!`rEap=Wdb060(Y75-D1W6-~{J!!Ed<3tq
zwp&hUHFog~v-Jls<xc8HK1Uq!smqAM<}r!S9d^J@R;5>%{2!W!K(i$uvotcPjVddW
z8_5HKY!oBCwGZcs-q`4Yt`Zk~>K?mcxg51wkZlX5e#B08I75F7#dgn5yf&Hrp`*%$
zQ;_Qg>TYRzBe$x=T(@WI9SC!ReSas9vDm(yslQjBJZde5z8GDU``r|N(MHcxNopGr
z_}u39W_zwWDL*XYYt>#Xo!9kL#97|EAGyGBcRXtLTd59x%m=3<KX%+}BsUGy9@h>i
zL^9joWYA)HfL15l9%H?q`$mY27!<9$7GH(kxb%MV>`}hR4a?+*LH6aR{dzrX@?6X4
z3e`9L;cjqYb`cJmophbm(OX0b)!AFG?5`c#zLagzM<qLWA@@_TdWUT#E|2fIBK4QC
zX>W~o)?-!@e80lvk!p#&CD8u5_r&wp4O0zQ>y!k5U$h_K;rWGk=U)zX!#@Q%|9g*A
zWx)qS1?fq6X<$mQTB$#3g;;5tHOYuAh;YKSBz%il3Ui6fPRv#v62SsrCdMRTav)Sg
zTq1WOu&@v$Ey;@^+_!)cf|w_X<@RC>!=~+A1-65O0bOFYiH-)abINwZvFB;hJjL_$
z(9iScmUdMp2O$WW!520Hd0Q^Yj?DK%YgJD^ez$Z^?@9@Ab-=KgW@n8nC&88)TDC+E
zlJM)L3r+ZJfZW_T$;Imq*#2<(j+FIk8ls7)WJ6CjUu#r5PoXxQs4b)mZza<<Vrg%t
z(8`S(Rc|dIPl3K8yS7-~cO9uc>8=v{o)VlLRM<9yw^0En#tXAj`Sylxvki{<1DPe^
zhjHwx^;c8tb?Vr$6ZB;$Ff$+3(*oinbwpN-#F)bTsXq@Sm?43MC#jQ~`F|twI=7oC
zH4TJtu#;ngRA|Y~w5N=UfMZi?s0%ZmKUFTAye&6Y*y-%c1oD3yQ%IF2q2385Zl+=>
zfz=o`Bedy|U;oxbyb^rB9ixG{Gb-{h$U0hVe`J;{ql!s_OJ_>>eoQn(G6h7+b^P48
zG<=Wg2;xGD-+d@UMZ!c;0>#3nws$9kIDkK13IfloGT@s14AY>&>>^#>`PT7GV$2Hp
zN<{bN*ztlZu_%W=&3+=#3bE(mka6VoHEs~0BjZ$+=0`a@R$iaW)6>wp2w)=v2@|2d
z%?34!+iOc5S@;AAC4hELWLH56RGxo4jw8MDMU0Wk2k_G}=Vo(>eRFo(g3@HjG|`H3
zm8b*dK=moM*oB<)*A$M9!!5o~4U``e)wxavm@O_R(`P|u%9^LGi(_%IF<6o;NLp*0
zKsfZ0#24GT8(G`i4UvoMh$^;kOhl?`0yNiyrC#HJH=tqOH^T_d<Sn^_)gv)I><2Z+
zeN>Y9Zn!X4*DMCK^o75Zk2621bdmV7Rx@AX^alBG4%~;G_vUoxhfhFRlR&+3WwF^T
zaL)8xPq|wCZoNT^>3J0K?e{J-kl+hu2rZI>CUv#-z&u@`hjeb+bBZ>bcciQVZ{SbW
zez04s9oFEgc8Z+Kp{XFX`MVf-s&w9*dx7wLen(_@y34}Qz@&`$2+osqfxz4&d}{Ql
z*g1ag00Gu+$C`0avds{Q65BfGsu9`_`dML*rX~hyWIe$T>CsPRoLIr%MTk3pJ^2<X
zSh&)U43JF%=<Yj{veBb(Ckb8gdgCpYh}%kpZZYDF{AgLe$}UrMaOOvu)YU9;n?BkT
zNxs(4`%n);ebfU@MeFR!NNvHzh>zH1qub1MBzPG}PO;Wmav9w%F7?%l=xIf#LlP`!
z_Nw;xBQY9anH5-c8A4mME}?{iewjz(Sq-29r{fV;Fc>fv%0!W@(+{={Xl-sJ6aMoc
z)9Q+$bchoTGTyWU_oI19!)bD=IG&OImfy;VxNXoIO2hYEfO~MkE#IXTK(~?Z&!ae!
zl8z{D&2PC$Q*OBC(rS~-*-GHNJ6AC$@eve>LB@Iq;jbBZj`wk4|LGogE||Ie=M5g=
z9d`uYQ1^Sr_q2wmZE>w2WG)!F%^KiqyaD<N`p2N#PIS`Mf%==Eao;Uu2_^UDs-(J<
zG74hi&Q&b4n<JGE>tIAct?}D~JP4shTJy5Bg+-(EA8aXaxbd~BKMtTf2iQ69jD1o*
zZF9*S3!v-TdqwK$%&?91Sh2=e63;X0Lci@n7y3XOu2ofyL9^-I767eHESAq{m+@*r
zbVDx!FQ|AjT;!bYsXv8ilQjy~Chiu&HNhFXt3R_6kMC8~ChEFqG@MWu#1Q1#=~#ix
zrkHpJre_?#r=N0wv`-7cHHqU`phJX2M_^{H0~{VP79Dv{6YP)oA1&TSfKPEPZn2)G
z9o{U1huZBLL;Tp_0OYw@+9z(jkrwIGd<rJ;;|g97;QyKudFl=p>UrOhKJUbwy?WBt
zlIK)*K0lQCY0qZ!$%1?3A#-S70F#YyUnmJF*`xx?aH5;gE5pe-15w)EB#nuf6B*c~
z8Z25NtY%6Wlb)bUA$w%HKs5$!Z*W?YKV-lE0@w^{4vw;J>=rn?u!rv$&eM+rpU<lK
zDSaRV>6rc=j9>N2Op+C{D^mospMCjF2ZGhe4eADA#skp2EA26%p3Ex9wHW8l&Y@HX
z$Qv)<C51V>mHM}4*@M*#*ll5^hE9M^=q~eyWEai*P;4z<9ZYy!SlNE5nlc7gm;M&Q
zKhKE4d*%A>^m0R?{N}y|i6i^k>^n4(wzKvlQeHq{l&JuFD~sTsdhs`(?lFK@Q{pU~
zb!M3c@*3IwN1RUOVjY5>uT+s-2QLW<CJ6(zZl(Y=4_QOGX*=xBx0IU{bjOr8d3j>Y
z4T2>fiSn>>Fob+%B868-v9D@AfWr#M8eM6w#eAlhc#zk6jkLxGBGk`E3$!A@*am!R
zy>29&ptYK6>cvP`b!syNp)Q$0UOW|-O@)8!?94GOYF_}+zlW%fCEl|Tep_zx05g6q
z>tp47e-&R*hSNe{6{H!mL?+j$c^TXT{C&@T-<poA8}E*C#VxAw^S6u#n%mQpO;WFU
zvLaxQPs)-49m4`q`S=1vU=9~A0zKn1+mPA-d!D#}Ga{6K-?jPMxc<i>xIaesNCl05
z9SLb@q&mSb)I{VXMaiWa3PWj=Ed!>*GwUe;^|uk=Pz$njNnfFY^MM>E?zqhf6^{}0
zx&~~dA5#}1ig~7HvOQ#;d9JZBeEQ+}<CQKP1(~6&tehmCAOE|1+U&fYps;i?%7U2p
zS$$NL#C96qXCIm<KHY5B6l`{cfkDE1j9A*K@D<ezU#F~o4Jc6wIHIE1FM;gV*B;)0
zG;Z-u_NyJF6sUBMs*2d5FC-N2g4b5#W7L^fgw?1&N?2BbL-v_8w{+M;T4E-PtC(jp
zwx$;ua6pZ4<M2z^rf4wO59&T7p0g;L(4p_&xl(GK)K%7!w_@bd(@>-~v$at`m!(ai
z$w(H&mWCC~;PQ1$%iuz3`>dWeb3_p}X>L2LK%2l59Tyc}4m0>9A!8rhoU3m>i2+hl
zx?*qs*c^j}+WPs>&v1%1Ko8_ivAGIn@QK7A`hDz-Emkcgv2@wTbYhkiwX2l=xz*XG
zaiNg+j4F-I>9v+LjosI-QECrtKj<k3_!Y>p&0T@xIMKVr+&)gyb4@b3y?2CA?=ooN
zT#;r<xDf1CATz!G(dQOqWZ;oJ_aQiWEGp<Ugklw>U86WLh(e@#mF*rk(NV-qSIZyr
z$6!ZUmzD)%yO-ot`rw3rp6?*_l*@Z*IB0xn4|BGPWHNc-1ZUnNSMWmDh=EzWJRP`)
z<NV4A@-;kC0%10Sy6M-PAN9ZN;dIx>l%d%J613oXzh5;VY^XWJi{lB`f#u+ThvtP7
zq(HK<4>tw(=yzSBWtYO}XI`S1pMBe3!jFxBHIuwJ(@%zdQFi1Q_hU2eDuHqXte7Ki
zOV55H2D6u#4oTfr7|u*3p75K<e&o873&m~>F&jaLEDpxk!4*bhPc%mpfj)Us3XIG3
zIKMX^s^1wt8YK7Ky^UOG=w!o5e7W-<&c|fw2{;Q11vm@J{)@N3-p1U>!0~sKWHaL=
zWV(0}1IIyt1p%=_-Fe5Kfzc71wg}`RDDntVZv;4!=&XXF-$48jS0Sc;eDy@Sg;+{A
zFStc{dXT}kcIjMXb4F7MbX~2%i;UrBxm%qmLKb|2=?uPr00-$MEUIGR5+JG2l2Nq`
zkM{{1RO_R)+8oQ6x&-^kCj)W8Z}TJjS*Wm4>hf+4#VJP)OBaDF%3pms7DclusBUw}
z{ND#!*I6h85g6DzNvdAmnwWY{&+!KZM4DGzeHI?MR@+~|su0{y-5-nICz_MIT_#FE
zm<5f3zlaKq!XyvY3H`9s&T};z!cK}G%;~!rpzk9-6L}4Rg7vXtKFsl}@sT#U#7)x-
z7UWue5sa$R>N&b{J61&gvKcKlozH*;OjoDR+elkh|4bJ!_3AZNMOu?n9&|L>OTD78
z^i->ah_Mqc|Ev)KNDzfu1P3grBIM#%`QZqj5W{qu(HocQhjyS;UINoP`{J+DvV?|1
z_sw6Yr3z6%e7JKVDY<$P=M)dbk@~Yw9|2!Cw!io3%j92wTD!c^e9Vj+7VqXo3>u#=
zv#M{HHJ=e$X5vQ>><L7PK}M0Mkv;HE!=8>ML?E8#UlmvJgTnb73{PSPTf*0)mcj6C
z{KsfUbDK|F$E(k;ER%8HMdDi`=BfpZzP3cl5yJHu;v^o2FkHNk;cXc17tL8T!CsYI
zfeZ6sw@;8ia|mY_AXjCS?kUfxdjDB28)~Tz1dGE|{VfBS9`0m2!m<b+8mQK3%NGoh
zaOY+g1^h6C*Gw2j`j9qWgH0kild})ej>1yG?hR})er^pl4c@9Aq+|}ZlDaHL)K$O|
z%9Jp-imI-Id0|(d5{v~w6mx)tUKfbuVD`xNt04Mry%M+jXzE>4(TBsx#&=@wT2Vh)
z1yeEY&~17>0%P(eHP0HB^|7C+WJxQBTG$uyOWY@iDloRIb-Cf!p<{WQHR!422#F34
zG`v|#CJ^G}y9U*7jgTlD{D&y$Iv{6&PYG>{Ixg$pGk?lWrE#PJ8KunQC@}^6OP!|<
zS;}p3to{S|uZz%kKe|;A0bL0XxPB&Q{<jo5;Kc4f%b@SP6h8+C2M0XP2e_UKD&SV%
z$;{G_%HGJp$jsL9&w}a_RODkLfKcI4DRHVArb&i30QE00@B{9IZ-5jvK!7zd;Qy~)
zBLV?x_a|t8<^8JwB*9)3DIrBZDlutcnwO7}O7-3TJbR=FxPJcB=kimh^#2}{;*%B=
z7E(~8mJ)u6@biiP5ChJ~{i&n@?rrQ%{>J(9PyX`+Kr`k~r2}yP^ND{8!v7Q1&vtk&
z2Y}l@J@{|2`oA%sxvM9i0V+8IXrZ4;tey)d;LZI70Kbim<4=WoTPZy=Yd|34v#$Kh
zx|#YJ8s`J>W&jt#GcMpx84w2Z3ur-rK7gf-p5cE)=w1R2*|0mj12hvapuUWM0b~dG
zMg9p8FmAZI@i{q~0@QuY44&mMUNXd7z>U58shA3o`p5eVLpq>+{(<3->DWuSFVZwC
zxd50Uz(w~LxC4}bgag#q#NNokK@yNc+Q|Ap!u>Ddy+df>v;j@I12CDNN9do+0^n8p
zMQs7X#+FVF0C5muGfN{r0|Nkql%BQT|K(DDNdR2pzM=_ea5+GO|J67`05AV92t@4l
z0Qno0078PIHdaQGHZ~Scw!dzgqjK~3B7kf>BcP__&lLyU(cu3B^uLo%{j|Mb<L(`+
zI+OxXmH-z3=uZU;@C$xXWwHNQ-LLU7%TRB>0NR)tkeT7Hcwp4O#<O;C)U&kwZ#J9>
z)yzu>cvG(d9~0a^)eZ;<gXy2}eqFmlH`*3uKnDQ<)($^E$Irz}^p|LoHYWeIw_j#C
ze+Gk<_$AnH<vf3BQ!g`UKeM1q{co0EH^a+JqtBSafUo#3I{oF;<1cwfU)K0#p1)^w
zVzpnQ|Jz7^k@fE-^vg7B&(N<m{|Wl9N!MPYzRXwhjQY;#|Ay>;%3ksk@F&1eEBje~
zW+-_s)&RgiweQc!otF>4%vbXKaOU41{!hw?|2`Ld3I8$&#WOsq>EG)1ANb!{N4z9@
zsU!bPG-~-bqCeIDzo^Q;gnucB{tRzm{ZH^Orphm2U+REA!*<*J6YQV83@<TXipoA?
z4mtf3<{vQt|Lx28Qoi*WwZZu}Q2)D_>&xoDl%#wnl5qcBqCcAF-vX5{30}(oJrnSH
z{RY85hylK2dMOh2%oO1J8%)0?8TOL%rS8)+CsDv}aQ>4D)Jv+DLK)9gI^n-T^$)Tc
zFPUD75<D|$Mg0cTALhXKONoM)R4-$}pQ+?xf13*MeX;$suG>qJm!Y-KBqj;JP4dV4
z`X{lGmn<)1IG<SzlYX1!&k>z330}s}Jrjtf{(lnuuNHe5(ezA(pYa=1|Ff-LhPFK8
zyJh_b{yzu0yll6ZkpRzRjezyYivjyjW7QwO;@6X`m;2Apn2EK2!~7S}-*=;5*7K$B
z`x(=!^?zgj(-`&ApZJXI09aDLXaT@<;CH=?fBOY5d|b~wBA@@p^K#nxr<VU8!}v?T
z%QFpn*Z)oF{+k!(KXv|(&*jfflb=49=VAc>`)?i?SqTupI_PJ(A3cx`z~9mX_*)>L
F{|7XC?P&l2

literal 0
HcmV?d00001

diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 00000000000..2d135d7b25c
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Wed Jun 26 10:57:21 CST 2019
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.1.1-all.zip
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew
new file mode 100755
index 00000000000..cccdd3d517f
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew.bat b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew.bat
new file mode 100644
index 00000000000..e95643d6a2c
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/paddle/fluid/lite/demo/java/android/PaddlePredictor/settings.gradle b/paddle/fluid/lite/demo/java/android/PaddlePredictor/settings.gradle
new file mode 100644
index 00000000000..e7b4def49cb
--- /dev/null
+++ b/paddle/fluid/lite/demo/java/android/PaddlePredictor/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/paddle/fluid/lite/gen_code/CMakeLists.txt b/paddle/fluid/lite/gen_code/CMakeLists.txt
index bacfc3e988e..8059378011c 100644
--- a/paddle/fluid/lite/gen_code/CMakeLists.txt
+++ b/paddle/fluid/lite/gen_code/CMakeLists.txt
@@ -4,24 +4,30 @@ lite_cc_library(gen_code_lite SRCS gen_code.cc
         HVY_DEPS operator)
 lite_cc_library(paddle_infer_gencode SRCS paddle_infer.cc DEPS program_lite utils_lite)
 
-if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-    lite_cc_test(test_gen_code_lite SRCS gen_code_test.cc DEPS gen_code_lite ${tensor_lite}
-            mul_op_lite
-            compatible_pb_lite
-            model_parser_lite
-            X86_DEPS mul_compute_x86
-            ARM_DEPS mul_compute_arm
-            ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
+lite_cc_test(test_gen_code_lite SRCS gen_code_test.cc 
+        DEPS gen_code_lite ${tensor_lite} ${host_kernels} ${ops_lite} 
+        compatible_pb_lite
+        model_parser_lite
+        X86_DEPS ${x86_kernels}
+        ARM_DEPS ${arm_kernels}
+        CL_DEPS ${opencl_kernels}
+        EXCLUDE_COMPILE_DEPS "ON"
+        ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
-    lite_cc_library(__generated_code__
-        SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/lite/gen_code/__generated_code__.cc
-        DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
-    )
+lite_cc_library(__generated_code__
+    SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/lite/gen_code/__generated_code__.cc
+    DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
+    EXCLUDE_COMPILE_DEPS "ON"
+)
+add_dependencies(__generated_code__ test_gen_code_lite)
+add_dependencies(__generated_code__ extern_lite_download_lite_naive_model_tar_gz)
 
-    lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
-      ${ops_lite} ${host_kernels}
-      X86_DEPS ${x86_kernels}
-      )
+lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
+    ${ops_lite} ${host_kernels}
+    X86_DEPS ${x86_kernels}
+    ARM_DEPS ${arm_kernels}
+    CL_DEPS ${opencl_kernels}
+    EXCLUDE_COMPILE_DEPS "ON"
+)
 
-    add_dependencies(__generated_code__ test_gen_code_lite)
-endif()
+lite_cc_binary(paddle_code_generator SRCS paddle_code_generator.cc DEPS model_parser_lite gen_code_lite)
diff --git a/paddle/fluid/lite/gen_code/gen_code.cc b/paddle/fluid/lite/gen_code/gen_code.cc
index a50241bb715..c0b219fd0f4 100644
--- a/paddle/fluid/lite/gen_code/gen_code.cc
+++ b/paddle/fluid/lite/gen_code/gen_code.cc
@@ -111,6 +111,15 @@ void Module::AddOpDescHelper(const std::string &op_id,
         return std::to_string(desc.GetAttr<bool>(name));
       case AttrType::STRING:
         return "\"" + desc.GetAttr<std::string>(name) + "\"";
+      case AttrType::FLOATS: {
+        auto vals = desc.GetAttr<std::vector<float>>(name);
+        return "{" + Join(vals, ",") + "}";
+      }
+      case AttrType::INTS: {
+        auto vals = desc.GetAttr<std::vector<int>>(name);
+        return "{" + Join(vals, ",") + "}";
+      }
+
       case AttrType::STRINGS: {
         std::vector<std::string> tmp;
         auto vals = desc.GetAttr<std::vector<std::string>>(name);
@@ -137,8 +146,12 @@ void Module::AddOpDescHelper(const std::string &op_id,
         return "bool";
       case AttrType::STRING:
         return "std::string";
+      case AttrType::FLOATS:
+        return "std::vector<float>";
       case AttrType::STRINGS:
         return "std::vector<std::string>";
+      case AttrType::INTS:
+        return "std::vector<int>";
       default:
         LOG(FATAL) << "Unsupported attribute type: " << static_cast<int>(type);
     }
@@ -160,6 +173,8 @@ void Module::AddOp(const cpp::OpDesc &op) {
   auto op_name = OpUniqueName();
   AddOpDescHelper(op_name, op);
 
+  LOG(INFO) << "add op " << op_name;
+
   Line(string_format("// Create Op: %s", op.Type().c_str()));
 
   Line(string_format("auto %s = lite::LiteOpRegistry::Global().Create(\"%s\");",
diff --git a/paddle/fluid/lite/gen_code/gen_code.h b/paddle/fluid/lite/gen_code/gen_code.h
index 1a55483f03a..b45ff9370f5 100644
--- a/paddle/fluid/lite/gen_code/gen_code.h
+++ b/paddle/fluid/lite/gen_code/gen_code.h
@@ -181,9 +181,7 @@ class ProgramCodeGenerator {
  public:
   ProgramCodeGenerator(const framework::proto::ProgramDesc &program,
                        const lite::Scope &exec_scope)
-      : program_(program), exec_scope_(exec_scope) {
-    LOG(INFO) << program.DebugString();
-  }
+      : program_(program), exec_scope_(exec_scope) {}
 
   std::string GenCode() {
     Module m;
diff --git a/paddle/fluid/lite/gen_code/gen_code_test.cc b/paddle/fluid/lite/gen_code/gen_code_test.cc
index c27b775c061..4d0545b5d5b 100644
--- a/paddle/fluid/lite/gen_code/gen_code_test.cc
+++ b/paddle/fluid/lite/gen_code/gen_code_test.cc
@@ -19,9 +19,10 @@
 #include <string>
 #include <utility>
 #include <vector>
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
 #include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/scope.h"
 #include "paddle/fluid/lite/model_parser/cpp/op_desc.h"
 #include "paddle/fluid/lite/model_parser/model_parser.h"
@@ -48,15 +49,25 @@ TEST(gen_code, manual) {
 
   // Set weights.
   std::vector<float> w0_data({0, 1, 2, 3});
+  std::vector<float> a_data({0, 1, 2, 3});
+#ifdef LITE_WITH_ARM
+  w0->Assign<float, lite::DDim, TARGET(kARM)>(
+      w0_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
+  a->Assign<float, lite::DDim, TARGET(kARM)>(
+      a_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
+#else
   w0->Assign<float, lite::DDim, TARGET(kX86)>(
       w0_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
-
-  std::vector<float> a_data({0, 1, 2, 3});
   a->Assign<float, lite::DDim, TARGET(kX86)>(
       a_data.data(), lite::DDim{std::vector<int64_t>({2, 2})});
+#endif
 
   std::vector<Place> valid_places({
+#ifdef LITE_WITH_ARM
+      Place{TARGET(kARM), PRECISION(kFloat)},
+#else
       Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
       Place{TARGET(kHost), PRECISION(kFloat)},
       Place{TARGET(kHost), PRECISION(kAny)},
   });
@@ -71,7 +82,11 @@ TEST(gen_code, manual) {
 
   mul_op->Attach(mul_op_desc, &tmp_scope);
   auto mul_kernel = std::move(mul_op->CreateKernels(valid_places).front());
+#ifdef LITE_WITH_ARM
+  auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kARM));
+#else
   auto fc_ctx = ContextScheduler::Global().NewContext(TARGET(kX86));
+#endif
   mul_op->CheckShape();
   mul_op->InferShape();
   mul_kernel->SetContext(std::move(fc_ctx));
@@ -94,7 +109,11 @@ TEST(gen_code, auto_gen) {
   op0.SetAttr<std::string>("desc", "this is a desc");
   op0.SetAttr<int>("x_col", 1);
   op0.SetAttr<int>("y_col", 2);
+#ifdef LITE_WITH_ARM
+  op0.SetAttr<std::string>(kKernelTypeAttr, "arm");
+#else
   op0.SetAttr<std::string>(kKernelTypeAttr, "x86");
+#endif
 
   gencode::Module module;
   module.AddHeaderIncludeGenCode();
@@ -134,12 +153,3 @@ TEST(gen_code, optimized_program) {
 }  // namespace gencode
 }  // namespace lite
 }  // namespace paddle
-
-USE_LITE_OP(mul);
-#ifdef LITE_WITH_X86
-USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
-#endif
-
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-#endif
diff --git a/paddle/fluid/lite/gen_code/generated_code_test.cc b/paddle/fluid/lite/gen_code/generated_code_test.cc
index e5874a2e149..83bd5767ac6 100644
--- a/paddle/fluid/lite/gen_code/generated_code_test.cc
+++ b/paddle/fluid/lite/gen_code/generated_code_test.cc
@@ -24,7 +24,8 @@ TEST(PaddlePredictor, Init) {
   predictor.Init();
 }
 
-TEST(PaddlePredictor, Run) {
+#ifdef LITE_WITH_X86
+TEST(PaddlePredictor, RunX86) {
   gencode::PaddlePredictor predictor;
   predictor.Init();
 
@@ -41,6 +42,39 @@ TEST(PaddlePredictor, Run) {
   auto output_tensor = predictor.GetOutput(0);
   LOG(INFO) << "output: " << output_tensor->data<float>()[0];
 }
+#endif
+
+#ifdef LITE_WITH_ARM
+TEST(PaddlePredictor, RunARM) {
+  gencode::PaddlePredictor predictor;
+  predictor.Init();
+
+  LOG(INFO) << "run the generated code";
+  auto input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(std::vector<int64_t>({1, 100}));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < 100; i++) {
+    data[i] = 1;
+  }
+
+  predictor.Run();
+
+  std::vector<float> result({0.4350058, -0.6048313, -0.29346266, 0.40377066,
+                             -0.13400325, 0.37114543, -0.3407839, 0.14574292,
+                             0.4104212, 0.8938774});
+
+  auto output_tensor = predictor.GetOutput(0);
+  auto output_shape = output_tensor->shape();
+  ASSERT_EQ(output_shape.size(), 2);
+  ASSERT_EQ(output_shape[0], 1);
+  ASSERT_EQ(output_shape[1], 500);
+
+  int step = 50;
+  for (int i = 0; i < result.size(); i += step) {
+    EXPECT_NEAR(output_tensor->data<float>()[i], result[i], 1e-6);
+  }
+}
+#endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/gen_code/paddle_code_generator.cc b/paddle/fluid/lite/gen_code/paddle_code_generator.cc
new file mode 100644
index 00000000000..cade9206317
--- /dev/null
+++ b/paddle/fluid/lite/gen_code/paddle_code_generator.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include "paddle/fluid/lite/gen_code/gen_code.h"
+#include "paddle/fluid/lite/model_parser/model_parser.h"
+
+DEFINE_string(optimized_model, "", "");
+DEFINE_string(generated_code_file, "__generated_code__.cc", "");
+
+namespace paddle {
+namespace lite {
+namespace gencode {
+
+void GenCode(const std::string& model_dir, const std::string& out_file) {
+  lite::Scope scope;
+  framework::proto::ProgramDesc desc;
+  LoadModel(model_dir, &scope, &desc);
+
+  ProgramCodeGenerator codegen(desc, scope);
+
+  std::ofstream file(out_file);
+
+  file << codegen.GenCode();
+
+  file.close();
+}
+
+}  // namespace gencode
+}  // namespace lite
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, false);
+  paddle::lite::gencode::GenCode(FLAGS_optimized_model,
+                                 FLAGS_generated_code_file);
+  return 0;
+}
diff --git a/paddle/fluid/lite/gen_code/paddle_infer.cc b/paddle/fluid/lite/gen_code/paddle_infer.cc
index ac4e99cb714..42585a92b3e 100644
--- a/paddle/fluid/lite/gen_code/paddle_infer.cc
+++ b/paddle/fluid/lite/gen_code/paddle_infer.cc
@@ -25,6 +25,12 @@ void Tensor::Resize(const Tensor::ddim_t &shape) {
   tensor->Resize(shape);
 }
 
+std::vector<int64_t> Tensor::shape() const {
+  CHECK(raw_tensor_);
+  auto *tensor = static_cast<const lite::Tensor *>(raw_tensor_);
+  return tensor->dims().Vectorize();
+}
+
 #define FOR_EACH_TYPE(HANDLE) \
   HANDLE(int);                \
   HANDLE(float);              \
diff --git a/paddle/fluid/lite/gen_code/paddle_infer.h b/paddle/fluid/lite/gen_code/paddle_infer.h
index 99158b0503c..e01ffc25e29 100644
--- a/paddle/fluid/lite/gen_code/paddle_infer.h
+++ b/paddle/fluid/lite/gen_code/paddle_infer.h
@@ -34,6 +34,8 @@ class Tensor {
   template <typename T>
   T *mutable_data();
 
+  ddim_t shape() const;
+
  private:
   const void *raw_tensor_;
   void *raw_mutable_tensor_{};
diff --git a/paddle/fluid/lite/host/CMakeLists.txt b/paddle/fluid/lite/host/CMakeLists.txt
index 90812f3f3cd..0a400e6a0a9 100644
--- a/paddle/fluid/lite/host/CMakeLists.txt
+++ b/paddle/fluid/lite/host/CMakeLists.txt
@@ -1 +1,2 @@
-cc_library(target_wrapper_host SRCS target_wrapper.cc)
+lite_cc_library(target_wrapper_host SRCS target_wrapper.cc)
+ 
diff --git a/paddle/fluid/lite/host/target_wrapper.cc b/paddle/fluid/lite/host/target_wrapper.cc
index 2e8393ef466..af4269390cb 100644
--- a/paddle/fluid/lite/host/target_wrapper.cc
+++ b/paddle/fluid/lite/host/target_wrapper.cc
@@ -14,15 +14,29 @@
 
 #include "paddle/fluid/lite/core/target_wrapper.h"
 #include <cstring>
+#include <memory>
 
 namespace paddle {
 namespace lite {
 
+const int MALLOC_ALIGN = 64;
+
 void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
-  return new char[size];
+  size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
+  char* p = static_cast<char*>(malloc(offset + size));
+  if (!p) {
+    return nullptr;
+  }
+  void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
+                                    (~(MALLOC_ALIGN - 1)));
+  static_cast<void**>(r)[-1] = p;
+  memset(r, 0, size);
+  return r;
 }
 void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
-  delete[] static_cast<char*>(ptr);
+  if (ptr) {
+    free(static_cast<void**>(ptr)[-1]);
+  }
 }
 void TargetWrapper<TARGET(kHost)>::MemcpySync(void* dst, const void* src,
                                               size_t size, IoDirection dir) {
diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt
index ce22ba12166..abc6d65bb02 100644
--- a/paddle/fluid/lite/kernels/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/CMakeLists.txt
@@ -1,7 +1,10 @@
 message(STATUS "add lite kernels")
-set(lite_kernel_deps type_system kernel_lite op_lite op_registry_lite context_lite ${tensor_lite})
+
+set(lite_kernel_deps type_system kernel_lite op_lite op_registry_lite context_lite ${tensor_lite} CACHE INTERNAL "" FORCE)
+
 add_subdirectory(host)
 add_subdirectory(arm)
 add_subdirectory(cuda)
 add_subdirectory(x86)
+add_subdirectory(opencl)
  
diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
index ff3cab02ee8..d467d207b8d 100644
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -4,24 +4,51 @@ endif()
 
 message(STATUS "compile with lite ARM kernels")
 
-cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
-cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(activation_compute_arm SRCS activation_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(elementwise_compute_arm SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
+lite_cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
+lite_cc_test(test_activation_compute_arm SRCS activation_compute_test.cc DEPS activation_compute_arm)
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
-lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
+lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
+lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
+lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
+lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
+lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
+lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
+lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
+lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
+# lite_cc_test(test_calib_compute_arm SRCS calib_compute_test.cc DEPS calib_compute_arm)
+lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
 
 set(arm_kernels
     fc_compute_arm
-    relu_compute_arm
+    activation_compute_arm
     mul_compute_arm
     scale_compute_arm
     softmax_compute_arm
-	elementwise_add_compute_arm)
+    conv_compute_arm
+    batch_norm_compute_arm
+    elementwise_compute_arm
+    pool_compute_arm
+    split_compute_arm
+    concat_compute_arm
+    dropout_compute_arm
+    transpose_compute_arm
+    calib_compute_arm
+    )
 
 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc b/paddle/fluid/lite/kernels/arm/activation_compute.cc
similarity index 61%
rename from paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
rename to paddle/fluid/lite/kernels/arm/activation_compute.cc
index 310cde17bbd..79961f47417 100644
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/activation_compute.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
 #include "paddle/fluid/lite/arm/math/funcs.h"
 
 namespace paddle {
@@ -20,13 +20,14 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void ElementwiseAddCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
-  int n = param.X->dims().production();
-  lite::arm::math::elementwise_add(x_data, y_data, out_data, n);
+void ReluCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_relu<float>(x_data, output_data, x_dims.production(),
+                                   ctx.threads());
 }
 
 }  // namespace arm
@@ -34,9 +35,8 @@ void ElementwiseAddCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseAddCompute, def)
+REGISTER_LITE_KERNEL(relu, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ReluCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/activation_compute.h b/paddle/fluid/lite/kernels/arm/activation_compute.h
new file mode 100644
index 00000000000..04e7127b598
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/activation_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~ReluCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/activation_compute_test.cc b/paddle/fluid/lite/kernels/arm/activation_compute_test.cc
new file mode 100644
index 00000000000..8761fca5bad
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/activation_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void activation_compute_ref(const operators::ActivationParam& param) {
+  auto x_data = param.X->data<dtype>();
+  auto output_data = param.Out->mutable_data<dtype>();
+  DDim x_dims = param.X->dims();
+  DDim output_dims = param.Out->dims();
+  ASSERT_EQ(x_dims.data(), output_dims.data());
+  for (int i = 0; i < output_dims.production(); i++) {
+    output_data[i] = std::max(0.f, x_data[i]);
+  }
+}
+
+TEST(activation_arm, retrive_op) {
+  auto activation =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("relu");
+  ASSERT_FALSE(activation.empty());
+  ASSERT_TRUE(activation.front());
+}
+
+TEST(activation_arm, init) {
+  ReluCompute activation;
+  ASSERT_EQ(activation.precision(), PRECISION(kFloat));
+  ASSERT_EQ(activation.target(), TARGET(kARM));
+}
+
+TEST(activation_arm, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          Tensor x;
+          Tensor output;
+          Tensor output_ref;
+          // set the dims of input, output, ref output tensors
+          x.Resize({n, c, h, w});
+          output.Resize({n, c, h, w});
+          output_ref.Resize({n, c, h, w});
+          // initialize the data of input tensors
+          auto* x_data = x.mutable_data<float>();
+          auto* output_data = output.mutable_data<float>();
+          for (int i = 0; i < x.dims().production(); i++) {
+            float sign = i % 3 == 0 ? -1.0f : 1.0f;
+            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
+          }
+          // prepare kernel params and run
+          ReluCompute activation;
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          activation.SetContext(std::move(ctx));
+          operators::ActivationParam param;
+          param.X = &x;
+          param.Out = &output;
+          activation.SetParam(param);
+          activation.Launch();
+          // invoking ref implementation and compare results
+          param.Out = &output_ref;
+          activation_compute_ref<float>(param);
+          auto* output_ref_data = output_ref.mutable_data<float>();
+          for (int i = 0; i < output.dims().production(); i++) {
+            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
new file mode 100644
index 00000000000..0cb43dd5e04
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void BatchNormCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    int64_t channel_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        channel_size = x_dims[1];
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+    new_scale.Resize({channel_size});
+    new_bias.Resize({channel_size});
+    auto* scale_data = param.scale->mutable_data<float>();
+    auto* bias_data = param.bias->mutable_data<float>();
+    auto* mean_data = param.mean->mutable_data<float>();
+    auto* variance_data = param.variance->mutable_data<float>();
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    for (int c = 0; c < channel_size; c++) {
+      float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
+      new_bias_data[c] =
+          bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+      new_scale_data[c] = inv_scale * scale_data[c];
+    }
+  }
+}
+
+void BatchNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto x_data = param.x->mutable_data<float>();
+  auto y_data = param.y->mutable_data<float>();
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    auto* new_scale_data = new_scale.mutable_data<float>();
+    auto* new_bias_data = new_bias.mutable_data<float>();
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW):
+        outer_size = x_dims[0];
+        channel_size = x_dims[1];
+        inner_size = x_dims.Slice(2, x_dims.size()).production();
+        lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+                               inner_size, new_scale_data, new_bias_data);
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
+      //                          new_scale_data, new_bias_data);
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  } else {
+    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
+    // saved_variance
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::BatchNormCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.h b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
new file mode 100644
index 00000000000..cf3ad3accde
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~BatchNormCompute() = default;
+
+ private:
+  Tensor new_scale;
+  Tensor new_bias;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
new file mode 100644
index 00000000000..3ca1a0b599b
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename dtype>
+void batch_norm_compute_ref(const operators::BatchNormParam& param) {
+  DDim x_dims = param.x->dims();
+  auto x_data = param.x->mutable_data<dtype>();
+  auto scale_data = param.scale->mutable_data<dtype>();
+  auto bias_data = param.bias->mutable_data<dtype>();
+  auto mean_data = param.mean->mutable_data<dtype>();
+  auto variance_data = param.variance->mutable_data<dtype>();
+  auto y_data = param.y->mutable_data<dtype>();
+  float epsilon = param.epsilon;
+  float momentum = param.momentum;
+  DataLayoutType data_layout = param.data_layout;
+
+  bool global_stats = param.is_test || param.use_global_stats;
+  if (global_stats) {
+    int64_t outer_size = 0;
+    int64_t channel_size = 0;
+    int64_t inner_size = 0;
+    switch (data_layout) {
+      case DATALAYOUT(kNCHW):
+        outer_size = x_dims[0];
+        channel_size = x_dims[1];
+        inner_size = x_dims.Slice(2, x_dims.size()).production();
+        break;
+      // case DATALAYOUT(kNHWC):
+      //   outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
+      //   channel_size = x_dims[x_dims.size() - 1];
+      //   inner_size = 1;
+      //   break;
+      default:
+        LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout);
+        break;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          dtype norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  } else {
+    // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
+    // saved_variance
+  }
+}
+
+TEST(batch_norm_arm, retrive_op) {
+  auto batch_norm =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+
+TEST(batch_norm_arm, init) {
+  BatchNormCompute batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kARM));
+}
+
+TEST(batch_norm_arm, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          for (auto is_test : {/*false, */ true}) {
+            for (auto use_global_stats : {false, true}) {
+              for (auto epsilon : {1e-4f, 1e-5f}) {
+                for (auto momentum : {0.9f, 0.99f}) {
+                  for (auto data_layout :
+                       {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) {
+                    Tensor x;
+                    Tensor scale;
+                    Tensor bias;
+                    Tensor mean;
+                    Tensor variance;
+                    Tensor y;
+                    Tensor mean_out;
+                    Tensor variance_out;
+                    Tensor saved_mean;
+                    Tensor saved_variance;
+                    Tensor y_ref;
+                    Tensor mean_out_ref;
+                    Tensor variance_out_ref;
+                    Tensor saved_mean_ref;
+                    Tensor saved_variance_ref;
+                    // set the dims of input, output, ref output tensors
+                    std::vector<int64_t> in_out_shape;
+                    switch (data_layout) {
+                      case DATALAYOUT(kNCHW):
+                        in_out_shape = {n, c, h, w};
+                        break;
+                      // case DATALAYOUT(kNHWC):
+                      //   in_out_shape = {n, h, w, c};
+                      //   break;
+                      default:
+                        LOG(FATAL) << "Unknown storage order: "
+                                   << DataLayoutToStr(data_layout);
+                        break;
+                    }
+                    x.Resize(in_out_shape);
+                    scale.Resize({c});
+                    bias.Resize({c});
+                    mean.Resize({c});
+                    variance.Resize({c});
+                    y.Resize(in_out_shape);
+                    mean_out.Resize({c});
+                    variance_out.Resize({c});
+                    saved_mean.Resize({c});
+                    saved_variance.Resize({c});
+                    y_ref.Resize(in_out_shape);
+                    mean_out_ref.Resize({c});
+                    variance_out_ref.Resize({c});
+                    saved_mean_ref.Resize({c});
+                    saved_variance_ref.Resize({c});
+                    // initialize the data of input tensors
+                    auto* x_data = x.mutable_data<float>();
+                    auto* scale_data = scale.mutable_data<float>();
+                    auto* bias_data = bias.mutable_data<float>();
+                    auto* mean_data = mean.mutable_data<float>();
+                    auto* variance_data = variance.mutable_data<float>();
+                    auto* y_data = y.mutable_data<float>();
+                    for (int i = 0; i < x.dims().production(); i++) {
+                      x_data[i] = static_cast<float>(i % 64);
+                    }
+                    for (int i = 0; i < scale.dims().production(); i++) {
+                      scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+                    }
+                    for (int i = 0; i < bias.dims().production(); i++) {
+                      bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+                    }
+                    for (int i = 0; i < mean.dims().production(); i++) {
+                      mean_data[i] = static_cast<float>(i) * 0.0565f;
+                    }
+                    for (int i = 0; i < variance.dims().production(); i++) {
+                      variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+                    }
+                    // prepare kernel params and run
+                    BatchNormCompute batch_norm;
+                    std::unique_ptr<KernelContext> ctx(new KernelContext);
+                    ctx->As<ARMContext>();
+                    batch_norm.SetContext(std::move(ctx));
+                    operators::BatchNormParam param;
+                    param.x = &x;
+                    param.scale = &scale;
+                    param.bias = &bias;
+                    param.mean = &mean;
+                    param.variance = &variance;
+                    param.is_test = is_test;
+                    param.use_global_stats = use_global_stats;
+                    param.epsilon = epsilon;
+                    param.momentum = momentum;
+                    param.data_layout = data_layout;
+                    param.y = &y;
+                    param.mean_out = &mean_out;
+                    param.variance_out = &variance_out;
+                    param.saved_mean = &saved_mean;
+                    param.saved_variance = &saved_variance;
+                    batch_norm.SetParam(param);
+                    batch_norm.Launch();
+                    // invoking ref implementation and compare results
+                    param.y = &y_ref;
+                    param.mean_out = &mean_out_ref;
+                    param.variance_out = &variance_out_ref;
+                    param.saved_mean = &saved_mean_ref;
+                    param.saved_variance = &saved_variance_ref;
+                    batch_norm_compute_ref<float>(param);
+                    auto* y_ref_data = y_ref.mutable_data<float>();
+                    for (int i = 0; i < y.dims().production(); i++) {
+                      EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/calib_compute.cc b/paddle/fluid/lite/kernels/arm/calib_compute.cc
new file mode 100644
index 00000000000..47141d5b773
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/calib_compute.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include <vector>
+#include "paddle/fluid/lite/arm/math/type_trans.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void CalibComputeFp32ToInt8::Run() {
+  auto& param = this->Param<operators::CalibParam>();
+  std::vector<float> scale = {param.scale};
+  const auto* din = param.input->data<float>();
+  auto* dout = param.output->mutable_data<signed char>();
+  lite::arm::math::fp32_to_int8(din, dout, scale.data(), 1, 1,
+                                param.input->numel());
+  return;
+}
+
+void CalibComputeInt8ToFp32::Run() {
+  auto& param = this->Param<operators::CalibParam>();
+  const auto* din = param.input->data<signed char>();
+  std::vector<float> scale = {param.scale};
+  auto* dout = param.output->mutable_data<float>();
+  lite::arm::math::int8_to_fp32(din, dout, scale.data(), 1, 1,
+                                param.input->numel());
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(calib, kARM, kInt8, kNCHW,
+                     paddle::lite::kernels::arm::CalibComputeFp32ToInt8,
+                     fp32_to_int8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(calib, kARM, kInt8, kNCHW,
+                     paddle::lite::kernels::arm::CalibComputeInt8ToFp32,
+                     int8_to_fp32)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/calib_compute.h b/paddle/fluid/lite/kernels/arm/calib_compute.h
new file mode 100644
index 00000000000..fa8b67eab3c
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/calib_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/calib_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class CalibComputeFp32ToInt8
+    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeFp32ToInt8() override{};
+
+ private:
+};
+
+class CalibComputeInt8ToFp32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt8ToFp32() override{};
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/calib_compute_test.cc b/paddle/fluid/lite/kernels/arm/calib_compute_test.cc
new file mode 100644
index 00000000000..783fe464187
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/calib_compute_test.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+static int get_rand(int start, int end) {
+  int i = rand();  // NOLINT
+  i = (i % (end - start)) + start;
+  return i;
+}
+
+static void int8_to_fp32_basic(const int8_t* din, float* dout,
+                               const float* scale, int axis_size,
+                               int64_t outer_size, int64_t inner_size) {
+  int loop_size = axis_size * outer_size;
+  for (int i = 0; i < loop_size; ++i) {
+    float scale_in = scale[i % axis_size];
+    for (int j = 0; j < inner_size; ++j) {
+      dout[j] = din[j] * scale_in;
+    }
+    dout += inner_size;
+    din += inner_size;
+  }
+}
+
+static void fp32_to_int8_basic(const float* din, int8_t* dout,
+                               const float* scale, int axis_size,
+                               int64_t outer_size, int64_t inner_size) {
+  int loop_size = axis_size * outer_size;
+  for (int i = 0; i < loop_size; ++i) {
+    float inv_scale = 1.f / scale[i % axis_size];
+    for (int j = 0; j < inner_size; ++j) {
+      dout[j] = static_cast<int8_t>(roundf(din[j] * inv_scale));
+    }
+    dout += inner_size;
+    din += inner_size;
+  }
+}
+
+void calib_ref(const operators::CalibParam& param) {
+  std::vector<float> scale = {param.in_scale};
+  if (param.in_dtype == PRECISION(kFloat) &&
+      param.out_dtype == PRECISION(kInt8)) {
+    const auto* din = param.input->data<float>();
+    auto* dout = param.output->mutable_data<signed char>();
+    fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel());
+    return;
+  }
+  if (param.in_dtype == PRECISION(kInt8) &&
+      param.out_dtype == PRECISION(kFloat)) {
+    const auto* din = param.input->data<signed char>();
+    auto* dout = param.output->mutable_data<float>();
+    int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel());
+    return;
+  }
+  LOG(FATAL) << "Unsupport Dtype.";
+}
+
+TEST(calib_arm, retrive_op) {
+  auto calib =
+      KernelRegistry::Global()
+          .Create<TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)>("calib");
+  ASSERT_FALSE(calib.empty());
+  ASSERT_TRUE(calib.front());
+}
+
+TEST(calib_arm, init) {
+  CalibCompute calib;
+  ASSERT_EQ(calib.precision(), PRECISION(kInt8));
+  ASSERT_EQ(calib.target(), TARGET(kARM));
+}
+
+TEST(calib_arm, int8_to_fp32) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          Tensor x;
+          Tensor output;
+          Tensor output_ref;
+          // set the dims of input, output, ref output tensors
+          x.Resize({n, c, h, w});
+          output.Resize({n, c, h, w});
+          output_ref.Resize({n, c, h, w});
+          // initialize the data of input tensors
+          auto* x_data = x.mutable_data<char>();
+          auto* output_data = output.mutable_data<float>();
+          for (int i = 0; i < x.dims().production(); i++) {
+            float sign = i % 3 == 0 ? -1.0f : 1.0f;
+            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
+          }
+          // prepare kernel params and run
+          CalibCompute calib;
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          calib.SetContext(std::move(ctx));
+          operators::CalibParam param;
+          param.in_scale = get_rand(0, 100) * 0.1f;
+          param.in_dtype = PRECISION(kInt8);
+          param.out_dtype = PRECISION(kFloat);
+          param.input = &x;
+          param.output = &output;
+          calib.SetParam(param);
+          calib.Launch();
+          // invoking ref implementation and compare results
+          param.output = &output_ref;
+          calib_ref(param);
+          auto* output_ref_data = output_ref.mutable_data<float>();
+          for (int i = 0; i < output.dims().production(); i++) {
+            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute.cc b/paddle/fluid/lite/kernels/arm/concat_compute.cc
new file mode 100644
index 00000000000..70adb8fc33e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+std::vector<size_t> stride_numel(const DDim& ddim) {
+  std::vector<size_t> strides(ddim.size());
+  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i];
+  }
+  return strides;
+}
+
+void ConcatCompute::Run() {
+  auto& param = Param<operators::ConcatParam>();
+  std::vector<lite::Tensor*> inputs = param.x;
+  auto* out = param.output;
+  int axis = param.axis;
+  out->mutable_data<float>();
+
+  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && inputs.size() < 10) {
+    size_t output_offset = 0;
+    for (auto* in : inputs) {
+      auto in_stride = stride_numel(in->dims());
+      auto out_stride = stride_numel(out->dims());
+      void* dst = out->mutable_data<float>() + output_offset;
+      const void* src = in->data<float>();
+#if 0
+      LOG(INFO) << "out_stride.size():" << out_stride.size();
+      LOG(INFO) << "out_stride[0]" << out_stride[0];
+      for (int i=0; i < out_stride.size(); ++i) {
+        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
+      }
+      LOG(INFO) << "in_stride.size():" << in_stride.size();
+      for (int i=0; i < in_stride.size(); ++i) {
+        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
+      }
+#endif
+      // src and dst tensor should have the same dims size.
+      CHECK(in_stride.size() == out_stride.size());
+      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
+      output_offset += in_stride[0];
+    }
+  } else {
+    std::vector<lite::Tensor*> inputs_concat(inputs.size());
+    for (int j = 0; j < inputs.size(); ++j) {
+      inputs_concat[j] = inputs[j];
+    }
+    lite::arm::math::concat_func(inputs_concat, axis, out);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(concat, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute.h b/paddle/fluid/lite/kernels/arm/concat_compute.h
new file mode 100644
index 00000000000..2e1ca89841f
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/concat_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void Run() override;
+
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/concat_compute_test.cc b/paddle/fluid/lite/kernels/arm/concat_compute_test.cc
new file mode 100644
index 00000000000..664f4ed1167
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/concat_compute_test.cc
@@ -0,0 +1,235 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/lite_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+bool infer_shape(const operators::ConcatParam& param) {
+  std::vector<lite::DDim> input_dims;
+  for (auto p : param.x) {
+    input_dims.push_back(p->dims());
+  }
+  size_t axis = static_cast<size_t>(param.axis);
+  const size_t n = input_dims.size();
+  CHECK_GT_OR_FALSE(n, 0);
+  auto& out_dims = input_dims[0];
+  size_t in_zero_dims_size = out_dims.size();
+  for (size_t i = 1; i < n; i++) {
+    for (size_t j = 0; j < in_zero_dims_size; j++) {
+      if (j == axis) {
+        out_dims[axis] += input_dims[i][j];
+      } else {
+        CHECK_EQ_OR_FALSE(out_dims[j], input_dims[i][j]);
+      }
+    }
+  }
+  if (out_dims[axis] < 0) {
+    out_dims[axis] = -1;
+  }
+  // Set output dims
+  param.output->Resize(lite::DDim(out_dims));
+  return true;
+}
+
+void concat_compute_ref(const operators::ConcatParam& param) {
+  std::vector<lite::Tensor*> input = param.x;
+  int axis = param.axis;
+  infer_shape(param);
+
+  lite::Tensor* output = param.output;
+  int num = input.size();
+  int rows = 1;
+  auto dim_0 = input[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int out_rows = rows, out_cols = 0;
+
+  std::vector<int> input_cols(input.size());
+  for (int i = 0; i < num; ++i) {
+    int input_i_numel = input[i]->dims().size() == 0 ? 0 : 1;
+    for (int didx = 0; didx < input[i]->dims().size(); ++didx) {
+      input_i_numel *= input[i]->dims()[didx];
+    }
+    int t_cols = input_i_numel / rows;
+    out_cols += t_cols;
+    input_cols[i] = t_cols;
+  }
+
+  // computation
+  auto output_data = output->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = input[j]->data<float>();
+    for (int k = 0; k < out_rows; ++k) {
+      memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
+             sizeof(float) * col_len);
+    }
+    col_idx += col_len;
+  }
+}
+
+TEST(concat_arm, init) {
+  ConcatCompute concat;
+  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.target(), TARGET(kARM));
+}
+
+TEST(concat_arm, compute_input_single) {
+  ConcatCompute concat;
+  operators::ConcatParam param;
+
+  LOG(INFO) << "test concat start";
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  lite::Tensor tensorA;
+  DDimLite ddimA({10, 4, 3, 2});
+  tensorA.Resize(ddimA);
+
+  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
+                          ddimA.data()[3];
+       i++) {
+    tensorA.mutable_data<float>()[i] = i;
+  }
+
+  param.x.push_back(&tensorA);
+  for (int cur_axis : {0, 1}) {
+    param.output = &output;
+    param.axis = cur_axis;
+    CHECK(infer_shape(param));
+    concat.SetParam(param);
+    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
+
+    concat.Run();
+    LOG(INFO) << "concat.Run end";
+    param.output = &output_ref;
+    LOG(INFO) << "concat_compute_ref start";
+    concat_compute_ref(param);
+    LOG(INFO) << "concat_compute_ref end";
+
+    auto* output_data = output.data<float>();
+    auto* output_ref_data = output_ref.data<float>();
+    for (int i = 0; i < (ddimA.data()[0]) * ddimA.data()[1] * ddimA.data()[2] *
+                            ddimA.data()[3];
+         i++) {
+      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
+      // output_ref_data[" << i << "]:" << output_ref_data[i];
+      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+    }
+  }
+}
+
+TEST(concat_arm, compute_input_multi) {
+  ConcatCompute concat;
+  operators::ConcatParam param;
+
+  LOG(INFO) << "test concat start";
+  // init param
+  // x: tensorA, tensorB, tensorC, tensorD
+  // axis: 0
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  lite::Tensor tensorA;
+  lite::Tensor tensorB;
+  lite::Tensor tensorC;
+  lite::Tensor tensorD;
+
+  DDimLite ddimA({10, 4, 3, 2});
+  DDimLite ddimB({20, 4, 3, 2});
+  DDimLite ddimC({30, 4, 3, 2});
+  DDimLite ddimD({40, 4, 3, 2});
+
+  tensorA.Resize(ddimA);
+  tensorB.Resize(ddimB);
+  tensorC.Resize(ddimC);
+  tensorD.Resize(ddimD);
+
+  for (int i = 0; i < ddimA.data()[0] * ddimA.data()[1] * ddimA.data()[2] *
+                          ddimA.data()[3];
+       i++) {
+    tensorA.mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < ddimB.data()[0] * ddimB.data()[1] * ddimB.data()[2] *
+                          ddimB.data()[3];
+       i++) {
+    tensorB.mutable_data<float>()[i] = i + 1;
+  }
+  for (int i = 0; i < ddimC.data()[0] * ddimC.data()[1] * ddimC.data()[2] *
+                          ddimC.data()[3];
+       i++) {
+    tensorC.mutable_data<float>()[i] = i + 2;
+  }
+  for (int i = 0; i < ddimD.data()[0] * ddimD.data()[1] * ddimD.data()[2] *
+                          ddimD.data()[3];
+       i++) {
+    tensorD.mutable_data<float>()[i] = i + 3;
+  }
+
+  param.x.push_back(&tensorA);
+  param.x.push_back(&tensorB);
+  param.x.push_back(&tensorC);
+  param.x.push_back(&tensorD);
+  for (int cur_axis : {0}) {
+    param.output = &output;
+    param.axis = cur_axis;
+    CHECK(infer_shape(param));
+    concat.SetParam(param);
+    LOG(INFO) << "test concat start cur_axis:" << cur_axis;
+
+    concat.Run();
+    LOG(INFO) << "concat.Run end";
+    param.output = &output_ref;
+    LOG(INFO) << "concat_compute_ref start";
+    concat_compute_ref(param);
+    LOG(INFO) << "concat_compute_ref end";
+
+    auto* output_data = output.data<float>();
+    auto* output_ref_data = output_ref.data<float>();
+    int elem_num = (ddimA.data()[0] + ddimB.data()[0] + ddimC.data()[0] +
+                    ddimD.data()[0]) *
+                   ddimA.data()[1] * ddimA.data()[2] * ddimA.data()[3];
+    for (int i = 0; i < elem_num; i++) {
+      // LOG(INFO) << "output[" << i << "]:" << output_data[i] << "
+      // output_ref_data[" << i << "]:" << output_ref_data[i];
+      EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+    }
+  }
+}
+
+TEST(concat, retrive_op) {
+  auto concat =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "concat");
+  ASSERT_FALSE(concat.empty());
+  ASSERT_TRUE(concat.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.cc b/paddle/fluid/lite/kernels/arm/conv_compute.cc
new file mode 100644
index 00000000000..0ad99530ac5
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ConvCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  int win = x_dims[3];  // nchw
+  int hin = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int ow = o_dims[3];
+  int oh = o_dims[2];
+  int oc = o_dims[1];
+  int kh = w_dims[2];  // oihw
+  int kw = w_dims[3];
+  int pad = param.paddings[0];
+  int stride = param.strides[0];
+
+  const auto* i_data = param.x->data<float>();
+  const auto* w_data = param.filter->data<float>();
+  const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
+  auto* o_data = param.output->mutable_data<float>();
+
+  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
+                   (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool flag_dw_3x3 =
+      (kw == 3 && (pad == 0 || pad == 1) && (stride == 1 || stride == 2));
+  bool flag_dw_5x5 =
+      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
+  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+
+  // select conv impl
+  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+    // dw conv impl
+    impl_ = new lite::arm::math::DepthwiseConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking dw conv";
+  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
+             no_dilation) {
+    if (ic >= 32 && oc >= 32 && oh > 16 && ow > 16) {
+      // winograd conv impl
+      impl_ = new lite::arm::math::WinogradConv<PRECISION(kFloat)>;
+      VLOG(3) << "invoking winograd conv";
+    } else {
+      // direct conv impl
+      impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
+      VLOG(3) << "invoking direct conv";
+    }
+  } else if (param.groups == 1 && kw == 3 && stride == 2 && kps_equal &&
+             no_dilation) {
+    // direct conv impl
+    impl_ = new lite::arm::math::DirectConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking direct conv";
+  } else {
+    impl_ = new lite::arm::math::GemmLikeConv<PRECISION(kFloat)>;
+    VLOG(3) << "invoking gemm like conv";
+  }
+  CHECK(this->impl_->create(param, &ctx));
+}
+
+void ConvCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(impl_);
+  impl_->run(param);
+  // if (this->act_ != nullptr) {
+  //   this->act_->run(outputs, outputs, param.activation_param);
+  // }
+}
+
+template <PrecisionType Ptype_out>
+void ConvComputeInt8<Ptype_out>::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto x_dims = param.x->dims();
+  auto w_dims = param.filter->dims();
+  auto o_dims = param.output->dims();
+
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  int win = x_dims[3];  // nchw
+  int hin = x_dims[2];
+  int ic = x_dims[1];
+  int bs = x_dims[0];
+  int ow = o_dims[3];
+  int oh = o_dims[2];
+  int oc = o_dims[1];
+  int kh = w_dims[2];  // oihw
+  int kw = w_dims[3];
+  int ph = param.paddings[1];
+  int pw = param.paddings[0];
+  int sh = param.strides[1];
+  int sw = param.strides[0];
+
+  bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
+  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3) && (ph == 1) && (sw == 1 || sw == 2);
+  bool flag_dw_5x5 = (kw == 5 && sw == 1 && ph == 2);
+  bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+
+  // weigth is int8 and bias is int32 so do not need trans
+  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+    impl_ = new lite::arm::math::DepthwiseConvInt8<Ptype_out>;
+    VLOG(3) << "Run DepthwiseConv Int8";
+  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
+             kps_equal && no_dilation) {
+    VLOG(3) << "Run DirectConv Int8";
+    impl_ = new lite::arm::math::DirectConvInt8<Ptype_out>;
+  } else {
+    VLOG(3) << "Run GemmLikeConvInt8";
+    impl_ = new lite::arm::math::GemmLikeConvInt8<Ptype_out>;
+  }
+
+  CHECK(this->impl_->create(param, &ctx));
+}
+
+template <PrecisionType Ptype_out>
+void ConvComputeInt8<Ptype_out>::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(impl_);
+  impl_->run(param);
+}
+
+template class ConvComputeInt8<PRECISION(kInt8)>;
+template class ConvComputeInt8<PRECISION(kFloat)>;
+template class ConvComputeInt8<PRECISION(kInt32)>;
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConvCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ConvCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv2d, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kInt8)>, int8_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    conv2d, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kFloat)>, fp32_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kInt8)>, int8_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::ConvComputeInt8<PRECISION(kFloat)>, fp32_out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/conv_compute.h b/paddle/fluid/lite/kernels/arm/conv_compute.h
new file mode 100644
index 00000000000..28bf6ea7dff
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ConvCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~ConvCompute() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+
+ private:
+  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kFloat), param_t>* impl_{
+      nullptr};
+};
+
+template <PrecisionType Ptype_out>
+class ConvComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~ConvComputeInt8() {
+    if (impl_ != nullptr) {
+      delete impl_;
+    }
+  }
+
+ private:
+  lite::arm::math::ImplBase<TARGET(kARM), PRECISION(kInt8), param_t>* impl_{
+      nullptr};
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
new file mode 100644
index 00000000000..9745e042383
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
@@ -0,0 +1,979 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/type_trans.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+static int get_rand(int start, int end) {
+  int i = rand();  // NOLINT
+  i = (i % (end - start)) + start;
+  return i;
+}
+
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din, Dtype2* dout, int num, int chout,
+                       int hout, int wout, int chin, int hin, int win,
+                       const Dtype1* weights, const Dtype2* bias, int group,
+                       int kernel_w, int kernel_h, int stride_w, int stride_h,
+                       int dila_w, int dila_h, int pad_w, int pad_h,
+                       bool flag_bias, bool flag_relu) {
+  Dtype2 beta = 0;
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                }
+              }
+            }
+            if (flag_relu) {
+              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                          ? dst_data_ref[out_idx]
+                                          : (Dtype2)0;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename Dtype1, typename Dtype2>
+void conv_compute_ref(const operators::ConvParam& param) {
+  const Dtype1* din = param.x->data<Dtype1>();
+  Dtype2* dout = param.output->mutable_data<Dtype2>();
+
+  int num = param.x->dims()[0];
+  int chout = param.output->dims()[1];
+  int hout = param.output->dims()[2];
+  int wout = param.output->dims()[3];
+
+  int chin = param.x->dims()[1];
+  int hin = param.x->dims()[2];
+  int win = param.x->dims()[3];
+
+  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
+  Dtype2* bias = nullptr;
+  if (param.bias != nullptr) {
+    bias = param.bias->mutable_data<Dtype2>();
+  }
+
+  int group = param.groups;
+  int kernel_w = param.filter->dims()[2];
+  int kernel_h = param.filter->dims()[3];
+  int stride_w = param.strides[0];
+  int stride_h = param.strides[1];
+  int dila_w = param.dilations[0];
+  int dila_h = param.dilations[1];
+
+  int pad_w = param.paddings[0];
+  int pad_h = param.paddings[1];
+  bool flag_bias = (param.bias != nullptr);
+  bool flag_relu = param.fuse_relu;
+
+  conv_basic(din, dout, num, chout, hout, wout, chin, hin, win, weights, bias,
+             group, kernel_w, kernel_h, stride_w, stride_h, dila_w, dila_h,
+             pad_w, pad_h, flag_bias, flag_relu);
+}
+
+TEST(conv_arm, retrive_op) {
+  auto conv = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      "conv2d");
+  ASSERT_FALSE(conv.empty());
+  ASSERT_TRUE(conv.front());
+}
+
+TEST(conv_arm_int8, retrive_op) {
+  auto conv =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kInt8)>("conv2d");
+  ASSERT_FALSE(conv.empty());
+  ASSERT_TRUE(conv.front());
+}
+
+TEST(conv_arm, init) {
+  ConvCompute conv;
+  ASSERT_EQ(conv.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv.target(), TARGET(kARM));
+}
+
+TEST(conv_arm_int8, init) {
+  ConvComputeInt8<PRECISION(kFloat)> float_out;
+  ASSERT_EQ(float_out.precision(), PRECISION(kInt8));
+  ASSERT_EQ(float_out.target(), TARGET(kARM));
+  ConvComputeInt8<PRECISION(kInt8)> int8_out;
+  ASSERT_EQ(float_out.precision(), PRECISION(kInt8));
+  ASSERT_EQ(float_out.target(), TARGET(kARM));
+}
+
+TEST(conv_arm_int8, int8_int32) {
+  DeviceInfo::Init();
+  for (auto n : {2}) {
+    for (auto ic : {6}) {
+      for (auto oc : {6}) {
+        for (auto ih : {9}) {
+          for (auto iw : {9}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, /*true*/}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1}) {
+                      for (auto padding : {0}) {
+                        for (auto ks : {1}) {
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+
+                          Tensor input_int8;
+                          Tensor filter_int8;
+                          Tensor output_int32, output_int32_ref;
+
+                          input_int8.Resize(input_shape);
+                          filter_int8.Resize(filter_shape);
+                          output_int32.Resize(output_shape);
+                          output_int32_ref.Resize(output_shape);
+
+                          int8_t* input_int8_data =
+                              input_int8.mutable_data<int8_t>();
+                          int8_t* filter_int8_data =
+                              filter_int8.mutable_data<int8_t>();
+                          for (int i = 0; i < input_int8.dims().production();
+                               i++) {
+                            input_int8_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < filter_int8.dims().production();
+                               i++) {
+                            filter_int8_data[i] = i % 10 * (i % 3 - 1);
+                          }
+
+                          operators::ConvParam param;
+                          param.x = &input_int8;
+                          param.filter = &filter_int8;
+                          param.bias = nullptr;
+                          param.fuse_relu = false;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          param.output = &output_int32_ref;
+                          conv_compute_ref<int8_t, int>(param);
+
+                          param.output = &output_int32;
+                          std::unique_ptr<KernelContext> ctx(new KernelContext);
+                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt32)>
+                              int8gemm_int32;
+                          int8gemm_int32.init(param, &ctx->As<ARMContext>());
+                          int8gemm_int32.create(param, &ctx->As<ARMContext>());
+                          int8gemm_int32.run(param);
+
+                          int* output_int32_data =
+                              output_int32.mutable_data<int>();
+                          int* output_int32_ref_data =
+                              output_int32_ref.mutable_data<int>();
+
+                          for (int i = 0; i < output_int32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int32_data[i],
+                                        output_int32_ref_data[i], 1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv_arm_int8, int8_fp32) {
+  DeviceInfo::Init();
+  for (auto n : {2}) {
+    for (auto ic : {6}) {
+      for (auto oc : {6}) {
+        for (auto ih : {9}) {
+          for (auto iw : {9}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, /*true*/}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1}) {
+                      for (auto padding : {0}) {
+                        for (auto ks : {1}) {
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+
+                          LOG(INFO) << "flag_bias: " << flag_bias;
+
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          std::vector<int64_t> bias_shape({1, oc, 1, 1});
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+
+                          Tensor input_fp32, input_int8;
+                          Tensor filter_fp32, filter_int8;
+                          Tensor bias_fp32, bias_int32;
+                          Tensor output_int32_ref, output_int32;
+                          Tensor output_fp32_ref, output_fp32;
+                          Tensor output_int8_ref, output_int8;
+
+                          input_fp32.Resize(input_shape);
+                          input_int8.Resize(input_shape);
+                          filter_fp32.Resize(filter_shape);
+                          filter_int8.Resize(filter_shape);
+                          bias_fp32.Resize(bias_shape);
+                          bias_int32.Resize(bias_shape);
+                          output_int32.Resize(output_shape);
+                          output_int32_ref.Resize(output_shape);
+                          output_fp32_ref.Resize(output_shape);
+                          output_fp32.Resize(output_shape);
+                          output_int8_ref.Resize(output_shape);
+                          output_int8.Resize(output_shape);
+
+                          float* input_fp32_data =
+                              input_fp32.mutable_data<float>();
+                          int8_t* input_int8_data =
+                              input_int8.mutable_data<int8_t>();
+
+                          float* filter_fp32_data =
+                              filter_fp32.mutable_data<float>();
+                          int8_t* filter_int8_data =
+                              filter_int8.mutable_data<int8_t>();
+
+                          float* bias_fp32_data =
+                              bias_fp32.mutable_data<float>();
+                          int* bias_int32_data = bias_int32.mutable_data<int>();
+
+                          for (int i = 0; i < input_fp32.dims().production();
+                               i++) {
+                            input_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < filter_fp32.dims().production();
+                               i++) {
+                            filter_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < bias_fp32.dims().production();
+                               i++) {
+                            bias_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+
+                          std::vector<float> in_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              input_fp32, &in_scale, -1, 127.f);
+                          lite::arm::math::trans_tensor_fp32_to_int8(
+                              &input_fp32, &input_int8, in_scale[0]);
+
+                          std::vector<float> w_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              filter_fp32, &w_scale, -1, 127.f);
+                          int axis_size = oc;
+                          int inner_size = ic / group * ks * ks;
+                          w_scale = lite::arm::math::get_tensor_scale_n(
+                              filter_fp32_data, axis_size, inner_size, 127.f);
+                          lite::arm::math::fp32_to_int8(
+                              filter_fp32_data, filter_int8_data,
+                              w_scale.data(), axis_size, 1, inner_size);
+
+                          // lite::arm::math::trans_fp32_bias_to_int32_basic(&bias_fp32,
+                          // &bias_int32, in_scale[0], w_scale);
+                          for (int i = 0; i < bias_int32.dims().production();
+                               i++) {
+                            bias_int32_data[i] = 1;
+                          }
+
+                          operators::ConvParam param;
+                          param.x = &input_int8;
+                          param.filter = &filter_int8;
+                          if (flag_bias) {
+                            param.bias = &bias_int32;
+                          } else {
+                            param.bias = nullptr;
+                          }
+                          param.fuse_relu = false;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          param.output = &output_int32_ref;
+                          conv_compute_ref<int8_t, int>(param);
+
+                          int* output_int32_ref_data =
+                              output_int32_ref.mutable_data<int>();
+
+                          // ============ int8gemm_int32 ============
+                          /*
+                          param.output = &output_int32;
+                          std::unique_ptr<KernelContext> ctx_int32(
+                              new KernelContext);
+                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt32)>
+                              int8gemm_int32;
+                          int8gemm_int32.init(param,
+                                              &ctx_int32->As<ARMContext>());
+                          int8gemm_int32.create(param,
+                                                &ctx_int32->As<ARMContext>());
+                          int8gemm_int32.run(param);
+                          int* output_int32_data =
+                              output_int32.mutable_data<int>();
+                          for (int i = 0; i < output_int32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int32_data[i],
+                                        output_int32_ref_data[i], 1e-3);
+                          }
+                          */
+                          // ============ int8gemm_int8 ============
+                          int8_t* output_int8_ref_data =
+                              output_int8_ref.mutable_data<int8_t>();
+                          lite::arm::math::trans_tensor_int32_to_int8(
+                              &output_int32_ref, &output_int8_ref, in_scale[0],
+                              1, w_scale);
+                          param.output = &output_int8;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_int8(
+                              new KernelContext);
+                          lite::arm::math::GemmLikeConvInt8<PRECISION(kInt8)>
+                              int8gemm_int8;
+                          int8gemm_int8.init(param,
+                                             &ctx_int8->As<ARMContext>());
+                          int8gemm_int8.create(param,
+                                               &ctx_int8->As<ARMContext>());
+                          int8gemm_int8.run(param);
+                          int8_t* output_int8_data =
+                              output_int8.mutable_data<int8_t>();
+                          for (int i = 0; i < output_int8.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int8_data[i],
+                                        output_int8_ref_data[i], 1e-3);
+                          }
+
+                          // ============ int8gemm_float32 ============
+                          float* output_fp32_ref_data =
+                              output_fp32_ref.mutable_data<float>();
+                          lite::arm::math::trans_tensor_int32_to_fp32(
+                              &output_int32_ref, &output_fp32_ref, in_scale[0],
+                              w_scale);
+                          param.output = &output_fp32;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_fp32(
+                              new KernelContext);
+                          lite::arm::math::GemmLikeConvInt8<PRECISION(kFloat)>
+                              int8gemm_fp32;
+                          int8gemm_fp32.init(param,
+                                             &ctx_fp32->As<ARMContext>());
+                          int8gemm_fp32.create(param,
+                                               &ctx_fp32->As<ARMContext>());
+                          int8gemm_fp32.run(param);
+                          float* output_fp32_data =
+                              output_fp32.mutable_data<float>();
+                          for (int i = 0; i < output_fp32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_fp32_data[i],
+                                        output_fp32_ref_data[i], 1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv_direct_int8, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto ic : {1, 3, 8}) {
+      for (auto oc : {1, 3, 8}) {
+        for (auto ih : {5, 15, 28}) {
+          for (auto iw : {5, 15, 28}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, /*true*/}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {1}) {
+                        for (auto ks : {3}) {
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          std::vector<int64_t> bias_shape({1, oc, 1, 1});
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+
+                          Tensor input_fp32, input_int8;
+                          Tensor filter_fp32, filter_int8;
+                          Tensor bias_int32;
+                          Tensor output_int32_ref, output_int32;
+                          Tensor output_fp32_ref, output_fp32;
+                          Tensor output_int8_ref, output_int8;
+
+                          input_fp32.Resize(input_shape);
+                          input_int8.Resize(input_shape);
+                          filter_fp32.Resize(filter_shape);
+                          filter_int8.Resize(filter_shape);
+                          bias_int32.Resize(bias_shape);
+                          output_int32.Resize(output_shape);
+                          output_int32_ref.Resize(output_shape);
+                          output_fp32_ref.Resize(output_shape);
+                          output_fp32.Resize(output_shape);
+                          output_int8_ref.Resize(output_shape);
+                          output_int8.Resize(output_shape);
+
+                          float* input_fp32_data =
+                              input_fp32.mutable_data<float>();
+                          int8_t* input_int8_data =
+                              input_int8.mutable_data<int8_t>();
+
+                          float* filter_fp32_data =
+                              filter_fp32.mutable_data<float>();
+                          int8_t* filter_int8_data =
+                              filter_int8.mutable_data<int8_t>();
+
+                          int* bias_int32_data =
+                              bias_int32.mutable_data<int32_t>();
+
+                          for (int i = 0; i < input_fp32.dims().production();
+                               i++) {
+                            input_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < filter_fp32.dims().production();
+                               i++) {
+                            filter_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < bias_int32.dims().production();
+                               i++) {
+                            bias_int32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+
+                          std::vector<float> in_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              input_fp32, &in_scale, -1, 127.f);
+                          lite::arm::math::trans_tensor_fp32_to_int8(
+                              &input_fp32, &input_int8, in_scale[0]);
+
+                          std::vector<float> w_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              filter_fp32, &w_scale, -1, 127.f);
+                          int axis_size = oc;
+                          int inner_size = ic / group * ks * ks;
+                          w_scale = lite::arm::math::get_tensor_scale_n(
+                              filter_fp32_data, axis_size, inner_size, 127.f);
+                          lite::arm::math::fp32_to_int8(
+                              filter_fp32_data, filter_int8_data,
+                              w_scale.data(), axis_size, 1, inner_size);
+
+                          operators::ConvParam param;
+                          param.x = &input_int8;
+                          param.filter = &filter_int8;
+                          if (flag_bias) {
+                            param.bias = &bias_int32;
+                          }
+                          param.fuse_relu = false;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          param.output = &output_int32_ref;
+                          conv_compute_ref<int8_t, int>(param);
+
+                          int* output_int32_ref_data =
+                              output_int32_ref.mutable_data<int>();
+
+                          // ============ int8direct_int32 ============
+                          param.output = &output_int32;
+                          std::unique_ptr<KernelContext> ctx_int32(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kInt32)>
+                              int8direct_int32;
+                          int8direct_int32.init(param,
+                                                &ctx_int32->As<ARMContext>());
+                          int8direct_int32.create(param,
+                                                  &ctx_int32->As<ARMContext>());
+                          int8direct_int32.run(param);
+                          int* output_int32_data =
+                              output_int32.mutable_data<int>();
+                          for (int i = 0; i < output_int32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int32_data[i],
+                                        output_int32_ref_data[i], 1e-3);
+                          }
+
+                          // ============ int8direct_int8 ============
+                          int8_t* output_int8_ref_data =
+                              output_int8_ref.mutable_data<int8_t>();
+                          lite::arm::math::trans_tensor_int32_to_int8(
+                              &output_int32_ref, &output_int8_ref, in_scale[0],
+                              1, w_scale);
+                          param.output = &output_int8;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_int8(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kInt8)>
+                              int8direct_int8;
+                          int8direct_int8.init(param,
+                                               &ctx_int8->As<ARMContext>());
+                          int8direct_int8.create(param,
+                                                 &ctx_int8->As<ARMContext>());
+                          int8direct_int8.run(param);
+                          int8_t* output_int8_data =
+                              output_int8.mutable_data<int8_t>();
+                          for (int i = 0; i < output_int8.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int8_data[i],
+                                        output_int8_ref_data[i], 1e-3);
+                          }
+
+                          // ============ int8direct_float32 ============
+                          float* output_fp32_ref_data =
+                              output_fp32_ref.mutable_data<float>();
+                          lite::arm::math::trans_tensor_int32_to_fp32(
+                              &output_int32_ref, &output_fp32_ref, in_scale[0],
+                              w_scale);
+                          param.output = &output_fp32;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_fp32(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kFloat)>
+                              int8direct_fp32;
+                          int8direct_fp32.init(param,
+                                               &ctx_fp32->As<ARMContext>());
+                          int8direct_fp32.create(param,
+                                                 &ctx_fp32->As<ARMContext>());
+                          int8direct_fp32.run(param);
+                          float* output_fp32_data =
+                              output_fp32.mutable_data<float>();
+                          for (int i = 0; i < output_fp32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_fp32_data[i],
+                                        output_fp32_ref_data[i], 1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv_depthwise_int8, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto ic : {1, 3, 8}) {
+      for (auto ih : {5, 15, 28}) {
+        for (auto iw : {5, 15, 28}) {
+          for (auto flag_bias : {false, true}) {
+            for (auto flag_relu : {false, true}) {
+              for (auto dilation : {1}) {
+                for (auto stride : {1, 2}) {
+                  for (auto padding : {1, 2}) {
+                    for (auto ks : {3, /*5 */}) {
+                      int group = ic;
+                      int oc = ic;
+
+                      bool flag_dw_3x3 = (ks == 3) && (padding == 1) &&
+                                         (stride == 1 || stride == 2);
+                      bool flag_dw_5x5 =
+                          (ks == 5 && stride == 1 && padding == 2);
+                      bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+                      if (!flag_dw) continue;
+
+                      const int dks = dilation * (ks - 1) + 1;
+                      int oh = (ih + 2 * padding - dks) / stride + 1;
+                      int ow = (iw + 2 * padding - dks) / stride + 1;
+                      std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                      std::vector<int64_t> filter_shape = {oc, ic / group, ks,
+                                                           ks};
+                      std::vector<int64_t> bias_shape({1, oc, 1, 1});
+                      std::vector<int64_t> output_shape({n, oc, oh, ow});
+
+                      Tensor input_fp32, input_int8;
+                      Tensor filter_fp32, filter_int8;
+                      Tensor bias_int32;
+                      Tensor output_int32_ref, output_int32;
+                      Tensor output_fp32_ref, output_fp32;
+                      Tensor output_int8_ref, output_int8;
+
+                      input_fp32.Resize(input_shape);
+                      input_int8.Resize(input_shape);
+                      filter_fp32.Resize(filter_shape);
+                      filter_int8.Resize(filter_shape);
+                      bias_int32.Resize(bias_shape);
+
+                      output_int32.Resize(output_shape);
+                      output_int32_ref.Resize(output_shape);
+                      output_fp32_ref.Resize(output_shape);
+                      output_fp32.Resize(output_shape);
+                      output_int8_ref.Resize(output_shape);
+                      output_int8.Resize(output_shape);
+
+                      float* input_fp32_data = input_fp32.mutable_data<float>();
+                      int8_t* input_int8_data =
+                          input_int8.mutable_data<int8_t>();
+                      float* filter_fp32_data =
+                          filter_fp32.mutable_data<float>();
+                      int8_t* filter_int8_data =
+                          filter_int8.mutable_data<int8_t>();
+
+                      int* bias_int32_data = bias_int32.mutable_data<int32_t>();
+
+                      for (int i = 0; i < input_fp32.dims().production(); i++) {
+                        input_fp32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+                      for (int i = 0; i < filter_fp32.dims().production();
+                           i++) {
+                        filter_fp32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+                      for (int i = 0; i < bias_int32.dims().production(); i++) {
+                        bias_int32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+
+                      std::vector<float> in_scale;
+                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                          input_fp32, &in_scale, -1, 127.f);
+                      lite::arm::math::trans_tensor_fp32_to_int8(
+                          &input_fp32, &input_int8, in_scale[0]);
+
+                      std::vector<float> w_scale;
+                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                          filter_fp32, &w_scale, -1, 127.f);
+                      int axis_size = oc;
+                      int inner_size = ic / group * ks * ks;
+                      w_scale = lite::arm::math::get_tensor_scale_n(
+                          filter_fp32_data, axis_size, inner_size, 127.f);
+                      lite::arm::math::fp32_to_int8(
+                          filter_fp32_data, filter_int8_data, w_scale.data(),
+                          axis_size, 1, inner_size);
+
+                      operators::ConvParam param;
+                      param.x = &input_int8;
+                      param.filter = &filter_int8;
+                      if (flag_bias) {
+                        param.bias = &bias_int32;
+                      }
+                      param.fuse_relu = false;
+                      param.paddings = std::vector<int>({padding, padding});
+                      param.strides = std::vector<int>({stride, stride});
+                      param.dilations = std::vector<int>({dilation, dilation});
+                      param.groups = group;
+                      param.output = &output_int32_ref;
+                      conv_compute_ref<int8_t, int>(param);
+
+                      int* output_int32_ref_data =
+                          output_int32_ref.mutable_data<int>();
+
+                      // ============ int8depthwise_int32 ============
+                      param.output = &output_int32;
+                      std::unique_ptr<KernelContext> ctx_int32(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt32)>
+                          int8depthwise_int32;
+                      int8depthwise_int32.init(param,
+                                               &ctx_int32->As<ARMContext>());
+                      int8depthwise_int32.create(param,
+                                                 &ctx_int32->As<ARMContext>());
+                      int8depthwise_int32.run(param);
+                      int* output_int32_data = output_int32.mutable_data<int>();
+                      for (int i = 0; i < output_int32.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_int32_data[i],
+                                    output_int32_ref_data[i], 1e-3);
+                      }
+
+                      // ============ int8depthwise_int8============
+                      int8_t* output_int8_ref_data =
+                          output_int8_ref.mutable_data<int8_t>();
+                      lite::arm::math::trans_tensor_int32_to_int8(
+                          &output_int32_ref, &output_int8_ref, in_scale[0], 1,
+                          w_scale);
+                      param.output = &output_int8;
+                      param.input_scale = in_scale[0];
+                      param.output_scale = 1;
+                      param.weight_scale = w_scale;
+                      std::unique_ptr<KernelContext> ctx_int8(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt8)>
+                          int8depthwise_int8;
+                      int8depthwise_int8.init(param,
+                                              &ctx_int8->As<ARMContext>());
+                      int8depthwise_int8.create(param,
+                                                &ctx_int8->As<ARMContext>());
+                      int8depthwise_int8.run(param);
+                      int8_t* output_int8_data =
+                          output_int8.mutable_data<int8_t>();
+                      for (int i = 0; i < output_int8.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_int8_data[i],
+                                    output_int8_ref_data[i], 1e-3);
+                      }
+
+                      // ============int8depthwise_float32 ============
+                      float* output_fp32_ref_data =
+                          output_fp32_ref.mutable_data<float>();
+                      lite::arm::math::trans_tensor_int32_to_fp32(
+                          &output_int32_ref, &output_fp32_ref, in_scale[0],
+                          w_scale);
+                      param.output = &output_fp32;
+                      param.input_scale = in_scale[0];
+                      param.output_scale = 1;
+                      param.weight_scale = w_scale;
+                      std::unique_ptr<KernelContext> ctx_fp32(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kFloat)>
+                          int8depthwise_fp32;
+                      int8depthwise_fp32.init(param,
+                                              &ctx_fp32->As<ARMContext>());
+                      int8depthwise_fp32.create(param,
+                                                &ctx_fp32->As<ARMContext>());
+                      int8depthwise_fp32.run(param);
+                      float* output_fp32_data =
+                          output_fp32.mutable_data<float>();
+                      for (int i = 0; i < output_fp32.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_fp32_data[i],
+                                    output_fp32_ref_data[i], 1e-3);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv_arm, compute) {
+  DeviceInfo::Init();
+#if 1
+  for (auto n : {2}) {
+    for (auto ic : {6}) {
+      for (auto oc : {6}) {
+        for (auto ih : {9}) {
+          for (auto iw : {9}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, true}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {0, 1, 2}) {
+                        for (auto ks : {1, 3, 5}) {
+#else
+  for (auto n : {1, 2}) {
+    for (auto ic : {6, 32 /*, 128*/}) {
+      for (auto oc : {6, 32 /*, 128*/}) {
+        for (auto ih : {9, 18 /*, 56 , 112, 224, 512*/}) {
+          for (auto iw : {9, 18 /*, 56, 112, 224, 512*/}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, true}) {
+                  for (auto dilation : {1, 2}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {0, 1, 2}) {
+                        for (auto ks : {1, 3, 5}) {
+#endif
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+                          // get input, filter and output shape
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+                          // resize input, filter and output
+                          Tensor input;
+                          Tensor filter;
+                          Tensor bias;
+                          Tensor output;
+                          Tensor output_ref;
+                          input.Resize(input_shape);
+                          filter.Resize(filter_shape);
+                          output.Resize(output_shape);
+                          output_ref.Resize(output_shape);
+                          VLOG(3) << "input: " << input.dims();
+                          VLOG(3) << "filter: " << filter.dims()
+                                  << " padding:" << padding
+                                  << " stride:" << stride
+                                  << " dilation:" << dilation;
+                          VLOG(3) << "output: " << output.dims();
+                          auto* input_data = input.mutable_data<float>();
+                          auto* filter_data = filter.mutable_data<float>();
+                          auto* output_data = output.mutable_data<float>();
+                          for (int i = 0; i < input.dims().production(); i++) {
+                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
+                            input_data[i] = sign * static_cast<float>(i % 128);
+                          }
+                          for (int i = 0; i < filter.dims().production(); i++) {
+                            filter_data[i] =
+                                i * 0.001f /
+                                static_cast<float>(filter.dims().production());
+                          }
+                          // prepare kernel params and run
+                          ConvCompute conv;
+                          std::unique_ptr<KernelContext> ctx(new KernelContext);
+                          ctx->As<ARMContext>();
+                          conv.SetContext(std::move(ctx));
+                          operators::ConvParam param;
+                          param.x = &input;
+                          param.filter = &filter;
+                          param.output = &output;
+                          param.bias = nullptr;
+                          if (flag_bias) {
+                            bias.Resize({oc});
+                            auto* bias_data = bias.mutable_data<float>();
+                            for (int i = 0; i < bias.dims().production(); i++) {
+                              bias_data[i] = static_cast<float>(i);
+                            }
+                            param.bias = &bias;
+                          }
+                          param.fuse_relu = flag_relu;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          conv.SetParam(param);
+                          conv.Launch();
+                          // invoking ref implementation and compare results
+                          param.output = &output_ref;
+                          conv_compute_ref<float, float>(param);
+                          auto* output_ref_data =
+                              output_ref.mutable_data<float>();
+                          for (int i = 0; i < output.dims().production(); i++) {
+                            EXPECT_NEAR(output_data[i], output_ref_data[i],
+                                        1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/dropout_compute.cc b/paddle/fluid/lite/kernels/arm/dropout_compute.cc
new file mode 100644
index 00000000000..49e998536a4
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/dropout_compute.cc
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/dropout_compute.h"
+#include <string>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void DropoutCompute::Run() {
+  auto& param = Param<operators::DropoutParam>();
+  const float* x_data = param.x->data<float>();
+  float* out_data = param.output->mutable_data<float>();
+  int num = param.x->dims().production();
+  const float prob_data = param.dropout_prob;
+  if (param.dropout_implementation == "upscale_in_train") {
+    lite::arm::math::dropout_up(x_data, out_data, num);
+  } else {
+    lite::arm::math::dropout_down(x_data, out_data, num, prob_data);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::DropoutCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h b/paddle/fluid/lite/kernels/arm/dropout_compute.h
similarity index 87%
rename from paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
rename to paddle/fluid/lite/kernels/arm/dropout_compute.h
index 9939509d0be..a421f848873 100644
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute.h
+++ b/paddle/fluid/lite/kernels/arm/dropout_compute.h
@@ -22,12 +22,11 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class DropoutCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   void Run() override;
 
-  virtual ~ElementwiseAddCompute() = default;
+  virtual ~DropoutCompute() = default;
 };
 
 }  // namespace arm
diff --git a/paddle/fluid/lite/kernels/arm/dropout_compute_test.cc b/paddle/fluid/lite/kernels/arm/dropout_compute_test.cc
new file mode 100644
index 00000000000..960d47442b6
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/dropout_compute_test.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/dropout_compute.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(dropout_arm, init) {
+  DropoutCompute dropout;
+  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
+  ASSERT_EQ(dropout.target(), TARGET(kARM));
+}
+
+TEST(dropout, retrive_op) {
+  auto dropout =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "dropout");
+  ASSERT_FALSE(dropout.empty());
+  ASSERT_TRUE(dropout.front());
+}
+
+template <typename dtype>
+void dropout_compute_ref(const operators::DropoutParam& param) {
+  const float* x_data = param.x->data<float>();
+  float* output_data = param.output->mutable_data<float>();
+  int num = param.x->dims().production();
+  const float prob_data = param.dropout_prob;
+  if (param.dropout_implementation.compare(
+          std::string({"downgrade_in_infer"})) == 0) {
+    float scale = 1.0 - prob_data;
+    for (int i = 0; i < num; i++) {
+      output_data[i] = x_data[i] * scale;
+    }
+  } else {
+    for (int i = 0; i < num; i++) {
+      output_data[i] = x_data[i];
+    }
+  }
+}
+
+TEST(dropout_arm, compute) {
+  DropoutCompute dropout;
+  operators::DropoutParam param;
+
+  lite::Tensor x;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto prob : {0.2f, 0.8f})
+            for (auto impl : {std::string({"downgrade_in_infer"}),
+                              std::string({"upscale_in_train"})}) {
+              x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+              output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+              output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+              auto* x_data = x.mutable_data<float>();
+              auto* output_data = output.mutable_data<float>();
+              auto* output_ref_data = output_ref.mutable_data<float>();
+              for (int i = 0; i < x.dims().production(); i++) {
+                x_data[i] = i;
+              }
+              param.x = &x;
+              param.output = &output;
+              param.dropout_prob = prob;
+              param.dropout_implementation = impl;
+              dropout.SetParam(param);
+              dropout.Run();
+              param.output = &output_ref;
+              dropout_compute_ref<float>(param);
+              for (int i = 0; i < output.dims().production(); i++) {
+                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+              }
+            }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc b/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
deleted file mode 100644
index 7156d08ce77..00000000000
--- a/paddle/fluid/lite/kernels/arm/elementwise_add_compute_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/kernels/arm/elementwise_add_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-TEST(elementwise_add_arm, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_add");
-  ASSERT_FALSE(elementwise_add.empty());
-  ASSERT_TRUE(elementwise_add.front());
-}
-
-TEST(elementwise_add_arm, init) {
-  ElementwiseAddCompute elementwise_add;
-  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
-  ASSERT_EQ(elementwise_add.target(), TARGET(kARM));
-}
-
-template <typename dtype>
-void elementwise_add_compute_ref(const operators::ElementwiseParam& param) {
-  const dtype* x_data = param.X->data<const dtype>();
-  const dtype* y_data = param.Y->data<const dtype>();
-  dtype* out_data = param.Out->mutable_data<dtype>();
-  DDim dim = param.X->dims();
-  ASSERT_EQ(dim.data(), param.Out->dims().data());
-  for (int i = 0; i < dim.production(); i++) {
-    out_data[i] = x_data[i] + y_data[i];
-  }
-}
-
-TEST(elementwise_add, compute) {
-  ElementwiseAddCompute elementwise_add;
-  operators::ElementwiseParam param;
-
-  lite::Tensor x, y, out, out_ref;
-  x.Resize(DDim(std::vector<int64_t>({2, 3, 4, 5})));
-  y.Resize(DDim(std::vector<int64_t>({2, 3, 4, 5})));
-  out.Resize(DDim(std::vector<int64_t>({2, 3, 4, 5})));
-  out_ref.Resize(DDim(std::vector<int64_t>({2, 3, 4, 5})));
-  auto* x_data = x.mutable_data<float>();
-  auto* y_data = y.mutable_data<float>();
-  auto* out_data = out.mutable_data<float>();
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < x.dims().production(); i++) {
-    x_data[i] = y_data[i] = i;
-  }
-
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  elementwise_add.SetParam(param);
-  elementwise_add.Run();
-
-  param.Out = &out_ref;
-  elementwise_add_compute_ref<float>(param);
-  for (int i = 0; i < out.dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_compute.cc b/paddle/fluid/lite/kernels/arm/elementwise_compute.cc
new file mode 100644
index 00000000000..c3b9b41cde1
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include <string>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+inline bool is_broadcast(const DDim& x_dims, const DDim& y_dims, int axis,
+                         int* pre, int* n, int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  if (x_dims.size() == y_dims.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+void ElementwiseAddCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_add_broadcast(x_data, y_data, out_data, pre, n,
+                                               post);
+  } else {
+    lite::arm::math::elementwise_add(x_data, y_data, out_data,
+                                     x_dims.production());
+  }
+}
+
+void ElementwiseAddActivationCompute::Run() {
+  auto& param = Param<operators::FusionElementwiseActivationParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  std::string act_type = param.act_type;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu_broadcast(x_data, y_data, out_data,
+                                                      pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_add_relu(x_data, y_data, out_data,
+                                            x_dims.production());
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseAddCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_add_activation, kARM, kFloat, kNCHW,
+    paddle::lite::kernels::arm::ElementwiseAddActivationCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_compute.h b/paddle/fluid/lite/kernels/arm/elementwise_compute.h
new file mode 100644
index 00000000000..bb80c61221e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseAddActivationCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseAddActivationCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc b/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc
new file mode 100644
index 00000000000..9f87e6628bc
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/elementwise_compute_test.cc
@@ -0,0 +1,292 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(elementwise_add_arm, retrive_op) {
+  auto elementwise_add =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "elementwise_add");
+  ASSERT_FALSE(elementwise_add.empty());
+  ASSERT_TRUE(elementwise_add.front());
+}
+
+TEST(elementwise_add_arm, init) {
+  ElementwiseAddCompute elementwise_add;
+  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
+  ASSERT_EQ(elementwise_add.target(), TARGET(kARM));
+}
+
+template <typename dtype>
+void elementwise_compute_ref(const operators::ElementwiseParam& param,
+                             const std::string elt_type,
+                             const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  // do elementwise add/sub/max...
+  if (elt_type == "add") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr + diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (elt_type == "sub") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr - diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else {
+    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+  }
+  // do activation relu/sigmod...
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << elt_type;
+    }
+  }
+}
+
+TEST(elementwise_add, compute) {
+  ElementwiseAddCompute elementwise_add;
+  operators::ElementwiseParam param;
+  lite::Tensor x, y, output, output_ref;
+
+#if 1
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {-1, 0, 1, 3}) {
+            for (auto yd :
+                 {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                  std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                  std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                  std::vector<int64_t>({c, h, w}),
+                  std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 4, 11}) {
+      for (auto h : {1, 3, 4, 11}) {
+        for (auto w : {1, 3, 4, 11}) {
+          for (auto axis : {-1, 0, 1, 2, 3}) {
+            for (auto yd :
+                 {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                  std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                  std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                  std::vector<int64_t>({h, w}), std::vector<int64_t>({n, c, h}),
+                  std::vector<int64_t>({c, h, w}),
+                  std::vector<int64_t>({n, c, h, w})}) {
+#endif
+              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+              auto y_dim = DDim(yd);
+              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+              if (axis_t + y_dim.size() > 4) continue;
+              bool flag = false;
+              for (int i = 0; i < y_dim.size(); i++) {
+                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+              }
+              if (flag) continue;
+
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              output.Resize(x_dim);
+              output_ref.Resize(x_dim);
+              auto* x_data = x.mutable_data<float>();
+              auto* y_data = y.mutable_data<float>();
+              auto* output_data = output.mutable_data<float>();
+              auto* output_ref_data = output_ref.mutable_data<float>();
+              for (int i = 0; i < x_dim.production(); i++) {
+                x_data[i] = i;
+              }
+              for (int i = 0; i < y_dim.production(); i++) {
+                y_data[i] = i;
+              }
+              param.X = &x;
+              param.Y = &y;
+              param.axis = axis;
+              param.Out = &output;
+              elementwise_add.SetParam(param);
+              elementwise_add.Run();
+              param.Out = &output_ref;
+              elementwise_compute_ref<float>(param, "add", "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(fusion_elementwise_add_activation_arm, retrive_op) {
+  auto fusion_elementwise_add_activation =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "fusion_elementwise_add_activation");
+  ASSERT_FALSE(fusion_elementwise_add_activation.empty());
+  ASSERT_TRUE(fusion_elementwise_add_activation.front());
+}
+
+TEST(fusion_elementwise_add_activation_arm, init) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  ASSERT_EQ(fusion_elementwise_add_activation.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fusion_elementwise_add_activation.target(), TARGET(kARM));
+}
+
+TEST(fusion_elementwise_add_activation_arm, compute) {
+  ElementwiseAddActivationCompute fusion_elementwise_add_activation;
+  operators::FusionElementwiseActivationParam param;
+  lite::Tensor x, y, output, output_ref;
+
+#if 1
+  for (auto act_type : {"relu"}) {
+    for (auto n : {1, 3, 4}) {
+      for (auto c : {1, 3, 4}) {
+        for (auto h : {1, 3, 4}) {
+          for (auto w : {1, 3, 4}) {
+            for (auto axis : {-1, 0, 1, 3}) {
+              for (auto yd :
+                   {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                    std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                    std::vector<int64_t>({n, c}), std::vector<int64_t>({h, w}),
+                    std::vector<int64_t>({n, c, h}),
+                    std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto act_type : {"relu"}) {
+    for (auto n : {1, 3, 4, 11}) {
+      for (auto c : {1, 3, 4, 11}) {
+        for (auto h : {1, 3, 4, 11}) {
+          for (auto w : {1, 3, 4, 11}) {
+            for (auto axis : {-1, 0, 1, 2, 3}) {
+              for (auto yd :
+                   {std::vector<int64_t>({n}), std::vector<int64_t>({c}),
+                    std::vector<int64_t>({h}), std::vector<int64_t>({w}),
+                    std::vector<int64_t>({n, c}), std::vector<int64_t>({c, h}),
+                    std::vector<int64_t>({h, w}),
+                    std::vector<int64_t>({n, c, h}),
+                    std::vector<int64_t>({c, h, w}),
+                    std::vector<int64_t>({n, c, h, w})}) {
+#endif
+                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+                auto y_dim = DDim(yd);
+                int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+                if (axis_t + y_dim.size() > 4) continue;
+                bool flag = false;
+                for (int i = 0; i < y_dim.size(); i++) {
+                  if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+                }
+                if (flag) continue;
+
+                x.Resize(x_dim);
+                y.Resize(y_dim);
+                output.Resize(x_dim);
+                output_ref.Resize(x_dim);
+                auto* x_data = x.mutable_data<float>();
+                auto* y_data = y.mutable_data<float>();
+                auto* output_data = output.mutable_data<float>();
+                auto* output_ref_data = output_ref.mutable_data<float>();
+                for (int i = 0; i < x_dim.production(); i++) {
+                  float sign = i % 3 == 0 ? -1.0f : 1.0f;
+                  x_data[i] = i * sign;
+                }
+                for (int i = 0; i < y_dim.production(); i++) {
+                  float sign = i % 2 == 0 ? 0.5f : -0.5f;
+                  y_data[i] = i * sign;
+                }
+                param.X = &x;
+                param.Y = &y;
+                param.axis = axis;
+                param.Out = &output;
+                param.act_type = act_type;
+                fusion_elementwise_add_activation.SetParam(param);
+                fusion_elementwise_add_activation.Run();
+                param.Out = &output_ref;
+                elementwise_compute_ref<float>(param, "add", act_type);
+                for (int i = 0; i < output.dims().production(); i++) {
+                  EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc
index b26551e0533..41bd914c9d2 100644
--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -13,7 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
+#include <vector>
+#include "paddle/fluid/lite/api/paddle_place.h"
 #include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/arm/math/gemm_prepacked_int8.h"
+#include "paddle/fluid/lite/arm/math/gemv_arm_int8.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 #include "paddle/fluid/lite/core/type_system.h"
 
@@ -22,47 +26,163 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-void FcCompute::Run() {
+void FcCompute::PrepareForRun() {
   auto& param = this->Param<operators::FcParam>();
   auto x_dims = param.input->dims();
   auto w_dims = param.w->dims();
 
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
   CHECK_GE(x_dims.size(), 2UL);
   CHECK_EQ(w_dims.size(), 2UL);
   CHECK_EQ(param.output->dims().size(), 2UL);
 
+  m_ = x_dims.Slice(0, param.in_num_col_dims).production();
+  k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
+  n_ = w_dims[1];
+  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+  if (m_ == 1) {
+    if (!transed_weight_) {
+      transed_weight_ = new Tensor;
+    }
+    transed_weight_->Resize({n_, k_});
+    const auto* w_data = param.w->data<float>();
+    auto* t_data = transed_weight_->mutable_data<float>();
+    int i = 0;
+
+    for (int nn = 0; nn < n_; ++nn) {
+      for (int kk = 0; kk < k_; ++kk) {
+        t_data[i++] = w_data[kk * n_ + nn];
+      }
+    }
+  }
+
+  if (m_ > 1) {
+    int hblock = lite::arm::math::get_hblock(ctx.arch());
+    int m_round = hblock * ((m_ + hblock - 1) / hblock);
+    ctx.ExtendWorkspace(DDimLite(std::vector<int64_t>({m_round * k_})));
+  }
+}
+
+void FcCompute::Run() {
+  auto& param = this->Param<operators::FcParam>();
+
   const auto* i_data = param.input->data<float>();
   const auto* w_data = param.w->data<float>();
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
   auto* o_data = param.output->mutable_data<float>();
 
-  int x_h = x_dims.Slice(0, param.in_num_col_dims).production();
-  int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-  int n = w_dims[1];
-  CHECK_EQ(x_w, static_cast<int>(w_dims[0]));
   auto& ctx = this->ctx_->template As<ARMContext>();
-  if (x_h > 1) {
-    float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) +
-                       ctx.l2_cache_size() / sizeof(float);
-    lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false,
-                              &ctx);
-    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n,
-                                   x_w, false, false, false, &ctx);
-
+  if (m_ > 1) {
+    float* packed_in =
+        ctx.workspace_data<float>() + ctx.l2_cache_size() / sizeof(float);
+    lite::arm::math::prepackA(packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx);
+    lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, m_, n_,
+                                   k_, false, false, false, &ctx);
     if (param.bias) {
-      CHECK_EQ(param.bias->numel(), n);
-      lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n);
+      CHECK_EQ(param.bias->numel(), n_);
+      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
     }
   } else {
-    // use sgemmv
-    // sgemv((const float*)weights, (const float*)din, (float*)dout,
-    //       false, n, x_w, _param->_flag_bias, (float*)bias, false);
+    CHECK(transed_weight_);
+    const auto* t_data = transed_weight_->data<float>();
+
+    lite::arm::math::sgemv(t_data, i_data, o_data, false, n_, k_,
+                           b_data != nullptr, b_data, false);
   }
 }
 
-TargetType FcCompute::target() const { return TARGET(kARM); }
+template <PrecisionType Ptype_out>
+void FcComputeInt8<Ptype_out>::PrepareForRun() {
+  auto& param = this->Param<operators::FcParam>();
+  auto x_dims = param.input->dims();
+  auto w_dims = param.w->dims();
+
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  if (!tmp_int32_out_) {
+    tmp_int32_out_ = new Tensor;
+    tmp_int32_out_->Resize(param.output->dims());
+  }
+
+  CHECK_GE(x_dims.size(), 2UL);
+  CHECK_EQ(w_dims.size(), 2UL);
+  CHECK_EQ(param.output->dims().size(), 2UL);
 
-PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
+  this->m_ = x_dims.Slice(0, param.in_num_col_dims).production();
+  this->k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
+  this->n_ = w_dims[1];
+  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+
+  if (this->m_ == 1) {
+    if (!this->transed_weight_) {
+      this->transed_weight_ = new Tensor;
+    }
+    this->transed_weight_->Resize({this->n_, this->k_});
+    const auto* w_data = param.w->template data<int8_t>();
+    auto* t_data = this->transed_weight_->template mutable_data<int8_t>();
+    int i = 0;
+
+    for (int nn = 0; nn < this->n_; ++nn) {
+      for (int kk = 0; kk < this->k_; ++kk) {
+        t_data[i++] = w_data[kk * this->n_ + nn];
+      }
+    }
+  }
+
+  if (this->m_ > 1) {
+    int hblock = lite::arm::math::get_hblock(ctx.arch());
+    int m_round = hblock * ((this->m_ + hblock - 1) / hblock);
+    ctx.ExtendWorkspace(DDimLite(std::vector<int64_t>({m_round * this->k_})));
+  }
+}
+
+template <PrecisionType Ptype_out>
+void FcComputeInt8<Ptype_out>::Run() {
+  auto& param = this->Param<operators::FcParam>();
+
+  const auto* i_data = param.input->template data<int8_t>();
+  const auto* w_data = param.w->template data<int8_t>();
+  const auto* b_data = param.bias ? param.bias->template data<int>() : nullptr;
+  int* o_data = nullptr;
+
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  o_data = this->tmp_int32_out_->template mutable_data<int>();
+  if (m_ > 1) {
+    int8_t* packed_in =
+        static_cast<int8_t*>(ctx.template workspace_data<int8_t>()) +
+        ctx.l2_cache_size() / sizeof(int8_t);
+    lite::arm::math::prepackA_int8(packed_in, i_data, k_, 0, m_, 0, k_, false);
+    lite::arm::math::gemm_prepack_int8(packed_in, w_data, b_data, o_data, m_,
+                                       n_, k_, false, false, false, nullptr,
+                                       &ctx);
+    if (param.bias) {
+      CHECK_EQ(param.bias->numel(), n_);
+      lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
+    }
+  } else {
+    CHECK(transed_weight_);
+    const auto* t_data = transed_weight_->template data<int8_t>();
+    lite::arm::math::gemv_int8(t_data, i_data, o_data, false, n_, k_, nullptr,
+                               b_data != nullptr, b_data, false);
+  }
+
+  float i_scale = param.input_scale;
+  std::vector<float> weight_scale = param.weight_scale;
+  if (Ptype_out == PRECISION(kInt8)) {
+    float o_scale = param.output_scale;
+    param.output->template mutable_data<int8_t>();
+    lite::arm::math::trans_tensor_dtype<PRECISION(kInt32), PRECISION(kInt8)>(
+        tmp_int32_out_, param.output, i_scale, o_scale, weight_scale);
+  } else if (Ptype_out == PRECISION(kFloat)) {
+    param.output->template mutable_data<float>();
+    lite::arm::math::trans_tensor_dtype<PRECISION(kInt32), PRECISION(kFloat)>(
+        tmp_int32_out_, param.output, i_scale, 1.f, weight_scale);
+  } else {
+    LOG(ERROR) << "unsupported precision type!!";
+  }
+}
 
 }  // namespace arm
 }  // namespace kernels
@@ -76,3 +196,21 @@ REGISTER_LITE_KERNEL(fc, kARM, kFloat, kNCHW,
     .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fc, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::FcComputeInt8<PRECISION(kInt8)>, int8out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fc, kARM, kInt8, kNCHW,
+    paddle::lite::kernels::arm::FcComputeInt8<PRECISION(kFloat)>, fp32out)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.h b/paddle/fluid/lite/kernels/arm/fc_compute.h
index 41451784335..cfbcaa6939b 100644
--- a/paddle/fluid/lite/kernels/arm/fc_compute.h
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <stdint.h>
+#include "paddle/fluid/lite/arm/math/type_trans.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/operators/fc_op.h"
 
@@ -25,12 +27,40 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   using param_t = operators::FcParam;
 
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~FcCompute() override {
+    if (transed_weight_) {
+      delete transed_weight_;
+    }
+  };
+
+ private:
+  lite::Tensor* transed_weight_{nullptr};
+  int m_, n_, k_;
+};
+
+template <PrecisionType Ptype_out>
+class FcComputeInt8 : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::FcParam;
+
+  void PrepareForRun() override;
+
   void Run() override;
 
-  TargetType target() const override;
-  PrecisionType precision() const override;
+  ~FcComputeInt8() override {
+    if (transed_weight_) {
+      delete transed_weight_;
+    }
+  };
 
-  virtual ~FcCompute() = default;
+ private:
+  lite::Tensor* transed_weight_{nullptr};
+  Tensor* tmp_int32_out_{nullptr};
+  int m_, n_, k_;
 };
 
 }  // namespace arm
diff --git a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
index 2e85fccf7d6..65ccde50dbe 100644
--- a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc
@@ -14,6 +14,11 @@
 
 #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
 #include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
@@ -23,6 +28,46 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+#define A(i, j) a[i * lda + j]
+#define B(i, j) b[i * ldb + j]
+#define C(i, j) c[i * ldc + j]
+
+template <typename T>
+void gemm_bias(const T* a, const int M, const int K, const T* b, const int K_,
+               const int N, T* biases, T* c) {
+  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
+  EXPECT_TRUE(a && b && c);
+  const int lda = K;
+  const int ldb = N;
+  const int ldc = N;
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      C(m, n) = 0.0f;
+      for (int k = 0; k < K; ++k) {
+        C(m, n) += A(m, k) * B(k, n);
+      }
+    }
+  }
+  if (biases) {
+    for (int m = 0; m < M; ++m) {
+      for (int n = 0; n < N; ++n) {
+        C(m, n) += biases[n];
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
+              const T upper = static_cast<T>(2.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
 TEST(fc_arm, retrive_op) {
   auto fc =
       KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
@@ -37,108 +82,117 @@ TEST(fc_arm, init) {
 }
 
 TEST(fc_arm, compare_test) {
-  lite::Tensor x, w, b, out, ref;
-  constexpr int batch_size = 2;
-  x.Resize({batch_size, 3});
-  w.Resize({3, 4});
-  b.Resize({1, 4});
-  out.Resize({batch_size, 4});
-  ref.Resize({batch_size, 4});
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-  auto ref_data = ref.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().product(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().product(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < b.dims().product(); i++) {
-    b_data[i] = static_cast<float>(i);
-  }
-
-  lite::arm::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                    w_data, 3, 4,           //
-                                    b_data, ref_data);
-
-  // fc compute kernel
-  FcCompute fc;
-  operators::FcParam param;
-
-  param.in_num_col_dims = 1;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &b;
-  param.output = &out;
-  param.in_mat_dims = x.dims();
-
-  DeviceInfo::Init();
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<ARMContext>();
-  fc.SetParam(param);
-  fc.SetContext(std::move(ctx));
-  fc.Run();
-
-  VLOG(3) << "output vs ref";
-  for (int i = 0; i < out.dims().product(); i++) {
-    VLOG(3) << out_data[i] << " vs " << ref_data[i];
-  }
-
-  for (int i = 0; i < out.dims().product(); ++i) {
-    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  using T = float;
+
+  for (int m : {1, 2, 3, 4}) {
+    for (int n : {1, 2, 3, 4}) {
+      for (int k : {1, 2, 3, 4}) {
+        for (bool with_bias : {true, false}) {
+          VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
+                  << (with_bias ? ", with bias" : "");
+          lite::Tensor x, w, b, out, ref;
+
+          x.Resize({m, k});
+          w.Resize({k, n});
+          b.Resize({1, n});
+          out.Resize({m, n});
+          ref.Resize({m, n});
+
+          auto* x_data = x.mutable_data<T>();
+          auto* w_data = w.mutable_data<T>();
+          auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
+
+          auto* out_data = out.mutable_data<T>();
+          auto* ref_data = ref.mutable_data<T>();
+
+          FillData<T>(x_data, x.dims().production());
+          FillData<T>(w_data, w.dims().production());
+          FillData<T>(out_data, out.dims().production(), 0, 0);
+          FillData<T>(ref_data, ref.dims().production(), 0, 0);
+
+          if (with_bias) {
+            FillData<T>(b_data, b.dims().production());
+          }
+
+          FcCompute fc;
+          operators::FcParam param;
+
+          param.input = &x;
+          param.w = &w;
+          param.bias = with_bias ? &b : nullptr;
+          param.output = &out;
+          param.in_num_col_dims = 1;
+          param.in_mat_dims = x.dims();
+
+          DeviceInfo::Init();
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          fc.SetParam(param);
+          fc.SetContext(std::move(ctx));
+          fc.PrepareForRun();
+          fc.Run();
+
+          gemm_bias<T>(x_data, m, k, w_data, k, n, b_data, ref_data);
+
+          for (int i = 0; i < out.dims().production(); i++) {
+            EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+          }
+        }
+      }
+    }
   }
 }
 
 TEST(fc_arm, num_col_dims) {
-  FcCompute fc;
-  operators::FcParam param;
-
-  lite::Tensor x;
-  lite::Tensor w;
-  lite::Tensor bias;
-  lite::Tensor output;
-
-  x.Resize({1, 2, 3});
-  w.Resize({3, 4});
-  bias.Resize({1, 4});
-  output.Resize({2, 4});
-
-  auto* x_data = x.mutable_data<float>();
-  auto* w_data = w.mutable_data<float>();
-  auto* bias_data = bias.mutable_data<float>();
-  auto* output_data = output.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().product(); i++) {
-    x_data[i] = static_cast<float>(i);
+  using T = float;
+
+  for (bool with_bias : {true, false}) {
+    lite::Tensor x, w, b, out, ref;
+
+    x.Resize({1, 2, 3});
+    w.Resize({3, 4});
+    b.Resize({1, 4});
+    out.Resize({2, 4});
+    ref.Resize({2, 4});
+
+    auto* x_data = x.mutable_data<float>();
+    auto* w_data = w.mutable_data<float>();
+    auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
+
+    auto* out_data = out.mutable_data<T>();
+    auto* ref_data = ref.mutable_data<T>();
+
+    FillData<T>(x_data, x.dims().production());
+    FillData<T>(w_data, w.dims().production());
+    FillData<T>(out_data, out.dims().production(), 0, 0);
+    FillData<T>(ref_data, ref.dims().production(), 0, 0);
+    if (with_bias) {
+      FillData<T>(b_data, b.dims().production());
+    }
+    FcCompute fc;
+    operators::FcParam param;
+    param.input = &x;
+    param.w = &w;
+    param.bias = with_bias ? &b : nullptr;
+    param.output = &out;
+    param.in_num_col_dims = 2;
+    param.in_mat_dims = x.dims();
+
+    std::unique_ptr<KernelContext> ctx(new KernelContext);
+    ctx->As<ARMContext>();
+    DeviceInfo::Init();
+
+    fc.SetParam(param);
+    fc.SetContext(std::move(ctx));
+    fc.PrepareForRun();
+    fc.Run();
+
+    gemm_bias<T>(x_data, 2, 3, w_data, 3, 4, b_data, ref_data);
+
+    for (int i = 0; i < out.dims().production(); i++) {
+      EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+    }
   }
-  for (int64_t i = 0; i < w.dims().product(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < bias.dims().product(); i++) {
-    bias_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < output.dims().product(); i++) {
-    output_data[i] = static_cast<float>(i);
-  }
-
-  param.in_num_col_dims = 2;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &bias;
-  param.output = &output;
-  param.in_mat_dims = x.dims();
-
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  ctx->As<ARMContext>();
-  DeviceInfo::Init();
-
-  fc.SetParam(param);
-  fc.SetContext(std::move(ctx));
-  fc.Run();
 }
 
 }  // namespace arm
diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.cc b/paddle/fluid/lite/kernels/arm/mul_compute.cc
index ff12b236031..c721e8046e7 100644
--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -12,57 +12,61 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
 #include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/core/type_system.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
-void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h,
-                       int y_w, T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+void MulCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
 
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> Y(y, y_h, y_w);
-  Eigen::Map<matrix_t> Out(out, x_h, y_w);
+void MulCompute::Run() {
+  auto& param = Param<param_t>();
 
-  Out = X * Y;
-}
+  const auto* x_data = param.x->data<float>();
+  const auto* y_data = param.y->data<float>();
+  auto* o_data = param.output->mutable_data<float>();
 
-class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
+  m_ = static_cast<int>(
+      param.x->dims().Slice(0, param.x_num_col_dims).production());
+  int x_w =
+      static_cast<int>(param.x->dims()
+                           .Slice(param.x_num_col_dims, param.x->dims().size())
+                           .production());
+  int y_h = static_cast<int>(
+      param.y->dims().Slice(0, param.y_num_col_dims).production());
+  n_ = static_cast<int>(param.y->dims()
+                            .Slice(param.y_num_col_dims, param.y->dims().size())
+                            .production());
 
-  void Run() override {
-    auto& param = Param<operators::MulParam>();
-    core::dim2 x_shape(
-        {static_cast<int>(
-             param.x->dims().Slice(0, param.x_num_col_dims).production()),
-         static_cast<int>(
-             param.x->dims()
-                 .Slice(param.x_num_col_dims, param.x->dims().size())
-                 .production())});
-    core::dim2 y_shape(
-        {static_cast<int>(
-             param.y->dims().Slice(0, param.y_num_col_dims).production()),
-         static_cast<int>(
-             param.y->dims()
-                 .Slice(param.y_num_col_dims, param.y->dims().size())
-                 .production())});
+  CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
+  k_ = x_w;
 
-    mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y,  //
-                      param.y->data<float>(), y_shape.x, y_shape.y,  //
-                      param.output->mutable_data<float>());
-  }
+  if (n_ == 1) {
+    lite::arm::math::sgemv(x_data, y_data, o_data, false, m_, k_, false,
+                           nullptr, false);
 
-  virtual ~MulCompute() = default;
-};
+  } else {
+    constexpr bool is_tranposed_y = false;
+    auto& ctx = this->ctx_->template As<ARMContext>();
+    int hblock = lite::arm::math::get_hblock(ctx.arch());
+    int m_round = hblock * ((m_ + hblock - 1) / hblock);
+    ctx.ExtendWorkspace(DDimLite(std::vector<int64_t>({m_round * k_})));
+
+    float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
+                      ctx.l2_cache_size() / sizeof(float);
+    lite::arm::math::prepackA(packed_x, x_data, k_, 0, m_, 0, k_, false, &ctx);
+    lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m_, n_,
+                                   k_, false, false, is_tranposed_y, &ctx);
+  }
+}
 
 }  // namespace arm
 }  // namespace kernels
diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.h b/paddle/fluid/lite/kernels/arm/mul_compute.h
new file mode 100644
index 00000000000..64c8f813d4e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~MulCompute() = default;
+
+ private:
+  int m_, n_, k_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
new file mode 100644
index 00000000000..9a0deec2a1d
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define A(i, j) a[i * lda + j]
+#define B(i, j) b[i * ldb + j]
+#define C(i, j) c[i * ldc + j]
+
+template <typename T>
+void mul_gemm(const T* a, const int M, const int K, const T* b, const int K_,
+              const int N, T* c) {
+  EXPECT_TRUE(K_ == K && M > 0 && N > 0 && K > 0);
+  EXPECT_TRUE(a && b && c);
+  const int lda = K;
+  const int ldb = N;
+  const int ldc = N;
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      C(m, n) = 0.0f;
+      for (int k = 0; k < K; ++k) {
+        C(m, n) += A(m, k) * B(k, n);
+      }
+    }
+  }
+}
+
+template <typename T>
+void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
+              const T upper = static_cast<T>(2.f)) {
+  static unsigned int seed = 100;
+  std::mt19937 rng(seed++);
+  std::uniform_real_distribution<double> uniform_dist(0, 1);
+  for (int i = 0; i < n; ++i) {
+    a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
+  }
+}
+
+TEST(mul_arm, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_arm, init) {
+  MulCompute mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kARM));
+}
+
+TEST(mul_arm, compare_test) {
+  using T = float;
+
+  for (int m : {1, 2, 3, 4}) {
+    for (int n : {1, 2, 3, 4}) {
+      for (int k : {1, 2, 3, 4}) {
+        VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
+        lite::Tensor x, y, out, ref;
+        x.Resize({m, k});
+        y.Resize({k, n});
+        out.Resize({m, n});
+        ref.Resize({m, n});
+
+        auto* x_data = x.mutable_data<T>();
+        auto* y_data = y.mutable_data<T>();
+        auto* out_data = out.mutable_data<T>();
+        auto* ref_data = ref.mutable_data<T>();
+
+        FillData<T>(x_data, x.dims().production());
+        FillData<T>(y_data, y.dims().production());
+        FillData<T>(out_data, out.dims().production(), 0, 0);
+        FillData<T>(ref_data, ref.dims().production(), 0, 0);
+
+        MulCompute mul;
+        operators::MulParam param;
+
+        param.x = &x;
+        param.y = &y;
+        param.output = &out;
+
+        DeviceInfo::Init();
+        std::unique_ptr<KernelContext> ctx(new KernelContext);
+        ctx->As<ARMContext>();
+        mul.SetParam(param);
+        mul.SetContext(std::move(ctx));
+        mul.PrepareForRun();
+
+        mul.Run();
+
+        mul_gemm<T>(x_data, m, k, y_data, k, n, ref_data);
+
+        for (int i = 0; i < out.dims().production(); i++) {
+          EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+        }
+      }
+    }
+  }
+}
+
+TEST(mul_arm, num_col_dims) {
+  using T = float;
+
+  lite::Tensor x, y, out, ref;
+  x.Resize({2, 3, 4});
+  y.Resize({3, 4, 5});
+  out.Resize({2, 5});
+  ref.Resize({2, 5});
+
+  auto* x_data = x.mutable_data<T>();
+  auto* y_data = y.mutable_data<T>();
+  auto* out_data = out.mutable_data<T>();
+  auto* ref_data = ref.mutable_data<T>();
+
+  FillData<T>(x_data, x.dims().production());
+  FillData<T>(y_data, y.dims().production());
+  FillData<T>(out_data, out.dims().production());
+  FillData<T>(ref_data, out.dims().production());
+
+  MulCompute mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+  param.x_num_col_dims = 1;
+  param.y_num_col_dims = 2;
+
+  DeviceInfo::Init();
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  mul.SetParam(param);
+  mul.SetContext(std::move(ctx));
+  mul.PrepareForRun();
+
+  mul.Run();
+
+  mul_gemm<T>(x_data, 2, 12, y_data, 12, 5, ref_data);
+
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.cc b/paddle/fluid/lite/kernels/arm/pool_compute.cc
new file mode 100644
index 00000000000..0b5eb6ac847
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void PoolCompute::PrepareForRun() {
+  auto& ctx = this->ctx_->template As<ARMContext>();
+}
+
+void PoolCompute::Run() {
+  auto& param = Param<operators::PoolParam>();
+  auto& in_dims = param.x->dims();
+  auto& out_dims = param.output->dims();
+
+  const float* din = param.x->data<float>();
+  float* dout = param.output->mutable_data<float>();
+
+  std::vector<int>& ksize = param.ksize;
+  std::vector<int>& strides = param.strides;
+  std::vector<int>& paddings = param.paddings;
+
+  std::string& pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+  bool ceil_mode = param.ceil_mode;
+  bool use_quantizer = param.use_quantizer;
+  std::string& data_format = param.data_format;
+
+  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
+                   (paddings[0] == paddings[1]);
+
+  if (global_pooling) {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_dims[i + 2]);
+    }
+    if (pooling_type == "max") {
+      lite::arm::math::pooling_global_max(din, dout, out_dims[0], out_dims[1],
+                                          out_dims[2], out_dims[3], in_dims[1],
+                                          in_dims[2], in_dims[3]);
+      VLOG(3) << "invoking pooling_global_max";
+      return;
+    } else if (pooling_type == "avg") {
+      lite::arm::math::pooling_global_avg(din, dout, out_dims[0], out_dims[1],
+                                          out_dims[2], out_dims[3], in_dims[1],
+                                          in_dims[2], in_dims[3]);
+      VLOG(3) << "invoking pooling_global_ave";
+      return;
+    }
+  } else {
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling2x2s2_max(din, dout, out_dims[0], out_dims[1],
+                                          out_dims[2], out_dims[3], in_dims[1],
+                                          in_dims[2], in_dims[3]);
+        VLOG(3) << "invoking pooling2x2s2_max";
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling2x2s2_avg(din, dout, out_dims[0], out_dims[1],
+                                          out_dims[2], out_dims[3], in_dims[1],
+                                          in_dims[2], in_dims[3], exclusive);
+        VLOG(3) << "invoking pooling2x2s2_avg";
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
+               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling3x3s1p1_max(din, dout, out_dims[0], out_dims[1],
+                                            out_dims[2], out_dims[3],
+                                            in_dims[1], in_dims[2], in_dims[3]);
+        VLOG(3) << "invokingpooling3x3s1p1_max";
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling3x3s1p1_avg(
+            din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+            in_dims[1], in_dims[2], in_dims[3], exclusive);
+        VLOG(3) << "invoking pooling3x3s1p1_avg";
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
+               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling3x3s2p0_max(din, dout, out_dims[0], out_dims[1],
+                                            out_dims[2], out_dims[3],
+                                            in_dims[1], in_dims[2], in_dims[3]);
+        VLOG(3) << "pooling3x3s2p0_max";
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling3x3s2p0_avg(
+            din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+            in_dims[1], in_dims[2], in_dims[3], exclusive);
+        VLOG(3) << "invoking pooling3x3s2p0_avg";
+        return;
+      }
+    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
+               kps_equal) {
+      if (pooling_type == "max") {
+        lite::arm::math::pooling3x3s2p1_max(din, dout, out_dims[0], out_dims[1],
+                                            out_dims[2], out_dims[3],
+                                            in_dims[1], in_dims[2], in_dims[3]);
+        VLOG(3) << "invoking pooling3x3s2p1_max";
+        return;
+      } else if (pooling_type == "avg") {
+        lite::arm::math::pooling3x3s2p1_avg(
+            din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3],
+            in_dims[1], in_dims[2], in_dims[3], exclusive);
+        VLOG(3) << "invoking pooling3x3s2p1_avg";
+        return;
+      }
+    }
+  }
+  lite::arm::math::pooling_basic(
+      din, dout, out_dims[0], out_dims[1], out_dims[2], out_dims[3], in_dims[1],
+      in_dims[2], in_dims[3], ksize, strides, paddings, global_pooling,
+      exclusive, adaptive, ceil_mode, use_quantizer, pooling_type);
+  VLOG(3) << "invoking pooling_basic";
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::PoolCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/pool_compute.h b/paddle/fluid/lite/kernels/arm/pool_compute.h
new file mode 100644
index 00000000000..1cb4e6db1bb
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/pool_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class PoolCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~PoolCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
new file mode 100644
index 00000000000..8371568d2f0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc
@@ -0,0 +1,284 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
+                   bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
+  return output_size;
+}
+
+std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
+  const auto x_dims = param_->x->dims();
+  std::vector<int>& ksize = param_->ksize;
+  if (param_->global_pooling) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      param_->paddings[i] = 0;
+      ksize[i] = static_cast<int>(x_dims[i + 2]);
+    }
+  }
+
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  if (param_->adaptive) {
+    output_shape.insert(output_shape.end(), param_->ksize.begin(),
+                        param_->ksize.end());
+  } else {
+    for (size_t i = 0; i < param_->ksize.size(); ++i) {
+      output_shape.push_back(
+          PoolOutputSize(x_dims[i + 2], param_->ksize[i], param_->paddings[i],
+                         param_->strides[i], param_->ceil_mode));
+    }
+  }
+  return output_shape;
+}
+
+void pool_compute_ref(const operators::PoolParam& param) {
+  auto& in_dims = param.x->dims();
+  auto& out_dims = param.output->dims();
+
+  const float* src_ptr = param.x->data<const float>();
+  float* dst_ptr = param.output->mutable_data<float>();
+
+  std::vector<int> ksize = param.ksize;
+  std::vector<int> strides = param.strides;
+  std::vector<int> paddings = param.paddings;
+
+  std::string pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+  bool ceil_mode = param.ceil_mode;
+  bool use_quantizer = param.use_quantizer;
+  std::string data_format = param.data_format;
+
+  int in_n = in_dims[0];
+  int in_c = in_dims[1];
+  int in_h = in_dims[2];
+  int in_w = in_dims[3];
+  int size_in_n = in_c * in_h * in_w;
+  int size_in_c = in_h * in_w;
+
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+  int size_out_n = in_c * out_h * out_w;
+  int size_out_c = out_h * out_w;
+
+  int window_h = ksize[0];
+  int window_w = ksize[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+
+  if (global_pooling == true) {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        const float* src = src_ptr + n * size_in_n + c * size_in_c;
+        float res = src[0];
+        if (pooling_type == "max") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res = cur_val > res ? cur_val : res;
+          }
+        } else if (pooling_type == "avg") {
+          for (int i = 1; i < size_in_c; ++i) {
+            float cur_val = src[i];
+            res += cur_val;
+          }
+          res /= size_in_c;
+        }
+        dst_ptr[n * size_out_n + c] = res;
+      }
+    }
+  } else {
+    for (int n = 0; n < in_n; ++n) {
+      for (int c = 0; c < in_c; ++c) {
+        for (int h = 0; h < out_h; ++h) {
+          int sh = h * stride_h;
+          int eh = sh + window_h;
+          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
+          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
+          for (int w = 0; w < out_w; ++w) {
+            int sw = w * stride_w;
+            int ew = sw + window_w;
+            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
+            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
+            int pooling_size = (ew - sw) * (eh - sh);
+            if (pooling_size == 0) continue;
+            float res = 0.f;
+            for (int kh = sh; kh < eh; ++kh) {
+              for (int kw = sw; kw < ew; ++kw) {
+                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                if (kh == sh && kw == sw) {
+                  res = src_ptr[src_idx];
+                } else {
+                  if (pooling_type == "max") {
+                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                  }
+                  if (pooling_type == "avg") {
+                    res += src_ptr[src_idx];
+                  }
+                }
+              }
+            }
+            if (pooling_type == "avg") {
+              if (exclusive) {
+                res /= pooling_size;
+              } else {
+                res /= window_h * window_w;
+              }
+            }
+            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(pool_arm, init) {
+  PoolCompute pool;
+  ASSERT_EQ(pool.precision(), PRECISION(kFloat));
+  ASSERT_EQ(pool.target(), TARGET(kARM));
+}
+
+TEST(pool_arm, compute) {
+  PoolCompute pool;
+  operators::PoolParam param;
+
+  lite::Tensor x;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  // speedup for ci
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ceil_mode : {true, false}) {
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {true, false}) {
+          for (auto ksize : {2, 3}) {
+            for (auto stride : {1, 2}) {
+              for (auto pad : {0, 1}) {
+                for (auto n : {1, 2}) {
+                  for (auto c : {1, 3}) {
+#if 1
+                    for (auto h : {2, 3, 4, 11}) {
+                      for (auto w : {2, 3, 4, 11}) {
+#else
+                    for (int h = 2; h < 25; h++) {
+                      for (int w = 2; w < 25; w++) {
+#endif
+                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
+                                << " w:" << w << " ksize:" << ksize
+                                << " stride:" << stride << " pad:" << pad
+                                << " exclusive:" << exclusive
+                                << " global_pooling:" << global_pooling
+                                << " ceil_mode: " << ceil_mode
+                                << " pooling_type:" << pooling_type;
+
+                        // init x, output
+                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+                        auto* x_data = x.mutable_data<float>();
+                        for (int i = 0; i < x.dims().production(); ++i) {
+                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                          x_data[i] = sign * (i % 128);
+                        }
+
+                        // fill param
+                        param.x = &x;
+                        param.output = &output;
+                        param.pooling_type = pooling_type;
+                        if (global_pooling) {
+                          param.ksize = {h, w};
+                        } else {
+                          param.ksize = {ksize, ksize};
+                        }
+                        param.global_pooling = global_pooling;
+                        param.strides = {stride, stride};
+                        param.paddings = {pad, pad};
+                        param.exclusive = exclusive;
+                        param.ceil_mode = ceil_mode;
+                        param.adaptive = false;
+                        param.use_quantizer = false;
+
+                        const std::vector<int64_t>& output_shape =
+                            compute_output_shape(&param);
+                        output.Resize(DDim(output_shape));
+                        output_ref.Resize(DDim(output_shape));
+
+                        auto* output_data = output.mutable_data<float>();
+                        auto* output_ref_data =
+                            output_ref.mutable_data<float>();
+                        for (int i = 0; i < output.dims().production(); ++i) {
+                          output_data[i] = -2;
+                          output_ref_data[i] = -2;
+                        }
+
+                        // compute
+                        pool.SetParam(param);
+                        pool.Run();
+
+                        // compute ref
+                        param.output = &output_ref;
+                        pool_compute_ref(param);
+
+                        // compare
+                        for (int i = 0; i < output.dims().production(); i++) {
+                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+                        }
+                        VLOG(3) << "compare pass";
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(pool_arm, retrive_op) {
+  auto pool = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      "pool2d");
+  ASSERT_FALSE(pool.empty());
+  ASSERT_TRUE(pool.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
index fee47d7eb7a..b1277792286 100644
--- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc
@@ -54,6 +54,15 @@ TEST(scale_arm, compute) {
   lite::Tensor output;
   lite::Tensor output_ref;
 
+#if 1  // for ci speedup
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 3}) {
+      for (auto h : {3, 4}) {
+        for (auto w : {4, 3}) {
+          for (auto bias_after_scale : {true, false}) {
+            for (auto s : {-1.0f, 0.13f}) {
+              for (auto b : {-15.f, 0.11234f}) {
+#else
   for (auto n : {1, 3, 4, 11}) {
     for (auto c : {1, 3, 11, 4}) {
       for (auto h : {3, 1, 11, 4}) {
@@ -61,6 +70,8 @@ TEST(scale_arm, compute) {
           for (auto bias_after_scale : {true, false}) {
             for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
               for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
+#endif
+
                 x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                 output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
                 output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
diff --git a/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc b/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc
index 80a64f4eaf7..a984a5b3ade 100644
--- a/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/softmax_compute_test.cc
@@ -80,12 +80,19 @@ TEST(softmax_arm, compute) {
   lite::Tensor x;
   lite::Tensor output;
   lite::Tensor output_ref;
-
+#if 1
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 4}) {
+      for (auto h : {5, 1}) {
+        for (auto w : {1, 6}) {
+          for (auto axis : {-2, -1, 0, 1, 2}) {
+#else
   for (auto n : {1, 3, 4, 11}) {
     for (auto c : {1, 3, 11, 4}) {
       for (auto h : {3, 1, 11, 4}) {
         for (auto w : {1, 3, 4, 12}) {
           for (auto axis : {-4, -3, -2, -1, 0, 1, 2, 3}) {
+#endif
             x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
             output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
             output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
diff --git a/paddle/fluid/lite/kernels/arm/split_compute.cc b/paddle/fluid/lite/kernels/arm/split_compute.cc
new file mode 100644
index 00000000000..3c2416bd690
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/split_compute.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/split_compute.h"
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SplitCompute::Run() {
+  auto& param = Param<operators::SplitParam>();
+  const float* din = param.x->data<float>();
+  auto& dout = param.output;
+  auto in_dim = param.x->dims();
+  std::vector<int> in_strides(in_dim.size());
+  in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1];
+  for (int i = in_dim.size() - 2; i >= 0; --i) {
+    in_strides[i] = in_strides[i + 1] * in_dim[i];
+  }
+  lite::arm::math::split(din, dout, param.axis, in_strides);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(split, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::SplitCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/operators/batch_norm.cc b/paddle/fluid/lite/kernels/arm/split_compute.h
similarity index 71%
rename from paddle/fluid/lite/operators/batch_norm.cc
rename to paddle/fluid/lite/kernels/arm/split_compute.h
index 80388e13050..22701ba0fd9 100644
--- a/paddle/fluid/lite/operators/batch_norm.cc
+++ b/paddle/fluid/lite/kernels/arm/split_compute.h
@@ -12,20 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/operators/batch_norm.h"
-#include <vector>
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
-namespace operators {
+namespace kernels {
+namespace arm {
 
-bool BatchNormOpLite::CheckShape() const { return true; }
+class SplitCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
 
-bool BatchNormOpLite::InferShape() const { return true; }
+  virtual ~SplitCompute() = default;
+};
 
-}  // namespace operators
+}  // namespace arm
+}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-
-REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOpLite);
diff --git a/paddle/fluid/lite/kernels/arm/split_compute_test.cc b/paddle/fluid/lite/kernels/arm/split_compute_test.cc
new file mode 100644
index 00000000000..35e2c7cdeda
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/split_compute_test.cc
@@ -0,0 +1,176 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/split_compute.h"
+#include <gtest/gtest.h>
+#include <cstring>
+#include <limits>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void splite_resize_out(const lite::Tensor* din,
+                       const std::vector<lite::Tensor*>& dout, int axis,
+                       int num, const std::vector<int>& sections) {
+  auto in_dims = din->dims();
+  int outs_number = dout.size();
+
+  std::vector<lite::DDimLite> outs_dims;
+  outs_dims.reserve(outs_number);
+
+  if (num > 0) {
+    int out_axis_dim = in_dims[axis] / num;
+    for (int i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = out_axis_dim;
+      outs_dims.push_back(dim);
+    }
+  } else if (sections.size() > 0) {
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections[i];
+      outs_dims.push_back(dim);
+    }
+  }
+
+  for (int j = 0; j < outs_dims.size(); ++j) {
+    dout[j]->Resize(outs_dims[j]);
+  }
+}
+
+template <typename dtype>
+void split_compute_ref(const operators::SplitParam& param) {
+  const dtype* din = param.x->mutable_data<const dtype>();
+  auto& dout = param.output;
+  auto in_dim = param.x->dims();
+  int axis = param.axis;
+  std::vector<int> in_strides(in_dim.size());
+  in_strides[in_dim.size() - 1] = in_dim[in_dim.size() - 1];
+  for (int i = in_dim.size() - 2; i >= 0; --i) {
+    in_strides[i] = in_strides[i + 1] * in_dim[i];
+  }
+
+  int input_offset = 0;
+  for (auto out : dout) {
+    auto out_dim = out->dims();
+    std::vector<int> out_strides(out_dim.size());
+    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
+    for (int i = out_dim.size() - 2; i >= 0; --i) {
+      out_strides[i] = out_strides[i + 1] * out_dim[i];
+    }
+
+    dtype* out_data = out->mutable_data<dtype>();
+    int before = out_strides[0] / out_strides[axis];
+    int in_after = in_strides[axis];
+    int out_after = out_strides[axis];
+
+    for (int i = 0; i < before; ++i) {
+      std::memcpy(out_data + i * out_after, din + input_offset + i * in_after,
+                  sizeof(dtype) * out_after);
+    }
+    input_offset += out_strides[axis];
+  }
+}
+
+TEST(split_arm, init) {
+  SplitCompute split;
+  ASSERT_EQ(split.precision(), PRECISION(kFloat));
+  ASSERT_EQ(split.target(), TARGET(kARM));
+}
+
+TEST(split_arm, compute) {
+  SplitCompute split;
+  operators::SplitParam param;
+
+  lite::Tensor x;
+  std::vector<lite::Tensor*> output;
+  std::vector<lite::Tensor*> output_ref;
+
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {0, 1, 2, 3}) {
+            for (auto num : {0, 1, 2, 3}) {
+              for (auto sections :
+                   {std::vector<int>{1, 1, 1}, std::vector<int>{2, 2},
+                    std::vector<int>{1, 2}}) {
+                auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+                x.Resize(x_dim);
+                if ((num != 0 && x_dim[axis] % num != 0) ||
+                    (num == 0 && x_dim[axis] % sections.size() != 0))
+                  continue;
+                auto* x_data = x.mutable_data<float>();
+                for (int i = 0; i < x.dims().production(); i++) {
+                  x_data[i] = i;
+                }
+                for (auto out : output) delete out;
+                for (auto out : output_ref) delete out;
+                output.clear();
+                output_ref.clear();
+
+                int outs_number;
+                if (num > 0) {
+                  outs_number = num;
+                } else {
+                  outs_number = sections.size();
+                }
+                for (int i = 0; i < outs_number; i++) {
+                  output.push_back(new lite::Tensor);
+                  output_ref.push_back(new lite::Tensor);
+                }
+                splite_resize_out(&x, output, axis, num, sections);
+                splite_resize_out(&x, output_ref, axis, num, sections);
+                param.x = &x;
+                param.axis = axis;
+                param.num = num;
+                param.sections = sections;
+                param.output = output;
+                split.SetParam(param);
+                split.Run();
+                param.output = output_ref;
+                split_compute_ref<float>(param);
+                for (int i = 0; i < output.size(); i++) {
+                  float* output_data = output[i]->mutable_data<float>();
+                  float* output_ref_data = output_ref[i]->mutable_data<float>();
+                  for (int j = 0; j < output[i]->dims().production(); j++) {
+                    EXPECT_NEAR(output_data[j], output_ref_data[j], 1e-5);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(split, retrive_op) {
+  auto split =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("split");
+  ASSERT_FALSE(split.empty());
+  ASSERT_TRUE(split.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute.cc b/paddle/fluid/lite/kernels/arm/transpose_compute.cc
new file mode 100644
index 00000000000..ff32ac242b9
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+bool IsShuffleChannel(const std::vector<int> &axis) {
+  bool is_shuffle_channel = true;
+  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+    for (int i = 3; i < axis.size(); ++i) {
+      if (axis[i] != i) {
+        is_shuffle_channel = false;
+        break;
+      }
+    }
+  } else {
+    return false;
+  }
+  return is_shuffle_channel;
+}
+
+template <typename Dtype>
+void ShuffleChannelCompute(const std::vector<int> &axis,
+                           const lite::Tensor *input, lite::Tensor *output) {
+  const Dtype *input_ptr = input->data<Dtype>();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+  // input and output's shape dimension must >= 2 && <= 6.
+  const DDim &in_dim = input->dims();
+  const DDim &out_dim = output->dims();
+  size_t offset = 1;
+  for (int i = 3; i < axis.size(); ++i) {
+    offset *= in_dim[i];
+  }
+
+#pragma omp parallel for collapse(3)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+        size_t out_offset =
+            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+        memcpy(output_ptr + out_offset, input_ptr + in_offset,
+               offset * sizeof(Dtype));
+      }
+    }
+  }
+}
+
+template <typename Dtype>
+void TransposeCompute_(const std::vector<int> &axis, const lite::Tensor *input,
+                       lite::Tensor *output) {
+  // const Dtype *input_ptr = input->data<Dtype>();
+  const Dtype *input_ptr = input->data<float>();
+  Dtype *output_ptr = output->mutable_data<Dtype>();
+
+  // input and output's shape dimension must >= 2 && <= 6.
+  const DDim &in_dim = input->dims();
+  const DDim &out_dim = output->dims();
+
+  // precompute inverted output dim and strides
+  size_t rout_dim[6], strides[6];
+  int permute = axis.size();  // permute must >=2 && <= 6.
+  for (int i = 0; i < permute; ++i) {
+    int k = permute - 1 - i;
+    strides[k] = 1;
+    for (int j = axis[i] + 1; j < permute; ++j) {
+      strides[k] *= in_dim[j];
+    }
+    rout_dim[k] = out_dim[i];
+  }
+
+  // unroll the first 2 dimensions
+  int reamin_dim = 1;
+  for (int i = 2; i < out_dim.size(); ++i) {
+    reamin_dim *= out_dim[i];
+  }
+
+#pragma omp parallel for collapse(2)
+  for (int batch = 0; batch < out_dim[0]; ++batch) {
+    for (int j = 0; j < out_dim[1]; ++j) {
+      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+      int indics[4] = {0, 0, 0, 0};
+      for (int k = 0; k < reamin_dim; ++k) {
+        out_ptr[k] = input_ptr[offset];
+        indics[0] += 1;
+        offset += strides[0];
+        for (int p = 0; p < permute - 3; ++p) {
+          if (indics[p] == rout_dim[p]) {
+            indics[p + 1] += 1;
+            indics[p] = 0;
+            offset += strides[p + 1];
+            offset -= rout_dim[p] * strides[p];
+          } else {
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Transpose
+void TransposeCompute::Run() {
+  auto &param = Param<operators::TransposeParam>();
+  auto *input = param.x;
+  auto *output = param.output;
+  const std::vector<int> axis = param.axis;
+
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    ShuffleChannelCompute<float>(axis, input, output);
+  } else {
+    TransposeCompute_<float>(axis, input, output);
+  }
+  return;
+}
+
+// Transpose2
+void Transpose2Compute::Run() {
+  auto &param = Param<operators::TransposeParam>();
+  auto *input = param.x;
+  auto *output = param.output;
+  const std::vector<int> axis = param.axis;
+
+  bool shuffle_channel = IsShuffleChannel(axis);
+  if (shuffle_channel) {
+    ShuffleChannelCompute<float>(axis, input, output);
+  } else {
+    TransposeCompute_<float>(axis, input, output);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// Transpose
+REGISTER_LITE_KERNEL(transpose, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::TransposeCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+// Transpose2
+REGISTER_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW,
+                     paddle::lite::kernels::arm::Transpose2Compute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute.h b/paddle/fluid/lite/kernels/arm/transpose_compute.h
new file mode 100644
index 00000000000..d8ebb761ec4
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+// Transpose
+class TransposeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~TransposeCompute() = default;
+};
+
+// Transpose2
+class Transpose2Compute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~Transpose2Compute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc b/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc
new file mode 100644
index 00000000000..1315556e3dd
--- /dev/null
+++ b/paddle/fluid/lite/kernels/arm/transpose_compute_test.cc
@@ -0,0 +1,205 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/lite_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+void transpose_compute_ref(const operators::TransposeParam& param) {
+  const lite::Tensor* input = param.x;
+  lite::Tensor* output = param.output;
+  std::vector<int> axis = param.axis;
+
+  auto* input_data = input->data<float>();
+  auto* output_data = output->mutable_data<float>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_n = output->dims()[0];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+
+// Transpose
+TEST(transpose_arm, init) {
+  TransposeCompute transpose;
+  ASSERT_EQ(transpose.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose.target(), TARGET(kARM));
+}
+
+TEST(transpose_arm, compute_shape_nchw) {
+  TransposeCompute transpose;
+  operators::TransposeParam param;
+
+  std::vector<int> axis{0, 2, 3, 1};
+  param.axis = axis;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  const std::vector<int64_t> input_shape{1, 24, 2, 2};
+  const std::vector<int64_t> output_shape{1, 2, 2, 24};
+
+  DDimLite ddimInput(input_shape);
+  DDimLite ddimOutput(output_shape);
+
+  input.Resize(ddimInput);
+  output.Resize(ddimOutput);
+  output_ref.Resize(ddimOutput);
+
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    input.mutable_data<float>()[i] = i;
+    input.mutable_data<float>()[i + 1] = i + 1;
+    input.mutable_data<float>()[i + 2] = i + 2;
+    input.mutable_data<float>()[i + 3] = i + 3;
+  }
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+  }
+  param.x = &input;
+  param.output = &output;
+
+  // run transpose_compute
+  transpose.SetParam(param);
+  transpose.Run();
+
+  // run transpose_compute_ref
+  param.output = &output_ref;
+  transpose_compute_ref(param);
+
+  auto* output_data = output.data<float>();
+  auto* output_ref_data = output_ref.data<float>();
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(transpose, retrive_op) {
+  auto transpose =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "transpose");
+  ASSERT_FALSE(transpose.empty());
+  ASSERT_TRUE(transpose.front());
+}
+
+// Transpose2
+TEST(transpose2_arm, init) {
+  Transpose2Compute transpose2;
+  ASSERT_EQ(transpose2.precision(), PRECISION(kFloat));
+  ASSERT_EQ(transpose2.target(), TARGET(kARM));
+}
+
+TEST(transpose2_arm, compute_shape_nchw) {
+  Transpose2Compute transpose2;
+  operators::TransposeParam param;
+
+  std::vector<int> axis{0, 2, 3, 1};
+  param.axis = axis;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+
+  const std::vector<int64_t> input_shape{1, 24, 2, 2};
+  const std::vector<int64_t> output_shape{1, 2, 2, 24};
+
+  DDimLite ddimInput(input_shape);
+  DDimLite ddimOutput(output_shape);
+
+  input.Resize(ddimInput);
+  output.Resize(ddimOutput);
+  output_ref.Resize(ddimOutput);
+
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    input.mutable_data<float>()[i] = i;
+    input.mutable_data<float>()[i + 1] = i + 1;
+    input.mutable_data<float>()[i + 2] = i + 2;
+    input.mutable_data<float>()[i + 3] = i + 3;
+  }
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+  }
+  param.x = &input;
+  param.output = &output;
+
+  // run transpose_compute
+  transpose2.SetParam(param);
+  transpose2.Run();
+
+  // run transpose_compute_ref
+  param.output = &output_ref;
+  transpose_compute_ref(param);
+
+  auto* output_data = output.data<float>();
+  auto* output_ref_data = output_ref.data<float>();
+  for (int i = 0;
+       i < input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
+       i += 4) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+  }
+}
+
+TEST(transpose2, retrive_op) {
+  auto transpose2 =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "transpose2");
+  ASSERT_FALSE(transpose2.empty());
+  ASSERT_TRUE(transpose2.front());
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/arm/use_kernels.h b/paddle/fluid/lite/kernels/arm/use_kernels.h
deleted file mode 100644
index d856950f3a1..00000000000
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/lite/core/op_registry.h"
-
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
index f35f634a217..fd56d9df669 100644
--- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt
@@ -5,7 +5,8 @@ endif()
 message(STATUS "compile with lite CUDA kernels")
 
 nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS ${tensor_lite})
-cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
+lite_cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
 
 nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
  
+ 
diff --git a/paddle/fluid/lite/kernels/host/CMakeLists.txt b/paddle/fluid/lite/kernels/host/CMakeLists.txt
index d1f33477aaa..7766cb7dc59 100644
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
@@ -1,8 +1,8 @@
 message(STATUS "compile with lite host kernels")
 
-cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
-cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-cc_library(reshape_compute_host SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op_lite)
+lite_cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(reshape_compute_host SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op_lite)
 
 lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host)
 
@@ -10,4 +10,7 @@ set(host_kernels
     feed_compute_host
     fetch_compute_host
     reshape_compute_host
-    CACHE INTERNAL "host kernels")
+    )
+
+set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
+ 
diff --git a/paddle/fluid/lite/kernels/host/feed_compute.cc b/paddle/fluid/lite/kernels/host/feed_compute.cc
index 7bbd648c20d..f594b6d2047 100644
--- a/paddle/fluid/lite/kernels/host/feed_compute.cc
+++ b/paddle/fluid/lite/kernels/host/feed_compute.cc
@@ -29,10 +29,8 @@ class FeedCompute
     auto &param = Param<operators::FeedParam>();
     VLOG(4) << "feed_list.size: " << param.feed_list->size();
     VLOG(4) << "col " << param.col;
-    const lite::Tensor &feed_item = (*param.feed_list)[0];
+    const lite::Tensor &feed_item = (*param.feed_list)[param.col];
     param.out->ShareDataWith(feed_item);
-    VLOG(4) << "FEED input " << feed_item << " col " << param.col;
-    VLOG(4) << "FEED output " << *param.out;
   }
 };
 
diff --git a/paddle/fluid/lite/kernels/opencl/CMakeLists.txt b/paddle/fluid/lite/kernels/opencl/CMakeLists.txt
new file mode 100644
index 00000000000..65877816e4b
--- /dev/null
+++ b/paddle/fluid/lite/kernels/opencl/CMakeLists.txt
@@ -0,0 +1,25 @@
+if (NOT LITE_WITH_OPENCL)
+    return ()
+endif()
+
+set(cl_kernel_deps op_params_lite cl_caller cl_engine cl_context cl_wrapper)
+
+lite_cc_library(elementwise_add_opencl SRCS elementwise_add_compute.cc DEPS ${cl_kernel_deps})
+lite_cc_library(pool_opencl SRCS pool_compute.cc DEPS ${cl_kernel_deps})
+
+lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc DEPS elementwise_add_opencl
+  op_registry_lite program_lite
+  context_lite
+  ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+  )
+
+lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc DEPS pool_opencl
+  op_registry_lite program_lite
+  context_lite
+  ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl
+  )
+
+set(opencl_kernels
+  elementwise_add_opencl
+  pool_opencl
+  CACHE INTERNAL "opencl_kernels")
diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
new file mode 100644
index 00000000000..4213e2a81e7
--- /dev/null
+++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+// NOTE ugly here, hide these.
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<OpenClContext>();
+    CHECK(context.cl_helper() != nullptr);
+
+    elementwise_add(
+        context.cl_helper(), static_cast<const float*>(param.X->raw_data()),
+        param.X->dims(), static_cast<const float*>(param.Y->raw_data()),
+        param.Y->dims(), param.Out->mutable_data<float>(), param.Out->dims());
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW,
+                     paddle::lite::kernels::opencl::ElementwiseAddCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
new file mode 100644
index 00000000000..1040c8bd547
--- /dev/null
+++ b/paddle/fluid/lite/kernels/opencl/elementwise_add_compute_test.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(elementwise_add, init) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "elementwise_add", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+
+  lite::Tensor X, Y, Out;
+  operators::ElementwiseParam param;
+  param.X = &X;
+  param.Y = &Y;
+  param.Out = &Out;
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenClContext>().InitOnce();
+
+  kernel->SetParam(param);
+  kernel->SetContext(std::move(context));
+
+  X.Resize({4, 3, 10, 10});
+  Y.Resize({4, 3, 10, 10});
+  Out.Resize({4, 3, 10, 10});
+
+  auto* x_data = X.mutable_data<float>();
+  auto* y_data = Y.mutable_data<float>();
+  auto* out_data = Out.mutable_data<float>();
+
+  for (int i = 0; i < 4 * 3 * 10 * 10; i++) {
+    x_data[i] = 1.1 * i;
+    y_data[i] = 2.3 * i;
+  }
+
+  kernel->Launch();
+
+  for (int i = 0; i < 4 * 3 * 10 * 10; i++) {
+    EXPECT_NEAR(out_data[i], static_cast<float>(3.4 * i), 1e-6);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kOpenCL, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/opencl/pool_compute.cc b/paddle/fluid/lite/kernels/opencl/pool_compute.cc
new file mode 100644
index 00000000000..cfbf674b098
--- /dev/null
+++ b/paddle/fluid/lite/kernels/opencl/pool_compute.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+// NOTE ugly here, hide these.
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+class PoolCompute
+    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+
+    auto& context = ctx_->As<OpenClContext>();
+    CHECK(context.cl_helper() != nullptr);
+
+    pool(context.cl_helper(), pooling_type, paddings[0], paddings[1],
+         strides[0], strides[1], ksize[0], ksize[1],
+         static_cast<const float*>(param.x->raw_data()), in_dims,
+         param.output->mutable_data<float>(), out_dims);
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW,
+                     paddle::lite::kernels::opencl::PoolCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/opencl/pool_compute_test.cc b/paddle/fluid/lite/kernels/opencl/pool_compute_test.cc
new file mode 100644
index 00000000000..fde3caae84e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/opencl/pool_compute_test.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+
+void pool_avg(const int padding_height, const int padding_width,
+              const int stride_height, const int stride_width,
+              const int ksize_height, const int ksize_width,
+              const float* input_data, const DDim& in_dim, float* output_data,
+              const DDim& out_dim) {
+  const int batch_size = in_dim[0];
+  const int input_height = in_dim[2];
+  const int input_width = in_dim[3];
+  const int output_channels = out_dim[1];
+  const int output_height = out_dim[2];
+  const int output_width = out_dim[3];
+
+  const size_t input_spatial_size = input_height * input_width;
+  const size_t output_spatial_size = output_height * output_width;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int c = 0; c < output_channels; ++c) {
+      int channel = i * output_channels + c;
+      const float* input_ptr = input_data + channel * input_spatial_size;
+      float* output_ptr = output_data + channel * output_spatial_size;
+
+      for (int ph = 0; ph < output_height; ++ph) {
+        int hstart = ph * stride_height - padding_height;
+        int hend = std::min(hstart + ksize_height, input_height);
+        hstart = std::max(hstart, 0);
+        for (int pw = 0; pw < output_width; ++pw) {
+          int wstart = pw * stride_width - padding_width;
+          int wend = std::min(wstart + ksize_width, input_width);
+          wstart = std::max(wstart, 0);
+
+          float val = 0.f;
+          int count = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              val += input_ptr[h * input_width + w];
+              ++count;
+            }
+          }
+          output_ptr[ph * output_width + pw] =
+              (count > 0) ? val * (1.f / count) : 0.f;
+        }
+      }
+    }
+  }
+}
+
+TEST(pool2d, init) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = true;
+  param.pooling_type = "avg";
+  param.paddings = std::vector<int>{0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenClContext>().InitOnce();
+
+  kernel->SetParam(param);
+  kernel->SetContext(std::move(context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  auto* x_data = x.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>();
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
+    x_data[i] = dist(engine);
+  }
+
+  kernel->Launch();
+
+  std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
+  pool_avg(0, 0, 1, 1, 7, 7, x_data, in_dim, out_ref.get(), out_dim);
+
+  for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
index 6309267dd06..700b44656e5 100644
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -2,21 +2,41 @@ if(NOT LITE_WITH_X86)
     return()
 endif()
 
-cc_library(activation_compute_x86 SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
-cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
-cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
-cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(activation_compute_x86 SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
+lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
+
+lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
+lite_cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
+lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
+lite_cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
+lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
+lite_cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
+lite_cc_library(reshape_compute_x86 SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(gru_compute_x86 SRCS gru_compute.cc DEPS ${lite_kernel_deps} sequence2batch gru_compute)
+lite_cc_library(reduce_compute_x86 SRCS reduce_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(lookup_table_compute_x86 SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+lite_cc_library(sequence_reshape_compute_x86 SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps})
+
+lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
+lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
+lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
+lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
+lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x86)
+lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
+lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
+lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
+lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
 
-cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_x86 SRCS mul_compute.cc DEPS ${lite_kernel_deps})
-cc_library(relu_compute_x86 SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(scale_compute_x86 SRCS scale_compute.cc DEPS ${lite_kernel_deps})
-cc_library(elementwise_compute_x86 SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op elementwise_add_op)
-cc_library(softmax_compute_x86 SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
-cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
-cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
-cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
-cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
 
 set(x86_kernels
     activation_compute_x86
@@ -32,4 +52,13 @@ set(x86_kernels
     concat_compute_x86
     conv_compute_x86
     pool_compute_x86
+    batch_norm_compute_x86
+    uniform_random_compute_x86
+    sgd_compute_x86
+    reshape_compute_x86
+    gru_compute_x86
+    reduce_compute_x86
+    lookup_table_compute_x86
+    sequence_reshape_compute_x86
     CACHE INTERNAL "x86 kernels")
+    
diff --git a/paddle/fluid/lite/kernels/x86/activation_compute.cc b/paddle/fluid/lite/kernels/x86/activation_compute.cc
index a07a69af2d1..1098ebf1f00 100644
--- a/paddle/fluid/lite/kernels/x86/activation_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/activation_compute.cc
@@ -93,6 +93,25 @@ class SquareGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~SquareGradCompute() = default;
 };
 
+template <typename T>
+class SoftsignCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::ActivationParam>();
+    CHECK(context.x86_device_context());
+    param.Out->template mutable_data<T>();
+
+    Activate<paddle::operators::SoftsignFunctor<T>>(
+        *context.x86_device_context(), &param.X->raw_tensor(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~SoftsignCompute() = default;
+};
+
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
@@ -114,3 +133,9 @@ REGISTER_LITE_KERNEL(square_grad, kX86, kFloat, kNCHW,
     .BindOutput(paddle::framework::GradVarName("X"),
                 {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(softsign, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::SoftsignCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
new file mode 100644
index 00000000000..008d2398014
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+
+REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::BatchNormCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute.h b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
new file mode 100644
index 00000000000..c509fded0e0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T>
+class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::BatchNormParam>();
+    bool global_stats = param.is_test || param.use_global_stats;
+
+    const auto *x = param.x;
+    const auto &x_dims = x->dims();
+    CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
+    const int N = x_dims[0];
+    const int C = param.data_layout == DATALAYOUT(kNCHW)
+                      ? x_dims[1]
+                      : x_dims[x_dims.size() - 1];
+    const int sample_size = x->dims().production() / N / C;
+
+    // alloc memory
+    param.y->template mutable_data<T>();
+    if (!param.is_test) {
+      param.mean_out->template mutable_data<T>();
+      param.variance_out->template mutable_data<T>();
+      param.saved_mean->template mutable_data<T>();
+      param.saved_variance->template mutable_data<T>();
+    }
+    if (!global_stats) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
+                                          C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          param.saved_variance->mutable_data<T>(), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
+                                              C);
+      EigenVectorArrayMap<T> running_var_arr(
+          param.variance_out->mutable_data<T>(), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(),
+                              &param.y->raw_tensor());
+        return;
+      }
+
+      switch (param.data_layout) {
+        case DATALAYOUT(kNCHW): {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unknown storage order: "
+                     << DataLayoutToStr(param.data_layout);
+          break;
+      }
+      running_mean_arr = running_mean_arr * param.momentum +
+                         saved_mean_e * (1. - param.momentum);
+      running_var_arr = running_var_arr * param.momentum +
+                        saved_variance_e * (1. - param.momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (global_stats) {
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      inv_std = (var_arr + param.epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          param.saved_variance->mutable_data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+
+    ConstEigenVectorArrayMap<T> mean_arr(
+        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW): {
+        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  }
+  virtual ~BatchNormCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
new file mode 100644
index 00000000000..d9c53035db1
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(batch_norm_x86, retrive_op) {
+  auto batch_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+
+TEST(batch_norm_x86, init) {
+  BatchNormCompute<float> batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kX86));
+}
+
+TEST(batch_norm_x86, run_test) {
+  lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
+      saved_mean, saved_variance;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> scale_shape{3};
+  scale.Resize(lite::DDim(scale_shape));
+
+  std::vector<int64_t> bias_shape{3};
+  bias.Resize(lite::DDim(bias_shape));
+
+  std::vector<int64_t> mean_shape{3};
+  mean.Resize(lite::DDim(mean_shape));
+
+  std::vector<int64_t> variance_shape{3};
+  variance.Resize(lite::DDim(variance_shape));
+
+  std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
+  y.Resize(lite::DDim(y_shape));
+
+  std::vector<int64_t> mean_out_shape{3};
+  mean_out.Resize(lite::DDim(mean_out_shape));
+
+  std::vector<int64_t> variance_out_shape{3};
+  variance_out.Resize(lite::DDim(variance_out_shape));
+
+  std::vector<int64_t> saved_mean_shape{3};
+  saved_mean.Resize(lite::DDim(saved_mean_shape));
+
+  std::vector<int64_t> saved_variance_shape{3};
+  saved_variance.Resize(lite::DDim(saved_variance_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = scale.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+  auto mean_data = mean.mutable_data<float>();
+  auto variance_data = variance.mutable_data<float>();
+  y.mutable_data<float>();
+  mean_out.mutable_data<float>();
+  variance_out.mutable_data<float>();
+  saved_mean.mutable_data<float>();
+  saved_variance.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+  // BatchNormCompute batch_norm;
+  BatchNormCompute<float> batch_norm;
+  operators::BatchNormParam param;
+
+  param.x = &x;
+  param.is_test = false;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.use_global_stats = false;
+  param.epsilon = 1e-4f;
+  param.momentum = 0.9f;
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+
+  batch_norm.SetParam(param);
+  batch_norm.Run();
+
+  LOG(INFO) << "output: " << y;
+  LOG(INFO) << "mean_out: " << mean_out;
+  LOG(INFO) << "variance_out: " << mean_out;
+  LOG(INFO) << "saved_mean: " << saved_mean;
+  LOG(INFO) << "saved_variance: " << saved_variance;
+
+  /*for (int i = 0; i < y.dims().production(); i++) {
+    if(i < 5 || i > y.dims().production() - 5)
+      LOG(INFO) << y_data[i];
+  }*/
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute.cc b/paddle/fluid/lite/kernels/x86/concat_compute.cc
index 23ae8ca5055..8976ed9675b 100644
--- a/paddle/fluid/lite/kernels/x86/concat_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.cc
@@ -12,91 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConcatParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    int64_t axis = static_cast<int64_t>(param.axis);
-    auto out = param.output;
-
-    if (axis == 0 && param.x.size() < 10) {
-      size_t output_offset = 0;
-      for (auto* in : param.x) {
-        if (!in || in->dims().production() == 0UL) {
-          continue;
-        }
-        auto in_stride = framework::stride_numel(in->dims().data());
-        auto out_stride = framework::stride_numel(out->dims().data());
-        paddle::operators::StridedNumelCopyWithAxis<T>(
-            platform::CPUDeviceContext(), axis,
-            out->mutable_data<T>() + output_offset, out_stride, in->data<T>(),
-            in_stride, in_stride[axis]);
-
-        output_offset += in_stride[axis];
-      }
-    } else {
-      std::vector<lite::Tensor> inputs;
-      for (size_t j = 0; j < param.x.size(); ++j) {
-        if (param.x[j] && param.x[j]->dims().production() > 0) {
-          inputs.push_back(*param.x[j]);
-        } else {
-          continue;
-        }
-      }
-
-      int num = inputs.size();
-      int rows = 1;
-      auto dim_0 = inputs[0].dims();
-      for (int i = 0; i < axis; ++i) {
-        rows *= dim_0[i];
-      }
-      int out_rows = rows, out_cols = 0;
-
-      std::vector<int64_t> input_cols(inputs.size());
-      for (int i = 0; i < num; ++i) {
-        int t_cols = inputs[i].dims().production() / rows;
-        out_cols += t_cols;
-        input_cols[i] = t_cols;
-      }
-      // computation
-      auto output_data = param.output->template mutable_data<T>();
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        auto input_data = inputs[j].data<float>();
-        for (int k = 0; k < out_rows; ++k) {
-          std::memcpy(output_data + k * out_cols + col_idx,
-                      input_data + k * col_len, sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
-  }
-
-  virtual ~ConcatCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
 
 REGISTER_LITE_KERNEL(concat, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ConcatCompute<float>, def)
-    .BindInput("X", {LiteType::GetTensorListTy(TARGET(kX86))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute.h b/paddle/fluid/lite/kernels/x86/concat_compute.h
new file mode 100644
index 00000000000..ce6cfeba066
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/concat_compute.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <vector>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+inline int count(int start_axis, int end_axis, const lite::DDim& dim) {
+  int count = 1;
+  for (int i = start_axis; i < end_axis; ++i) {
+    count *= dim[i];
+  }
+  return count;
+}
+
+template <typename T>
+class ConcatCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    int64_t axis = static_cast<int64_t>(param.axis);
+    auto x_dims = param.x[0]->dims();
+    auto out = param.output;
+    if (param.x.size() == 1) return;
+
+    auto output_data = param.output->template mutable_data<T>();
+    int offset_concat_axis = 0;
+    int num_concat = count(0, axis, x_dims);
+    int concat_input_size = count(axis + 1, x_dims.size(), x_dims);
+    const int top_concat_axis = out->dims()[axis];
+    for (size_t i = 0; i < param.x.size(); ++i) {
+      auto bottom_data = param.x[i]->data<T>();
+      const int64_t bottom_concat_axis = param.x[i]->dims()[axis];
+      for (int n = 0; n < num_concat; ++n) {
+        std::memcpy(
+            output_data +
+                (n * top_concat_axis + offset_concat_axis) * concat_input_size,
+            bottom_data + n * bottom_concat_axis * concat_input_size,
+            (bottom_concat_axis * concat_input_size) * sizeof(T));
+      }
+      offset_concat_axis += bottom_concat_axis;
+    }
+  }
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/concat_compute_test.cc b/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
new file mode 100644
index 00000000000..aa50dae9eb9
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/concat_compute_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/concat_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(concat_x86, retrive_op) {
+  auto concat =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "concat");
+  ASSERT_FALSE(concat.empty());
+  ASSERT_TRUE(concat.front());
+}
+
+TEST(concat_x86, init) {
+  ConcatCompute<float> concat;
+  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.target(), TARGET(kX86));
+}
+
+TEST(concat_x86, run_test) {
+  lite::Tensor x1, x2, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x1_shape{batch_size, 1, 3, 3};
+  x1.Resize(lite::DDim(x1_shape));
+  std::vector<int64_t> x2_shape{batch_size, 1, 3, 3};
+  x2.Resize(lite::DDim(x2_shape));
+
+  std::vector<lite::Tensor*> x = {&x1, &x2};
+
+  std::vector<int64_t> out_shape{batch_size, 2, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x1_data = x1.mutable_data<float>();
+  auto x2_data = x2.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x1.dims().production(); i++) {
+    x1_data[i] = 1;
+    x2_data[i] = 2;
+  }
+
+  ConcatCompute<float> concat;
+  operators::ConcatParam param;
+  param.x = x;
+  param.output = &out;
+  param.axis = 1;
+
+  concat.SetParam(param);
+  concat.Run();
+
+  std::cout << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    std::cout << out_data[i] << " ";
+  }
+  std::cout << std::endl;
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(concat, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute.cc b/paddle/fluid/lite/kernels/x86/conv_compute.cc
index b29161c1c60..7b674a038de 100644
--- a/paddle/fluid/lite/kernels/x86/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.cc
@@ -12,144 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/lite/operators/conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/vol2col.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-template <typename T>
-class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ConvParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::ConvParam>();
-    lite::Tensor filter = *param.filter;
-    param.output->template mutable_data<T>();
-
-    const int batch_size = static_cast<int>(param.x->dims()[0]);
-
-    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
-    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
-
-    size_t data_dim = filter_shape_vec.size() - 2;
-    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-    col_shape_vec[0] = param.x->dims()[1] / param.groups;
-    for (size_t j = 0; j < data_dim; ++j) {
-      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-    }
-    lite::DDim col_shape(col_shape_vec);
-    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
-    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
-                              param.dilations);
-
-    lite::Tensor col;
-    lite::Tensor col_matrix;
-    if (is_expand) {
-      col.Resize(col_shape);
-      col.mutable_data<T>();
-      col_matrix.ShareDataWith(col);
-      col_matrix.Resize(col_matrix_shape);
-    }
-    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
-
-    lite::DDim filter_matrix_shape(std::vector<int64_t>{
-        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
-    filter.Resize(filter_matrix_shape);
-
-    lite::DDim output_matrix_shape(std::vector<int64_t>{
-        param.output->dims()[1],
-        param.output->dims().production() /
-            (param.output->dims()[0] * param.output->dims()[1])});
-
-    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
-    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
-
-    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
-        vol2col;
-    paddle::operators::math::Im2ColFunctor<
-        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
-        im2col;
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        platform::CPUDeviceContext());
-    for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      in_batch.ShareDataWith(
-          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
-      lite::Tensor out_batch;
-      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
-          output_matrix_shape.data()));
-
-      for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
-            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
-
-        if (!is_expand) {
-          col.ShareDataWith(in_slice);
-          col_matrix.ShareDataWith(col);
-          col_matrix.Resize(col_matrix_shape);
-        } else if (data_dim == 2U) {
-          // im2col
-          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                 param.dilations, param.strides,
-                 std::vector<int>{param.paddings[0], param.paddings[1],
-                                  param.paddings[0], param.paddings[1]},
-                 &(col.raw_tensor()));
-        } else if (data_dim == 3U) {
-          // vol2col
-          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
-                  param.dilations, param.strides, param.paddings,
-                  &(col.raw_tensor()));
-        }
-
-        // gemm
-        lite::Tensor out_slice;
-        out_slice.ShareDataWith(
-            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
-            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
-        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
-                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
-      }
-    }
-  }
-
-  virtual ~Conv2dCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
 
 REGISTER_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::Conv2dCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute.h b/paddle/fluid/lite/kernels/x86/conv_compute.h
new file mode 100644
index 00000000000..4b308779292
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/conv_compute.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/lite/operators/conv_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/fluid/operators/math/vol2col.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+inline bool IsExpand(const std::vector<int64_t>& filter_dim,
+                     const std::vector<int>& strides,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+template <typename T>
+class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConvParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ConvParam>();
+    lite::Tensor filter = *param.filter;
+    param.output->template mutable_data<T>();
+
+    const int batch_size = static_cast<int>(param.x->dims()[0]);
+
+    std::vector<int64_t> filter_shape_vec(filter.dims().Vectorize());
+    std::vector<int64_t> output_shape_vec(param.output->dims().Vectorize());
+
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = param.x->dims()[1] / param.groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    lite::DDim col_shape(col_shape_vec);
+    lite::DDim col_matrix_shape = col_shape.Flattern2D(data_dim + 1);
+    bool is_expand = IsExpand(filter_shape_vec, param.strides, param.paddings,
+                              param.dilations);
+
+    lite::Tensor col;
+    lite::Tensor col_matrix;
+    if (is_expand) {
+      col.Resize(col_shape);
+      col.mutable_data<T>();
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+    lite::DDim input_shape = param.x->dims().Slice(1, param.x->dims().size());
+
+    lite::DDim filter_matrix_shape(std::vector<int64_t>{
+        filter.dims()[0], filter.dims().production() / filter.dims()[0]});
+    filter.Resize(filter_matrix_shape);
+
+    lite::DDim output_matrix_shape(std::vector<int64_t>{
+        param.output->dims()[1],
+        param.output->dims().production() /
+            (param.output->dims()[0] * param.output->dims()[1])});
+
+    int in_step = static_cast<int>(param.x->dims()[1]) / param.groups;
+    int out_step = static_cast<int>(param.output->dims()[1]) / param.groups;
+
+    paddle::operators::math::Vol2ColFunctor<platform::CPUDeviceContext, T>
+        vol2col;
+    paddle::operators::math::Im2ColFunctor<
+        paddle::operators::math::ColFormat::kCFO, platform::CPUDeviceContext, T>
+        im2col;
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        platform::CPUDeviceContext());
+    for (int i = 0; i < batch_size; i++) {
+      lite::Tensor in_batch;
+      in_batch.ShareDataWith(
+          param.x->raw_tensor().Slice(i, i + 1).Resize(input_shape.data()));
+      lite::Tensor out_batch;
+      out_batch.ShareDataWith(param.output->raw_tensor().Slice(i, i + 1).Resize(
+          output_matrix_shape.data()));
+
+      for (int g = 0; g < param.groups; g++) {
+        lite::Tensor in_slice;
+        in_slice.ShareDataWith(
+            in_batch.raw_tensor().Slice(g * in_step, (g + 1) * in_step));
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                 param.dilations, param.strides,
+                 std::vector<int>{param.paddings[0], param.paddings[1],
+                                  param.paddings[0], param.paddings[1]},
+                 &(col.raw_tensor()));
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(platform::CPUDeviceContext(), in_slice.raw_tensor(),
+                  param.dilations, param.strides, param.paddings,
+                  &(col.raw_tensor()));
+        }
+
+        // gemm
+        lite::Tensor out_slice;
+        out_slice.ShareDataWith(
+            out_batch.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        lite::Tensor filter_slice;
+        filter_slice.ShareDataWith(
+            filter.raw_tensor().Slice(g * out_step, (g + 1) * out_step));
+        blas.MatMul(filter_slice.raw_tensor(), false, col_matrix.raw_tensor(),
+                    false, T(1.0), &(out_slice.raw_tensor()), T(0.0));
+      }
+    }
+  }
+
+  virtual ~Conv2dCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/conv_compute_test.cc b/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
new file mode 100644
index 00000000000..be57153b4b5
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/conv_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/conv_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(conv_x86, retrive_op) {
+  auto conv2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "conv2d");
+  ASSERT_FALSE(conv2d.empty());
+  ASSERT_TRUE(conv2d.front());
+}
+
+TEST(conv2d_x86, init) {
+  Conv2dCompute<float> conv2d;
+  ASSERT_EQ(conv2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(conv2d.target(), TARGET(kX86));
+}
+
+TEST(conv2d_x86, run_test) {
+  lite::Tensor x, filter, b, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> filter_shape{1, 3, 3, 3};
+  filter.Resize(lite::DDim(filter_shape));
+  std::vector<int64_t> b_shape{1, 3, 1, 1};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{batch_size, 1, 1, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto filter_data = filter.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < filter.dims().production(); i++) {
+    filter_data[i] = 1;
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = 0;
+  }
+
+  Conv2dCompute<float> conv2d;
+  operators::ConvParam param;
+
+  param.x = &x;
+  param.filter = &filter;
+  param.bias = &b;
+  param.output = &out;
+  param.strides = {1, 1};
+  param.paddings = {0, 0};
+  param.groups = 1;
+  param.dilations = {1, 1};
+
+  conv2d.SetParam(param);
+  conv2d.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i] << " ";
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute.cc b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
index d762ec2a06f..6b68e1da310 100644
--- a/paddle/fluid/lite/kernels/x86/dropout_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.cc
@@ -12,72 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <random>
-#include <string>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T>
-class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::DropoutParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::DropoutParam>();
-    const auto* x_data = param.x->data<T>();
-    auto* out_data = param.output->template mutable_data<T>();
-    if (!param.is_test) {
-      auto* mask_data = param.mask->template mutable_data<T>();
-      std::random_device rnd;
-      std::minstd_rand engine;
-      int seed = param.fix_seed ? param.seed : rnd();
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(0, 1);
-
-      size_t size = framework::product(param.mask->dims().data());
-      for (size_t i = 0; i < size; ++i) {
-        if (dist(engine) < param.dropout_prob) {
-          mask_data[i] = 0;
-          out_data[i] = 0;
-        } else {
-          if (param.dropout_implementation == "upscale_in_train") {
-            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
-            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
-          } else {
-            mask_data[i] = 1;
-            out_data[i] = x_data[i];
-          }
-        }
-      }
-    } else {
-      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
-      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
-      auto& place = *platform::CPUDeviceContext().eigen_device();
-      if (param.dropout_implementation == "upscale_in_train") {
-        Y.device(place) = X;
-      } else {
-        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
-      }
-    }
-  }
-
-  virtual ~DropoutCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
 
 REGISTER_LITE_KERNEL(dropout, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::DropoutCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute.h b/paddle/fluid/lite/kernels/x86/dropout_compute.h
new file mode 100644
index 00000000000..ee8b51619a5
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class DropoutCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DropoutParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::DropoutParam>();
+    const auto* x_data = param.x->data<T>();
+    auto* out_data = param.output->template mutable_data<T>();
+    if (!param.is_test) {
+      auto* mask_data = param.mask->template mutable_data<T>();
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed = param.fix_seed ? param.seed : rnd();
+      engine.seed(seed);
+      std::uniform_real_distribution<float> dist(0, 1);
+
+      size_t size = framework::product(param.mask->dims().data());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < param.dropout_prob) {
+          mask_data[i] = 0;
+          out_data[i] = 0;
+        } else {
+          if (param.dropout_implementation == "upscale_in_train") {
+            mask_data[i] = 1.0f / static_cast<T>(1.0f - param.dropout_prob);
+            out_data[i] = x_data[i] / static_cast<T>(1.0f - param.dropout_prob);
+          } else {
+            mask_data[i] = 1;
+            out_data[i] = x_data[i];
+          }
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(param.x->raw_tensor(), 1);
+      auto Y = EigenMatrix<T>::Reshape(param.output->raw_tensor(), 1);
+      auto& place = *platform::CPUDeviceContext().eigen_device();
+      if (param.dropout_implementation == "upscale_in_train") {
+        Y.device(place) = X;
+      } else {
+        Y.device(place) = X * static_cast<T>(1.0f - param.dropout_prob);
+      }
+    }
+  }
+
+  virtual ~DropoutCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc b/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
new file mode 100644
index 00000000000..522877857c7
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/dropout_compute_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/dropout_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(dropout_x86, retrive_op) {
+  auto dropout =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "dropout");
+  ASSERT_FALSE(dropout.empty());
+  ASSERT_TRUE(dropout.front());
+}
+
+TEST(dropout_x86, init) {
+  DropoutCompute<float> dropout;
+  ASSERT_EQ(dropout.precision(), PRECISION(kFloat));
+  ASSERT_EQ(dropout.target(), TARGET(kX86));
+}
+
+TEST(dropout_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // DropoutCompute dropout;
+  DropoutCompute<float> dropout;
+  operators::DropoutParam param;
+
+  param.x = &x;
+  param.dropout_prob = 0.25;
+  param.is_test = true;
+  param.fix_seed = true;
+  param.output = &out;
+
+  dropout.SetParam(param);
+  dropout.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(dropout, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
index 8e2ea92d6de..06802070634 100644
--- a/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.cc
@@ -12,113 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
 
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-struct SubFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
-};
-
-template <typename T>
-struct AddFunctor {
-  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-class ElementwiseSubCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseSubCompute() = default;
-};
-
-template <typename T>
-struct SubGradDX {
-  T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename T>
-class ElementwiseSubGradCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseGradParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    param.Y_grad->template mutable_data<T>();
-    // skip out, x, y
-    auto dout = param.Out_grad->raw_tensor();
-    auto dx = param.X_grad->raw_tensor();
-    auto dy = param.Y_grad->raw_tensor();
-    auto& skip = dout;
-    paddle::operators::ElemwiseExplicitGradCompute<
-        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
-        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
-        &dx, &dy, SubGradDX<T>(), SubGradDY<T>());
-  }
-
-  virtual ~ElementwiseSubGradCompute() = default;
-};
-
-template <typename T>
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK(context.x86_device_context());
-    param.Out->template mutable_data<T>();
-    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
-                                            platform::CPUDeviceContext, T>(
-        *context.x86_execution_context(), &param.X->raw_tensor(),
-        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
-        &param.Out->raw_tensor());
-  }
-
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// float
 REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
                      def)
@@ -127,9 +22,19 @@ REGISTER_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_sub_grad, kX86, kFloat, kNCHW,
-                     paddle::lite::kernels::x86::ElementwiseSubCompute<float>,
+REGISTER_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ElementwiseAddCompute<float>,
                      def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+#ifdef LITE_WITH_X86
+REGISTER_LITE_KERNEL(
+    elementwise_sub_grad, kX86, kFloat, kNCHW,
+    paddle::lite::kernels::x86::ElementwiseSubGradCompute<float>, def)
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindInput(paddle::framework::GradVarName("Out"),
                {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput(paddle::framework::GradVarName("X"),
@@ -137,11 +42,4 @@ REGISTER_LITE_KERNEL(elementwise_sub_grad, kX86, kFloat, kNCHW,
     .BindOutput(paddle::framework::GradVarName("Y"),
                 {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW,
-                     paddle::lite::kernels::x86::ElementwiseAddCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
+#endif
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute.h b/paddle/fluid/lite/kernels/x86/elementwise_compute.h
new file mode 100644
index 00000000000..de976e52622
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+class ElementwiseSubCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<SubFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, SubFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~ElementwiseSubCompute() = default;
+};
+
+template <typename T>
+struct SubGradDX {
+  T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+struct SubGradDY {
+  T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+
+#ifdef LITE_WITH_X86
+template <typename T>
+class ElementwiseSubGradCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseGradParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+
+    param.X_grad->template mutable_data<T>();
+    // skip out, x, y
+    auto dout = param.Out_grad->raw_tensor();
+    auto dx = param.X_grad->raw_tensor();
+
+    framework::Tensor* dy = nullptr;
+    if (param.Y_grad) {
+      param.Y_grad->template mutable_data<T>();
+      dy = &param.Y_grad->raw_tensor();
+    }
+    auto& skip = dout;
+    paddle::operators::ElemwiseExplicitGradCompute<
+        platform::CPUDeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
+        *context.x86_execution_context(), skip, skip, skip, dout, param.axis,
+        &dx, dy, SubGradDX<T>(), SubGradDY<T>());
+  }
+
+  virtual ~ElementwiseSubGradCompute() = default;
+};
+#endif
+
+template <typename T>
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    CHECK(context.x86_device_context());
+    param.Out->template mutable_data<T>();
+    paddle::operators::ElementwiseComputeEx<AddFunctor<T>,
+                                            platform::CPUDeviceContext, T>(
+        *context.x86_execution_context(), &param.X->raw_tensor(),
+        &param.Y->raw_tensor(), param.axis, AddFunctor<T>(),
+        &param.Out->raw_tensor());
+  }
+
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc b/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
new file mode 100644
index 00000000000..8bade95f58c
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/elementwise_compute_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(elementwise_add_x86, retrive_op) {
+  auto elementwise_add =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "elementwise_add");
+  ASSERT_FALSE(elementwise_add.empty());
+  ASSERT_TRUE(elementwise_add.front());
+}
+
+TEST(elementwise_add_x86, init) {
+  ElementwiseAddCompute<float> elementwise_add;
+  ASSERT_EQ(elementwise_add.precision(), PRECISION(kFloat));
+  ASSERT_EQ(elementwise_add.target(), TARGET(kX86));
+}
+
+TEST(elementwise_add_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{batch_size, 3, 2, 2};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = 1;
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = 2;
+  }
+
+  // ElementwiseAddCompute elementwise_add;
+  ElementwiseAddCompute<float> elementwise_add;
+  operators::ElementwiseParam param;
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  elementwise_add.SetParam(param);
+  elementwise_add.SetContext(std::move(ctx));
+  elementwise_add.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(elementwise_add, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute.cc b/paddle/fluid/lite/kernels/x86/fc_compute.cc
index dad37febc80..4d5399a90b2 100644
--- a/paddle/fluid/lite/kernels/x86/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.cc
@@ -12,89 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/fc_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void fc_compute_eigen(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
-
-template <typename T>
-void fc_compute_naive(const T* x, int x_h, int x_w,  //
-                      const T* w, int w_h, int w_w,  //
-                      const T* b,                    //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
-      }
-      out[i * w_w + j] = tmp + b[j];
-    }
-  }
-}
-
-template <typename T>
-class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
-
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
-  }
-
-  virtual ~FcCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
 
 REGISTER_LITE_KERNEL(fc, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::FcCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute.h b/paddle/fluid/lite/kernels/x86/fc_compute.h
new file mode 100644
index 00000000000..45a4227cd7b
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/fc_op.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+
+template <typename T>
+void fc_compute_naive(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_h);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_w * sizeof(T));
+  for (int i = 0; i < x_h; i++) {
+    for (int j = 0; j < w_w; j++) {
+      T tmp = static_cast<T>(0);
+      for (int k = 0; k < x_w; k++) {
+        tmp += x[i * x_w + k] * w[k * w_w + j];
+      }
+      out[i * w_w + j] = tmp + b[j];
+    }
+  }
+}
+
+template <typename T>
+class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FcParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& ctx = ctx_->As<X86Context>();
+    // CHECK_GE(param.input->dims().size(), 2UL);
+    // CHECK_EQ(param.output->dims().size(), 2UL);
+
+    auto w_dims = param.w->dims();
+    auto out_dims = param.output->dims();
+    int M = out_dims.production() / w_dims[1];
+
+    auto bias = param.bias;
+    const T* input_data = param.input->data<T>();
+    const T* w_data = param.w->data<T>();
+    T* output_data = param.output->mutable_data<T>();
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *ctx.x86_device_context());
+    paddle::operators::math::FCCompute<platform::CPUDeviceContext, T>(
+        blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
+        bias ? bias->data<T>() : NULL);
+  }
+
+  virtual ~FcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute.h.bak b/paddle/fluid/lite/kernels/x86/fc_compute.h.bak
new file mode 100644
index 00000000000..dc71ca25601
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/fc_compute.h.bak
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/fc_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+  if (b) {
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
+    Out = Out.array().rowwise() + B.transpose().array();
+  }
+}
+
+template <typename T>
+void fc_compute_naive(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      const T* b,                    //
+                      T* out) {
+  CHECK_EQ(x_w, w_h);
+  // out shape: (x_h, w_w)
+  memset(out, 0, x_h * w_w * sizeof(T));
+  for (int i = 0; i < x_h; i++) {
+    for (int j = 0; j < w_w; j++) {
+      T tmp = static_cast<T>(0);
+      for (int k = 0; k < x_w; k++) {
+        tmp += x[i * x_w + k] * w[k * w_w + j];
+      }
+      out[i * w_w + j] = tmp + b[j];
+    }
+  }
+}
+
+template <typename T>
+class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FcParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    CHECK_GE(param.input->dims().size(), 2UL);
+    CHECK_EQ(param.output->dims().size(), 2UL);
+
+    fc_compute_eigen(
+        param.input->data<T>(),  // x
+        param.input->dims().Slice(0, param.in_num_col_dims).production(),
+        param.input->dims()
+            .Slice(param.in_num_col_dims, param.input->dims().size())
+            .production(),
+        param.w->data<T>(),     // w
+        param.w->dims()[0],     // w_h
+        param.w->dims()[1],     // w_w
+        param.bias->data<T>(),  // b
+        param.output->mutable_data<T>());
+  }
+
+  virtual ~FcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/fc_compute_test.cc b/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
new file mode 100644
index 00000000000..ed6016d341e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/fc_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/fc_compute.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(fc_x86, retrive_op) {
+  auto fc =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
+  ASSERT_FALSE(fc.empty());
+  ASSERT_TRUE(fc.front());
+}
+
+TEST(fc_x86, init) {
+  FcCompute<float> fc;
+  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(fc.target(), TARGET(kX86));
+}
+
+TEST(fc_x86, run_test) {
+  lite::Tensor x, w, b, out;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{1, 4};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+
+  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
+                                     w_data, 3, 4,           //
+                                     b_data, ref_data); */
+
+  // FcCompute fc;
+  FcCompute<float> fc;
+  operators::FcParam param;
+
+  param.in_num_col_dims = 1;
+  param.input = &x;
+  param.w = &w;
+  param.bias = &b;
+  param.output = &out;
+  param.in_mat_dims = x.dims();
+
+  // std::unique_ptr<KernelContext> ctx(new KernelContext);
+  // ctx->As<X86Context>();
+  fc.SetParam(param);
+  // fc.SetContext(std::move(ctx));
+  fc.Run();
+
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    VLOG(3) << out_data[i];
+  }
+
+  /* for (int i = 0; i < out.dims().product(); ++i) {
+     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+   }*/
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/gru_compute.cc b/paddle/fluid/lite/kernels/x86/gru_compute.cc
new file mode 100644
index 00000000000..b75ef1d0062
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/gru_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/gru_compute.h"
+
+REGISTER_LITE_KERNEL(gru, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::GruCompute<float>, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/gru_compute.h b/paddle/fluid/lite/kernels/x86/gru_compute.h
new file mode 100644
index 00000000000..93bdd9f37b2
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/gru_compute.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const lite::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             lite::Tensor* dst, bool indexed_src) {
+  paddle::operators::math::CopyMatrixRowsFunctor<platform::CPUDeviceContext, T>
+      row_shuffle;
+  dst->Resize(src.dims());
+  dst->mutable_data<T>();
+  row_shuffle(platform::CPUDeviceContext(), src.raw_tensor(), index_lod,
+              &dst->raw_tensor(), indexed_src);
+}
+
+template <typename T>
+class GruCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GruParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::GruParam>();
+    // auto& context = context_->As<X86Context>();
+    bool origin_mode = param.origin_mode;
+    auto* input = param.x;
+    auto* h0 = param.h0;
+    auto weight = param.weight;
+    const T* weight_data = weight->data<T>();
+    auto* bias = param.bias;
+    auto* batch_gate = param.batchGate;
+    batch_gate->mutable_data<T>();
+    auto* batch_reset_hidden_prev = param.batchResetHiddenPrev;
+    batch_reset_hidden_prev->mutable_data<T>();
+    auto* batch_hidden = param.batchHidden;
+    batch_hidden->mutable_data<T>();
+    auto* hidden = param.hidden;
+    hidden->mutable_data<T>();
+    auto hidden_dims = hidden->dims();
+    bool is_reverse = param.is_reverse;
+    paddle::operators::math::LoDTensor2BatchFunctor<platform::CPUDeviceContext,
+                                                    T>
+        to_batch;
+    to_batch(platform::CPUDeviceContext(), input->raw_tensor(),
+             &batch_gate->raw_tensor(), true, is_reverse);
+
+    if (bias) {
+      paddle::operators::math::RowwiseAdd<platform::CPUDeviceContext, T>
+          add_bias;
+      add_bias(platform::CPUDeviceContext(), batch_gate->raw_tensor(),
+               bias->raw_tensor(), &batch_gate->raw_tensor());
+    }
+
+    int frame_size = hidden_dims[1];
+    paddle::operators::math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    lite::Tensor ordered_h0;
+
+    framework::Vector<size_t> order(batch_gate->raw_tensor().lod()[2]);
+
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<platform::CPUDeviceContext, T>(*h0, order, &ordered_h0,
+                                                      true);
+      gru_value.prev_out_value = const_cast<T*>(ordered_h0.data<T>());
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->raw_tensor().lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node =
+        paddle::operators::math::detail::GetActivationType(param.activation);
+    auto active_gate = paddle::operators::math::detail::GetActivationType(
+        param.gate_activation);
+
+    for (size_t n = 0; n < seq_len; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      lite::Tensor gate_t;
+      gate_t.ShareDataWith(batch_gate->raw_tensor().Slice(bstart, bend));
+      lite::Tensor reset_hidden_prev_t;
+      reset_hidden_prev_t.ShareDataWith(
+          batch_reset_hidden_prev->raw_tensor().Slice(bstart, bend));
+      Tensor hidden_t;
+      hidden_t.ShareDataWith(batch_hidden->raw_tensor().Slice(bstart, bend));
+      gru_value.output_value = const_cast<T*>(hidden_t.data<T>());
+      gru_value.gate_value = const_cast<T*>(gate_t.data<T>());
+      gru_value.reset_output_value =
+          const_cast<T*>(reset_hidden_prev_t.data<T>());
+
+      paddle::operators::math::GRUUnitFunctor<
+          platform::CPUDeviceContext, T>::compute(platform::CPUDeviceContext(),
+                                                  gru_value, frame_size,
+                                                  cur_batch_size, active_node,
+                                                  active_gate, origin_mode);
+
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    paddle::operators::math::Batch2LoDTensorFunctor<platform::CPUDeviceContext,
+                                                    T>
+        to_seq;
+    batch_hidden->raw_tensor().set_lod(batch_gate->raw_tensor().lod());
+    to_seq(platform::CPUDeviceContext(), batch_hidden->raw_tensor(),
+           &hidden->raw_tensor());
+  }
+
+  virtual ~GruCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/lookup_table_compute.cc b/paddle/fluid/lite/kernels/x86/lookup_table_compute.cc
new file mode 100644
index 00000000000..7b0ecffc5a2
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/lookup_table_compute.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/lookup_table_compute.h"
+
+// REGISTER_LITE_KERNEL(lookup_table, kX86, kFloat, kNCHW,
+//                     paddle::lite::kernels::x86::LookupTableCompute<float>,
+//                     def)
+//    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+//    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86))})
+//    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+//    .Finalize();
+REGISTER_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW,
+                     paddle::lite::kernels::x86::LookupTableCompute<int64_t>,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/lookup_table_compute.h b/paddle/fluid/lite/kernels/x86/lookup_table_compute.h
new file mode 100644
index 00000000000..67f769883ab
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/lookup_table_compute.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+/*struct LookupTableTimer {
+  std::chrono::time_point<std::chrono::high_resolution_clock> timer_{};
+  uint64_t total_{};
+
+  void Start() { timer_ = std::chrono::high_resolution_clock::now(); }
+  void Stop() {
+    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        std::chrono::high_resolution_clock::now() - timer_);
+    Log(duration.count());
+  }
+  void Log(uint32_t timespan) { total_ += timespan; }
+  ~LookupTableTimer() {
+    LOG(INFO) << "lookup table timer: [" << total_ << "us]";
+  }
+};*/
+
+template <typename T>
+class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
+ public:
+  using param_t = operators::LookupTableParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::LookupTableParam>();
+    // auto& context = context_->As<X86Context>();
+    auto *ids_t = param.ids;
+    auto *output_t = param.output;
+
+    int64_t padding_idx = param.padding_idx;
+    int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
+    int64_t ids_numel = ids_t->dims().production();
+
+    auto *table_t = param.w;
+    int64_t row_number = table_t->dims()[0];
+    int64_t row_width = table_t->dims()[1];
+
+    auto *table = table_t->data<float>();
+    auto *output = output_t->mutable_data<float>();
+    memset(output, 0, output_t->dims().production() * sizeof(T));
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      if (padding_idx != -1 && ids[i] == padding_idx) {
+        memset(output + i * row_width, 0, row_width * sizeof(float));
+      } else {
+        CHECK_LT(ids[i], row_number);
+        CHECK_GE(ids[i], 0);
+        memcpy(output + i * row_width, table + ids[i] * row_width,
+               row_width * sizeof(float));
+      }
+    }
+  }
+
+  virtual ~LookupTableCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute.cc b/paddle/fluid/lite/kernels/x86/mul_compute.cc
index ad009893c8a..01dd2171061 100644
--- a/paddle/fluid/lite/kernels/x86/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.cc
@@ -12,122 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/blas.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulParam>();
-    CHECK(context.x86_device_context());
-
-    param.output->template mutable_data<T>();
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    const Tensor x_matrix = x->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *x, param.x_num_col_dims)
-                                                 : *x;
-    const Tensor y_matrix = y->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                       *y, param.y_num_col_dims)
-                                                 : *y;
-
-    auto* z = &param.output->raw_tensor();
-    auto z_dim = z->dims();
-    if (z_dim.size() != 2) {
-      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-
-    blas.MatMul(x_matrix, y_matrix, z);
-    if (z_dim.size() != 2) {
-      z->Resize(z_dim);
-    }
-  }
-
-  virtual ~MulCompute() = default;
-};
-
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-    auto x_matrix = x->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*x, param.x_num_col_dims)
-                        : static_cast<const Tensor&>(*x);
-    auto y_matrix = y->dims().size() > 2
-                        ? framework::ReshapeToMatrix(*y, param.y_num_col_dims)
-                        : static_cast<const Tensor&>(*y);
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
 
 REGISTER_LITE_KERNEL(mul, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::MulCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute.h b/paddle/fluid/lite/kernels/x86/mul_compute.h
new file mode 100644
index 00000000000..0f95fea934a
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulParam>();
+    CHECK(context.x86_device_context());
+
+    param.output->template mutable_data<T>();
+
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+
+    Tensor x_matrix, y_matrix;
+
+    if (x->dims().size() > 2) {
+      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+
+    if (y->dims().size() > 2) {
+      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
+
+    } else {
+      y_matrix = *y;
+    }
+
+    auto* z = &param.output->raw_tensor();
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+
+    blas.MatMul(x_matrix, y_matrix, z);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+
+  virtual ~MulCompute() = default;
+};
+
+template <typename T>
+class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulGradParam>();
+    CHECK(context.x86_device_context());
+
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+
+    Tensor x_matrix, y_matrix;
+
+    if (x->dims().size() > 2) {
+      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+
+    if (y->dims().size() > 2) {
+      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
+
+    } else {
+      y_matrix = *y;
+    }
+
+    auto* dout = &param.output_grad->raw_tensor();
+
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize(
+        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
+
+    auto* dx = &param.x_grad->raw_tensor();
+    auto* dy = &param.y_grad->raw_tensor();
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+    if (dx) {
+      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
+      param.x_grad->template mutable_data<T>();
+      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dx, param.x_num_col_dims)
+                                               : *dx;
+
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+    }
+    if (dy) {
+      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
+      param.y_grad->template mutable_data<T>();
+      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dy, param.y_num_col_dims)
+                                               : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+    }
+  }
+
+  virtual ~MulGradCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute.h.bak b/paddle/fluid/lite/kernels/x86/mul_compute.h.bak
new file mode 100644
index 00000000000..21267f3ddc6
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/mul_compute.h.bak
@@ -0,0 +1,143 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/blas.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+void fc_compute_eigen(const T* x, int x_h, int x_w,  //
+                      const T* w, int w_h, int w_w,  //
+                      T* out) {
+  using matrix_t =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+  Eigen::Map<const matrix_t> X(x, x_h, x_w);
+  Eigen::Map<const matrix_t> W(w, w_h, w_w);
+  Eigen::Map<matrix_t> Out(out, x_h, w_w);
+
+  Out = X * W;
+
+}
+
+template <typename T>
+class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MulParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::MulParam>();
+    auto out_dims = param.output->dims();
+    fc_compute_eigen(
+        param.x->data<T>(),  // x
+        param.x->dims().Slice(0, param.x_num_col_dims).production(),
+        param.x->dims()
+            .Slice(param.x_num_col_dims, param.x->dims().size())
+            .production(),
+        param.y->data<T>(),     // w
+        param.y->dims().Slice(0, param.x_num_col_dims).production(),     // w_h
+        param.y->dims()
+            .Slice(param.y_num_col_dims, param.y->dims().size())
+            .production(),     // w_w
+        param.output->mutable_data<T>());
+
+    if (param.output->dims().size() != 2) {
+      param.output->Resize(out_dims);
+    }
+  }
+
+  virtual ~MulCompute() = default;
+};
+
+template <typename T>
+class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MulGradParam>();
+    CHECK(context.x86_device_context());
+
+    auto* x = &param.x->raw_tensor();
+    auto* y = &param.y->raw_tensor();
+
+    Tensor x_matrix, y_matrix;
+
+    if (x->dims().size() > 2) {
+      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
+    } else {
+      x_matrix = *x;
+    }
+
+    if (y->dims().size() > 2) {
+      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
+
+    } else {
+      y_matrix = *y;
+    }
+
+    auto* dout = &param.output_grad->raw_tensor();
+
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize(
+        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
+         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
+
+    auto* dx = &param.x_grad->raw_tensor();
+    auto* dy = &param.y_grad->raw_tensor();
+
+    if (dx != nullptr) {
+      dx->set_lod(x->lod());
+    }
+    if (dy != nullptr) {
+      dy->set_lod(y->lod());
+    }
+
+    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
+        *context.x86_device_context());
+    if (dx) {
+      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
+      param.x_grad->template mutable_data<T>();
+      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dx, param.x_num_col_dims)
+                                               : *dx;
+
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
+    }
+    if (dy) {
+      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
+      param.y_grad->template mutable_data<T>();
+      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
+                                                     *dy, param.y_num_col_dims)
+                                               : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
+    }
+  }
+
+  virtual ~MulGradCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/mul_compute_test.cc b/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
new file mode 100644
index 00000000000..c551754328e
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/mul_compute_test.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/mul_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(mul_x86, retrive_op) {
+  auto mul =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
+  ASSERT_FALSE(mul.empty());
+  ASSERT_TRUE(mul.front());
+}
+
+TEST(mul_x86, init) {
+  MulCompute<float> mul;
+  ASSERT_EQ(mul.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mul.target(), TARGET(kX86));
+}
+
+TEST(mul_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{3, 4};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{batch_size, 4};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  // MulCompute mul;
+  MulCompute<float> mul;
+  operators::MulParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.output = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  mul.SetContext(std::move(ctx));
+  mul.SetParam(param);
+  mul.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute.cc b/paddle/fluid/lite/kernels/x86/pool_compute.cc
index 745c2a78789..7c188db3c73 100644
--- a/paddle/fluid/lite/kernels/x86/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.cc
@@ -12,66 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/pooling.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::PoolParam;
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    if (param.global_pooling) {
-      for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
-        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
-      }
-    }
-    switch (param.ksize.size()) {
-      case 2: {
-        if (param.pooling_type == "max") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, true, false,
-                         &(param.output->raw_tensor()));
-        } else if (param.pooling_type == "avg") {
-          paddle::operators::math::Pool2dFunctor<
-              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
-              T>
-              pool2d_forward;
-          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
-                         param.ksize, param.strides, param.paddings,
-                         pool_process, param.exclusive, param.adaptive,
-                         &(param.output->raw_tensor()));
-        }
-      } break;
-      case 3: {
-      } break;
-    }
-  }
-  virtual ~PoolCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/pool_compute.h"
 
 REGISTER_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::PoolCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute.h b/paddle/fluid/lite/kernels/x86/pool_compute.h
new file mode 100644
index 00000000000..d024c5b84e3
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/pool_compute.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/types.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::PoolParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    if (param.global_pooling) {
+      for (size_t i = 0; i < param.ksize.size(); ++i) {
+        param.paddings[i] = 0;
+        param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
+      }
+    }
+    switch (param.ksize.size()) {
+      case 2: {
+        if (param.pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::MaxPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, true, false,
+                         &(param.output->raw_tensor()));
+        } else if (param.pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              platform::CPUDeviceContext, paddle::operators::math::AvgPool<T>,
+              T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(platform::CPUDeviceContext(), param.x->raw_tensor(),
+                         param.ksize, param.strides, param.paddings,
+                         pool_process, param.exclusive, param.adaptive,
+                         &(param.output->raw_tensor()));
+        }
+      } break;
+      case 3: {
+      } break;
+    }
+  }
+  virtual ~PoolCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/pool_compute_test.cc b/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
new file mode 100644
index 00000000000..b3d83350910
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/pool_compute_test.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/pool_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(pool_x86, retrive_op) {
+  auto pool2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "pool2d");
+  ASSERT_FALSE(pool2d.empty());
+  ASSERT_TRUE(pool2d.front());
+}
+
+TEST(pool2d_x86, init) {
+  PoolCompute<float> pool2d;
+  ASSERT_EQ(pool2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(pool2d.target(), TARGET(kX86));
+}
+
+TEST(pool2d_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 4, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+
+  PoolCompute<float> pool2d;
+  operators::PoolParam param;
+
+  param.x = &x;
+  param.output = &out;
+  param.strides = {2, 2};
+  param.paddings = {0, 0};
+  param.ksize = {2, 2};
+  param.pooling_type = "max";
+
+  pool2d.SetParam(param);
+  pool2d.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/reduce_compute.cc b/paddle/fluid/lite/kernels/x86/reduce_compute.cc
new file mode 100644
index 00000000000..9a45fabebd1
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/reduce_compute.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/reduce_compute.h"
+
+REGISTER_LITE_KERNEL(reduce_sum, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ReduceSumCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/reduce_compute.h b/paddle/fluid/lite/kernels/x86/reduce_compute.h
new file mode 100644
index 00000000000..d465a5a54d5
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/reduce_compute.h
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->sum(dim);
+  }
+};
+
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    paddle::operators::ReduceFunctor<platform::CPUDeviceContext, T, NDIM, \
+                                     RDIM, SumFunctor>(                   \
+        platform::CPUDeviceContext(), input->raw_tensor(),                \
+        &output->raw_tensor(), dims, keep_dim);                           \
+  }
+
+template <typename T>
+class ReduceSumCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReduceParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::ReduceParam>();
+    // auto& context = context_->As<X86Context>();
+    bool reduce_all = param.reduce_all;
+    auto* input = param.x;
+    auto* output = param.output;
+    param.output->mutable_data<T>();
+
+    auto dims = param.dim;
+    bool keep_dim = param.keep_dim;
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto x = paddle::operators::EigenVector<T>::Flatten(input->raw_tensor());
+      auto out = paddle::operators::EigenScalar<T>::From(output->raw_tensor());
+      auto& place = *platform::CPUDeviceContext().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      SumFunctor functor;
+      functor(place, &x, &out, reduce_dim);
+    } else {
+      int ndim = input->dims().size();
+      int rdim = dims.size();
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+      HANDLE_DIM(1, 1);
+    }
+  }
+
+  virtual ~ReduceSumCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/relu_compute.cc b/paddle/fluid/lite/kernels/x86/relu_compute.cc
index 52fffb57981..326df35beff 100644
--- a/paddle/fluid/lite/kernels/x86/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.cc
@@ -12,42 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ReluParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.input->dims().production();
-    const float* input = param.input->data<float>();
-    float* output = param.output->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  virtual ~ReluCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/relu_compute.h"
 
 REGISTER_LITE_KERNEL(relu, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ReluCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/arm/relu_compute.h b/paddle/fluid/lite/kernels/x86/relu_compute.h
similarity index 60%
rename from paddle/fluid/lite/kernels/arm/relu_compute.h
rename to paddle/fluid/lite/kernels/x86/relu_compute.h
index 29d17bf5918..0976ff80f48 100644
--- a/paddle/fluid/lite/kernels/arm/relu_compute.h
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.h
@@ -11,38 +11,42 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #pragma once
+
+#include <Eigen/Core>
 #include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace x86 {
 
-class ReluCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T>
+class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
+  using param_t = operators::ActivationParam;
+
   void Run() override {
-    auto& param = Param<operators::ReluParam>();
-    auto n = param.input->dims().production();
-    const float* input = param.input->data<float>();
-    float* output = param.output->mutable_data<float>();
+    auto& param = *param_.get_mutable<param_t>();
+    auto n = param.X->dims().production();
+    const float* input = param.X->data<float>();
+    float* output = param.Out->mutable_data<float>();
     for (int i = 0; i < n; i++) {
       output[i] = std::max(0.f, input[i]);
     }
   }
 
-  TargetType target() const override { return TARGET(kARM); }
-  PrecisionType precision() const override { return PRECISION(kFloat); }
+  virtual ~ReluCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace x86
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-
-REGISTER_LITE_KERNEL(relu, kARM, kFloat, kNCHW,
-                     paddle::lite::kernels::arm::ReluCompute, def)
-    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/relu_compute_test.cc b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
new file mode 100644
index 00000000000..f91cba535e0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/relu_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(relu_x86, retrive_op) {
+  auto relu =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  ASSERT_FALSE(relu.empty());
+  ASSERT_TRUE(relu.front());
+}
+
+TEST(relu_x86, init) {
+  ReluCompute<float> relu;
+  ASSERT_EQ(relu.precision(), PRECISION(kFloat));
+  ASSERT_EQ(relu.target(), TARGET(kX86));
+}
+
+TEST(relu_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    int sign = i % 2 == 0 ? 1 : -1;
+    x_data[i] = static_cast<float>(i * sign);
+  }
+  // ReluCompute relu;
+  ReluCompute<float> relu;
+  operators::ActivationParam param;
+
+  param.X = &x;
+  param.Out = &out;
+
+  relu.SetParam(param);
+  relu.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/reshape_compute.cc b/paddle/fluid/lite/kernels/x86/reshape_compute.cc
new file mode 100644
index 00000000000..85c94caff9b
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/reshape_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/reshape_compute.h"
+
+REGISTER_LITE_KERNEL(reshape, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ReshapeCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+REGISTER_LITE_KERNEL(reshape2, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::ReshapeCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/reshape_compute.h b/paddle/fluid/lite/kernels/x86/reshape_compute.h
new file mode 100644
index 00000000000..007e98141ec
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/reshape_compute.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/operators/reshape_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class ReshapeCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::ReshapeParam>();
+    // auto& context = context_->As<X86Context>();
+    CHECK(param.output);
+    CHECK(param.x);
+
+    auto *shape_tensor = param.actual_shape;
+    lite::DDim out_dims = param.output->dims();
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      auto shape = std::vector<int>(
+          shape_data, shape_data + shape_tensor->dims().production());
+      out_dims = paddle::lite::operators::ValidateShape(shape, param.x->dims());
+    }
+
+    param.output->mutable_data<T>();
+    framework::TensorCopy(param.x->raw_tensor(), platform::CPUPlace(),
+                          platform::CPUDeviceContext(),
+                          &param.output->raw_tensor());
+    param.output->Resize(out_dims);
+  }
+  virtual ~ReshapeCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute.cc b/paddle/fluid/lite/kernels/x86/scale_compute.cc
index 0135a6f614e..9a71750cf1e 100644
--- a/paddle/fluid/lite/kernels/x86/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.cc
@@ -12,48 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <Eigen/Core>
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-#include "paddle/fluid/lite/operators/relu_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-void scale_compute(const T* x, T* out, int size, float scale, float bias,
-                   bool bias_before) {
-  if (bias_before) bias *= scale;
-  for (int i = 0; i < size; i++) {
-    out[i] = x[i] * scale + bias;
-  }
-}
-
-template <typename T>
-class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ScaleParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    scale_compute(param.x->data<T>(), param.output->mutable_data<T>(),
-                  param.x->dims().production(), param.scale, param.bias,
-                  param.bias_after_scale);
-  }
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/scale_compute.h"
 
 REGISTER_LITE_KERNEL(scale, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::ScaleCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute.h b/paddle/fluid/lite/kernels/x86/scale_compute.h
new file mode 100644
index 00000000000..dc54cc07bd8
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/scale_compute.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <Eigen/Core>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+#include "paddle/fluid/lite/operators/relu_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void scale_compute(const T* x, T* out, int size, float scale, float bias,
+                   bool bias_before) {
+  if (bias_before) bias *= scale;
+  for (int i = 0; i < size; i++) {
+    out[i] = x[i] * scale + bias;
+  }
+}
+
+template <typename T>
+class ScaleCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ScaleParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    scale_compute(param.x->data<T>(), param.output->mutable_data<T>(),
+                  param.x->dims().production(), param.scale, param.bias,
+                  param.bias_after_scale);
+  }
+
+  virtual ~ScaleCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/scale_compute_test.cc b/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
new file mode 100644
index 00000000000..68d0e67cdf6
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/scale_compute_test.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/scale_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(scale_x86, retrive_op) {
+  auto scale =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("scale");
+  ASSERT_FALSE(scale.empty());
+  ASSERT_TRUE(scale.front());
+}
+
+TEST(scale_x86, init) {
+  ScaleCompute<float> scale;
+  ASSERT_EQ(scale.precision(), PRECISION(kFloat));
+  ASSERT_EQ(scale.target(), TARGET(kX86));
+}
+
+TEST(scale_x86, run_test) {
+  lite::Tensor x, y, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // ScaleCompute scale;
+  ScaleCompute<float> scale;
+  operators::ScaleParam param;
+
+  param.x = &x;
+  param.scale = 0.5;
+  param.bias = 0;
+  param.output = &out;
+
+  scale.SetParam(param);
+  scale.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(scale, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.cc b/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.cc
new file mode 100644
index 00000000000..13042d8d867
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/sequence_reshape_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_reshape, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::SequenceReshapeCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
+    .Finalize();
diff --git a/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.h b/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.h
new file mode 100644
index 00000000000..785eaa05360
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/sequence_reshape_compute.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SequenceReshapeCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceReshapeParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
+    // auto& context = context_->As<X86Context>();
+    auto* in = param.x;
+    auto* out = param.output;
+    int out_width = param.new_dim;
+
+    auto in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    // LOG(INFO)<<"sequence_reshape in tensor:"<<*in;
+    auto& in_lod = in->raw_tensor().lod();
+
+    CHECK_EQ(in_lod.size(), 1UL);
+    CHECK_EQ((uint64_t)in_dims[0], in_lod[0].back());
+
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+
+    if (in_width == out_width) {
+      out->raw_tensor().set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->raw_tensor().mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        CHECK_EQ(offset * out_width, seq_len * in_width);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+
+    // out->mutable_data<T>();
+    framework::TensorCopy(in->raw_tensor(), platform::CPUPlace(),
+                          &out->raw_tensor());
+    std::vector<int64_t> out_shape{static_cast<int64_t>(out->lod()[0].back()),
+                                   out_width};
+    out->Resize(lite::DDim(out_shape));
+  }
+
+  virtual ~SequenceReshapeCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/sgd_compute.cc b/paddle/fluid/lite/kernels/x86/sgd_compute.cc
index 2b50c9172a0..593b14eb52d 100644
--- a/paddle/fluid/lite/kernels/x86/sgd_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/sgd_compute.cc
@@ -49,6 +49,7 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     const T *param_data = param->template data<T>();
     const T *grad_data = grad->template data<T>();
     int64_t rows_idx = 0;
+
     T *out_data = param_out->template mutable_data<T>(
         context.x86_device_context()->GetPlace());
 
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute.cc b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
index fe408aa3c84..5bdb58b6887 100644
--- a/paddle/fluid/lite/kernels/x86/softmax_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.cc
@@ -12,76 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-static inline int CanonicalAxis(const int axis, const int rank) {
-  if (axis < 0) {
-    return axis + rank;
-  }
-  return axis;
-}
-
-static inline int SizeToAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (int i = 0; i < axis; i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-static inline int SizeFromAxis(const int axis, lite::DDim dims) {
-  int size = 1;
-  for (int i = axis; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-template <typename T>
-class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::SoftmaxParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
-    // auto& context = context_->As<X86Context>();
-    CHECK(param.output);
-    CHECK(param.x);
-    const int rank = param.x->dims().size();
-    const int axis = CanonicalAxis(param.axis, rank);
-    int axis_dim = param.x->dims()[axis];
-    const int n = SizeToAxis(axis, param.x->dims());
-    const int d = SizeFromAxis(axis, param.x->dims());
-    std::vector<int64_t> shape{n, d};
-
-    lite::Tensor input_2d, out_2d;
-    input_2d.ShareDataWith(*param.x);
-    input_2d.Resize(lite::DDim(shape));
-    out_2d.ShareDataWith(*param.output);
-    out_2d.Resize(lite::DDim(shape));
-
-    paddle::operators::math::SoftmaxFunctor<platform::CPUDeviceContext, T,
-                                            true>()(
-        platform::CPUDeviceContext(), axis_dim, &input_2d.raw_tensor(),
-        &out_2d.raw_tensor());
-  }
-
-  virtual ~SoftmaxCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
+#include "paddle/fluid/lite/kernels/x86/softmax_compute.h"
 
 REGISTER_LITE_KERNEL(softmax, kX86, kFloat, kNCHW,
                      paddle::lite::kernels::x86::SoftmaxCompute<float>, def)
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute.h b/paddle/fluid/lite/kernels/x86/softmax_compute.h
new file mode 100644
index 00000000000..e51162095d9
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute.h
@@ -0,0 +1,87 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+static inline int CanonicalAxis(const int axis, const int rank) {
+  if (axis < 0) {
+    return axis + rank;
+  }
+  return axis;
+}
+
+static inline int SizeToAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (int i = 0; i < axis; i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+static inline int SizeFromAxis(const int axis, lite::DDim dims) {
+  int size = 1;
+  for (size_t i = axis; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
+template <typename T>
+class SoftmaxCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SoftmaxParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SoftmaxParam>();
+    // auto& context = context_->As<X86Context>();
+    CHECK(param.output);
+    CHECK(param.x);
+    param.output->mutable_data<T>();
+    const int rank = param.x->dims().size();
+    const int axis = CanonicalAxis(param.axis, rank);
+    int axis_dim = param.x->dims()[axis];
+    const int n = SizeToAxis(axis, param.x->dims());
+    const int d = SizeFromAxis(axis, param.x->dims());
+    std::vector<int64_t> shape{n, d};
+
+    lite::Tensor input_2d, out_2d;
+    input_2d.ShareDataWith(*param.x);
+    input_2d.Resize(lite::DDim(shape));
+    out_2d.ShareDataWith(*param.output);
+    out_2d.Resize(lite::DDim(shape));
+
+    paddle::operators::math::SoftmaxFunctor<platform::CPUDeviceContext, T,
+                                            true>()(
+        platform::CPUDeviceContext(), axis_dim, &input_2d.raw_tensor(),
+        &out_2d.raw_tensor());
+  }
+
+  virtual ~SoftmaxCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc b/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
new file mode 100644
index 00000000000..daab7e82a53
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/softmax_compute_test.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/softmax_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(softmax_x86, retrive_op) {
+  auto softmax =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "softmax");
+  ASSERT_FALSE(softmax.empty());
+  ASSERT_TRUE(softmax.front());
+}
+
+TEST(softmax_x86, init) {
+  SoftmaxCompute<float> softmax;
+  ASSERT_EQ(softmax.precision(), PRECISION(kFloat));
+  ASSERT_EQ(softmax.target(), TARGET(kX86));
+}
+
+TEST(softmax_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 3, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape{batch_size, 3, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SoftmaxCompute<float> softmax;
+  operators::SoftmaxParam param;
+
+  param.x = &x;
+  param.output = &out;
+
+  softmax.SetParam(param);
+  softmax.Run();
+
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(softmax, kX86, kFloat, kNCHW, def);
diff --git a/paddle/fluid/lite/kernels/x86/uniform_random_compute.cc b/paddle/fluid/lite/kernels/x86/uniform_random_compute.cc
new file mode 100644
index 00000000000..58e0d1693a0
--- /dev/null
+++ b/paddle/fluid/lite/kernels/x86/uniform_random_compute.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class UniformRandomCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  void Run() override {
+    auto &context = ctx_->As<X86Context>();
+    auto &param = *param_.get_mutable<operators::UniformRandomParam>();
+    CHECK(context.x86_device_context());
+
+    auto *param_out = &param.Out->raw_tensor();
+
+    T *data =
+        param_out->mutable_data<T>(context.x86_device_context()->GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(param.seed);
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(static_cast<T>(param.min),
+                                           static_cast<T>(param.max));
+    int64_t size = param_out->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+
+  virtual ~UniformRandomCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// float
+REGISTER_LITE_KERNEL(uniform_random, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::UniformRandomCompute<float>,
+                     def)
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/paddle/fluid/lite/model_parser/CMakeLists.txt b/paddle/fluid/lite/model_parser/CMakeLists.txt
index 63fe21abdaf..85f69ec9b74 100644
--- a/paddle/fluid/lite/model_parser/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/CMakeLists.txt
@@ -1,7 +1,7 @@
-#cc_library(runtime_lite SRCS runtime.cc)
+#lite_cc_library(runtime_lite SRCS runtime.cc)
 
 #TODO(Superjomn) enable it again.
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(NOT LITE_ON_MOBILE)
     lite_cc_test(test_model_parser_lite SRCS model_parser_test.cc
       DEPS model_parser_lite framework_proto_lite
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model)
@@ -10,18 +10,15 @@ if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
     endif(WITH_TESTING)
 endif()
 
+lite_cc_library(compatible_pb_lite SRCS compatible_pb.cc
+  DEPS op_desc_lite framework_proto_lite var_desc_lite cpp_op_desc_lite)
 
-cc_library(compatible_pb_lite SRCS compatible_pb.cc DEPS op_desc_lite framework_proto_lite var_desc_lite)
-
-set(model_parser_deps variable_lite scope_lite ${tensor_lite} scope_lite
-                      target_wrapper_host
-                      compatible_pb_lite
-                      memory_lite
-                      )
-if (LITE_WITH_CUDA)
-    set(model_parser_deps ${model_parser_deps} target_wrapper_cuda)
-endif()
-cc_library(model_parser_lite SRCS model_parser.cc DEPS ${model_parser_deps})
+lite_cc_library(model_parser_lite SRCS model_parser.cc DEPS
+    variable_lite scope_lite ${tensor_lite} scope_lite
+    target_wrapper_host
+    compatible_pb_lite
+    memory_lite
+    CUDA_DEPS target_wrapper_cuda)
 
 lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_desc_lite compatible_pb_lite)
 
diff --git a/paddle/fluid/lite/model_parser/compatible_pb.cc b/paddle/fluid/lite/model_parser/compatible_pb.cc
index 23a09f8afbf..21461e3685d 100644
--- a/paddle/fluid/lite/model_parser/compatible_pb.cc
+++ b/paddle/fluid/lite/model_parser/compatible_pb.cc
@@ -72,6 +72,13 @@ void AttrsPbToCpp(const pb::OpDesc &pb_desc, cpp::OpDesc *cpp_desc) {
         cpp_desc->SetAttr<std::vector<std::string>>(
             name, pb_desc.GetAttr<std::vector<std::string>>(name));
         break;
+      case AttrType::LONGS:
+        cpp_desc->SetAttr<std::vector<int64_t>>(
+            name, pb_desc.GetAttr<std::vector<int64_t>>(name));
+        break;
+      case AttrType::LONG:
+        cpp_desc->SetAttr<int64_t>(name, pb_desc.GetAttr<int64_t>(name));
+        break;
       default:
         LOG(FATAL) << "Unsupported attr type found " << static_cast<int>(type);
     }
diff --git a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
index 71073179991..b13dc442753 100644
--- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt
@@ -1 +1 @@
-cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
+lite_cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
diff --git a/paddle/fluid/lite/model_parser/cpp/op_desc.cc b/paddle/fluid/lite/model_parser/cpp/op_desc.cc
index b6b854d72af..010aae84e3e 100644
--- a/paddle/fluid/lite/model_parser/cpp/op_desc.cc
+++ b/paddle/fluid/lite/model_parser/cpp/op_desc.cc
@@ -28,12 +28,14 @@ namespace cpp {
   }
 
 SET_ATTR_IMPL(int32_t, INT);
+SET_ATTR_IMPL(int64_t, LONG);
 SET_ATTR_IMPL(float, FLOAT);
 SET_ATTR_IMPL(std::string, STRING);
 SET_ATTR_IMPL(bool, BOOLEAN);
 SET_ATTR_IMPL(std::vector<int>, INTS);
 SET_ATTR_IMPL(std::vector<float>, FLOATS);
 SET_ATTR_IMPL(std::vector<std::string>, STRINGS);
+SET_ATTR_IMPL(std::vector<int64_t>, LONGS);
 
 std::pair<OpDesc::attrs_t::const_iterator, OpDesc::attr_types_t::const_iterator>
 FindAttr(const cpp::OpDesc& desc, const std::string& name) {
@@ -55,6 +57,7 @@ FindAttr(const cpp::OpDesc& desc, const std::string& name) {
   }
 
 GET_IMPL_ONE(int32_t, INT);
+GET_IMPL_ONE(int64_t, LONG);
 GET_IMPL_ONE(float, FLOAT);
 GET_IMPL_ONE(std::string, STRING);
 GET_IMPL_ONE(bool, BOOLEAN);
diff --git a/paddle/fluid/lite/model_parser/cpp/op_desc.h b/paddle/fluid/lite/model_parser/cpp/op_desc.h
index b70c1692659..ac001f3e775 100644
--- a/paddle/fluid/lite/model_parser/cpp/op_desc.h
+++ b/paddle/fluid/lite/model_parser/cpp/op_desc.h
@@ -58,6 +58,12 @@ class OpDesc : public OpDescAPI {
   std::map<std::string, std::vector<std::string>>* mutable_outputs() {
     return &outputs_;
   }
+
+  bool HasInput(const std::string& param) const {
+    auto it = inputs_.find(param);
+    return it != inputs_.end();
+  }
+
   std::vector<std::string> Input(const std::string& param) const override {
     auto it = inputs_.find(param);
     CHECK(it != inputs_.end());
@@ -75,6 +81,11 @@ class OpDesc : public OpDescAPI {
     return res;
   }
 
+  bool HasOutput(const std::string& param) const {
+    auto it = outputs_.find(param);
+    return it != outputs_.end();
+  }
+
   std::vector<std::string> Output(const std::string& param) const override {
     auto it = outputs_.find(param);
     CHECK(it != outputs_.end());
diff --git a/paddle/fluid/lite/model_parser/desc_apis.h b/paddle/fluid/lite/model_parser/desc_apis.h
index d28f82a0e73..5981b873f7c 100644
--- a/paddle/fluid/lite/model_parser/desc_apis.h
+++ b/paddle/fluid/lite/model_parser/desc_apis.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <map>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -79,6 +80,27 @@ class OpDescAPI {
   /// Get an attribute.
   template <typename T>
   T GetAttr(const std::string& name) const;
+
+  std::string Repr() const {
+    std::stringstream ss;
+    ss << Type();
+    ss << "(";
+    for (auto& arg : InputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Input(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ") -> (";
+    for (auto& arg : OutputArgumentNames()) {
+      ss << arg << ":";
+      for (auto val : Output(arg)) {
+        ss << val << " ";
+      }
+    }
+    ss << ")";
+    return ss.str();
+  }
 };
 
 }  // namespace lite
diff --git a/paddle/fluid/lite/model_parser/model_parser.cc b/paddle/fluid/lite/model_parser/model_parser.cc
index c829259ee42..d69fe4d7f7f 100644
--- a/paddle/fluid/lite/model_parser/model_parser.cc
+++ b/paddle/fluid/lite/model_parser/model_parser.cc
@@ -91,7 +91,7 @@ void LoadLoDTensor(std::istream &is, Variable *var) {
   auto *tensor = var->GetMutable<lite::Tensor>();
   uint32_t version{};
   is.read(reinterpret_cast<char *>(&version), sizeof(version));
-  LOG(INFO) << "model version " << version;
+  VLOG(3) << "model version " << version;
 
   // Load LoD information
   uint64_t lod_level{};
@@ -154,7 +154,7 @@ void LoadModel(const std::string &model_dir, Scope *scope,
       continue;
 
     std::string file_path = model_dir + "/" + var.name();
-    LOG(INFO) << "reading weight " << var.name();
+    VLOG(4) << "reading weight " << var.name();
 
     std::ifstream file(file_path);
     switch (var.type().type()) {
@@ -209,7 +209,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
     os.write(out.data(), size);
   }
   {  // the 3rd field, tensor data
-    uint64_t size = tensor.data_size();
+    uint64_t size = tensor.memory_size();
     CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
         << "Index overflow when writing tensor";
 
diff --git a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
index 22d88aeabf4..95e025c963f 100644
--- a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
+++ b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt
@@ -1,2 +1,3 @@
-cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
-cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
+lite_cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
+lite_cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
+ 
diff --git a/paddle/fluid/lite/model_parser/pb/op_desc.cc b/paddle/fluid/lite/model_parser/pb/op_desc.cc
index 7f84510a3fa..a97df0f88ac 100644
--- a/paddle/fluid/lite/model_parser/pb/op_desc.cc
+++ b/paddle/fluid/lite/model_parser/pb/op_desc.cc
@@ -43,6 +43,7 @@ FindAttr(framework::proto::OpDesc *desc, const std::string &name) {
     it->set_##pb_f__(v);                                         \
   }
 SET_IMPL_ONE(int, INT, i);
+SET_IMPL_ONE(int64_t, LONG, l);
 SET_IMPL_ONE(float, FLOAT, f);
 SET_IMPL_ONE(bool, BOOLEAN, b);
 
@@ -116,11 +117,13 @@ GetFindAttr(const framework::proto::OpDesc &desc, const std::string &name) {
   }
 GET_ATTR_IMPL(int32_t, i);
 GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(int64_t, l);
 GET_ATTR_IMPL(bool, b);
 GET_ATTRS_IMPL(std::vector<int>, ints);
 GET_ATTRS_IMPL(std::vector<float>, floats);
 GET_ATTRS_IMPL(std::vector<std::string>, strings);
 GET_ATTR_IMPL(std::string, s);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
 
 }  // namespace pb
 }  // namespace lite
diff --git a/paddle/fluid/lite/model_parser/pb/op_desc.h b/paddle/fluid/lite/model_parser/pb/op_desc.h
index e8772e162a5..b64ba5452d6 100644
--- a/paddle/fluid/lite/model_parser/pb/op_desc.h
+++ b/paddle/fluid/lite/model_parser/pb/op_desc.h
@@ -141,6 +141,8 @@ class OpDesc : public OpDescAPI {
   template <typename T>
   T GetAttr(const std::string &name) const;
 
+  std::string DebugString() const { return desc_.DebugString(); }
+
  private:
   std::vector<std::string> GetArguments(
       const google::protobuf::RepeatedPtrField<framework::proto::OpDesc_Var>
diff --git a/paddle/fluid/lite/opencl/CMakeLists.txt b/paddle/fluid/lite/opencl/CMakeLists.txt
new file mode 100644
index 00000000000..bff8050ce6f
--- /dev/null
+++ b/paddle/fluid/lite/opencl/CMakeLists.txt
@@ -0,0 +1,16 @@
+if (NOT LITE_WITH_OPENCL)
+    return()
+endif()
+
+lite_cc_library(cl_wrapper SRCS cl_wrapper.cxx)
+lite_cc_library(cl_tool SRCS cl_tool.cc DEPS cl_wrapper)
+lite_cc_library(cl_engine SRCS cl_engine.cc DEPS cl_tool)
+lite_cc_library(cl_context SRCS cl_context.cc DEPS cl_engine)
+lite_cc_library(cl_helper SRCS cl_helper.cc DEPS cl_context)
+lite_cc_library(cl_image_converter SRCS cl_image_converter.cc DEPS lite_tensor)
+lite_cc_library(cl_image SRCS cl_image.cc DEPS lite_tensor cl_image_converter cl_engine)
+lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_helper cl_image)
+lite_cc_test(test_cl_runtime SRCS cl_test.cc DEPS cl_helper cl_image cl_caller cl_wrapper
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/opencl)
+
+add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/paddle/fluid/lite/opencl/cl_caller.cc b/paddle/fluid/lite/opencl/cl_caller.cc
new file mode 100644
index 00000000000..50394df3883
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_caller.cc
@@ -0,0 +1,156 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+#include "paddle/fluid/lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+static void CopyImageData(CLHelper* helper, const CLImage& cl_image,
+                          float* out) {
+  int width = cl_image.image_dims()[0];
+  int height = cl_image.image_dims()[1];
+
+  float* image_data = new float[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = helper->OpenCLCommandQueue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, out, cl_image.image_dims(),
+                         cl_image.tensor_dims());
+
+  delete[] image_data;
+}
+
+bool InitOpenCLEngine(std::string cl_path) {
+  auto* engine = CLEngine::Global();
+  engine->set_cl_path(cl_path);
+  return engine->IsInitSuccess();
+}
+
+void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
+                     const float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim) {
+  if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
+    LOG(FATAL) << "Error: bias dims is error";
+    return;
+  }
+  auto kernel = bias_dim.size() == 1 ? helper->GetKernel("channel_add")
+                                     : helper->GetKernel("elementwise_add");
+  CLImage in_image;
+  in_image.set_tensor_data(in, in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias, bias_dim);
+  bias_image.InitCLImage(helper->OpenCLContext());
+  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
+  CLImage out_image;
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+
+  if (bias_dim.size() == 1) {
+    int tensor_w = in_dim[3];
+    status = kernel.setArg(3, tensor_w);
+    CL_CHECK_ERRORS(status);
+  }
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  CL_CHECK_ERRORS(status);
+
+  status = helper->OpenCLCommandQueue().finish();
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- Out image: " << out_image << " --- ";
+  CopyImageData(helper, out_image, out);
+}
+
+void pool(CLHelper* helper, const std::string pooling_type, const int pad_h,
+          const int pad_w, const int stride_h, const int stride_w,
+          const int ksize_h, const int ksize_w, const float* in,
+          const DDim& in_dim, float* out, const DDim& out_dim) {
+  auto kernel =
+      helper->GetKernel(string_format("pool_%s", pooling_type.c_str()));
+  CLImage in_image;
+  in_image.set_tensor_data(in, in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
+  CLImage out_image;
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  auto global_work_size = helper->DefaultWorkSize(out_image);
+  auto* in_converter =
+      dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
+  auto* out_converter =
+      dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
+  const int in_height = in_converter->HeightOfOneBlock();
+  const int in_width = in_converter->WidthOfOneBlock();
+  const int out_height = out_converter->HeightOfOneBlock();
+  const int out_width = out_converter->WidthOfOneBlock();
+  cl_int status;
+  status = kernel.setArg(0, in_height);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, in_width);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, out_height);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(3, out_width);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(4, pad_h);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(5, pad_w);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(6, stride_h);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(7, stride_w);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(8, ksize_h);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(9, ksize_w);
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(10, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(11, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
+  CL_CHECK_ERRORS(status);
+
+  status = helper->OpenCLCommandQueue().finish();
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- Out image: " << out_image << " --- ";
+  CopyImageData(helper, out_image, out);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_caller.h b/paddle/fluid/lite/opencl/cl_caller.h
new file mode 100644
index 00000000000..0d53574e17a
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_caller.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+
+namespace paddle {
+namespace lite {
+
+bool InitOpenCLEngine(std::string cl_path);
+
+/// An elementwise_add method to embed OpenCL logic inside, it is used as a
+/// black box so that the framework can remain simple.
+/// NOTE Currently, these methods are quite expensive, we will optimize them
+/// latter.
+void elementwise_add(CLHelper* helper, const float* in, const DDim& in_dim,
+                     const float* bias, const DDim& bias_dim, float* out,
+                     const DDim& out_dim);
+
+void pool(CLHelper* helper, const std::string pooling_type, const int pad_h,
+          const int pad_w, const int stride_h, const int stride_w,
+          const int ksize_h, const int ksize_w, const float* in,
+          const DDim& in_dim, float* out, const DDim& out_dim);
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_context.cc b/paddle/fluid/lite/opencl/cl_context.cc
new file mode 100644
index 00000000000..d8265d17d8e
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_context.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+
+cl::CommandQueue &CLContext::GetCommandQueue() {
+  return CLEngine::Global()->command_queue();
+}
+
+cl::Context &CLContext::GetContext() { return CLEngine::Global()->context(); }
+
+cl::Program &CLContext::GetProgram(const std::string &file_name,
+                                   const std::string &options) {
+  std::string program_key = file_name;
+  if (!options.empty()) {
+    program_key += options;
+  }
+  auto it = programs_.find(program_key);
+  if (it != programs_.end()) {
+    VLOG(3) << " --- program -> " << program_key << " has been built --- ";
+    return *(it->second);
+  }
+
+  auto program = CLEngine::Global()->CreateProgram(
+      GetContext(), CLEngine::Global()->cl_path() + "/cl_kernel/" + file_name);
+
+  VLOG(3) << " --- begin build program -> " << program_key << " --- ";
+  CLEngine::Global()->BuildProgram(program.get(), options);
+  VLOG(3) << " --- end build program -> " << program_key << " --- ";
+
+  programs_[program_key] = std::move(program);
+
+  return *(programs_[program_key]);
+}
+
+std::unique_ptr<cl::Kernel> CLContext::GetKernel(const std::string &kernel_name,
+                                                 const std::string &file_name,
+                                                 const std::string &options) {
+  cl_int status{CL_SUCCESS};
+  VLOG(3) << " --- to get program " << file_name << " --- ";
+  auto program = GetProgram(file_name, options);
+  VLOG(3) << " --- end get program --- ";
+  VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
+  std::unique_ptr<cl::Kernel> kernel(
+      new cl::Kernel(program, kernel_name.c_str(), &status));
+  CL_CHECK_ERRORS(status);
+  VLOG(3) << " --- end create kernel --- ";
+  return std::move(kernel);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_context.h b/paddle/fluid/lite/opencl/cl_context.h
new file mode 100644
index 00000000000..5c28b720a02
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_context.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/lite/opencl/cl_include.h"
+
+namespace paddle {
+namespace lite {
+
+class CLContext {
+ public:
+  cl::CommandQueue &GetCommandQueue();
+
+  cl::Context &GetContext();
+
+  cl::Program &GetProgram(const std::string &file_name,
+                          const std::string &options);
+
+  std::unique_ptr<cl::Kernel> GetKernel(const std::string &kernel_name,
+                                        const std::string &file_name,
+                                        const std::string &options);
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_engine.cc b/paddle/fluid/lite/opencl/cl_engine.cc
new file mode 100644
index 00000000000..bcf39992c2a
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_engine.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+
+CLEngine* CLEngine::Global() {
+  static CLEngine cl_engine_;
+  cl_engine_.Init();
+  return &cl_engine_;
+}
+
+CLEngine::~CLEngine() {
+  if (command_queue_ != nullptr) {
+    command_queue_->finish();
+  }
+  // For controlling the destruction order:
+  command_queue_.reset();
+  context_.reset();
+  device_.reset();
+  platform_.reset();
+}
+
+bool CLEngine::Init() {
+  if (initialized_) {
+    return true;
+  }
+  bool is_platform_init = InitializePlatform();
+  bool is_device_init = InitializeDevice();
+  is_init_success_ = is_platform_init && is_device_init;
+  initialized_ = true;
+  return initialized_;
+}
+
+cl::Platform& CLEngine::platform() {
+  CHECK(platform_ != nullptr) << "platform_ is not initialized!";
+  return *platform_;
+}
+
+cl::Context& CLEngine::context() {
+  if (context_ == nullptr) {
+    context_ = CreateContext();
+  }
+  return *context_;
+}
+
+cl::Device& CLEngine::device() {
+  CHECK(device_ != nullptr) << "device_ is not initialized!";
+  return *device_;
+}
+
+cl::CommandQueue& CLEngine::command_queue() {
+  if (command_queue_ == nullptr) {
+    command_queue_ = CreateCommandQueue(context());
+  }
+  return *command_queue_;
+}
+
+std::unique_ptr<cl::Program> CLEngine::CreateProgram(const cl::Context& context,
+                                                     std::string file_name) {
+  std::ifstream file{file_name, std::ios::binary | std::ios::ate};
+  CHECK(file.is_open()) << "Can't open file from " << file_name;
+  auto size = file.tellg();
+  CHECK(size > 0) << "size is too small.";
+  std::string content(size, '\0');
+  file.seekg(0);
+  file.read(&content[0], size);
+  cl::Program::Sources sources;
+  sources.push_back(content);
+  auto prog =
+      std::unique_ptr<cl::Program>(new cl::Program(context, sources, &status_));
+  LOG(INFO) << "OpenCL kernel file name: " << file_name;
+  LOG(INFO) << "Program source size: " << content.size();
+  CL_CHECK_ERRORS(status_);
+  return std::move(prog);
+}
+
+std::unique_ptr<cl::UserEvent> CLEngine::CreateEvent(
+    const cl::Context& context) {
+  auto event =
+      std::unique_ptr<cl::UserEvent>(new cl::UserEvent(context, &status_));
+  CL_CHECK_ERRORS(status_);
+  return std::move(event);
+}
+
+bool CLEngine::BuildProgram(cl::Program* program, const std::string& options) {
+  std::string build_option = options + " -cl-fast-relaxed-math -I " +
+                             CLEngine::Global()->cl_path() + "/cl_kernel";
+  status_ = program->build({*device_}, build_option.c_str());
+  CL_CHECK_ERRORS(status_);
+
+  if (status_ != CL_SUCCESS) {
+    if (program->getBuildInfo<CL_PROGRAM_BUILD_STATUS>(device()) ==
+        CL_BUILD_ERROR) {
+      std::string log = program->getBuildInfo<CL_PROGRAM_BUILD_LOG>(device());
+      LOG(INFO) << "Program build error: " << log;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+bool CLEngine::InitializePlatform() {
+  std::vector<cl::Platform> all_platforms;
+  status_ = cl::Platform::get(&all_platforms);
+  CL_CHECK_ERRORS(status_);
+  if (all_platforms.empty()) {
+    LOG(ERROR) << "No OpenCL platform found!";
+    return false;
+  }
+  platform_ = std::make_shared<cl::Platform>();
+  *platform_ = all_platforms[0];
+  return true;
+}
+
+bool CLEngine::InitializeDevice() {
+  std::vector<cl::Device> all_devices;
+  status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
+  CL_CHECK_ERRORS(status_);
+  if (all_devices.empty()) {
+    LOG(ERROR) << "No OpenCL GPU device found!";
+    return false;
+  }
+  device_ = std::make_shared<cl::Device>();
+  *device_ = all_devices[0];
+
+  auto device_name = device_->getInfo<CL_DEVICE_NAME>();
+  LOG(INFO) << "Using device: " << device_name;
+  auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
+  if (image_support) {
+    LOG(INFO) << "The chosen device supports image processing.";
+  } else {
+    LOG(ERROR) << "The chosen device doesn't support image processing!";
+    return false;
+  }
+  auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
+  LOG(INFO) << "The extensions supported by this device: " << ext_data;
+  if (ext_data.find("cl_khr_fp16") != std::string::npos) {
+    LOG(INFO) << "The chosen device supports the half data type.";
+  } else {
+    LOG(INFO) << "The chosen device doesn't support the half data type!";
+  }
+  auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  LOG(INFO) << "The chosen device has " << max_units << " compute units.";
+  auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
+  LOG(INFO) << "The local memory size of the chosen device is "
+            << static_cast<float>(local_mem) / 1024 << " KB.";
+  return true;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_engine.h b/paddle/fluid/lite/opencl/cl_engine.h
new file mode 100644
index 00000000000..cf0939d3709
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_engine.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <fstream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl_include.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+
+class CLEngine {
+ public:
+  static CLEngine* Global();
+
+  bool Init();
+
+  cl::Platform& platform();
+
+  cl::Context& context();
+
+  cl::Device& device();
+
+  cl::CommandQueue& command_queue();
+
+  std::unique_ptr<cl::Program> CreateProgram(const cl::Context& context,
+                                             std::string file_name);
+
+  std::unique_ptr<cl::UserEvent> CreateEvent(const cl::Context& context);
+
+  bool BuildProgram(cl::Program* program, const std::string& options = "");
+
+  bool IsInitSuccess() { return is_init_success_; }
+
+  std::string cl_path() { return cl_path_; }
+
+  void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
+
+ private:
+  CLEngine() = default;
+
+  ~CLEngine();
+
+  bool InitializePlatform();
+
+  bool InitializeDevice();
+
+  std::shared_ptr<cl::Context> CreateContext() {
+    auto context = std::make_shared<cl::Context>(
+        std::vector<cl::Device>{device()}, nullptr, nullptr, nullptr, &status_);
+    CL_CHECK_ERRORS(status_);
+    return context;
+  }
+
+  std::shared_ptr<cl::CommandQueue> CreateCommandQueue(
+      const cl::Context& context) {
+    auto queue =
+        std::make_shared<cl::CommandQueue>(context, device(), 0, &status_);
+    CL_CHECK_ERRORS(status_);
+    return queue;
+  }
+
+  std::string cl_path_;
+
+  std::shared_ptr<cl::Platform> platform_{nullptr};
+
+  std::shared_ptr<cl::Context> context_{nullptr};
+
+  std::shared_ptr<cl::Device> device_{nullptr};
+
+  std::shared_ptr<cl::CommandQueue> command_queue_{nullptr};
+
+  cl_int status_{CL_SUCCESS};
+
+  bool initialized_{false};
+
+  bool is_init_success_{false};
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_helper.cc b/paddle/fluid/lite/opencl/cl_helper.cc
new file mode 100644
index 00000000000..7e8ffa62341
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_helper.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+
+void CLHelper::set_context(CLContext *context) { context_ = context; }
+
+void CLHelper::AddKernel(const std::string &kernel_name,
+                         const std::string &file_name,
+                         const std::string &options) {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  VLOG(3) << " --- begin to add kernel ---";
+  auto kernel = context_->GetKernel(kernel_name, file_name, options);
+  kernels_.emplace_back(std::move(kernel));
+  kernel_offset_[kernel_name] = kernels_.size() - 1;
+  VLOG(3) << " --- end to add kernel --- ";
+}
+
+cl::Kernel &CLHelper::GetKernel(const int index) {
+  VLOG(3) << " --- kernel count: " << kernels_.size() << " --- ";
+  CHECK(static_cast<size_t>(index) < kernels_.size())
+      << "The index must be less than the size of kernels.";
+  CHECK(kernels_[index] != nullptr)
+      << "The target kernel pointer cannot be null.";
+  return *(kernels_[index]);
+}
+
+cl::CommandQueue &CLHelper::OpenCLCommandQueue() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetCommandQueue();
+}
+
+cl::Context &CLHelper::OpenCLContext() {
+  CHECK(context_ != nullptr) << "Please use set_context first!";
+  return context_->GetContext();
+}
+
+cl::NDRange CLHelper::DefaultWorkSize(const CLImage &image) {
+  // n c h w
+  auto image_dim = image.tensor_dims();
+  if (image_dim.size() == 4) {
+    auto n = image_dim[0];
+    auto h = image_dim[2];
+    auto w = image_dim[3];
+    auto image_width = image.ImageWidth();
+    auto work_size_0 = image_width / w;
+    auto work_size_1 = w;
+    auto work_size_2 = n * h;
+    return cl::NDRange{static_cast<size_t>(work_size_0),
+                       static_cast<size_t>(work_size_1),
+                       static_cast<size_t>(work_size_2)};
+  } else if (image_dim.size() == 2) {
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(image.ImageHeight())};
+  } else if (image_dim.size() == 1) {
+    return cl::NDRange{static_cast<size_t>(1),
+                       static_cast<size_t>(image.ImageWidth()),
+                       static_cast<size_t>(1)};
+  } else if (image_dim.size() == 3) {
+    auto c = image_dim[0];
+    auto h = image_dim[1];
+    auto w = image_dim[2];
+    return cl::NDRange{static_cast<size_t>((c + 3) / 4), static_cast<size_t>(w),
+                       static_cast<size_t>(h)};
+  } else {
+    LOG(FATAL) << "Not support this dimension, need to be implemented!";
+    return cl::NDRange{};
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_helper.h b/paddle/fluid/lite/opencl/cl_helper.h
new file mode 100644
index 00000000000..1732196ccf3
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_helper.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include "paddle/fluid/lite/opencl/cl_include.h"
+
+namespace paddle {
+namespace lite {
+
+class CLHelper {
+ public:
+  CLHelper() = default;
+
+  explicit CLHelper(CLContext *context) : context_(context) {}
+
+  void set_context(CLContext *context);
+
+  void AddKernel(const std::string &kernel_name, const std::string &file_name,
+                 const std::string &options = "");
+
+  cl::Kernel &GetKernel(const int index);
+  cl::Kernel &GetKernel(const std::string &name) {
+    auto it = kernel_offset_.find(name);
+    CHECK(it != kernel_offset_.end());
+    return GetKernel(it->second);
+  }
+
+  cl::CommandQueue &OpenCLCommandQueue();
+
+  cl::Context &OpenCLContext();
+
+  cl::NDRange DefaultWorkSize(const CLImage &image);
+
+ private:
+  CLContext *context_{nullptr};
+  std::map<std::string, int> kernel_offset_;
+  std::vector<std::unique_ptr<cl::Kernel>> kernels_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_image.cc b/paddle/fluid/lite/opencl/cl_image.cc
new file mode 100644
index 00000000000..40da48ba48a
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_image.cc
@@ -0,0 +1,163 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_image.h"
+#include <glog/logging.h>
+#include <array>
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+
+std::ostream& operator<<(std::ostream& os, const CLImage& cl_image) {
+  int width = cl_image.image_dims_[0];
+  int height = cl_image.image_dims_[1];
+
+  float* image_data = new float[height * width * 4];
+  cl::Image* image = cl_image.cl_image();
+  const std::array<size_t, 3> origin{0, 0, 0};
+  const std::array<size_t, 3> region{static_cast<size_t>(width),
+                                     static_cast<size_t>(height), 1};
+  cl_int err = CLEngine::Global()->command_queue().enqueueReadImage(
+      *image, CL_TRUE, origin, region, 0, 0, image_data, nullptr, nullptr);
+  CL_CHECK_ERRORS(err);
+
+  float* tensor_data = new float[cl_image.numel()];
+  auto* converter = cl_image.image_converter();
+  converter->ImageToNCHW(image_data, tensor_data, cl_image.image_dims_,
+                         cl_image.tensor_dims_);
+  int stride = cl_image.numel() / 20;
+  stride = stride > 0 ? stride : 1;
+
+  os << " dims: " << cl_image.tensor_dims_ << "\n";
+  for (int i = 0; i < cl_image.numel(); i += stride) {
+    os << tensor_data[i] << " ";
+  }
+
+  delete[] tensor_data;
+  delete[] image_data;
+
+  return os;
+}
+
+void CLImage::set_tensor_data(const float* tensor_data, const DDim& dim) {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  auto numel = dim.product();
+#else
+  auto numel = dim.production();
+#endif
+  tensor_data_.reset(new float[numel]);
+  memcpy(tensor_data_.get(), tensor_data, numel * sizeof(float));
+  tensor_dims_ = dim;
+}
+
+void CLImage::InitCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call "
+                                    "set_tensohelper->DefaultWorkSize(out_"
+                                    "image)r_data first!";
+  image_converter_.reset(new CLImageConverterFolder);
+  InitCLImage(context, image_converter_.get());
+}
+
+void CLImage::InitNormalCLImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  image_converter_.reset(new CLImageConverterNormal);
+  InitCLImage(context, image_converter_.get());
+}
+
+void CLImage::InitNImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterNWBlock);
+  InitCLImage(context, image_converter_.get());
+}
+
+void CLImage::InitDWImage(const cl::Context& context) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+  CHECK(tensor_dims_.size() == 4) << " Tensor dim is not 4.";
+  image_converter_.reset(new CLImageConverterDWBlock);
+  InitCLImage(context, image_converter_.get());
+}
+
+void CLImage::InitEmptyImage(const cl::Context& context, const DDim& dim) {
+  CHECK(tensor_data_ == nullptr)
+      << " Empty image tensor data shouldn't have value";
+
+  tensor_dims_ = dim;
+  image_converter_.reset(new CLImageConverterNormal);
+
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_converter_->InitImageDimInfoWith(tensor_dims_);
+  VLOG(3) << " end get image dims " << image_dims_;
+
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+
+void CLImage::InitEmptyWithImageDim(const cl::Context& context,
+                                    const DDim& image_dims) {
+  VLOG(3) << " to get image dims ";
+  image_dims_ = image_dims;
+  VLOG(3) << " end get image dims " << image_dims_;
+
+  InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
+
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image";
+}
+
+void CLImage::InitCLImage(const cl::Context& context,
+                          CLImageConverterBase* converter) {
+  CHECK(tensor_data_ != nullptr) << " Please call set_tensor_data first!";
+
+  VLOG(3) << " begin init cl image ";
+  image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
+
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  float* image_data = new float[image_dims_.product() * 4];
+#else
+  float* image_data = new float[image_dims_.production() * 4];
+#endif
+
+  VLOG(3) << " convert to image ";
+  converter->NCHWToImage(tensor_data_.get(), image_data, tensor_dims_);
+  VLOG(3) << " end convert to image ";
+
+  InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
+
+  delete[] image_data;
+  tensor_data_ = nullptr;
+  cl_event_ = CLEngine::Global()->CreateEvent(context);
+  initialized_ = true;
+  VLOG(3) << " end init cl image ";
+}
+
+void CLImage::InitCLImage(const cl::Context& context, int width, int height,
+                          void* data) {
+  cl::ImageFormat img_format(CL_RGBA, CL_FLOAT);
+  cl_int err;
+  cl_image_.reset(new cl::Image2D(
+      context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
+      img_format, width, height, 0, data, &err));
+  CL_CHECK_ERRORS(err);
+  CHECK(err == CL_SUCCESS) << " Create image 2d error.";
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_image.h b/paddle/fluid/lite/opencl/cl_image.h
new file mode 100644
index 00000000000..500646658ad
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_image.h
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+#include "paddle/fluid/lite/opencl/cl_include.h"
+
+namespace paddle {
+namespace lite {
+
+class CLImage {
+  // For debug
+  friend std::ostream& operator<<(std::ostream& os, const CLImage& image);
+
+ public:
+  CLImage() = default;
+  /*
+   * Will not hold input tensor data, memcpy in this method.
+   * */
+  void set_tensor_data(const float* tensor_data, const DDim& dim);
+
+  bool IsInit() { return initialized_; }
+  /*
+   * Need call set_tensor_data first.
+   * Folder when one dim or two dim.
+   * */
+  void InitCLImage(const cl::Context& context);
+
+  void InitNormalCLImage(const cl::Context& context);
+
+  void InitNImage(const cl::Context& context);
+
+  void InitDWImage(const cl::Context& context);
+
+  void InitEmptyImage(const cl::Context& context, const DDim& dim);
+
+  void InitEmptyWithImageDim(const cl::Context& context,
+                             const DDim& image_dims);
+
+  cl::Image* cl_image() const { return cl_image_.get(); }
+
+  const DDim& image_dims() const { return image_dims_; }
+
+  inline size_t ImageWidth() const { return image_dims_[0]; }
+
+  inline size_t ImageHeight() const { return image_dims_[1]; }
+
+  const DDim& tensor_dims() const { return tensor_dims_; }
+
+  /*with_da
+   * Resize original tensor dim.
+   * */
+  inline CLImage& Resize(const DDim& dims) {
+    tensor_dims_ = dims;
+    return *this;
+  }
+
+  template <typename T>
+  T* data() const {
+    CHECK(!initialized_) << "CL image has initialized, tensor data has been "
+                            "deleted, can't use tensor data!";
+    return reinterpret_cast<T*>(tensor_data_);
+  }
+
+  /*
+   *  Numel of tensor dim
+   * */
+  inline int64_t numel() const {
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+    return tensor_dims_.product();
+#else
+    return tensor_dims_.production();
+#endif
+  }
+
+  /*
+   *  Original tensor dim
+   * */
+
+  cl::UserEvent& cl_event() const { return *cl_event_; }
+
+  CLImageConverterBase* image_converter() const {
+    return image_converter_.get();
+  }
+
+ private:
+  void InitCLImage(const cl::Context& context, CLImageConverterBase* converter);
+
+  void InitCLImage(const cl::Context& context, int width, int height,
+                   void* data);
+
+  bool initialized_ = false;
+  std::unique_ptr<cl::Image2D> cl_image_{nullptr};
+  std::unique_ptr<cl::UserEvent> cl_event_{nullptr};
+  DDim tensor_dims_;
+  DDim image_dims_;
+  std::unique_ptr<float> tensor_data_{nullptr};
+  std::unique_ptr<CLImageConverterBase> image_converter_{nullptr};
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_image_converter.cc b/paddle/fluid/lite/opencl/cl_image_converter.cc
new file mode 100644
index 00000000000..03b3edd7a1c
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_image_converter.cc
@@ -0,0 +1,449 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_image_converter.h"
+#include <glog/logging.h>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+
+DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+
+void CLImageConverterDefault::NCHWToImage(float *nchw, float *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+
+  float *p = nchw;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = *p;
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDefault::ImageToNCHW(float *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  size_t width = image_dim[0];
+  float *p = tensor;
+
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = image[i2];
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
+  if (tensor_dim.size() <= 2) {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+    size_t width = (tdim[1] + 3) / 4;
+    size_t height = tdim[0];
+
+    width_of_one_block_ = width;
+    height_of_one_block_ = height;
+    c_block_ = 1;
+
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+
+  } else {
+    size_t new_dims[] = {1, 1, 1, 1};
+    for (size_t j = 0; j < tensor_dim.size(); ++j) {
+      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+    }
+    size_t N, C, H, W;
+    N = new_dims[0];
+    C = new_dims[1];
+    H = new_dims[2];
+    W = new_dims[3];
+    size_t width = W * ((C + 3) / 4);
+    size_t height = H * N;
+
+    width_of_one_block_ = W;
+    height_of_one_block_ = H;
+    c_block_ = width / W;
+
+    return DDim(
+        std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                       static_cast<DDim::value_type>(height)}));
+  }
+}
+
+void CLImageConverterFolder::NCHWToImage(float *tensor, float *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.NCHWToImage(tensor, image, tensor_dim);
+
+  } else {
+    size_t tdim[2] = {1, 1};
+    if (tensor_dim.size() == 1) {
+      tdim[1] = tensor_dim[0];
+    } else {
+      tdim[0] = tensor_dim[0];
+      tdim[1] = tensor_dim[1];
+    }
+
+    DDim image_dim = InitImageDimInfoWith(tensor_dim);
+    size_t width = image_dim[0];
+
+    for (size_t h = 0; h < tdim[0]; h++) {
+      for (size_t w = 0; w < tdim[1]; w++) {
+        image[(h * width + w / 4) * 4 + (w % 4)] = tensor[h * tdim[1] + w];
+      }
+    }
+  }
+}
+
+void CLImageConverterFolder::ImageToNCHW(float *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  if (tensor_dim.size() > 2) {
+    CLImageConverterDefault default_converter;
+    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+
+  } else {
+    size_t width = image_dim[0];
+    size_t H = 1, W = 1;
+
+    if (tensor_dim.size() == 2) {
+      H = tensor_dim[0];
+      W = tensor_dim[1];
+    } else if (tensor_dim.size() == 1) {
+      W = tensor_dim[0];
+    }
+
+    float *p = tensor;
+
+    for (size_t h = 0; h < H; h++) {
+      for (size_t w = 0; w < W; w++) {
+        p[h * W + w] = image[(h * width + w / 4) * 4 + (w % 4)];
+      }
+    }
+  }
+}
+
+DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+
+void CLImageConverterNWBlock::NCHWToImage(float *tensor, float *image,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  auto image_dim = InitImageDimInfoWith(tensor_dim);
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+  size_t block = image_dim[0] / tensor_dim[3];
+
+  for (size_t n = 0; n < block * 4; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          if (n < N) {
+            image[index] = *p;
+            p++;
+          } else {
+            image[index] = 0.0;
+          }
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+
+void CLImageConverterNWBlock::ImageToNCHW(float *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[0];
+  size_t C = tensor_dim[1];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+  size_t height = image_dim[1];
+
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      for (size_t h = 0; h < H; ++h) {
+        for (size_t w = 0; w < W; ++w) {
+          size_t index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
+                         w * 4 + n % 4;
+          *p = image[index];
+          p++;
+          if (index >= (width * height * 4)) {
+            LOG(INFO) << " index out of range ";
+          }
+        }
+      }
+    }
+  }
+  VLOG(3) << " init done";
+}
+
+DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C, H, W;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  H = tensor_dim[2];
+  W = tensor_dim[3];
+  size_t width = W * ((N + 3) / 4);
+  size_t height = C * H;
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+
+void CLImageConverterDWBlock::NCHWToImage(float *tensor, float *image,
+                                          const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+
+  size_t N, C, H, W;
+  N = new_dims[1];
+  C = new_dims[0];
+  H = new_dims[2];
+  W = new_dims[3];
+
+  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
+
+  VLOG(3) << " tensor dim: " << tensor_dim;
+  VLOG(3) << " image dim: " << in_image_dim;
+
+  size_t width = in_image_dim[0];
+  size_t w_block = width / W;
+
+  float *p = tensor;
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < w_block * 4; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          if (c < C) {
+            // size_t x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
+            // (c % 4);
+            image[i2] = *p;
+            i2 += 4;
+            p++;
+          } else {
+            image[i2] = 0.0;
+            i2 += 4;
+          }
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+void CLImageConverterDWBlock::ImageToNCHW(float *image, float *tensor,
+                                          const DDim &image_dim,
+                                          const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  float *p = tensor;
+  size_t N = tensor_dim[1];
+  size_t C = tensor_dim[0];
+  size_t H = tensor_dim[2];
+  size_t W = tensor_dim[3];
+  size_t width = image_dim[0];
+
+  size_t i0 = 0;
+  for (size_t n = 0; n < N; n++) {
+    for (size_t c = 0; c < C; c++) {
+      size_t i1 = i0 + (c / 4) * W;
+      for (size_t h = 0; h < H; h++) {
+        size_t i2 = (i1 << 2) + c % 4;
+        for (size_t w = 0; w < W; w++) {
+          *p = image[i2];
+          i2 += 4;
+          p++;
+        }
+        i1 += width;
+      }
+    }
+    i0 += width * H;
+  }
+}
+
+DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
+  size_t new_dims[] = {1, 1, 1, 1};
+  for (size_t j = 0; j < tensor_dim.size(); ++j) {
+    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
+  }
+  size_t N, C, H, W;
+  N = new_dims[0];
+  C = new_dims[1];
+  H = new_dims[2];
+  W = new_dims[3];
+  size_t width = W * ((C + 3) / 4);
+  size_t height = H * N;
+
+  width_of_one_block_ = W;
+  height_of_one_block_ = H;
+  c_block_ = width / W;
+
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+
+void CLImageConverterNormal::NCHWToImage(float *tensor, float *image,
+                                         const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() <= 4 && tensor_dim.size() > 0)
+      << " Tensor dim is not support!";
+
+  CLImageConverterDefault default_converter;
+  default_converter.NCHWToImage(tensor, image, tensor_dim);
+}
+
+void CLImageConverterNormal::ImageToNCHW(float *image, float *tensor,
+                                         const DDim &image_dim,
+                                         const DDim &tensor_dim) {
+  CLImageConverterDefault default_converter;
+  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
+}
+
+DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
+    const DDim &tensor_dim) {
+  CHECK(tensor_dim.size() == 4) << " Tensor dim is not 4.";
+  size_t N, C;
+  N = tensor_dim[0];
+  C = tensor_dim[1];
+  size_t width = (C + 3) / 4;
+  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
+  return DDim(
+      std::vector<DDim::value_type>({static_cast<DDim::value_type>(width),
+                                     static_cast<DDim::value_type>(height)}));
+}
+
+void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, float *image,
+                                                  const DDim &tensor_dim) {}
+
+void CLImageConverterWinoTransWeight::ImageToNCHW(float *image, float *tensor,
+                                                  const DDim &image_dim,
+                                                  const DDim &tensor_dim) {}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_image_converter.h b/paddle/fluid/lite/opencl/cl_image_converter.h
new file mode 100644
index 00000000000..1eab93f61be
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_image_converter.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+
+namespace paddle {
+namespace lite {
+
+class CLImageConverterBase {
+ public:
+  virtual ~CLImageConverterBase() {}
+
+  virtual void NCHWToImage(float *nchw, float *image,
+                           const DDim &tensor_dim) = 0;
+
+  virtual void ImageToNCHW(float *image, float *nchw, const DDim &image_dim,
+                           const DDim &tensor_dim) = 0;
+  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
+};
+
+class CLImageConverterDefault : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *nchw, float *image, const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+};
+
+class CLImageConverterFolder : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *tensor, float *image,
+                   const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
+class CLImageConverterNormal : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *tensor, float *image,
+                   const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+
+  /*
+   *  width of original tensor
+   * */
+  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
+
+  /*
+   *  height of original tensor
+   * */
+  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
+
+  int GetCBlock() const { return c_block_; }
+
+ private:
+  int c_block_;
+  int width_of_one_block_;
+  int height_of_one_block_;
+};
+
+class CLImageConverterNWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *tensor, float *image,
+                   const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+};
+class CLImageConverterDWBlock : public CLImageConverterBase {
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *tensor, float *image,
+                   const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+};
+
+class CLImageConverterWinoTransWeight : public CLImageConverterBase {
+ public:
+  DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
+  void NCHWToImage(float *tensor, float *image,
+                   const DDim &tensor_dim) override;
+  void ImageToNCHW(float *image, float *tensor, const DDim &image_dim,
+                   const DDim &tensor_dim) override;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_include.h b/paddle/fluid/lite/opencl/cl_include.h
new file mode 100644
index 00000000000..254782d6296
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_include.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#define CL_TARGET_OPENCL_VERSION 200
+#define CL_HPP_TARGET_OPENCL_VERSION 200
+#define CL_HPP_MINIMUM_OPENCL_VERSION 110
+
+#include <CL/cl2.hpp>
diff --git a/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
new file mode 100644
index 00000000000..c1063778300
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_kernel/channel_add_kernel.cl
@@ -0,0 +1,29 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+__kernel void channel_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage, __private const int w) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     int2 coords_bias;
+     coords_bias.x = x/w;
+     coords_bias.y = 0;
+     float4 in = read_imagef(input, sampler, coords);
+     float4 biase = read_imagef(bias, sampler, coords_bias);
+     float4 output = in + biase;
+     write_imagef(outputImage, coords, output);
+ }
diff --git a/paddle/fluid/lite/opencl/cl_kernel/cl_common.h b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
new file mode 100644
index 00000000000..dedd05a13c5
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_kernel/cl_common.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+inline float4 activation(float4 in
+#ifdef PRELU
+                         ,
+                         float4 prelu_alpha
+#endif
+                         ) {
+  float4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (float4)0.0);
+#endif
+
+#ifdef RELU
+  output = fmax(in, (float4)(0.0f));
+#endif
+  return output;
+}
diff --git a/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
new file mode 100644
index 00000000000..ecf719ae931
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_kernel/elementwise_add_kernel.cl
@@ -0,0 +1,26 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+__kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     float4 in = read_imagef(input, sampler, coords);
+     float4 biase = read_imagef(bias, sampler, coords);
+     float4 output = in + biase;
+     write_imagef(outputImage,coords,output);
+ }
diff --git a/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
new file mode 100644
index 00000000000..0ca3b9141da
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_kernel/pool_kernel.cl
@@ -0,0 +1,90 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define MIN_VALUE -FLT_MAX
+
+__kernel void pool_max(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = out_h * stride_h - pad_top;
+  int end_h = min(start_h + ksize_h, in_height);
+  start_h = max(start_h,0);
+
+  int start_w = out_w * stride_w - pad_left;
+  int end_w = min(start_w + ksize_w, in_width);
+  start_w = max(start_w,0);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  float4 max_value = (float4)(MIN_VALUE);
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      max_value = max(max_value, tmp);
+    }
+  }
+
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
+}
+
+__kernel void pool_avg(
+    __private const int in_height, __private const int in_width,
+    __private const int out_height, __private const int out_width,
+    __private const int pad_top, __private const int pad_left,
+    __private const int stride_h, __private const int stride_w,
+    __private const int ksize_h, __private const int ksize_w,
+    __read_only image2d_t input, __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int start_h = max(out_h * stride_h - pad_top, 0);
+  int end_h = min(start_h + ksize_h, in_height);
+
+  int start_w = max(out_w * stride_w - pad_left, 0);
+  int end_w = min(start_w + ksize_w, in_width);
+
+  const int pos_in_x = out_c * in_width;
+  const int pos_in_y = out_n * in_height;
+  float4 sum = (float4)(0.0f);
+  int num = 0;
+  for (int y = start_h; y < end_h; ++y) {
+    for (int x = start_w; x < end_w; ++x) {
+      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      num++;
+    }
+  }
+  float4 avg = sum / num;
+  const int pos_out_x = mad24(out_c, out_width, out_w);
+  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
+}
diff --git a/paddle/fluid/lite/opencl/cl_test.cc b/paddle/fluid/lite/opencl/cl_test.cc
new file mode 100644
index 00000000000..ea02b0c4fed
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_test.cc
@@ -0,0 +1,290 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <random>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/opencl/cl_caller.h"
+#include "paddle/fluid/lite/opencl/cl_context.h"
+#include "paddle/fluid/lite/opencl/cl_engine.h"
+#include "paddle/fluid/lite/opencl/cl_helper.h"
+#include "paddle/fluid/lite/opencl/cl_image.h"
+
+DEFINE_string(cl_path, "/data/local/tmp/opencl", "The OpenCL kernels path.");
+
+namespace paddle {
+namespace lite {
+
+TEST(cl_test, engine_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  engine->platform();
+  engine->device();
+  engine->command_queue();
+  auto& context = engine->context();
+  auto program = engine->CreateProgram(
+      context, engine->cl_path() + "/cl_kernel/" + "elementwise_add_kernel.cl");
+  auto event = engine->CreateEvent(context);
+  CHECK(engine->BuildProgram(program.get()));
+}
+
+TEST(cl_test, context_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  CLContext context;
+  context.GetKernel("pool_max", "pool_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+  context.GetKernel("elementwise_add", "elementwise_add_kernel.cl", "");
+}
+
+TEST(cl_test, kernel_test) {
+  auto* engine = CLEngine::Global();
+  CHECK(engine->IsInitSuccess());
+  engine->set_cl_path(FLAGS_cl_path);
+  std::unique_ptr<CLContext> context(new CLContext);
+  // std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  std::unique_ptr<CLHelper> helper(new CLHelper);
+  helper->set_context(context.get());
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("pool_max", "pool_kernel.cl");
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  auto kernel = helper->GetKernel(2);
+
+  std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
+  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
+    in_data[i] = 1.f;
+  }
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
+  CLImage in_image;
+  in_image.set_tensor_data(in_data.get(), in_dim);
+  in_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << in_image;
+
+  std::unique_ptr<float[]> bias_data(new float[4 * 3 * 256 * 512]);
+  for (int i = 0; i < 4 * 3 * 256 * 512; i++) {
+    bias_data[i] = 2.f;
+  }
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
+  CLImage bias_image;
+  bias_image.set_tensor_data(bias_data.get(), bias_dim);
+  bias_image.InitNormalCLImage(helper->OpenCLContext());
+  LOG(INFO) << bias_image;
+
+  CLImage out_image;
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 3, 256, 512});
+  out_image.InitEmptyImage(helper->OpenCLContext(), out_dim);
+  LOG(INFO) << out_image;
+
+  cl_int status;
+  status = kernel.setArg(0, *in_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(1, *bias_image.cl_image());
+  CL_CHECK_ERRORS(status);
+  status = kernel.setArg(2, *out_image.cl_image());
+  CL_CHECK_ERRORS(status);
+
+  // auto global_work_size = helper->DefaultWorkSize(out_image);
+  size_t width = in_image.ImageWidth();
+  size_t height = in_image.ImageHeight();
+  auto global_work_size = cl::NDRange{width, height};
+  cl::Event event;
+  status = helper->OpenCLCommandQueue().enqueueNDRangeKernel(
+      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event);
+  CL_CHECK_ERRORS(status);
+  status = helper->OpenCLCommandQueue().finish();
+  CL_CHECK_ERRORS(status);
+  double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
+  double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
+  double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
+  LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
+  LOG(INFO) << out_image;
+}
+
+TEST(cl_test, channel_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
+  std::unique_ptr<float[]> bias_data(new float[16]);
+  for (int i = 0; i < 16; i++) {
+    bias_data[i] = dist(engine);
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 16; j++) {
+      float b = bias_data[j];
+      for (int k = 0; k < 256 * 512; k++) {
+        int index = (i * 16 + j) * 256 * 512 + k;
+        out_ref[index] = in_data[index] + b;
+      }
+    }
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  std::unique_ptr<CLContext> context(new CLContext);
+  std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("channel_add", "channel_add_kernel.cl");
+  elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
+                  bias_dim, out.get(), out_dim);
+
+  int stride = 4 * 16 * 256 * 512 / 20;
+  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << std::endl;
+
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
+  }
+}
+
+TEST(cl_test, elementwise_add_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    bias_data[i] = dist(engine);
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    out_ref[i] = in_data[i] + bias_data[i];
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
+  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  std::unique_ptr<CLContext> context(new CLContext);
+  std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  helper->AddKernel("elementwise_add", "elementwise_add_kernel.cl");
+  helper->AddKernel("channel_add", "channel_add_kernel.cl");
+  elementwise_add(helper.get(), in_data.get(), in_dim, bias_data.get(),
+                  bias_dim, out.get(), out_dim);
+
+  int stride = 4 * 16 * 256 * 512 / 20;
+  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
+    std::cout << out[i] << " ";
+  }
+  std::cout << std::endl;
+
+  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
+    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
+  }
+}
+
+void pool_avg(const int padding_height, const int padding_width,
+              const int stride_height, const int stride_width,
+              const int ksize_height, const int ksize_width,
+              const float* input_data, const DDim& in_dim, float* output_data,
+              const DDim& out_dim) {
+  const int batch_size = in_dim[0];
+  const int input_height = in_dim[2];
+  const int input_width = in_dim[3];
+  const int output_channels = out_dim[1];
+  const int output_height = out_dim[2];
+  const int output_width = out_dim[3];
+
+  const size_t input_spatial_size = input_height * input_width;
+  const size_t output_spatial_size = output_height * output_width;
+
+  for (int i = 0; i < batch_size; i++) {
+    for (int c = 0; c < output_channels; ++c) {
+      int channel = i * output_channels + c;
+      const float* input_ptr = input_data + channel * input_spatial_size;
+      float* output_ptr = output_data + channel * output_spatial_size;
+
+      for (int ph = 0; ph < output_height; ++ph) {
+        int hstart = ph * stride_height - padding_height;
+        int hend = std::min(hstart + ksize_height, input_height);
+        hstart = std::max(hstart, 0);
+        for (int pw = 0; pw < output_width; ++pw) {
+          int wstart = pw * stride_width - padding_width;
+          int wend = std::min(wstart + ksize_width, input_width);
+          wstart = std::max(wstart, 0);
+
+          float val = 0.f;
+          int count = 0;
+          for (int h = hstart; h < hend; ++h) {
+            for (int w = wstart; w < wend; ++w) {
+              val += input_ptr[h * input_width + w];
+              ++count;
+            }
+          }
+          output_ptr[ph * output_width + pw] =
+              (count > 0) ? val * (1.f / count) : 0.f;
+        }
+      }
+    }
+  }
+}
+
+TEST(cl_test, pool_test) {
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
+  std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
+  for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
+    in_data[i] = dist(engine);
+  }
+
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
+  std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
+  std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
+
+  bool status = InitOpenCLEngine(FLAGS_cl_path);
+  CHECK(status) << "Fail to initialize OpenCL engine.";
+  std::unique_ptr<CLContext> context(new CLContext);
+  std::unique_ptr<CLHelper> helper(new CLHelper(context.get()));
+  helper->AddKernel("pool_max", "pool_kernel.cl");
+  helper->AddKernel("pool_avg", "pool_kernel.cl");
+  pool(helper.get(), "avg", 0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out.get(),
+       out_dim);
+  pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
+
+  for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
+    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_tool.cc b/paddle/fluid/lite/opencl/cl_tool.cc
new file mode 100644
index 00000000000..d09642ff553
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_tool.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/lite/opencl/cl_tool.h"
+
+namespace paddle {
+namespace lite {
+
+const char *opencl_error_to_str(cl_int error) {
+#define CASE_CL_CONSTANT(NAME) \
+  case NAME:                   \
+    return #NAME;
+  // Suppose that no combinations are possible.
+  switch (error) {
+    CASE_CL_CONSTANT(CL_SUCCESS)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
+    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
+    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
+    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
+    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
+    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
+    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
+    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
+    CASE_CL_CONSTANT(CL_MAP_FAILURE)
+    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
+    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
+    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
+    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
+    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
+    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
+    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
+    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
+    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
+    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
+    CASE_CL_CONSTANT(CL_INVALID_BINARY)
+    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
+    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
+    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
+    CASE_CL_CONSTANT(CL_INVALID_EVENT)
+    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
+    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
+    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
+    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
+    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
+
+    default:
+      return "UNKNOWN ERROR CODE";
+  }
+#undef CASE_CL_CONSTANT
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_tool.h b/paddle/fluid/lite/opencl/cl_tool.h
new file mode 100644
index 00000000000..49ad9d4e770
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_tool.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/lite/opencl/cl_include.h"
+
+namespace paddle {
+namespace lite {
+
+const char* opencl_error_to_str(cl_int error);
+
+#define CL_CHECK_ERRORS(ERR)                                         \
+  if (ERR != CL_SUCCESS) {                                           \
+    printf(                                                          \
+        "OpenCL error with code %s happened in file %s at line %d. " \
+        "Exiting.\n",                                                \
+        opencl_error_to_str(ERR), __FILE__, __LINE__);               \
+  }
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/opencl/cl_wrapper.cxx b/paddle/fluid/lite/opencl/cl_wrapper.cxx
new file mode 100644
index 00000000000..18a51d9ddb2
--- /dev/null
+++ b/paddle/fluid/lite/opencl/cl_wrapper.cxx
@@ -0,0 +1,964 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// This file is borrowed from MACE, and we will refactor it
+// in the near future.
+
+#include <dlfcn.h>
+#include <glog/logging.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/opencl/cl_include.h"
+
+/**
+ * Wrapper of OpenCL 2.0, based on file opencl20/CL/cl.h
+ */
+
+#if CL_HPP_TARGET_OPENCL_VERSION < 200
+#define CL_API_SUFFIX__VERSION_2_0
+#endif
+
+namespace paddle {
+namespace lite {
+
+class OpenCLLibrary final {
+ private:
+  OpenCLLibrary();
+  OpenCLLibrary(const OpenCLLibrary &) = delete;
+  OpenCLLibrary &operator=(const OpenCLLibrary &) = delete;
+
+  bool Load();
+  void *LoadFromPath(const std::string &path);
+
+ public:
+  static OpenCLLibrary *Get();
+
+  using clGetPlatformIDsFunc = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *);
+  using clGetPlatformInfoFunc = cl_int (*)(cl_platform_id, cl_platform_info,
+                                           size_t, void *, size_t *);
+  using clBuildProgramFunc = cl_int (*)(cl_program, cl_uint,
+                                        const cl_device_id *, const char *,
+                                        void (*pfn_notify)(cl_program, void *),
+                                        void *);
+  using clEnqueueNDRangeKernelFunc = cl_int (*)(cl_command_queue, cl_kernel,
+                                                cl_uint, const size_t *,
+                                                const size_t *, const size_t *,
+                                                cl_uint, const cl_event *,
+                                                cl_event *);
+  using clSetKernelArgFunc = cl_int (*)(cl_kernel, cl_uint, size_t,
+                                        const void *);
+  using clRetainMemObjectFunc = cl_int (*)(cl_mem);
+  using clReleaseMemObjectFunc = cl_int (*)(cl_mem);
+  using clEnqueueUnmapMemObjectFunc = cl_int (*)(cl_command_queue, cl_mem,
+                                                 void *, cl_uint,
+                                                 const cl_event *, cl_event *);
+  using clRetainCommandQueueFunc = cl_int (*)(cl_command_queue command_queue);
+  using clCreateContextFunc = cl_context (*)(
+      const cl_context_properties *, cl_uint, const cl_device_id *,
+      void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+          const char *, const void *, size_t, void *),
+      void *, cl_int *);
+  using clCreateContextFromTypeFunc =
+      cl_context (*)(const cl_context_properties *, cl_device_type,
+                     void(CL_CALLBACK *)(  // NOLINT(readability/casting)
+                         const char *, const void *, size_t, void *),
+                     void *, cl_int *);
+  using clReleaseContextFunc = cl_int (*)(cl_context);
+  using clWaitForEventsFunc = cl_int (*)(cl_uint, const cl_event *);
+  using clReleaseEventFunc = cl_int (*)(cl_event);
+  using clEnqueueWriteBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                              size_t, size_t, const void *,
+                                              cl_uint, const cl_event *,
+                                              cl_event *);
+  using clEnqueueReadBufferFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                             size_t, size_t, void *, cl_uint,
+                                             const cl_event *, cl_event *);
+  using clEnqueueReadImageFunc = cl_int (*)(cl_command_queue, cl_mem, cl_bool,
+                                            const size_t *, const size_t *,
+                                            size_t, size_t, void *, cl_uint,
+                                            const cl_event *, cl_event *);
+  using clGetProgramBuildInfoFunc = cl_int (*)(cl_program, cl_device_id,
+                                               cl_program_build_info, size_t,
+                                               void *, size_t *);
+  using clRetainProgramFunc = cl_int (*)(cl_program program);
+  using clEnqueueMapBufferFunc = void *(*)(cl_command_queue, cl_mem, cl_bool,
+                                           cl_map_flags, size_t, size_t,
+                                           cl_uint, const cl_event *,
+                                           cl_event *, cl_int *);
+  using clEnqueueMapImageFunc = void *(*)(cl_command_queue, cl_mem, cl_bool,
+                                          cl_map_flags, const size_t *,
+                                          const size_t *, size_t *, size_t *,
+                                          cl_uint, const cl_event *, cl_event *,
+                                          cl_int *);
+  using clCreateCommandQueueFunc = cl_command_queue(CL_API_CALL *)(  // NOLINT
+      cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+  using clCreateCommandQueueWithPropertiesFunc = cl_command_queue (*)(
+      cl_context, cl_device_id, const cl_queue_properties *, cl_int *);
+  using clReleaseCommandQueueFunc = cl_int (*)(cl_command_queue);
+  using clCreateProgramWithBinaryFunc = cl_program (*)(cl_context, cl_uint,
+                                                       const cl_device_id *,
+                                                       const size_t *,
+                                                       const unsigned char **,
+                                                       cl_int *, cl_int *);
+  using clRetainContextFunc = cl_int (*)(cl_context context);
+  using clGetContextInfoFunc = cl_int (*)(cl_context, cl_context_info, size_t,
+                                          void *, size_t *);
+  using clReleaseProgramFunc = cl_int (*)(cl_program program);
+  using clFlushFunc = cl_int (*)(cl_command_queue command_queue);
+  using clFinishFunc = cl_int (*)(cl_command_queue command_queue);
+  using clGetProgramInfoFunc = cl_int (*)(cl_program, cl_program_info, size_t,
+                                          void *, size_t *);
+  using clCreateKernelFunc = cl_kernel (*)(cl_program, const char *, cl_int *);
+  using clRetainKernelFunc = cl_int (*)(cl_kernel kernel);
+  using clCreateBufferFunc = cl_mem (*)(cl_context, cl_mem_flags, size_t,
+                                        void *, cl_int *);
+  using clCreateImage2DFunc = cl_mem(CL_API_CALL *)(cl_context,  // NOLINT
+                                                    cl_mem_flags,
+                                                    const cl_image_format *,
+                                                    size_t, size_t, size_t,
+                                                    void *, cl_int *);
+  using clCreateImageFunc = cl_mem (*)(cl_context, cl_mem_flags,
+                                       const cl_image_format *,
+                                       const cl_image_desc *, void *, cl_int *);
+  using clCreateUserEventFunc = cl_event (*)(cl_context, cl_int *);
+  using clCreateProgramWithSourceFunc = cl_program (*)(cl_context, cl_uint,
+                                                       const char **,
+                                                       const size_t *,
+                                                       cl_int *);
+  using clReleaseKernelFunc = cl_int (*)(cl_kernel kernel);
+  using clGetDeviceInfoFunc = cl_int (*)(cl_device_id, cl_device_info, size_t,
+                                         void *, size_t *);
+  using clGetDeviceIDsFunc = cl_int (*)(cl_platform_id, cl_device_type, cl_uint,
+                                        cl_device_id *, cl_uint *);
+  using clRetainDeviceFunc = cl_int (*)(cl_device_id);
+  using clReleaseDeviceFunc = cl_int (*)(cl_device_id);
+  using clRetainEventFunc = cl_int (*)(cl_event);
+  using clGetKernelWorkGroupInfoFunc = cl_int (*)(cl_kernel, cl_device_id,
+                                                  cl_kernel_work_group_info,
+                                                  size_t, void *, size_t *);
+  using clGetEventInfoFunc = cl_int (*)(cl_event event,
+                                        cl_event_info param_name,
+                                        size_t param_value_size,
+                                        void *param_value,
+                                        size_t *param_value_size_ret);
+  using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
+                                                 cl_profiling_info param_name,
+                                                 size_t param_value_size,
+                                                 void *param_value,
+                                                 size_t *param_value_size_ret);
+  using clGetImageInfoFunc = cl_int (*)(cl_mem, cl_image_info, size_t, void *,
+                                        size_t *);
+
+#define CL_DEFINE_FUNC_PTR(func) func##Func func = nullptr
+
+  CL_DEFINE_FUNC_PTR(clGetPlatformIDs);
+  CL_DEFINE_FUNC_PTR(clGetPlatformInfo);
+  CL_DEFINE_FUNC_PTR(clBuildProgram);
+  CL_DEFINE_FUNC_PTR(clEnqueueNDRangeKernel);
+  CL_DEFINE_FUNC_PTR(clSetKernelArg);
+  CL_DEFINE_FUNC_PTR(clReleaseKernel);
+  CL_DEFINE_FUNC_PTR(clCreateProgramWithSource);
+  CL_DEFINE_FUNC_PTR(clCreateBuffer);
+  CL_DEFINE_FUNC_PTR(clCreateImage);
+  CL_DEFINE_FUNC_PTR(clCreateImage2D);
+  CL_DEFINE_FUNC_PTR(clCreateUserEvent);
+  CL_DEFINE_FUNC_PTR(clRetainKernel);
+  CL_DEFINE_FUNC_PTR(clCreateKernel);
+  CL_DEFINE_FUNC_PTR(clGetProgramInfo);
+  CL_DEFINE_FUNC_PTR(clFlush);
+  CL_DEFINE_FUNC_PTR(clFinish);
+  CL_DEFINE_FUNC_PTR(clReleaseProgram);
+  CL_DEFINE_FUNC_PTR(clRetainContext);
+  CL_DEFINE_FUNC_PTR(clGetContextInfo);
+  CL_DEFINE_FUNC_PTR(clCreateProgramWithBinary);
+  CL_DEFINE_FUNC_PTR(clCreateCommandQueue);
+  CL_DEFINE_FUNC_PTR(clCreateCommandQueueWithProperties);
+  CL_DEFINE_FUNC_PTR(clReleaseCommandQueue);
+  CL_DEFINE_FUNC_PTR(clEnqueueMapBuffer);
+  CL_DEFINE_FUNC_PTR(clEnqueueMapImage);
+  CL_DEFINE_FUNC_PTR(clRetainProgram);
+  CL_DEFINE_FUNC_PTR(clGetProgramBuildInfo);
+  CL_DEFINE_FUNC_PTR(clEnqueueReadBuffer);
+  CL_DEFINE_FUNC_PTR(clEnqueueReadImage);
+  CL_DEFINE_FUNC_PTR(clEnqueueWriteBuffer);
+  CL_DEFINE_FUNC_PTR(clWaitForEvents);
+  CL_DEFINE_FUNC_PTR(clReleaseEvent);
+  CL_DEFINE_FUNC_PTR(clCreateContext);
+  CL_DEFINE_FUNC_PTR(clCreateContextFromType);
+  CL_DEFINE_FUNC_PTR(clReleaseContext);
+  CL_DEFINE_FUNC_PTR(clRetainCommandQueue);
+  CL_DEFINE_FUNC_PTR(clEnqueueUnmapMemObject);
+  CL_DEFINE_FUNC_PTR(clRetainMemObject);
+  CL_DEFINE_FUNC_PTR(clReleaseMemObject);
+  CL_DEFINE_FUNC_PTR(clGetDeviceInfo);
+  CL_DEFINE_FUNC_PTR(clGetDeviceIDs);
+  CL_DEFINE_FUNC_PTR(clRetainDevice);
+  CL_DEFINE_FUNC_PTR(clReleaseDevice);
+  CL_DEFINE_FUNC_PTR(clRetainEvent);
+  CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  CL_DEFINE_FUNC_PTR(clGetEventInfo);
+  CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo);
+  CL_DEFINE_FUNC_PTR(clGetImageInfo);
+
+#undef CL_DEFINE_FUNC_PTR
+
+ private:
+  void *handle_ = nullptr;
+};
+
+OpenCLLibrary *OpenCLLibrary::Get() {
+  static OpenCLLibrary library;
+  return &library;
+}
+
+OpenCLLibrary::OpenCLLibrary() {
+  this->Load();
+  // Do not call dlclose which may unload all OpenCL symbols.
+  // If close the OpenCL library, the static OpenCLlite destructor may fail.
+  // If there is no dlclose, the library will be closed when the program exist.
+  // Besides, the library will not be load repeatedly even dlopen many times.
+}
+
+bool OpenCLLibrary::Load() {
+  if (handle_ != nullptr) {
+    return true;
+  }
+
+  // Add customized OpenCL search path here
+  const std::vector<std::string> paths = {
+    "libOpenCL.so",
+#if defined(__aarch64__)
+    // Qualcomm Adreno with Android
+    "/system/vendor/lib64/libOpenCL.so",
+    "/system/lib64/libOpenCL.so",
+    // Mali with Android
+    "/system/vendor/lib64/egl/libGLES_mali.so",
+    "/system/lib64/egl/libGLES_mali.so",
+    // Typical Linux board
+    "/usr/lib/aarch64-linux-gnu/libOpenCL.so",
+#else
+    // Qualcomm Adreno with Android
+    "/system/vendor/lib/libOpenCL.so",
+    "/system/lib/libOpenCL.so",
+    // Mali with Android
+    "/system/vendor/lib/egl/libGLES_mali.so",
+    "/system/lib/egl/libGLES_mali.so",
+    // Typical Linux board
+    "/usr/lib/arm-linux-gnueabihf/libOpenCL.so",
+#endif
+  };
+
+  for (const auto &path : paths) {
+    VLOG(3) << "Loading OpenCL from " << path;
+    void *handle = LoadFromPath(path);
+    if (handle != nullptr) {
+      handle_ = handle;
+      break;
+    }
+  }
+
+  if (handle_ == nullptr) {
+    LOG(ERROR)
+        << "Failed to load OpenCL library, "
+           "please make sure there exists OpenCL library on your device, "
+           "and your APP have right to access the library.";
+    return false;
+  }
+
+  return true;
+}
+
+void *OpenCLLibrary::LoadFromPath(const std::string &path) {
+  void *handle = dlopen(path.c_str(), RTLD_LAZY | RTLD_LOCAL);
+
+  if (handle == nullptr) {
+    VLOG(3) << "Failed to load OpenCL library from path " << path
+            << " error code: " << dlerror();
+    return nullptr;
+  }
+
+#define CL_ASSIGN_FROM_DLSYM(func)                               \
+  do {                                                           \
+    void *ptr = dlsym(handle, #func);                            \
+    if (ptr == nullptr) {                                        \
+      VLOG(1) << "Failed to load " << #func << " from " << path; \
+      continue;                                                  \
+    }                                                            \
+    func = reinterpret_cast<func##Func>(ptr);                    \
+    VLOG(3) << "Loaded " << #func << " from " << path;           \
+  } while (false)
+
+  CL_ASSIGN_FROM_DLSYM(clGetPlatformIDs);
+  CL_ASSIGN_FROM_DLSYM(clGetPlatformInfo);
+  CL_ASSIGN_FROM_DLSYM(clBuildProgram);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueNDRangeKernel);
+  CL_ASSIGN_FROM_DLSYM(clSetKernelArg);
+  CL_ASSIGN_FROM_DLSYM(clReleaseKernel);
+  CL_ASSIGN_FROM_DLSYM(clCreateProgramWithSource);
+  CL_ASSIGN_FROM_DLSYM(clCreateBuffer);
+  CL_ASSIGN_FROM_DLSYM(clCreateImage);
+  CL_ASSIGN_FROM_DLSYM(clCreateImage2D);
+  CL_ASSIGN_FROM_DLSYM(clCreateUserEvent);
+  CL_ASSIGN_FROM_DLSYM(clRetainKernel);
+  CL_ASSIGN_FROM_DLSYM(clCreateKernel);
+  CL_ASSIGN_FROM_DLSYM(clGetProgramInfo);
+  CL_ASSIGN_FROM_DLSYM(clFlush);
+  CL_ASSIGN_FROM_DLSYM(clFinish);
+  CL_ASSIGN_FROM_DLSYM(clReleaseProgram);
+  CL_ASSIGN_FROM_DLSYM(clRetainContext);
+  CL_ASSIGN_FROM_DLSYM(clGetContextInfo);
+  CL_ASSIGN_FROM_DLSYM(clCreateProgramWithBinary);
+  CL_ASSIGN_FROM_DLSYM(clCreateCommandQueue);
+  CL_ASSIGN_FROM_DLSYM(clCreateCommandQueueWithProperties);
+  CL_ASSIGN_FROM_DLSYM(clReleaseCommandQueue);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueMapBuffer);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueMapImage);
+  CL_ASSIGN_FROM_DLSYM(clRetainProgram);
+  CL_ASSIGN_FROM_DLSYM(clGetProgramBuildInfo);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueReadBuffer);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueReadImage);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueWriteBuffer);
+  CL_ASSIGN_FROM_DLSYM(clWaitForEvents);
+  CL_ASSIGN_FROM_DLSYM(clReleaseEvent);
+  CL_ASSIGN_FROM_DLSYM(clCreateContext);
+  CL_ASSIGN_FROM_DLSYM(clCreateContextFromType);
+  CL_ASSIGN_FROM_DLSYM(clReleaseContext);
+  CL_ASSIGN_FROM_DLSYM(clRetainCommandQueue);
+  CL_ASSIGN_FROM_DLSYM(clEnqueueUnmapMemObject);
+  CL_ASSIGN_FROM_DLSYM(clRetainMemObject);
+  CL_ASSIGN_FROM_DLSYM(clReleaseMemObject);
+  CL_ASSIGN_FROM_DLSYM(clGetDeviceInfo);
+  CL_ASSIGN_FROM_DLSYM(clGetDeviceIDs);
+  CL_ASSIGN_FROM_DLSYM(clRetainDevice);
+  CL_ASSIGN_FROM_DLSYM(clReleaseDevice);
+  CL_ASSIGN_FROM_DLSYM(clRetainEvent);
+  CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  CL_ASSIGN_FROM_DLSYM(clGetEventInfo);
+  CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
+  CL_ASSIGN_FROM_DLSYM(clGetImageInfo);
+
+#undef CL_ASSIGN_FROM_DLSYM
+
+  return handle;
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+CL_API_ENTRY cl_event clCreateUserEvent(cl_context context, cl_int *errcode_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateUserEvent;
+  if (func != nullptr) {
+    return func(context, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+// Platform APIs
+CL_API_ENTRY cl_int
+clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
+                 cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformIDs;
+  if (func != nullptr) {
+    return func(num_entries, platforms, num_platforms);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int
+clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
+                  size_t param_value_size, void *param_value,
+                  size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetPlatformInfo;
+  if (func != nullptr) {
+    return func(platform, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Device APIs
+CL_API_ENTRY cl_int clGetDeviceIDs(
+    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
+    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceIDs;
+  if (func != nullptr) {
+    return func(platform, device_type, num_entries, devices, num_devices);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int
+clGetDeviceInfo(cl_device_id device, cl_device_info param_name,
+                size_t param_value_size, void *param_value,
+                size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetDeviceInfo;
+  if (func != nullptr) {
+    return func(device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainDevice(cl_device_id device)
+    CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainDevice;
+  if (func != nullptr) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseDevice(cl_device_id device)
+    CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseDevice;
+  if (func != nullptr) {
+    return func(device);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Context APIs
+CL_API_ENTRY cl_context clCreateContext(
+    const cl_context_properties *properties, cl_uint num_devices,
+    const cl_device_id *devices,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContext;
+  if (func != nullptr) {
+    return func(properties, num_devices, devices, pfn_notify, user_data,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_context clCreateContextFromType(
+    const cl_context_properties *properties, cl_device_type device_type,
+    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
+    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateContextFromType;
+  if (func != nullptr) {
+    return func(properties, device_type, pfn_notify, user_data, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainContext(cl_context context)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainContext;
+  if (func != nullptr) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseContext(cl_context context)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseContext;
+  if (func != nullptr) {
+    return func(context);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int
+clGetContextInfo(cl_context context, cl_context_info param_name,
+                 size_t param_value_size, void *param_value,
+                 size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetContextInfo;
+  if (func != nullptr) {
+    return func(context, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Program Object APIs
+CL_API_ENTRY cl_program clCreateProgramWithSource(
+    cl_context context, cl_uint count, const char **strings,
+    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithSource;
+  if (func != nullptr) {
+    return func(context, count, strings, lengths, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_program clCreateProgramWithBinary(
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateProgramWithBinary;
+  if (func != nullptr) {
+    return func(context, num_devices, device_list, lengths, binaries,
+                binary_status, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_int
+clGetProgramInfo(cl_program program, cl_program_info param_name,
+                 size_t param_value_size, void *param_value,
+                 size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramInfo;
+  if (func != nullptr) {
+    return func(program, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clGetProgramBuildInfo(
+    cl_program program, cl_device_id device, cl_program_build_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetProgramBuildInfo;
+  if (func != nullptr) {
+    return func(program, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainProgram(cl_program program)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainProgram;
+  if (func != nullptr) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseProgram(cl_program program)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseProgram;
+  if (func != nullptr) {
+    return func(program);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clBuildProgram(
+    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
+    const char *options,
+    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+    void *user_data) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clBuildProgram;
+  if (func != nullptr) {
+    return func(program, num_devices, device_list, options, pfn_notify,
+                user_data);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Kernel Object APIs
+CL_API_ENTRY cl_kernel
+clCreateKernel(cl_program program, const char *kernel_name,
+               cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateKernel;
+  if (func != nullptr) {
+    return func(program, kernel_name, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainKernel(cl_kernel kernel)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainKernel;
+  if (func != nullptr) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseKernel(cl_kernel kernel)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseKernel;
+  if (func != nullptr) {
+    return func(kernel);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index,
+                                   size_t arg_size, const void *arg_value)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clSetKernelArg;
+  if (func != nullptr) {
+    return func(kernel, arg_index, arg_size, arg_value);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Memory Object APIs
+CL_API_ENTRY cl_mem
+clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
+               void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateBuffer;
+  if (func != nullptr) {
+    return func(context, flags, size, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_mem clCreateImage(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    const cl_image_desc *image_desc, void *host_ptr,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage;
+  if (func != nullptr) {
+    return func(context, flags, image_format, image_desc, host_ptr,
+                errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainMemObject(cl_mem memobj)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainMemObject;
+  if (func != nullptr) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseMemObject(cl_mem memobj)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseMemObject;
+  if (func != nullptr) {
+    return func(memobj);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clGetImageInfo(cl_mem image, cl_image_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetImageInfo;
+  if (func != nullptr) {
+    return func(image, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Command Queue APIs
+CL_API_ENTRY cl_command_queue clCreateCommandQueueWithProperties(
+    cl_context context, cl_device_id device,
+    const cl_queue_properties *properties,
+    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_0 {
+  auto func =
+      paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueueWithProperties;
+  if (func != nullptr) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    // Fix MediaTek MT6771 OpenCL driver breakage
+    VLOG(3) << "Fallback to clCreateCommandQueue";
+    if (properties[0] == CL_QUEUE_PROPERTIES) {
+// When calling with OpenCL-CLHPP, the 2nd param is provided by caller.
+#pragma GCC diagnostic push  // disable warning both for clang and gcc
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+      return clCreateCommandQueue(context, device, properties[1], errcode_ret);
+#pragma GCC diagnostic pop
+    } else {
+      LOG(FATAL) << "Unknown calling parameters, check the code here";
+      if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+      return nullptr;
+    }
+  }
+}
+
+CL_API_ENTRY cl_int clRetainCommandQueue(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainCommandQueue;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseCommandQueue(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseCommandQueue;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Enqueued Commands APIs
+CL_API_ENTRY cl_int clEnqueueReadBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
+    size_t offset, size_t size, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_read, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clEnqueueReadImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
+    const size_t *origin, const size_t *region, size_t row_pitch,
+    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueReadImage;
+  if (func != nullptr) {
+    return func(command_queue, image, blocking_read, origin, region, row_pitch,
+                slice_pitch, ptr, num_events_in_wait_list, event_wait_list,
+                event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clEnqueueWriteBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
+    size_t offset, size_t size, const void *ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueWriteBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_write, offset, size, ptr,
+                num_events_in_wait_list, event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY void *clEnqueueMapBuffer(
+    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
+    cl_map_flags map_flags, size_t offset, size_t size,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapBuffer;
+  if (func != nullptr) {
+    return func(command_queue, buffer, blocking_map, map_flags, offset, size,
+                num_events_in_wait_list, event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY void *clEnqueueMapImage(
+    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
+    cl_map_flags map_flags, const size_t *origin, const size_t *region,
+    size_t *image_row_pitch, size_t *image_slice_pitch,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueMapImage;
+  if (func != nullptr) {
+    return func(command_queue, image, blocking_map, map_flags, origin, region,
+                image_row_pitch, image_slice_pitch, num_events_in_wait_list,
+                event_wait_list, event, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+CL_API_ENTRY cl_int clEnqueueUnmapMemObject(
+    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
+    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueUnmapMemObject;
+  if (func != nullptr) {
+    return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clGetKernelWorkGroupInfo(
+    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
+    size_t param_value_size, void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetKernelWorkGroupInfo;
+  if (func != nullptr) {
+    return func(kernel, device, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clEnqueueNDRangeKernel(
+    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
+    const size_t *global_work_offset, const size_t *global_work_size,
+    const size_t *local_work_size, cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clEnqueueNDRangeKernel;
+  if (func != nullptr) {
+    return func(command_queue, kernel, work_dim, global_work_offset,
+                global_work_size, local_work_size, num_events_in_wait_list,
+                event_wait_list, event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Event Object APIs
+CL_API_ENTRY cl_int clWaitForEvents(
+    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clWaitForEvents;
+  if (func != nullptr) {
+    return func(num_events, event_list);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clRetainEvent;
+  if (func != nullptr) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clReleaseEvent;
+  if (func != nullptr) {
+    return func(event);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Event API
+CL_API_ENTRY cl_int clGetEventInfo(cl_event event, cl_event_info param_name,
+                                   size_t param_value_size, void *param_value,
+                                   size_t *param_value_size_ret)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Profiling APIs
+CL_API_ENTRY cl_int clGetEventProfilingInfo(
+    cl_event event, cl_profiling_info param_name, size_t param_value_size,
+    void *param_value,
+    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clGetEventProfilingInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Flush and Finish APIs
+CL_API_ENTRY cl_int clFlush(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clFlush;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+CL_API_ENTRY cl_int clFinish(cl_command_queue command_queue)
+    CL_API_SUFFIX__VERSION_1_0 {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clFinish;
+  if (func != nullptr) {
+    return func(command_queue);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
+// Deprecated OpenCL 1.1 APIs
+CL_API_ENTRY /* CL_EXT_PREFIX__VERSION_1_1_DEPRECATED */ cl_mem clCreateImage2D(
+    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
+    size_t image_width, size_t image_height, size_t image_row_pitch,
+    void *host_ptr,
+    cl_int *errcode_ret) /* CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED */ {
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateImage2D;
+  if (func != nullptr) {
+    return func(context, flags, image_format, image_width, image_height,
+                image_row_pitch, host_ptr, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
+
+// Deprecated OpenCL 2.0 APIs
+CL_API_ENTRY /*CL_EXT_PREFIX__VERSION_1_2_DEPRECATED*/ cl_command_queue
+clCreateCommandQueue(cl_context context, cl_device_id device,
+                     cl_command_queue_properties properties,
+                     cl_int *errcode_ret)
+/* CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED */ {  // NOLINT
+  auto func = paddle::lite::OpenCLLibrary::Get()->clCreateCommandQueue;
+  if (func != nullptr) {
+    return func(context, device, properties, errcode_ret);
+  } else {
+    if (errcode_ret != nullptr) *errcode_ret = CL_INVALID_PLATFORM;
+    return nullptr;
+  }
+}
diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt
index 09c05ecb6f5..484f7c8f4ac 100644
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -1,52 +1,86 @@
 set(op_DEPS ${tensor_lite} op_lite op_params_lite)
 
-cc_library(fc_op_lite SRCS fc_op.cc DEPS ${op_DEPS})
-cc_library(relu_op_lite SRCS relu_op.cc DEPS ${op_DEPS})
-cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
-cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS})
-cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS})
-cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} )
-cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS})
-cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
-cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
-cc_library(activation_ops_lite SRCS activation_ops.cc DEPS ${op_DEPS})
-cc_library(elementwise_ops_lite SRCS elementwise_ops.cc DEPS ${op_DEPS})
-cc_library(mean_op_lite SRCS mean_op.cc DEPS ${op_DEPS})
-cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
-#cc_library(sgd_op_lite SRCS sgd_op.cc DEPS ${op_DEPS})
-cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
-cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
-cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
-cc_library(conv_op_lite SRCS conv_op.cc DEPS ${op_DEPS})
-cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS})
-cc_library(batch_norm_op_lite SRCS batch_norm.cc DEPS ${op_DEPS})
+lite_cc_library(conv_op_lite SRCS conv_op.cc DEPS ${op_DEPS})
+lite_cc_library(pool_op_lite SRCS pool_op.cc DEPS ${op_DEPS})
+lite_cc_library(fc_op_lite SRCS fc_op.cc DEPS ${op_DEPS})
+lite_cc_library(relu_op_lite SRCS relu_op.cc DEPS ${op_DEPS})
+lite_cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
+lite_cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS})
+lite_cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS})
+lite_cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} )
+lite_cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS})
+lite_cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS})
+lite_cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
+lite_cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
+lite_cc_library(activation_ops_lite SRCS activation_ops.cc DEPS ${op_DEPS})
+lite_cc_library(elementwise_ops_lite SRCS elementwise_ops.cc DEPS ${op_DEPS})
+lite_cc_library(fusion_elementwise_activation_ops_lite SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops_lite ${op_DEPS})
+lite_cc_library(mean_op_lite SRCS mean_op.cc DEPS ${op_DEPS})
+lite_cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
+lite_cc_library(sgd_op_lite SRCS sgd_op.cc DEPS ${op_DEPS})
+lite_cc_library(uniform_random_op_lite SRCS uniform_random_op.cc DEPS ${op_DEPS})
+lite_cc_library(gru_op_lite SRCS gru_op.cc DEPS ${op_DEPS})
+#lite_cc_library(fusion_gru_op_lite SRCS fusion_gru_op.cc DEPS ${op_DEPS})
+lite_cc_library(reduce_ops_lite SRCS reduce_ops.cc DEPS ${op_DEPS})
+lite_cc_library(lookup_table_op_lite SRCS lookup_table_op.cc DEPS ${op_DEPS})
+lite_cc_library(sequence_reshape_op_lite SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
+
+lite_cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
+lite_cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
+lite_cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
+lite_cc_library(calib_op_lite SRCS calib_op.cc DEPS ${op_DEPS})
+lite_cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS})
+lite_cc_library(transpose_op_lite SRCS transpose_op.cc DEPS ${op_DEPS})
+lite_cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
+lite_cc_library(fake_dequant SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 
 set(ops_lite
-    fc_op_lite
-    relu_op_lite
-    mul_op_lite
-    scale_op_lite
-    softmax_op_lite
-    reshape_op_lite
-    feed_op_lite
-    fetch_op_lite
-    io_copy_op_lite
-    elementwise_ops_lite
-    mean_op_lite
-    fill_constant_op_lite
-    activation_ops_lite
-    dropout_op_lite
-    concat_op_lite
-    conv_op_lite
-    pool_op_lite
-    batch_norm_op_lite
-    CACHE INTERNAL "ops lite")
+        conv_op_lite
+        pool_op_lite
+        fc_op_lite
+        relu_op_lite
+        mul_op_lite
+        scale_op_lite
+        softmax_op_lite
+        reshape_op_lite
+        batch_norm_op_lite
+        feed_op_lite
+        fetch_op_lite
+        io_copy_op_lite
+        elementwise_ops_lite
+        fusion_elementwise_activation_ops_lite
+        mean_op_lite
+        fill_constant_op_lite
+        activation_ops_lite
+        dropout_op_lite
+        concat_op_lite
+        calib_op_lite
+        split_op_lite
+        transpose_op_lite
+        fake_quant
+        fake_dequant
+        sgd_op_lite
+        uniform_random_op_lite
+        gru_op_lite
+        reduce_ops_lite
+        lookup_table_op_lite
+        sequence_reshape_op_lite
+        CACHE INTERNAL "ops lite")
 
 lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc 
              DEPS fc_op_lite memory_lite 
              X86_DEPS fc_compute_x86
-                 ARM_DEPS fc_compute_arm)
+             ARM_DEPS fc_compute_arm)
+lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
+             DEPS pool_op_lite memory_lite
+             ARM_DEPS pool_compute_arm)
 lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite)
 lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
+lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
+lite_cc_test(test_calib_op_lite SRCS calib_op_test.cc DEPS calib_op_lite memory_lite ARM_DEPS calib_compute_arm)
+lite_cc_test(test_fusion_elementwise_activation_ops_lite 
+             SRCS fusion_elementwise_activation_ops_test.cc 
+             DEPS fusion_elementwise_activation_ops_lite memory_lite)
+lite_cc_test(test_transpose_op_lite SRCS transpose_op_test.cc DEPS transpose_op_lite memory_lite)
diff --git a/paddle/fluid/lite/operators/activation_ops.cc b/paddle/fluid/lite/operators/activation_ops.cc
index 8cda67af14a..cea2855eb72 100644
--- a/paddle/fluid/lite/operators/activation_ops.cc
+++ b/paddle/fluid/lite/operators/activation_ops.cc
@@ -30,6 +30,7 @@ class ActivationOp : public OpLite {
 
   bool InferShape() const override {
     param_.Out->Resize(param_.X->dims());
+    param_.Out->raw_tensor().set_lod(param_.X->raw_tensor().lod());
     return true;
   }
 
@@ -72,6 +73,21 @@ class ActivationGradOp : public OpLite {
 
     param_.Out_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
     param_.X_grad = GetMutableVar<Tensor>(scope, X_grad_name);
+
+    if (opdesc.HasInput("X")) {
+      auto X_name = opdesc.Input("X").front();
+      param_.X = GetVar<lite::Tensor>(scope, X_name);
+    } else {
+      param_.X = param_.X_grad;
+    }
+
+    if (opdesc.HasInput("Out")) {
+      auto Out_name = opdesc.Input("Out").front();
+      param_.Out = GetVar<lite::Tensor>(scope, Out_name);
+    } else {
+      param_.Out = param_.Out_grad;
+    }
+
     return true;
   }
 
@@ -89,6 +105,7 @@ class ActivationGradOp : public OpLite {
 }  // namespace paddle
 
 REGISTER_LITE_OP(square, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
 #ifdef LITE_WITH_X86
 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
 #endif
diff --git a/paddle/fluid/lite/operators/batch_norm.h b/paddle/fluid/lite/operators/batch_norm.h
deleted file mode 100644
index 90815768e6b..00000000000
--- a/paddle/fluid/lite/operators/batch_norm.h
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_lite.h"
-#include "paddle/fluid/lite/core/scope.h"
-#include "paddle/fluid/lite/operators/op_params.h"
-#include "paddle/fluid/lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class BatchNormOpLite : public OpLite {
- public:
-  BatchNormOpLite() {}
-
-  explicit BatchNormOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-  // TODO(Superjomn) replace framework::OpDesc with a lite one.
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto x = op_desc.Input("X").front();
-    auto bias = op_desc.Input("Bias").front();
-    auto mean = op_desc.Input("Mean").front();
-    auto scale = op_desc.Input("Scale").front();
-    auto variance = op_desc.Input("Variance").front();
-
-    auto out = op_desc.Output("Y").front();
-    auto mean_out = op_desc.Output("MeanOut").front();
-    auto var_out = op_desc.Output("VarianceOut").front();
-    auto saved_mean = op_desc.Output("SavedMean").front();
-    auto saved_var = op_desc.Output("SavedVariance").front();
-
-    auto *var = scope->FindVar(x);
-    param_.x = var->GetMutable<Tensor>();
-    var = scope->FindVar(bias);
-    param_.bias = var->GetMutable<Tensor>();
-    var = scope->FindVar(mean);
-    param_.mean = var->GetMutable<Tensor>();
-    var = scope->FindVar(scale);
-    param_.scale = var->GetMutable<Tensor>();
-    var = scope->FindVar(variance);
-    param_.var = var->GetMutable<Tensor>();
-    var = scope->FindVar(out);
-    param_.out = var->GetMutable<Tensor>();
-    var = scope->FindVar(mean_out);
-    param_.mean_out = var->GetMutable<Tensor>();
-    var = scope->FindVar(var_out);
-    param_.var_out = var->GetMutable<Tensor>();
-    var = scope->FindVar(saved_mean);
-    param_.saved_mean = var->GetMutable<Tensor>();
-    var = scope->FindVar(saved_var);
-    param_.saved_var = var->GetMutable<Tensor>();
-
-    param_.eps = op_desc.GetAttr<float>("epsilon");
-
-    return true;
-  }
-
-  std::string DebugString() const override { return "batch_norm"; }
-
- private:
-  mutable BatchNormParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/batch_norm_op.cc b/paddle/fluid/lite/operators/batch_norm_op.cc
new file mode 100644
index 00000000000..b6ef87732de
--- /dev/null
+++ b/paddle/fluid/lite/operators/batch_norm_op.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/batch_norm_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool BatchNormOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.bias);
+  CHECK_OR_FALSE(param_.scale);
+  CHECK_OR_FALSE(param_.mean);
+  CHECK_OR_FALSE(param_.variance);
+  CHECK_OR_FALSE(param_.y);
+  if (!param_.is_test) {
+    CHECK_OR_FALSE(param_.mean_out);
+    CHECK_OR_FALSE(param_.variance_out);
+    CHECK_OR_FALSE(param_.saved_mean);
+    CHECK_OR_FALSE(param_.saved_variance);
+  }
+  auto x_dims = param_.x->dims();
+  auto scale_dims = param_.scale->dims();
+  auto bias_dims = param_.bias->dims();
+  auto mean_dims = param_.mean->dims();
+  auto variance_dims = param_.variance->dims();
+  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
+      << "Input X must have 2 to 5 dimensions.";
+  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
+  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
+  CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions.";
+  CHECK_EQ(variance_dims.size(), 1UL)
+      << "Input Variance must have 1 dimensions.";
+  return true;
+}
+
+bool BatchNormOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  int64_t channel_size = 0;
+  switch (param_.data_layout) {
+    case DATALAYOUT(kNCHW):
+      channel_size = x_dims[1];
+      break;
+    // case DATALAYOUT(kNHWC):
+    //   channel_size = x_dims[x_dims.size() - 1];
+    //   break;
+    default:
+      LOG(FATAL) << "Unknown storage order: "
+                 << DataLayoutToStr(param_.data_layout);
+      break;
+  }
+  if (!param_.is_test) {
+    param_.mean_out->Resize({channel_size});
+    param_.variance_out->Resize({channel_size});
+    param_.saved_mean->Resize({channel_size});
+    param_.saved_variance->Resize({channel_size});
+  }
+  param_.y->Resize(x_dims);
+  return true;
+}
+
+bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.bias =
+      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
+  param_.scale =
+      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
+  param_.mean =
+      scope->FindVar(op_desc.Input("Mean").front())->GetMutable<Tensor>();
+  param_.variance =
+      scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
+  param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
+  param_.is_test = op_desc.GetAttr<int>("is_test");
+  param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
+  if (!param_.is_test) {
+    param_.mean_out =
+        scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable<Tensor>();
+    param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front())
+                              ->GetMutable<Tensor>();
+    param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front())
+                            ->GetMutable<Tensor>();
+    param_.saved_variance =
+        scope->FindVar(op_desc.Output("SavedVariance").front())
+            ->GetMutable<Tensor>();
+  }
+  param_.epsilon = op_desc.GetAttr<float>("epsilon");
+  param_.momentum = op_desc.GetAttr<float>("momentum");
+  std::string data_layout = op_desc.GetAttr<std::string>("data_layout");
+  CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW.";
+  // param_.data_layout = StringToDataLayout(data_layout);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp);
diff --git a/paddle/fluid/lite/operators/batch_norm_op.h b/paddle/fluid/lite/operators/batch_norm_op.h
new file mode 100644
index 00000000000..30e8747319b
--- /dev/null
+++ b/paddle/fluid/lite/operators/batch_norm_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class BatchNormOp : public OpLite {
+ public:
+  BatchNormOp() {}
+  explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "batch_norm"; }
+
+ private:
+  mutable BatchNormParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/batch_norm_op_test.cc b/paddle/fluid/lite/operators/batch_norm_op_test.cc
new file mode 100644
index 00000000000..9fb02759722
--- /dev/null
+++ b/paddle/fluid/lite/operators/batch_norm_op_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/batch_norm_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+TEST(batch_norm_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
+  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
+  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
+  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
+  auto* y = scope.Var("y")->GetMutable<Tensor>();
+  x->Resize({2, 32, 10, 20});
+  auto x_dims = x->dims();
+  const int64_t channel_size = x_dims[1];  // NCHW
+  scale->Resize({channel_size});
+  bias->Resize({channel_size});
+  mean->Resize({channel_size});
+  variance->Resize({channel_size});
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Scale", {"scale"});
+  desc.SetInput("Bias", {"bias"});
+  desc.SetInput("Mean", {"mean"});
+  desc.SetInput("Variance", {"variance"});
+  desc.SetOutput("Y", {"y"});
+  desc.SetAttr("is_test", static_cast<int>(1));
+  desc.SetAttr("use_global_stats", false);
+  desc.SetAttr("epsilon", 1e-5f);
+  desc.SetAttr("momentum", 0.9f);
+  desc.SetAttr("data_layout", std::string("NCHW"));
+
+  BatchNormOp batch_norm("batch_norm");
+
+  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
+  batch_norm.Attach(desc, &scope);
+  batch_norm.CheckShape();
+  batch_norm.InferShape();
+
+  // check output dims
+  auto y_dims = y->dims();
+  CHECK_EQ(y_dims.size(), x_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    CHECK_EQ(y_dims[i], x_dims[i]);
+  }
+}
+
+TEST(batch_norm_op_lite, test_enable_is_test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* scale = scope.Var("scale")->GetMutable<Tensor>();
+  auto* bias = scope.Var("bias")->GetMutable<Tensor>();
+  auto* mean = scope.Var("mean")->GetMutable<Tensor>();
+  auto* variance = scope.Var("variance")->GetMutable<Tensor>();
+  auto* y = scope.Var("y")->GetMutable<Tensor>();
+  auto* mean_out = scope.Var("mean_out")->GetMutable<Tensor>();
+  auto* variance_out = scope.Var("variance_out")->GetMutable<Tensor>();
+  auto* saved_mean = scope.Var("saved_mean")->GetMutable<Tensor>();
+  auto* saved_variance = scope.Var("saved_variance")->GetMutable<Tensor>();
+  x->Resize({2, 32, 10, 20});
+  auto x_dims = x->dims();
+  const int64_t channel_size = x_dims[1];  // NCHW
+  scale->Resize({channel_size});
+  bias->Resize({channel_size});
+  mean->Resize({channel_size});
+  variance->Resize({channel_size});
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("batch_norm");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Scale", {"scale"});
+  desc.SetInput("Bias", {"bias"});
+  desc.SetInput("Mean", {"mean"});
+  desc.SetInput("Variance", {"variance"});
+  desc.SetOutput("Y", {"y"});
+  desc.SetOutput("MeanOut", {"mean_out"});
+  desc.SetOutput("VarianceOut", {"variance_out"});
+  desc.SetOutput("SavedMean", {"saved_mean"});
+  desc.SetOutput("SavedVariance", {"saved_variance"});
+  desc.SetAttr("is_test", static_cast<int>(0));
+  desc.SetAttr("use_global_stats", false);
+  desc.SetAttr("epsilon", 1e-5f);
+  desc.SetAttr("momentum", 0.9f);
+  desc.SetAttr("data_layout", std::string("NCHW"));
+
+  BatchNormOp batch_norm("batch_norm");
+
+  batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
+  batch_norm.Attach(desc, &scope);
+  batch_norm.CheckShape();
+  batch_norm.InferShape();
+
+  // check output dims
+  auto y_dims = y->dims();
+  CHECK_EQ(y_dims.size(), x_dims.size());
+  for (size_t i = 0; i < y_dims.size(); i++) {
+    CHECK_EQ(y_dims[i], x_dims[i]);
+  }
+  auto mean_out_dims = mean_out->dims();
+  auto variance_out_dims = variance_out->dims();
+  auto saved_mean_dims = saved_mean->dims();
+  auto saved_variance_dims = saved_variance->dims();
+  CHECK_EQ(mean_out_dims.size(), 1UL);
+  CHECK_EQ(variance_out_dims.size(), 1UL);
+  CHECK_EQ(saved_mean_dims.size(), 1UL);
+  CHECK_EQ(saved_variance_dims.size(), 1UL);
+  CHECK_EQ(mean_out_dims[0], channel_size);
+  CHECK_EQ(variance_out_dims[0], channel_size);
+  CHECK_EQ(saved_mean_dims[0], channel_size);
+  CHECK_EQ(saved_variance_dims[0], channel_size);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/calib_op.cc b/paddle/fluid/lite/operators/calib_op.cc
new file mode 100644
index 00000000000..289ef40e179
--- /dev/null
+++ b/paddle/fluid/lite/operators/calib_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/calib_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CalibOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+bool CalibOpLite::InferShape() const {
+  param_.output->Resize(param_.input->dims());
+  return true;
+}
+
+bool CalibOpLite::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  auto x_var = scope->FindVar(opdesc.Input("Input").front());
+  auto output_var = scope->FindVar(opdesc.Output("Out").front());
+  CHECK(x_var);
+  CHECK(output_var);
+  param_.input = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
+  param_.output = output_var->GetMutable<lite::Tensor>();
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (opdesc.HasAttr("scale")) {
+    param_.scale = opdesc.GetAttr<float>("scale");
+  }
+  CHECK(param_.input) << "Input(X) of CalibOp should not be null.";
+  CHECK(param_.output) << "Output(Out) of CalibOp should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(calib, paddle::lite::operators::CalibOpLite);
diff --git a/paddle/fluid/lite/operators/calib_op.h b/paddle/fluid/lite/operators/calib_op.h
new file mode 100644
index 00000000000..1d93f6ea9a5
--- /dev/null
+++ b/paddle/fluid/lite/operators/calib_op.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+/*
+ * The data types used by the two adjacent layers in the model should
+ * be the same. When the two operators accept different data types,
+ * we may need to implicitly add a data type conversion operator.
+ * Currently, this operator only supports mutual conversion of int8
+ * and float32 types.
+ */
+class CalibOpLite : public OpLite {
+ public:
+  CalibOpLite() {}
+
+  explicit CalibOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "calib"; }
+
+ private:
+  mutable CalibParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/calib_op_test.cc b/paddle/fluid/lite/operators/calib_op_test.cc
new file mode 100644
index 00000000000..deab7368b4b
--- /dev/null
+++ b/paddle/fluid/lite/operators/calib_op_test.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/operators/calib_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+#ifdef LITE_WITH_ARM
+TEST(calib_op_lite, TestARM) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("Input")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  x->Resize(DDim(std::vector<int64_t>({1, 10, 20})));
+  output->Resize(DDim(std::vector<int64_t>{1, 10, 20}));
+
+  // set data
+  for (int i = 0; i < 10 * 20; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("calib");
+  desc.SetInput("Input", {"Input"});
+  desc.SetOutput("Out", {"output"});
+  desc.SetAttr("scale", 10.0f);
+
+  CalibOpLite calib("calib");
+
+  calib.SetValidPlaces({Place{TARGET(kARM), PRECISION(kInt8)}});
+  calib.Attach(desc, &scope);
+  auto kernels = calib.CreateKernels({Place{TARGET(kARM), PRECISION(kInt8)}});
+  ASSERT_FALSE(kernels.empty());
+}
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
+#endif
diff --git a/paddle/fluid/lite/operators/concat_op.cc b/paddle/fluid/lite/operators/concat_op.cc
index e51d6e0d349..f09b3b32d62 100644
--- a/paddle/fluid/lite/operators/concat_op.cc
+++ b/paddle/fluid/lite/operators/concat_op.cc
@@ -21,7 +21,7 @@ namespace lite {
 namespace operators {
 
 bool ConcatOpLite::CheckShape() const {
-  CHECK_GT_OR_FALSE(param_.x.size(), 1UL);
+  CHECK_GE_OR_FALSE(param_.x.size(), 1UL);
   CHECK_OR_FALSE(param_.output);
   return true;
 }
@@ -50,6 +50,10 @@ bool ConcatOpLite::InferShape() const {
   }
   // Set output dims
   param_.output->Resize(lite::DDim(out_dims));
+  if (n == 1) {
+    param_.output->ShareDataWith(param_.x[0]->raw_tensor());
+  }
+  param_.output->raw_tensor().set_lod(param_.x[0]->raw_tensor().lod());
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/conv_op.cc b/paddle/fluid/lite/operators/conv_op.cc
index 63838efd6fe..948e2a0641c 100644
--- a/paddle/fluid/lite/operators/conv_op.cc
+++ b/paddle/fluid/lite/operators/conv_op.cc
@@ -24,31 +24,49 @@ bool ConvOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.output);
   CHECK_OR_FALSE(param_.filter);
-  return true;
-}
+  // bias is optional.
 
-bool ConvOpLite::InferShape() const {
-  auto in_dims = param_.x->dims();
-  auto filter_dims = param_.filter->dims();
-  std::vector<int> strides = param_.strides;
-  std::vector<int> paddings = param_.paddings;
-  int groups = param_.groups;
-  std::vector<int> dilations = param_.dilations;
+  const auto in_dims = param_.x->dims();
+  const auto filter_dims = param_.filter->dims();
 
   CHECK_OR_FALSE(in_dims.size() == 4 || in_dims.size() == 5);
+
   CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
-  CHECK_OR_FALSE(in_dims.size() - strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(paddings.size(), strides.size());
-  CHECK_EQ_OR_FALSE(in_dims[1], filter_dims[1] * groups);
-  CHECK_EQ_OR_FALSE(filter_dims[0] % groups, 0);
+  CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
+  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
+
+  CHECK_EQ_OR_FALSE(in_dims[1], filter_dims[1] * param_.groups);
+  CHECK_EQ_OR_FALSE(filter_dims[0] % param_.groups, 0);
+  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
+
+  return true;
+}
+
+inline int ConvOutputSize(int input_size, int filter_size, int dilation,
+                          int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  CHECK_GT_OR_FALSE(output_size, 0);
+
+  return output_size;
+}
+
+bool ConvOpLite::InferShape() const {
+  const auto in_dims = param_.x->dims();
+  const auto filter_dims = param_.filter->dims();
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(
+        ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], param_.dilations[i],
+                       param_.paddings[i], param_.strides[i]));
   }
+
+  // Set output dims
   param_.output->Resize(lite::DDim(output_shape));
+
+  // share LoD
+  // param_.output->set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/conv_op.h b/paddle/fluid/lite/operators/conv_op.h
index 79726e0284b..567bc97130f 100644
--- a/paddle/fluid/lite/operators/conv_op.h
+++ b/paddle/fluid/lite/operators/conv_op.h
@@ -26,29 +26,6 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-inline int ConvOutputSize(int input_size, int filter_size, int dilation,
-                          int padding, int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  CHECK_OR_FALSE(output_size > 0);
-
-  return output_size;
-}
-
-inline bool IsExpand(const std::vector<int64_t>& filter_dim,
-                     const std::vector<int>& strides,
-                     const std::vector<int>& paddings,
-                     const std::vector<int>& dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
 class ConvOpLite : public OpLite {
  public:
   ConvOpLite() {}
@@ -59,7 +36,6 @@ class ConvOpLite : public OpLite {
 
   bool InferShape() const override;
 
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
     auto X = op_desc.Input("Input").front();
@@ -70,36 +46,52 @@ class ConvOpLite : public OpLite {
     param_.filter = scope->FindVar(Filter)->GetMutable<lite::Tensor>();
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
+    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    param_.groups = op_desc.GetAttr<int>("groups");
+    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+
+    // optional params
     std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
     if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
         input_arg_names.end()) {
       auto bias_arguments = op_desc.Input("Bias");
-      if (bias_arguments.size() != 0) {
+      if (bias_arguments.size() > 0) {
         auto bias_var = scope->FindVar(bias_arguments.front());
         if (bias_var != nullptr) {
-          param_.bias = &bias_var->Get<lite::Tensor>();
+          param_.bias =
+              const_cast<lite::Tensor*>(&(bias_var->Get<lite::Tensor>()));
         }
       }
     }
     if (std::find(input_arg_names.begin(), input_arg_names.end(),
                   "ResidualData") != input_arg_names.end()) {
-      auto res_argument = op_desc.Input("ResidualData");
-      if (res_argument.size() != 0) {
-        auto residual_data_var = scope->FindVar(res_argument.front());
+      auto res_data_arguments = op_desc.Input("ResidualData");
+      if (res_data_arguments.size() > 0) {
+        auto residual_data_var = scope->FindVar(res_data_arguments.front());
         if (residual_data_var != nullptr) {
-          param_.residualData = &residual_data_var->Get<lite::Tensor>();
+          param_.residualData = const_cast<lite::Tensor*>(
+              &(residual_data_var->Get<lite::Tensor>()));
         }
       }
     }
-
-    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-    param_.groups = op_desc.GetAttr<int>("groups");
-    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
-
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+    // For Int8
+    if (op_desc.HasAttr("enable_int8")) {
+      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
+      if (op_desc.HasAttr("input_scale"))
+        param_.input_scale = op_desc.GetAttr<float>("input_scale");
+      if (op_desc.HasAttr("weight_scale"))
+        param_.weight_scale =
+            op_desc.GetAttr<std::vector<float>>("weight_scale");
+      if (op_desc.HasAttr("output_scale"))
+        param_.output_scale = op_desc.GetAttr<float>("output_scale");
+    }
     return true;
   }
 
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
   std::string DebugString() const override { return "conv2d"; }
 
  private:
diff --git a/paddle/fluid/lite/operators/dropout_op.cc b/paddle/fluid/lite/operators/dropout_op.cc
index b5b50dc3d16..cf31b90c9f1 100644
--- a/paddle/fluid/lite/operators/dropout_op.cc
+++ b/paddle/fluid/lite/operators/dropout_op.cc
@@ -52,13 +52,16 @@ class DropoutOpLite : public OpLite {
     param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
 
     param_.dropout_prob = op_desc.GetAttr<float>("dropout_prob");
-    if (op_desc.HasAttr("axis")) {
-      param_.is_test = op_desc.GetAttr<bool>("is_test");
-    }
+    param_.is_test = true;
+    // TODO(sangoly): `is_test` has different attr type in x86 and arm, set
+    // `true` now.
+    // if (op_desc.HasAttr("is_test")) {
+    //   param_.is_test = op_desc.GetAttr<bool>("is_test");
+    // }
     param_.fix_seed = op_desc.GetAttr<bool>("fix_seed");
     param_.seed = op_desc.GetAttr<int>("seed");
     param_.dropout_implementation =
-        op_desc.GetAttr<int>("dropout_implementation");
+        op_desc.GetAttr<std::string>("dropout_implementation");
     return true;
   }
 
diff --git a/paddle/fluid/lite/operators/elementwise_ops.cc b/paddle/fluid/lite/operators/elementwise_ops.cc
index b400b1ab26c..11ca4a4acce 100644
--- a/paddle/fluid/lite/operators/elementwise_ops.cc
+++ b/paddle/fluid/lite/operators/elementwise_ops.cc
@@ -12,92 +12,72 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/operators/elementwise_ops.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace operators {
 
-class ElementwiseOp : public OpLite {
- public:
-  explicit ElementwiseOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.X);
-    CHECK_OR_FALSE(param_.Y);
-    CHECK_OR_FALSE(param_.Out);
-    return true;
-  }
-
-  bool InferShape() const override {
-    CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
-    param_.Out->Resize(param_.X->dims());
-    return true;
-  }
-
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    auto X_name = opdesc.Input("X").front();
-    auto Y_name = opdesc.Input("Y").front();
-    auto Out_name = opdesc.Output("Out").front();
-
-    param_.X = GetVar<lite::Tensor>(scope, X_name);
-    param_.Y = GetVar<lite::Tensor>(scope, Y_name);
-    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-    param_.axis = opdesc.GetAttr<int>("axis");
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "elementwise_op"; }
-
- private:
-  mutable operators::ElementwiseParam param_;
-};
+bool ElementwiseOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool ElementwiseOp::InferShape() const {
+  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
+  param_.Out->Resize(param_.X->dims());
+  param_.Out->raw_tensor().set_lod(param_.X->lod());
+  return true;
+}
+
+bool ElementwiseOp::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  auto X_name = opdesc.Input("X").front();
+  auto Y_name = opdesc.Input("Y").front();
+  auto Out_name = opdesc.Output("Out").front();
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  return true;
+}
 
 #ifdef LITE_WITH_X86
-class ElementwiseGradExplicitOp : public OpLite {
- public:
-  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
-
-  bool CheckShape() const override {
-    CHECK_OR_FALSE(param_.Y);
-    CHECK_OR_FALSE(param_.X_grad);
-    CHECK_OR_FALSE(param_.Y_grad);
-    CHECK_OR_FALSE(param_.Out_grad);
-    return true;
-  }
-
-  bool InferShape() const override {
-    param_.X_grad->Resize(param_.Out_grad->dims());
-    param_.Y_grad->Resize(param_.Y->dims());
-    return true;
+bool ElementwiseGradExplicitOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+
+bool ElementwiseGradExplicitOp::InferShape() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  if (param_.Y_grad) param_.Y_grad->Resize(param_.Y->dims());
+  return true;
+}
+
+bool ElementwiseGradExplicitOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                           lite::Scope* scope) {
+  CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
+  auto Y_name = opdesc.Input("Y").front();
+  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
+  auto X_grad = opdesc.Output(framework::GradVarName("X")).front();
+
+  if (opdesc.Output(framework::GradVarName("Y")).size() > 0) {
+    auto Y_grad = opdesc.Output(framework::GradVarName("Y")).front();
+    param_.Y_grad = GetMutableVar<Tensor>(scope, Y_grad);
   }
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_grad);
+  param_.axis = opdesc.GetAttr<int>("axis");
 
-  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
-    auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
-    auto X_name = opdesc.Output(framework::GradVarName("X")).front();
-    auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
-
-    param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
-    param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
-    param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
-    param_.axis = opdesc.GetAttr<int>("axis");
-
-    return true;
-  }
-
-  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override {
-    return "elementwise_grad_explicit_op";
-  }
+  return true;
+}
 
- private:
-  mutable operators::ElementwiseGradParam param_;
-};
 #endif
 
 }  // namespace operators
diff --git a/paddle/fluid/lite/operators/elementwise_ops.h b/paddle/fluid/lite/operators/elementwise_ops.h
new file mode 100644
index 00000000000..3a0199fab0e
--- /dev/null
+++ b/paddle/fluid/lite/operators/elementwise_ops.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ElementwiseOp : public OpLite {
+ public:
+  explicit ElementwiseOp(const std::string& op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "elementwise_op"; }
+
+ private:
+  mutable operators::ElementwiseParam param_;
+};
+
+#ifdef LITE_WITH_X86
+class ElementwiseGradExplicitOp : public OpLite {
+ public:
+  explicit ElementwiseGradExplicitOp(const std::string& type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "elementwise_grad_explicit_op";
+  }
+
+ private:
+  mutable operators::ElementwiseGradParam param_;
+};
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/arm/math/scale.h b/paddle/fluid/lite/operators/fake_dequantize_max_abs.cc
similarity index 72%
rename from paddle/fluid/lite/arm/math/scale.h
rename to paddle/fluid/lite/operators/fake_dequantize_max_abs.cc
index 97a5f79fc6b..8c3c8c7fd79 100644
--- a/paddle/fluid/lite/arm/math/scale.h
+++ b/paddle/fluid/lite/operators/fake_dequantize_max_abs.cc
@@ -12,17 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/fluid/lite/operators/fake_dequantize_max_abs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
-namespace arm {
-namespace math {
-
-template <typename T>
-void scale(const T* din, T* dout, int num, float scale, float bias);
-
-}  // namespace math
-}  // namespace arm
+namespace operators {}  // namespace operators
 }  // namespace lite
 }  // namespace paddle
+
+REGISTER_LITE_OP(fake_dequantize_max_abs,
+                 paddle::lite::operators::FakeDequantizeMaxAbsOpLite);
diff --git a/paddle/fluid/lite/operators/fake_dequantize_max_abs.h b/paddle/fluid/lite/operators/fake_dequantize_max_abs.h
new file mode 100644
index 00000000000..de48c413041
--- /dev/null
+++ b/paddle/fluid/lite/operators/fake_dequantize_max_abs.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FakeDequantizeMaxAbsOpLite : public OpLite {
+ public:
+  FakeDequantizeMaxAbsOpLite() {}
+
+  explicit FakeDequantizeMaxAbsOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override { return true; }
+
+  bool InferShape() const override { return true; }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto in_scale = op_desc.Input("Scale").front();
+
+    auto out = op_desc.Output("Out").front();
+
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+
+    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.max_range = op_desc.GetAttr<float>("max_range");
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "fake_dequantize_max_abs"; }
+
+ private:
+  mutable FakeDequantizeMaxAbsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.cc b/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.cc
new file mode 100644
index 00000000000..59f48d4380f
--- /dev/null
+++ b/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fake_quantize_moving_average_abs_max,
+                 paddle::lite::operators::FakeQuantizeMovingAvgMaxAbsOpLite);
diff --git a/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.h b/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.h
new file mode 100644
index 00000000000..547584e1651
--- /dev/null
+++ b/paddle/fluid/lite/operators/fake_quantize_moving_avg_max_abs.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FakeQuantizeMovingAvgMaxAbsOpLite : public OpLite {
+ public:
+  FakeQuantizeMovingAvgMaxAbsOpLite() {}
+
+  explicit FakeQuantizeMovingAvgMaxAbsOpLite(const std::string &type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override { return true; }
+
+  bool InferShape() const override { return true; }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto in_scale = op_desc.Input("InScale").front();
+
+    auto out = op_desc.Output("Out").front();
+    auto out_scale = op_desc.Output("OutScale").front();
+
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.in_scale = scope->FindVar(in_scale)->GetMutable<lite::Tensor>();
+
+    param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+    param_.out_scale = scope->FindVar(out_scale)->GetMutable<lite::Tensor>();
+    param_.bit_length = op_desc.GetAttr<int>("bit_length");
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fake_quantize_moving_avg_max_abs";
+  }
+
+ private:
+  mutable FakeQuantizeMovingAvgMaxAbsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/fc_op.cc b/paddle/fluid/lite/operators/fc_op.cc
index 87d7f35c771..d132b94b63c 100644
--- a/paddle/fluid/lite/operators/fc_op.cc
+++ b/paddle/fluid/lite/operators/fc_op.cc
@@ -61,7 +61,7 @@ bool FcOpLite::InferShape() const {
   param_.output->Resize(lite::DDim(output_dims));
 
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->raw_tensor().set_lod(param_.input->lod());
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/fc_op.h b/paddle/fluid/lite/operators/fc_op.h
index 0e738018322..47d4293dfe1 100644
--- a/paddle/fluid/lite/operators/fc_op.h
+++ b/paddle/fluid/lite/operators/fc_op.h
@@ -59,6 +59,17 @@ class FcOpLite : public OpLite {
     param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
     param_.in_num_col_dims = op_desc.GetAttr<int>("in_num_col_dims");
 
+    // For Int8
+    if (op_desc.HasAttr("enable_int8")) {
+      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
+      if (op_desc.HasAttr("input_scale"))
+        param_.input_scale = op_desc.GetAttr<float>("input_scale");
+      if (op_desc.HasAttr("weight_scale"))
+        param_.weight_scale =
+            op_desc.GetAttr<std::vector<float>>("weight_scale");
+      if (op_desc.HasAttr("output_scale"))
+        param_.output_scale = op_desc.GetAttr<float>("output_scale");
+    }
     return true;
   }
 
diff --git a/paddle/fluid/lite/operators/fill_constant_op.cc b/paddle/fluid/lite/operators/fill_constant_op.cc
index b762f0d3c92..10079d20a25 100644
--- a/paddle/fluid/lite/operators/fill_constant_op.cc
+++ b/paddle/fluid/lite/operators/fill_constant_op.cc
@@ -36,7 +36,7 @@ class FillConstantOp : public OpLite {
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
     auto Out_name = opdesc.Output("Out").front();
 
-    param_.Out = GetMutableVar<Tensor>(scope, Out_name);
+    param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
     param_.dtype = opdesc.GetAttr<int>("dtype");
     param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
     param_.value = opdesc.GetAttr<float>("value");
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
new file mode 100644
index 00000000000..2364d179774
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h"
+#include <string>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool FusionElementwiseActivationOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool FusionElementwiseActivationOp::InferShape() const {
+  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
+
+bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
+                                               lite::Scope* scope) {
+  auto X_name = opdesc.Input("X").front();
+  auto Y_name = opdesc.Input("Y").front();
+  auto Out_name = opdesc.Output("Out").front();
+
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  param_.act_type = opdesc.GetAttr<std::string>("act_type");
+  // TODO(sangoly): support more activation types.
+  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
+
+  return true;
+}
+
+#ifdef LITE_WITH_X86
+bool FusionElementwiseActivationGradExplicitOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Y_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+
+bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  param_.Y_grad->Resize(param_.Y->dims());
+  return true;
+}
+
+bool FusionElementwiseActivationGradExplicitOp::AttachImpl(
+    const cpp::OpDesc& opdesc, lite::Scope* scope) {
+  CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
+  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
+  auto X_name = opdesc.Output(framework::GradVarName("X")).front();
+  auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
+
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
+  param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
+  param_.act_type = opdesc.GetAttr<std::string>("act_type");
+  // TODO(sangoly): support more activation types.
+  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
+
+  return true;
+}
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(fusion_elementwise_sub_activation,
+                 paddle::lite::operators::FusionElementwiseActivationOp);
+#ifdef LITE_WITH_X86
+REGISTER_LITE_OP(
+    fusion_elementwise_sub_activation_grad,
+    paddle::lite::operators::FusionElementwiseActivationGradExplicitOp);
+#endif
+REGISTER_LITE_OP(fusion_elementwise_add_activation,
+                 paddle::lite::operators::FusionElementwiseActivationOp);
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
new file mode 100644
index 00000000000..1a759c35e80
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/operators/elementwise_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class FusionElementwiseActivationOp : public OpLite {
+ public:
+  explicit FusionElementwiseActivationOp(const std::string& type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fusion_elementwise_activation_op";
+  }
+
+ private:
+  mutable operators::FusionElementwiseActivationParam param_;
+};
+
+#ifdef LITE_WITH_X86
+class FusionElementwiseActivationGradExplicitOp : public OpLite {
+ public:
+  explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
+      : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "fusion_elementwise_activation_grad_explicit_op";
+  }
+
+ private:
+  mutable operators::FusionElementwiseActivationGradParam param_;
+};
+#endif
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc
new file mode 100644
index 00000000000..07566e25fc1
--- /dev/null
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops_test.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h"
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+TEST(fusion_elementwise_activation_op_lite, test) {
+  // prepare variables
+  lite::Scope scope;
+  auto* x = scope.Var("x")->GetMutable<lite::Tensor>();
+  auto* y = scope.Var("y")->GetMutable<lite::Tensor>();
+  auto* out = scope.Var("out")->GetMutable<lite::Tensor>();
+  x->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
+  y->Resize(lite::DDim(std::vector<int64_t>({10, 20})));
+  out->Resize(lite::DDim(std::vector<int64_t>{10, 20}));
+
+  // set data
+  for (int i = 0; i < 10 * 20; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    y->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    out->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("fusion_elementwise_add_activation");
+  desc.SetInput("X", {"x"});
+  desc.SetInput("Y", {"y"});
+  desc.SetOutput("Out", {"out"});
+  desc.SetAttr("axis", static_cast<int>(1));
+  desc.SetAttr("act_type", std::string("relu"));
+
+  FusionElementwiseActivationOp fuse_op("fusion_elementwise_add_activation");
+
+  fuse_op.SetValidPlaces({Place{TARGET(kX86), PRECISION(kFloat)}});
+  fuse_op.Attach(desc, &scope);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/gru_op.cc b/paddle/fluid/lite/operators/gru_op.cc
new file mode 100644
index 00000000000..7f936242c1a
--- /dev/null
+++ b/paddle/fluid/lite/operators/gru_op.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/gru_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool GruOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.weight);
+  CHECK_OR_FALSE(param_.batchGate);
+  CHECK_OR_FALSE(param_.batchResetHiddenPrev);
+  CHECK_OR_FALSE(param_.batchHidden);
+  CHECK_OR_FALSE(param_.hidden);
+
+  auto weight_dims = param_.weight->dims();
+  int frame_size = weight_dims[0];
+  CHECK_EQ_OR_FALSE(weight_dims[1], frame_size * 3);
+  if (param_.h0) {
+    auto h0_dims = param_.h0->dims();
+    CHECK_EQ_OR_FALSE(h0_dims[1], frame_size);
+  }
+  if (param_.bias) {
+    auto bias_dims = param_.bias->dims();
+    int bias_height = bias_dims[0];
+    int bias_width = bias_dims[1];
+    CHECK_EQ_OR_FALSE(bias_height, 1);
+    CHECK_EQ_OR_FALSE(bias_width, frame_size * 3);
+  }
+  return true;
+}
+
+bool GruOp::InferShape() const {
+  auto input_dims = param_.x->dims();
+  auto weight_dims = param_.weight->dims();
+  int frame_size = weight_dims[0];
+  std::vector<int64_t> outShape{input_dims[0], frame_size};
+  param_.batchGate->Resize(input_dims);
+  param_.batchResetHiddenPrev->Resize(lite::DDim(outShape));
+  param_.batchHidden->Resize(lite::DDim(outShape));
+  param_.hidden->Resize(lite::DDim(outShape));
+  param_.hidden->raw_tensor().set_lod(param_.x->lod());
+  return true;
+}
+
+bool GruOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.x =
+      scope->FindVar(opdesc.Input("Input").front())->GetMutable<lite::Tensor>();
+  param_.weight = scope->FindVar(opdesc.Input("Weight").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.batchGate = scope->FindVar(opdesc.Output("BatchGate").front())
+                         ->GetMutable<lite::Tensor>();
+  param_.batchResetHiddenPrev =
+      scope->FindVar(opdesc.Output("BatchResetHiddenPrev").front())
+          ->GetMutable<lite::Tensor>();
+  param_.batchHidden = scope->FindVar(opdesc.Output("BatchHidden").front())
+                           ->GetMutable<lite::Tensor>();
+  param_.hidden = scope->FindVar(opdesc.Output("Hidden").front())
+                      ->GetMutable<lite::Tensor>();
+
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "H0") !=
+      input_arg_names.end()) {
+    if (opdesc.Input("H0").size() != 0) {
+      param_.h0 = scope->FindVar(opdesc.Input("H0").front())
+                      ->GetMutable<lite::Tensor>();
+    }
+  }
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "Bias") !=
+      input_arg_names.end()) {
+    param_.bias = scope->FindVar(opdesc.Input("Bias").front())
+                      ->GetMutable<lite::Tensor>();
+  }
+
+  param_.activation = opdesc.GetAttr<std::string>("activation");
+  param_.gate_activation = opdesc.GetAttr<std::string>("gate_activation");
+  param_.is_reverse = opdesc.GetAttr<bool>("is_reverse");
+  param_.origin_mode = opdesc.GetAttr<bool>("origin_mode");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(gru, paddle::lite::operators::GruOp);
diff --git a/paddle/fluid/lite/operators/gru_op.h b/paddle/fluid/lite/operators/gru_op.h
new file mode 100644
index 00000000000..e292e2a00c0
--- /dev/null
+++ b/paddle/fluid/lite/operators/gru_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class GruOp : public OpLite {
+ public:
+  GruOp() {}
+  explicit GruOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "gru"; }
+
+ private:
+  mutable GruParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/lookup_table_op.cc b/paddle/fluid/lite/operators/lookup_table_op.cc
new file mode 100644
index 00000000000..5f7b1c0ceec
--- /dev/null
+++ b/paddle/fluid/lite/operators/lookup_table_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/lookup_table_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool LookupTableOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.w);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.ids);
+  auto table_dims = param_.w->dims();
+  auto ids_dims = param_.ids->dims();
+  int ids_rank = ids_dims.size();
+  CHECK_EQ_OR_FALSE(table_dims.size(), 2UL);
+  CHECK_EQ_OR_FALSE(ids_dims[ids_rank - 1], 1UL);
+
+  return true;
+}
+
+bool LookupTableOp::InferShape() const {
+  auto table_dims = param_.w->dims();
+  auto ids_dims = param_.ids->dims();
+  int ids_rank = ids_dims.size();
+  auto output_dims = framework::vectorize(
+      framework::slice_ddim(ids_dims.data(), 0, ids_rank - 1));
+  output_dims.push_back(table_dims[1]);
+  param_.output->Resize(lite::DDim(output_dims));
+  param_.output->raw_tensor().set_lod(param_.ids->raw_tensor().lod());
+  return true;
+}
+
+bool LookupTableOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.w =
+      scope->FindVar(opdesc.Input("W").front())->GetMutable<lite::Tensor>();
+  param_.ids =
+      scope->FindVar(opdesc.Input("Ids").front())->GetMutable<lite::Tensor>();
+  param_.output =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+
+  if (opdesc.HasAttr("is_sparse"))
+    param_.is_sparse = opdesc.GetAttr<bool>("is_sparse");
+  if (opdesc.HasAttr("is_distributed"))
+    param_.is_distributed = opdesc.GetAttr<bool>("is_distributed");
+  param_.padding_idx = opdesc.GetAttr<int64_t>("padding_idx");
+  if (opdesc.HasAttr("remote_prefetch"))
+    param_.remote_prefetch = opdesc.GetAttr<bool>("remote_prefetch");
+  if (opdesc.HasAttr("trainer_id"))
+    param_.trainer_id = opdesc.GetAttr<int>("trainer_id");
+  if (opdesc.HasAttr("height_sections"))
+    param_.height_sections =
+        opdesc.GetAttr<std::vector<int64_t>>("height_sections");
+  if (opdesc.HasAttr("epmap"))
+    param_.epmap = opdesc.GetAttr<std::vector<std::string>>("epmap");
+  if (opdesc.HasAttr("table_names"))
+    param_.table_names =
+        opdesc.GetAttr<std::vector<std::string>>("table_names");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(lookup_table, paddle::lite::operators::LookupTableOp);
diff --git a/paddle/fluid/lite/operators/lookup_table_op.h b/paddle/fluid/lite/operators/lookup_table_op.h
new file mode 100644
index 00000000000..bb21ed4544a
--- /dev/null
+++ b/paddle/fluid/lite/operators/lookup_table_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class LookupTableOp : public OpLite {
+ public:
+  LookupTableOp() {}
+  explicit LookupTableOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "lookup_table"; }
+
+ private:
+  mutable LookupTableParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/mean_op.cc b/paddle/fluid/lite/operators/mean_op.cc
index 411dcbb735a..596f4bda010 100644
--- a/paddle/fluid/lite/operators/mean_op.cc
+++ b/paddle/fluid/lite/operators/mean_op.cc
@@ -51,7 +51,7 @@ class MeanOp : public OpLite {
   std::string DebugString() const override { return "mean"; }
 
  private:
-  mutable operators::ElementwiseParam param_;
+  mutable operators::MeanParam param_;
 };
 
 #ifdef LITE_WITH_X86
@@ -73,7 +73,7 @@ class MeanGradOp : public OpLite {
   }
 
   bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override {
-    CHECK_EQ(opdesc.InputArgumentNames().size(), 3UL);
+    CHECK_EQ(opdesc.InputArgumentNames().size(), 2UL);
     auto X_name = opdesc.Input("X").front();
     auto Out_grad_name = opdesc.Input(framework::GradVarName("Out")).front();
     auto X_grad_name = opdesc.Output(framework::GradVarName("X")).front();
diff --git a/paddle/fluid/lite/operators/mul_op.cc b/paddle/fluid/lite/operators/mul_op.cc
index 70eb37dd09b..a6bb47c6476 100644
--- a/paddle/fluid/lite/operators/mul_op.cc
+++ b/paddle/fluid/lite/operators/mul_op.cc
@@ -31,16 +31,18 @@ bool MulOpLite::CheckShape() const {
   CHECK_GT_OR_FALSE(x_dims.size(), static_cast<size_t>(param_.x_num_col_dims));
   CHECK_GT_OR_FALSE(y_dims.size(), static_cast<size_t>(param_.y_num_col_dims));
 
-  // auto x_mat_dims =
-  //     framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
-  // auto y_mat_dims =
-  //     framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
-
-  // PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
-  //                   "First matrix's width must be equal with second matrix's
-  //                   "
-  //                   "height. %s, %s",
-  //                   x_mat_dims[1], y_mat_dims[0]);
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  auto x_mat_dims =
+      framework::flatten_to_2d(x_dims.data(), param_.x_num_col_dims);
+  auto y_mat_dims =
+      framework::flatten_to_2d(y_dims.data(), param_.y_num_col_dims);
+
+  PADDLE_ENFORCE_EQ(x_mat_dims[1], y_mat_dims[0],
+                    "First matrix's width must be equal with second matrix's"
+                    "height. %s, %s",
+                    x_mat_dims[1], y_mat_dims[0]);
+#endif
+
   return true;
 }
 
@@ -63,7 +65,7 @@ bool MulOpLite::InferShape() const {
   param_.output->Resize(lite::DDim(out_dims));
 
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->raw_tensor().set_lod(param_.x->lod());
   return true;
 }
 
@@ -73,30 +75,34 @@ bool MulGradOpLite::CheckShape() const {
   CHECK_OR_FALSE(param_.x);
   CHECK_OR_FALSE(param_.y);
   CHECK_OR_FALSE(param_.output_grad);
-  CHECK_OR_FALSE(param_.x_grad);
-  CHECK_OR_FALSE(param_.y_grad);
 
   return true;
 }
 
 bool MulGradOpLite::InferShape() const {
-  param_.x_grad->Resize(param_.x->dims());
-  param_.y_grad->Resize(param_.y->dims());
+  if (param_.x_grad) param_.x_grad->Resize(param_.x->dims());
+  if (param_.y_grad) param_.y_grad->Resize(param_.y->dims());
   return true;
 }
 
 bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   auto X_name = op_desc.Input("X").front();
   auto Y_name = op_desc.Input("Y").front();
-  auto Out_grad_name = op_desc.Output(framework::GradVarName("Out")).front();
-  auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
-  auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
+  auto Out_grad_name = op_desc.Input(framework::GradVarName("Out")).front();
+
+  if (op_desc.Output(framework::GradVarName("X")).size()) {
+    auto X_grad_name = op_desc.Output(framework::GradVarName("X")).front();
+    param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
+  }
+
+  if (op_desc.Output(framework::GradVarName("Y")).size()) {
+    auto Y_grad_name = op_desc.Output(framework::GradVarName("Y")).front();
+    param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
+  }
 
   param_.x = GetVar<lite::Tensor>(scope, X_name);
   param_.y = GetVar<lite::Tensor>(scope, Y_name);
   param_.output_grad = GetVar<lite::Tensor>(scope, Out_grad_name);
-  param_.x_grad = GetMutableVar<lite::Tensor>(scope, X_grad_name);
-  param_.y_grad = GetMutableVar<lite::Tensor>(scope, Y_grad_name);
 
   return true;
 }
@@ -107,3 +113,6 @@ bool MulGradOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
 }  // namespace paddle
 
 REGISTER_LITE_OP(mul, paddle::lite::operators::MulOpLite);
+#ifdef LITE_WITH_X86
+REGISTER_LITE_OP(mul_grad, paddle::lite::operators::MulGradOpLite);
+#endif
diff --git a/paddle/fluid/lite/operators/mul_op.h b/paddle/fluid/lite/operators/mul_op.h
index 7aa1581bb2a..05c3a276131 100644
--- a/paddle/fluid/lite/operators/mul_op.h
+++ b/paddle/fluid/lite/operators/mul_op.h
@@ -38,15 +38,19 @@ class MulOpLite : public OpLite {
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    CHECK(!op_desc.Input("X").empty());
+    CHECK(!op_desc.Input("Y").empty());
+    CHECK(!op_desc.Output("Out").empty());
+
     auto input = op_desc.Input("X").front();
     auto W = op_desc.Input("Y").front();
     auto out = op_desc.Output("Out").front();
     auto *var = scope->FindVar(input);
     CHECK(var);
-    param_.x = var->GetMutable<Tensor>();
+    param_.x = &var->Get<Tensor>();
     var = scope->FindVar(W);
     CHECK(var) << "no var called " << W;
-    param_.y = var->GetMutable<Tensor>();
+    param_.y = &var->Get<Tensor>();
     var = scope->FindVar(out);
     CHECK(var) << "no var called " << out;
     param_.output = var->GetMutable<Tensor>();
@@ -62,6 +66,7 @@ class MulOpLite : public OpLite {
   mutable MulParam param_;
 };
 
+#ifdef LITE_WITH_X86
 class MulGradOpLite : public OpLite {
  public:
   MulGradOpLite() {}
@@ -81,6 +86,7 @@ class MulGradOpLite : public OpLite {
  private:
   mutable MulGradParam param_;
 };
+#endif
 
 }  // namespace operators
 }  // namespace lite
diff --git a/paddle/fluid/lite/operators/op_params.h b/paddle/fluid/lite/operators/op_params.h
index 78df0ce8a7a..0744c6a4bb2 100644
--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -28,10 +28,15 @@ namespace lite {
 namespace operators {
 
 using param_t = Any;
+#define WITH_INT8_CONFIG             \
+  bool enable_int8{false};           \
+  float input_scale{1.0};            \
+  std::vector<float> weight_scale{}; \
+  float output_scale{1.0};
 
 /// ----------------------- Functional operators ------------------------------
 struct FeedParam {
-  const std::vector<lite::Tensor>* feed_list{};
+  std::vector<lite::Tensor>* feed_list{};
   lite::Tensor* out{};
   int col;
 };
@@ -48,6 +53,12 @@ struct IoCopyParam {
   lite::Tensor* y{};
 };
 
+struct CalibParam {
+  const lite::Tensor* input{};
+  lite::Tensor* output{};
+  float scale;
+};
+
 /// -------------------------- NN operators ------------------------------------
 
 struct FcParam {
@@ -57,21 +68,21 @@ struct FcParam {
   lite::Tensor* output{};
   lite::DDim in_mat_dims;
   int in_num_col_dims{1};
-};
-
-struct ReluParam {
-  lite::Tensor* input{};
-  lite::Tensor* output{};
+  bool weight_transposed{false};
+  // for int8
+  WITH_INT8_CONFIG
 };
 
 // For Mul Op
 struct MulParam {
-  lite::Tensor* x{};
-  lite::Tensor* y{};
+  const lite::Tensor* x{};
+  const lite::Tensor* y{};
   lite::Tensor* output{};
 
   int x_num_col_dims{1};
   int y_num_col_dims{1};
+  // for int8
+  WITH_INT8_CONFIG
 };
 
 struct MulGradParam {
@@ -124,8 +135,8 @@ struct ConcatParam {
 struct ConvParam {
   lite::Tensor* x{};
   lite::Tensor* filter{};
-  const lite::Tensor* bias{};
-  const lite::Tensor* residualData{};
+  lite::Tensor* bias{nullptr};
+  lite::Tensor* residualData{nullptr};
   lite::Tensor* output{};
   std::vector<int> strides{1, 1};
   std::vector<int> paddings{0, 0};
@@ -143,6 +154,27 @@ struct ConvParam {
   float scale_weights{1.0f};      // only used with mkl-dnn int8
   bool force_fp32_output{false};  // only used in mkl-dnn int8
   std::string data_format{"Anylayout"};
+  // for int8
+  WITH_INT8_CONFIG
+};
+
+// For BatchNorm op
+struct BatchNormParam {
+  lite::Tensor* x{};
+  lite::Tensor* bias{};
+  lite::Tensor* scale{};
+  lite::Tensor* mean{};
+  lite::Tensor* variance{};
+  lite::Tensor* y{};
+  lite::Tensor* mean_out{};
+  lite::Tensor* variance_out{};
+  lite::Tensor* saved_mean{};
+  lite::Tensor* saved_variance{};
+  bool is_test{true};
+  bool use_global_stats{false};
+  float epsilon;
+  float momentum;
+  DataLayoutType data_layout{DATALAYOUT(kNCHW)};
 };
 
 // For Pooling op
@@ -174,6 +206,85 @@ struct DropoutParam {
   std::string dropout_implementation{"downgrade_in_infer"};
 };
 
+// For Split op
+struct SplitParam {
+  lite::Tensor* x{};
+  std::vector<lite::Tensor*> output{};
+  int axis{-1};
+  int num{0};
+  std::vector<int> sections;
+};
+
+// For Transpose op
+struct TransposeParam {
+  const lite::Tensor* x{};
+  lite::Tensor* output{};
+  std::vector<int> axis;
+  bool use_mkldnn{false};
+  std::string data_format{"AnyLayout"};
+};
+
+struct GruParam {
+  lite::Tensor* x{};
+  lite::Tensor* h0{};
+  lite::Tensor* weight{};
+  lite::Tensor* bias{};
+  lite::Tensor* batchGate{};
+  lite::Tensor* batchResetHiddenPrev{};
+  lite::Tensor* batchHidden{};
+  lite::Tensor* hidden{};
+  std::string activation{"tanh"};
+  std::string gate_activation{"sigmoid"};
+  bool is_reverse{false};
+  bool origin_mode{false};
+};
+
+struct FusionGruParam {
+  lite::Tensor* x{};
+  lite::Tensor* h0{};
+  lite::Tensor* weightX{};
+  lite::Tensor* weightH{};
+  lite::Tensor* bias{};
+  lite::Tensor* reorderedH0{};
+  lite::Tensor* xx{};
+  lite::Tensor* batchedInput{};
+  lite::Tensor* batchedOut{};
+  lite::Tensor* hidden{};
+  std::string activation{"tanh"};
+  std::string gate_activation{"sigmoid"};
+  bool is_reverse{false};
+  bool use_seq{true};
+};
+
+struct LookupTableParam {
+  lite::Tensor* w{};
+  lite::Tensor* ids{};
+  lite::Tensor* output{};
+  bool is_sparse{false};
+  bool is_distributed{false};
+  int64_t padding_idx{-1};
+  bool remote_prefetch{false};
+  int trainer_id{0};
+  std::vector<std::string> epmap{};
+  std::vector<int64_t> height_sections{};
+  std::vector<std::string> table_names{};
+};
+
+struct SequenceReshapeParam {
+  lite::Tensor* x{};
+  lite::Tensor* output{};
+  int new_dim;
+};
+
+///----------------------- reduce operators -----------------------------
+struct ReduceParam {
+  lite::Tensor* x{};
+  lite::Tensor* output{};
+  std::vector<int> dim{0};
+  bool keep_dim{false};
+  bool reduce_all{false};
+};
+
 /// ----------------------- element wise operators ----------------------
 struct ElementwiseParam {
   const lite::Tensor* X{};
@@ -190,6 +301,14 @@ struct ElementwiseGradParam {
   int axis{-1};  // for broadcasting.
 };
 
+struct FusionElementwiseActivationParam : public ElementwiseParam {
+  std::string act_type;
+};
+
+struct FusionElementwiseActivationGradParam : public ElementwiseGradParam {
+  std::string act_type;
+};
+
 /// ----------------------- activation operators ----------------------
 struct ActivationParam {
   const lite::Tensor* X{};
@@ -227,6 +346,28 @@ struct FillConstantParam {
   lite::Tensor* Out{};
 };
 
+//
+struct FakeQuantizeMovingAvgMaxAbsParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* in_scale{};
+  const lite::Tensor* in_accum{};
+  const lite::Tensor* in_state{};
+  lite::Tensor* out{};
+  lite::Tensor* out_scale{};
+  lite::Tensor* out_state{};
+  lite::Tensor* out_accum{};
+  int bit_length;
+  bool is_test{true};
+  float moving_rate{0.9};
+};
+
+struct FakeDequantizeMaxAbsParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* in_scale{};
+  lite::Tensor* out{};
+  float max_range;
+};
+
 /// ----------------------- sgd operators ----------------------
 struct SGDParam {
   int dtype{framework::proto::VarType::FP32};
@@ -237,20 +378,14 @@ struct SGDParam {
   lite::Tensor* ParamOut{};
 };
 
-//
-struct BatchNormParam {
-  lite::Tensor* x{};
-  lite::Tensor* bias{};
-  lite::Tensor* mean{};
-  lite::Tensor* scale{};
-  lite::Tensor* var{};
-  lite::Tensor* out{};
-  lite::Tensor* mean_out{};
-  lite::Tensor* var_out{};
-  lite::Tensor* saved_mean{};
-  lite::Tensor* saved_var{};
-
-  float eps{1e-5};
+/// ----------------------- uniform_random operators ----------------------
+struct UniformRandomParam {
+  std::vector<int64_t> shape{};
+  float min{-1.0f};
+  float max{1.0f};
+  int seed{0};
+  int dtype{framework::proto::VarType::FP32};
+  lite::Tensor* Out{};
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/lite/operators/pool_op.cc b/paddle/fluid/lite/operators/pool_op.cc
index 055f00f90a4..3faf2bf0fa4 100644
--- a/paddle/fluid/lite/operators/pool_op.cc
+++ b/paddle/fluid/lite/operators/pool_op.cc
@@ -19,6 +19,27 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
+bool PoolOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+
+  const auto& x_dims = param_.x->dims();
+  const auto& ksize = param_.ksize;
+  const auto& strides = param_.strides;
+  const auto& paddings = param_.paddings;
+
+  // "Pooling intput should be 4-D or 5-D tensor."
+  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
+  // Input size and pooling size should be consistent.
+  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
+  // Strides size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == strides.size());
+  // Paddings size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == paddings.size());
+
+  return true;
+}
+
 int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
                    bool ceil_mode) {
   int output_size;
@@ -28,46 +49,35 @@ int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
     output_size =
         (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
   }
-  CHECK_OR_FALSE(output_size > 0);
   return output_size;
 }
 
-bool PoolOpLite::CheckShape() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
-  return true;
-}
-
 bool PoolOpLite::InferShape() const {
-  const auto input_dims = param_.x->dims();
-  CHECK_OR_FALSE(input_dims.size() == 4 || input_dims.size() == 5);
-
+  const auto x_dims = param_.x->dims();
+  std::vector<int>& ksize = param_.ksize;
   if (param_.global_pooling) {
-    param_.ksize.resize(static_cast<size_t>(input_dims.size()) - 2);
-    for (size_t i = 0; i < param_.ksize.size(); ++i) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
       param_.paddings[i] = 0;
-      param_.ksize[i] = static_cast<int>(input_dims[i + 2]);
+      ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
 
-  CHECK_OR_FALSE(input_dims.size() - param_.ksize.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.ksize.size(), param_.strides.size());
-  CHECK_EQ_OR_FALSE(param_.ksize.size(), param_.paddings.size());
-
-  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
   if (param_.adaptive) {
     output_shape.insert(output_shape.end(), param_.ksize.begin(),
                         param_.ksize.end());
   } else {
     for (size_t i = 0; i < param_.ksize.size(); ++i) {
       output_shape.push_back(
-          PoolOutputSize(input_dims[i + 2], param_.ksize[i], param_.paddings[i],
+          PoolOutputSize(x_dims[i + 2], param_.ksize[i], param_.paddings[i],
                          param_.strides[i], param_.ceil_mode));
     }
   }
-  // share LoD
-  // param_.output->set_lod(param_.input->lod());
   param_.output->Resize(lite::DDim(output_shape));
+
+  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  // ctx->ShareLoD("X", "Out");
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/pool_op.h b/paddle/fluid/lite/operators/pool_op.h
index 64c15ccf1db..29946ed92a4 100644
--- a/paddle/fluid/lite/operators/pool_op.h
+++ b/paddle/fluid/lite/operators/pool_op.h
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #pragma once
+
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
 #include "paddle/fluid/lite/core/kernel.h"
 #include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/scope.h"
@@ -35,25 +37,41 @@ class PoolOpLite : public OpLite {
 
   bool InferShape() const override;
 
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
   // TODO(Superjomn) replace framework::OpDesc with a lite one.
   bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
-    auto input = op_desc.Input("X").front();
+    auto x = op_desc.Input("X").front();
     auto out = op_desc.Output("Out").front();
 
-    param_.x = scope->FindVar(input)->GetMutable<Tensor>();
-    param_.output = scope->FindVar(out)->GetMutable<Tensor>();
+    CHECK(scope->FindVar(x));
+    CHECK(scope->FindVar(out));
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
     param_.pooling_type = op_desc.GetAttr<std::string>("pooling_type");
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
+    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
     param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
-    param_.ceil_mode = op_desc.GetAttr<bool>("ceil_mode");
-    param_.adaptive = op_desc.GetAttr<bool>("adaptive");
-    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
+
+    if (op_desc.HasAttr("exclusive")) {
+      param_.exclusive = op_desc.GetAttr<bool>("exclusive");
+    }
+    if (op_desc.HasAttr("adaptive")) {
+      param_.adaptive = op_desc.GetAttr<bool>("adaptive");
+    }
+    if (op_desc.HasAttr("ceil_mode")) {
+      param_.ceil_mode = op_desc.GetAttr<bool>("ceil_mode");
+    }
+    if (op_desc.HasAttr("use_quantizer")) {
+      param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
+    }
+    // param_.data_format = op_desc.GetAttr<bool>("data_format");
     return true;
   }
 
-  std::string DebugString() const override { return "pool"; }
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "pool2d"; }
 
  private:
   mutable PoolParam param_;
diff --git a/paddle/fluid/lite/operators/pool_op_test.cc b/paddle/fluid/lite/operators/pool_op_test.cc
new file mode 100644
index 00000000000..e9616ede5a4
--- /dev/null
+++ b/paddle/fluid/lite/operators/pool_op_test.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/pool_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+TEST(pool_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  x->Resize(DDim(std::vector<int64_t>({1, 3, 224, 224})));
+  output->Resize(DDim(std::vector<int64_t>{1, 3, 112, 112}));
+
+  // set data
+  for (int i = 0; i < 1 * 3 * 224 * 224; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 1 * 3 * 112 * 112; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("pool2d");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+
+  std::string pooling_type("max");
+  desc.SetAttr("pooling_type", pooling_type);
+  // desc.SetAttr("ksize", static_cast<std::vector<int>>({2, 2}));
+  std::vector<int> ksize{2, 2};
+  desc.SetAttr("ksize", ksize);
+
+  bool global_pooling{false};
+  desc.SetAttr("global_pooling", global_pooling);
+
+  std::vector<int> strides{1, 1};
+  desc.SetAttr("strides", strides);
+
+  std::vector<int> paddings{0, 0};
+  desc.SetAttr("paddings", paddings);
+
+  bool exclusive{true};
+  desc.SetAttr("exclusive", exclusive);
+
+  bool adaptive{false};
+  desc.SetAttr("adaptive", adaptive);
+
+  bool ceil_mode{false};
+  desc.SetAttr("ceil_mode", ceil_mode);
+
+  bool use_quantizer{false};
+  desc.SetAttr("use_quantizer", use_quantizer);
+
+  PoolOpLite pool("pool2d");
+  pool.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  pool.Attach(desc, &scope);
+  auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
+  LOG(INFO) << "kernels.size(): " << kernels.size();
+#ifdef LITE_WITH_ARM
+  ASSERT_FALSE(kernels.empty());
+#else
+  ASSERT_TRUE(kernels.empty());
+#endif
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+#endif
diff --git a/paddle/fluid/lite/operators/reduce_ops.cc b/paddle/fluid/lite/operators/reduce_ops.cc
new file mode 100644
index 00000000000..7c17f9caeb3
--- /dev/null
+++ b/paddle/fluid/lite/operators/reduce_ops.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/reduce_ops.h"
+#include <algorithm>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ReduceOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_LE(x_rank, 6UL) << "Tensors with rank at most 6 are supported.";
+  return true;
+}
+
+bool ReduceOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  auto dims = param_.dim;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] < 0) dims[i] = x_rank + dims[i];
+    CHECK_LT(dims[i], x_rank)
+        << "The dim should be in the range [-rank(input), rank(input).";
+  }
+  sort(dims.begin(), dims.end());
+  bool reduce_all = param_.reduce_all;
+  bool keep_dim = param_.keep_dim;
+
+  if (reduce_all) {
+    if (keep_dim)
+      param_.output->Resize(lite::DDim(std::vector<int64_t>(x_rank, 1)));
+    else
+      param_.output->Resize(lite::DDim(std::vector<int64_t>{1}));
+  } else {
+    auto dims_vector = x_dims.Vectorize();
+    if (keep_dim) {
+      for (size_t i = 0; i < dims.size(); ++i) {
+        dims_vector[dims[i]] = 1;
+      }
+    } else {
+      const int kDelFlag = -2;
+      for (size_t i = 0; i < dims.size(); ++i) {
+        dims_vector[dims[i]] = kDelFlag;
+      }
+      dims_vector.erase(
+          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+          dims_vector.end());
+    }
+    auto out_dims = lite::DDim(dims_vector);
+    param_.output->Resize(out_dims);
+    if (dims[0] != 0) {
+      param_.output->raw_tensor().set_lod(param_.x->lod());
+    }
+  }
+  return true;
+}
+
+bool ReduceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.x =
+      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
+  param_.output =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+
+  param_.dim = opdesc.GetAttr<std::vector<int>>("dim");
+  param_.reduce_all = opdesc.GetAttr<bool>("reduce_all");
+  param_.keep_dim = opdesc.GetAttr<bool>("keep_dim");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(reduce_sum, paddle::lite::operators::ReduceOp);
diff --git a/paddle/fluid/lite/operators/reduce_ops.h b/paddle/fluid/lite/operators/reduce_ops.h
new file mode 100644
index 00000000000..75d1866587d
--- /dev/null
+++ b/paddle/fluid/lite/operators/reduce_ops.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ReduceOp : public OpLite {
+ public:
+  ReduceOp() {}
+  explicit ReduceOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "reduce"; }
+
+ private:
+  mutable ReduceParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/relu_op.cc b/paddle/fluid/lite/operators/relu_op.cc
index 47251c72dfa..e52ceb3603d 100644
--- a/paddle/fluid/lite/operators/relu_op.cc
+++ b/paddle/fluid/lite/operators/relu_op.cc
@@ -21,22 +21,22 @@ namespace operators {
 
 bool ReluOp::CheckShape() const { return true; }
 bool ReluOp::InferShape() const {
-  CHECK_OR_FALSE(param_.input);
-  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
   // TODO(Superjomn) Enable data sharing.
-  param_.output->Resize(param_.input->dims());
+  param_.Out->Resize(param_.X->dims());
   // share lod
-  // param_.output->set_lod(param_.input->lod());
+  param_.Out->raw_tensor().set_lod(param_.X->lod());
   return true;
 }
 
 bool ReluOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.input = const_cast<lite::Tensor *>(
+  param_.X = const_cast<lite::Tensor *>(
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.output =
+  param_.Out =
       scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.input);
-  CHECK(param_.output);
+  CHECK(param_.X);
+  CHECK(param_.Out);
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/relu_op.h b/paddle/fluid/lite/operators/relu_op.h
index 945a9680a75..9724686b42d 100644
--- a/paddle/fluid/lite/operators/relu_op.h
+++ b/paddle/fluid/lite/operators/relu_op.h
@@ -38,7 +38,7 @@ class ReluOp : public OpLite {
   std::string DebugString() const override { return "relu"; }
 
  private:
-  mutable ReluParam param_;
+  mutable ActivationParam param_;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/lite/operators/reshape_op.cc b/paddle/fluid/lite/operators/reshape_op.cc
index 6fc9c1af1e6..7b9675f68ef 100644
--- a/paddle/fluid/lite/operators/reshape_op.cc
+++ b/paddle/fluid/lite/operators/reshape_op.cc
@@ -30,6 +30,7 @@ bool ReshapeOp::InferShape() const {
   auto x_dims = param_.x->dims();
   auto output_dims = ValidateShape(param_.shape, x_dims);
   param_.output->Resize(output_dims);
+  param_.output->raw_tensor().set_lod(param_.x->lod());
   return true;
 }
 
@@ -43,10 +44,12 @@ bool ReshapeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
   if (std::find(input_arg_names.begin(), input_arg_names.end(), "Shape") !=
       input_arg_names.end()) {
-    auto actual_shape_var = scope->FindVar(opdesc.Input("Shape").front());
-    if (actual_shape_var != nullptr) {
-      param_.actual_shape =
-          const_cast<lite::Tensor *>(&(actual_shape_var->Get<lite::Tensor>()));
+    if (opdesc.Input("Shape").size() != 0) {
+      auto actual_shape_var = scope->FindVar(opdesc.Input("Shape").front());
+      if (actual_shape_var != nullptr) {
+        param_.actual_shape = const_cast<lite::Tensor *>(
+            &(actual_shape_var->Get<lite::Tensor>()));
+      }
     }
   }
   param_.shape = (opdesc.GetAttr<std::vector<int>>("shape"));
@@ -74,6 +77,7 @@ bool Reshape2Op::InferShape() const {
     xshape_dims[i + 1] = x_dims[i];
   }
   param_.xshape->Resize(DDim(xshape_dims));
+  param_.xshape->raw_tensor().set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/scale_op.cc b/paddle/fluid/lite/operators/scale_op.cc
index fb55366488c..fdede60675f 100644
--- a/paddle/fluid/lite/operators/scale_op.cc
+++ b/paddle/fluid/lite/operators/scale_op.cc
@@ -26,6 +26,7 @@ bool ScaleOp::CheckShape() const {
 
 bool ScaleOp::InferShape() const {
   param_.output->Resize(param_.x->dims());
+  param_.output->raw_tensor().set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/paddle/fluid/lite/operators/sequence_reshape_op.cc b/paddle/fluid/lite/operators/sequence_reshape_op.cc
new file mode 100644
index 00000000000..2ad16fc0e14
--- /dev/null
+++ b/paddle/fluid/lite/operators/sequence_reshape_op.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/sequence_reshape_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceReshapeOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  CHECK_EQ_OR_FALSE(x_dims.size(), 2U);
+  return true;
+}
+
+bool SequenceReshapeOp::InferShape() const {
+  int new_dim = param_.new_dim;
+  auto x_numel = param_.x->dims().production();
+  std::vector<int64_t> out_shape{x_numel / new_dim,
+                                 static_cast<int64_t>(new_dim)};
+  param_.output->Resize(lite::DDim(out_shape));
+  return true;
+}
+
+bool SequenceReshapeOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                   lite::Scope *scope) {
+  param_.x =
+      scope->FindVar(opdesc.Input("X").front())->GetMutable<lite::Tensor>();
+  param_.output =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+
+  param_.new_dim = opdesc.GetAttr<int>("new_dim");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_reshape, paddle::lite::operators::SequenceReshapeOp);
diff --git a/paddle/fluid/lite/operators/sequence_reshape_op.h b/paddle/fluid/lite/operators/sequence_reshape_op.h
new file mode 100644
index 00000000000..75c6501668d
--- /dev/null
+++ b/paddle/fluid/lite/operators/sequence_reshape_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceReshapeOp : public OpLite {
+ public:
+  SequenceReshapeOp() {}
+  explicit SequenceReshapeOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_reshape"; }
+
+ private:
+  mutable SequenceReshapeParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/sgd_op.cc b/paddle/fluid/lite/operators/sgd_op.cc
index 2571ad0b102..666ca798013 100644
--- a/paddle/fluid/lite/operators/sgd_op.cc
+++ b/paddle/fluid/lite/operators/sgd_op.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "/paddle/paddle/fluid/lite/operators/sgd_op.h"
+#include "paddle/fluid/lite/operators/sgd_op.h"
 #include "paddle/fluid/lite/core/op_lite.h"
 #include "paddle/fluid/lite/core/op_registry.h"
 
@@ -30,13 +30,14 @@ bool SGDOpLite::CheckShape() const {
 
 bool SGDOpLite::InferShape() const {
   auto lr_dims = param_.LearningRate->dims().data();
+#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
   CHECK_EQ_OR_FALSE(framework::product(lr_dims), 1);
+#endif
   param_.ParamOut->Resize(param_.Param->dims());
   return true;
 }
 
-bool SGDOpLite::AttachImpl(const OpDesc& opdesc, lite::Scope* scope) {
-  CHECK_EQ(opdesc.Inputs().size(), 3UL);
+bool SGDOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto Param_name = opdesc.Input("Param").front();
   auto LearningRate_name = opdesc.Input("LearningRate").front();
   auto Grad_name = opdesc.Input("Grad").front();
diff --git a/paddle/fluid/lite/operators/sgd_op.h b/paddle/fluid/lite/operators/sgd_op.h
index dea045c0b67..5847a2cc59d 100644
--- a/paddle/fluid/lite/operators/sgd_op.h
+++ b/paddle/fluid/lite/operators/sgd_op.h
@@ -37,7 +37,7 @@ class SGDOpLite : public OpLite {
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
-  bool AttachImpl(const OpDesc &op_desc, lite::Scope *scope) override;
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
 
   std::string DebugString() const override { return "sgd"; }
 
diff --git a/paddle/fluid/lite/operators/softmax_op.cc b/paddle/fluid/lite/operators/softmax_op.cc
index 41d7b335e80..dbb78d39906 100644
--- a/paddle/fluid/lite/operators/softmax_op.cc
+++ b/paddle/fluid/lite/operators/softmax_op.cc
@@ -31,6 +31,7 @@ bool SoftmaxOp::CheckShape() const {
 
 bool SoftmaxOp::InferShape() const {
   param_.output->Resize(param_.x->dims());
+  param_.output->raw_tensor().set_lod(param_.x->lod());
   return true;
 }
 
@@ -39,7 +40,12 @@ bool SoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
       &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
   param_.output =
       scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  param_.axis = opdesc.GetAttr<int>("axis");
+
+  if (opdesc.HasAttr("axis")) {
+    param_.axis = opdesc.GetAttr<int>("axis");
+  } else {
+    param_.axis = -1;
+  }
   CHECK(param_.x);
   CHECK(param_.output);
   return true;
diff --git a/paddle/fluid/lite/operators/split_op.cc b/paddle/fluid/lite/operators/split_op.cc
new file mode 100644
index 00000000000..1f220819db6
--- /dev/null
+++ b/paddle/fluid/lite/operators/split_op.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/split_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SplitOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_GT_OR_FALSE(param_.output.size(), 1UL);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_OR_FALSE(param_.axis >= -static_cast<int>(x_rank) &&
+                 param_.axis < static_cast<int>(x_rank));
+  return true;
+}
+
+bool SplitOp::InferShape() const {
+  const auto &outs = param_.output;
+  auto in_dims = param_.x->dims();
+  int axis = param_.axis;
+  int num = param_.num;
+  const auto &sections = param_.sections;
+
+  const int outs_number = outs.size();
+  std::vector<lite::DDim> outs_dims;
+  outs_dims.reserve(outs_number);
+
+  if (num > 0) {
+    int out_axis_dim = in_dims[axis] / num;
+    for (int i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = out_axis_dim;
+      outs_dims.push_back(dim);
+    }
+  } else if (sections.size() > 0) {
+    for (int i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections[i];
+      outs_dims.push_back(dim);
+    }
+  }
+
+  for (int j = 0; j < outs_dims.size(); ++j) {
+    outs[j]->Resize(outs_dims[j]);
+  }
+
+  return true;
+}
+
+bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.axis = opdesc.GetAttr<int>("axis");
+  param_.num = opdesc.GetAttr<int>("num");
+  param_.sections = opdesc.GetAttr<std::vector<int>>("sections");
+  auto input = opdesc.Input("Input").front();
+  auto outs = opdesc.Output("Out");
+  param_.x = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  for (auto var : outs) {
+    param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(split, paddle::lite::operators::SplitOp);
diff --git a/paddle/fluid/lite/operators/split_op.h b/paddle/fluid/lite/operators/split_op.h
new file mode 100644
index 00000000000..20dc4b1028c
--- /dev/null
+++ b/paddle/fluid/lite/operators/split_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SplitOp : public OpLite {
+ public:
+  SplitOp() {}
+  explicit SplitOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "split"; }
+
+ private:
+  mutable SplitParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/transpose_op.cc b/paddle/fluid/lite/operators/transpose_op.cc
new file mode 100644
index 00000000000..6b422bbb277
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/transpose_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+bool TransposeOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  return true;
+}
+
+bool TransposeOp::InferShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  lite::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+  param_.output->Resize(out_dims);
+  return true;
+}
+
+bool TransposeOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out = op_desc.Output("Out").front();
+
+  CHECK(scope->FindVar(x));
+  CHECK(scope->FindVar(out));
+  param_.x = GetVar<lite::Tensor>(scope, x);
+  param_.output = GetMutableVar<lite::Tensor>(scope, out);
+
+  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
+  if (op_desc.HasAttr("use_mkldnn")) {
+    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
+  }
+  if (op_desc.HasAttr("data_format")) {
+    param_.data_format = op_desc.GetAttr<std::string>("data_format");
+  }
+  return true;
+}
+
+// Transpose2
+bool Transpose2Op::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  return true;
+}
+
+bool Transpose2Op::InferShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  auto x_dims = param_.x->dims();
+  auto x_rank = x_dims.size();
+  std::vector<int> axis = param_.axis;
+  size_t axis_size = axis.size();
+  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
+  // x_rank, axis_size
+  CHECK_OR_FALSE(x_rank == axis_size);
+
+  std::vector<int> count(axis_size, 0);
+  for (size_t i = 0; i < axis_size; i++) {
+    // Each element of Attribute axis should be a unique value
+    // range from 0 to (dims - 1),
+    // where the dims is the axis's size
+    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
+                   ++count[axis[i]] == 1);
+  }
+  lite::DDim out_dims(x_dims);
+  for (size_t i = 0; i < axis_size; i++) {
+    out_dims[i] = x_dims[axis[i]];
+  }
+  param_.output->Resize(out_dims);
+  return true;
+}
+
+bool Transpose2Op::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out = op_desc.Output("Out").front();
+
+  CHECK(scope->FindVar(x));
+  CHECK(scope->FindVar(out));
+  param_.x = GetVar<lite::Tensor>(scope, x);
+  param_.output = GetMutableVar<lite::Tensor>(scope, out);
+
+  param_.axis = op_desc.GetAttr<std::vector<int>>("axis");
+  if (op_desc.HasAttr("use_mkldnn")) {
+    param_.use_mkldnn = op_desc.GetAttr<bool>("use_mkldnn");
+  }
+  if (op_desc.HasAttr("data_format")) {
+    param_.data_format = op_desc.GetAttr<std::string>("data_format");
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(transpose, paddle::lite::operators::TransposeOp);
+REGISTER_LITE_OP(transpose2, paddle::lite::operators::Transpose2Op);
diff --git a/paddle/fluid/lite/operators/transpose_op.h b/paddle/fluid/lite/operators/transpose_op.h
new file mode 100644
index 00000000000..f51acb61e1b
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+class TransposeOp : public OpLite {
+ public:
+  TransposeOp() {}
+  explicit TransposeOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "transpose"; }
+
+ private:
+  mutable TransposeParam param_;
+};
+
+// Transpose2
+class Transpose2Op : public OpLite {
+ public:
+  Transpose2Op() {}
+  explicit Transpose2Op(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "transpose2"; }
+
+ private:
+  mutable TransposeParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/transpose_op_test.cc b/paddle/fluid/lite/operators/transpose_op_test.cc
new file mode 100644
index 00000000000..8962c1e4921
--- /dev/null
+++ b/paddle/fluid/lite/operators/transpose_op_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/transpose_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+// Transpose
+TEST(transpose_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  const int h = 10;
+  const int w = 20;
+  x->Resize(DDim(std::vector<int64_t>({h, w})));
+  output->Resize(DDim(std::vector<int64_t>{w, h}));
+
+  // set data
+  for (int i = 0; i < h * w; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < w * h; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
+  std::vector<int> axis{0, 2, 3, 1};
+  desc.SetAttr("axis", axis);
+
+  TransposeOp transpose("transpose");
+
+  transpose.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  transpose.Attach(desc, &scope);
+}
+
+// Transpose2
+TEST(transpose2_op_lite, test) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("x")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  const int h = 10;
+  const int w = 20;
+  x->Resize(DDim(std::vector<int64_t>({h, w})));
+  output->Resize(DDim(std::vector<int64_t>{w, h}));
+
+  // set data
+  for (int i = 0; i < h * w; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < w * h; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("transpose2");
+  desc.SetInput("X", {"x"});
+  desc.SetOutput("Out", {"output"});
+  // axis change for shape in mobilenetssd: [1, 24, 2, 2] => [1, 2, 2, 24]
+  std::vector<int> axis{0, 2, 3, 1};
+  desc.SetAttr("axis", axis);
+
+  Transpose2Op transpose2("transpose2");
+
+  transpose2.SetValidPlaces({Place{TARGET(kARM), PRECISION(kFloat)}});
+  transpose2.Attach(desc, &scope);
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/operators/uniform_random_op.cc b/paddle/fluid/lite/operators/uniform_random_op.cc
new file mode 100644
index 00000000000..5f38b9ed124
--- /dev/null
+++ b/paddle/fluid/lite/operators/uniform_random_op.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/operators/uniform_random_op.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool UniformRandomOpLite::CheckShape() const { return true; }
+
+bool UniformRandomOpLite::InferShape() const {
+  param_.Out->Resize(param_.shape);
+  return true;
+}
+
+bool UniformRandomOpLite::AttachImpl(const cpp::OpDesc& opdesc,
+                                     lite::Scope* scope) {
+  param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
+  param_.min = opdesc.GetAttr<float>("min");
+  param_.max = opdesc.GetAttr<float>("max");
+  param_.seed = opdesc.GetAttr<int>("seed");
+  param_.dtype = opdesc.GetAttr<int>("dtype");
+  param_.Out = GetMutableVar<Tensor>(scope, opdesc.Output("Out").front());
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(uniform_random, paddle::lite::operators::UniformRandomOpLite);
diff --git a/paddle/fluid/lite/operators/uniform_random_op.h b/paddle/fluid/lite/operators/uniform_random_op.h
new file mode 100644
index 00000000000..0d85baf59aa
--- /dev/null
+++ b/paddle/fluid/lite/operators/uniform_random_op.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class UniformRandomOpLite : public OpLite {
+ public:
+  UniformRandomOpLite() {}
+
+  explicit UniformRandomOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+
+  std::string DebugString() const override { return "uniform_random"; }
+
+ private:
+  mutable UniformRandomParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/python/lite_test.py b/paddle/fluid/lite/python/lite_test.py
new file mode 100644
index 00000000000..5ef3548832e
--- /dev/null
+++ b/paddle/fluid/lite/python/lite_test.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.compiler as compiler
+import paddle.fluid.core as core
+import paddle.fluid.core.lite as lite
+import paddle.fluid.layers as layers
+import numpy as np
+import unittest
+
+from paddle.fluid.cxx_trainer import add_feed_fetch_op
+
+
+def _as_lodtensor(data, place):
+    # single tensor case
+    tensor = core.LoDTensor()
+    tensor.set(data, place)
+    return tensor
+
+
+data_label = [[
+    0.753544, 0.772977, 0.646915, 0.747543, 0.528923, 0.0517749, 0.248678,
+    0.75932, 0.960376, 0.606618
+]]
+data_a = [[
+    0.874445, 0.21623, 0.713262, 0.702672, 0.396977, 0.828285, 0.932995,
+    0.442674, 0.0321735, 0.484833, 0.045935, 0.21276, 0.556421, 0.131825,
+    0.285626, 0.741409, 0.257467, 0.975958, 0.444006, 0.114553
+]]
+
+data_loss = [0.9876687]
+
+
+class NaiveModelTest(unittest.TestCase):
+    def test_model(self):
+
+        start_prog = fluid.Program()
+        main_prog = fluid.Program()
+
+        start_prog.random_seed = 100
+        main_prog.random_seed = 100
+
+        with fluid.program_guard(main_prog, start_prog):
+            a = fluid.layers.data(name="a", shape=[1, 20], dtype='float32')
+            label = fluid.layers.data(name="label", shape=[10], dtype='float32')
+            a1 = fluid.layers.fc(input=a, size=10, act=None, bias_attr=False)
+            cost = fluid.layers.square_error_cost(a1, label)
+            avg_cost = fluid.layers.mean(cost)
+
+            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            optimizer.minimize(avg_cost)
+
+            x86_place = lite.Place(lite.TargetType.kX86,
+                                   lite.PrecisionType.kFloat,
+                                   lite.DataLayoutType.kNCHW, 0)
+            host_place = lite.Place(lite.TargetType.kHost,
+                                    lite.PrecisionType.kFloat,
+                                    lite.DataLayoutType.kNCHW, 0)
+            scope = lite.Scope()
+
+        trainer = lite.CXXTrainer(scope, x86_place, [x86_place, host_place])
+        trainer.run_startup_program(start_prog.desc)
+
+        cpu = fluid.core.CPUPlace()
+        main_prog = add_feed_fetch_op(
+            main_prog,
+            feed=['a', 'label'],
+            fetch_list={avg_cost},
+            scope=scope,
+            place=cpu)
+        # print(main_prog)
+        exe = trainer.build_main_program_executor(main_prog.desc)
+
+        feed_data = [
+            _as_lodtensor(np.array(data_a, object), cpu),
+            _as_lodtensor(np.array(data_label, object), cpu)
+        ]
+
+        exe.run(feed_data)
+        # print(np.array(exe.get_output(0).raw_tensor()))
+        self.assertTrue(
+            np.allclose(
+                np.array(data_loss),
+                np.array(exe.get_output(0).raw_tensor()),
+                atol=1e-8),
+            "lite result not equel to offline result")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddle/fluid/lite/tools/CMakeLists.txt b/paddle/fluid/lite/tools/CMakeLists.txt
new file mode 100644
index 00000000000..71bebdf6f8c
--- /dev/null
+++ b/paddle/fluid/lite/tools/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(debug)
diff --git a/paddle/fluid/lite/tools/Dockerfile.mobile b/paddle/fluid/lite/tools/Dockerfile.mobile
index e48af122751..e1094afe677 100644
--- a/paddle/fluid/lite/tools/Dockerfile.mobile
+++ b/paddle/fluid/lite/tools/Dockerfile.mobile
@@ -33,13 +33,18 @@ RUN apt-get install -y --no-install-recommends \
         vim \
         wget
 
+# timezone
+RUN apt install tzdata
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/timezone && echo 'Asia/Shanghai' >/etc/timezone
+
 # for android simulator
 RUN apt-get install -y --no-install-recommends \
         libc6-i386 \
         lib32stdc++6 \
         redir \
         iptables \
-        openjdk-8-jre
+        openjdk-8-jre \
+        default-jdk
 
 # for cmake 3.10
 RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
@@ -88,3 +93,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
 RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
 RUN apt-get autoremove -y && apt-get clean
 RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
+ 
diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh
index 2a31f8d1ff9..c13ca95ec72 100755
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -4,34 +4,42 @@ set -ex
 TESTS_FILE="./lite_tests.txt"
 LIBS_FILE="./lite_libs.txt"
 
+readonly ADB_WORK_DIR="/data/local/tmp"
 readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"
 
+NUM_CORES_FOR_COMPILE=8
+
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
 # here we fake an empty file to make cmake works.
-function prepare_for_codegen {
+function prepare_workspace {
     # in build directory
-    mkdir -p ./paddle/fluid/lite/gen_code
-    touch ./paddle/fluid/lite/gen_code/__generated_code__.cc
-}
-function cmake_x86 {
-    prepare_for_codegen
-    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
+    # 1. Prepare gen_code file
+    GEN_CODE_PATH_PREFIX=paddle/fluid/lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    # 2.Prepare debug tool
+    DEBUG_TOOL_PATH_PREFIX=paddle/fluid/lite/tools/debug
+    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
+    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
 }
 
-function cmake_x86_for_CI {
-    prepare_for_codegen
-    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} -DLITE_WITH_PROFILE=ON
+function check_need_ci {
+    git log -1 --oneline | grep "test=develop" || exit -1
 }
 
-function cmake_gpu {
-    prepare_for_codegen
-    cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
+function cmake_x86 {
+    prepare_workspace
+    cmake ..  -DCMAKE_BUILD_TYPE=Release -DWITH_MKL=ON -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON -DLITE_WITH_PROFILE=OFF ${common_flags}
 }
 
-function cmake_arm {
+function cmake_opencl {
+    prepare_workspace
     # $1: ARM_TARGET_OS in "android" , "armlinux"
-    # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
+    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+    # $3: ARM_TARGET_LANG in "gcc" "clang"
     cmake .. \
+        -DLITE_WITH_OPENCL=ON \
         -DWITH_GPU=OFF \
         -DWITH_MKL=OFF \
         -DWITH_LITE=ON \
@@ -40,14 +48,120 @@ function cmake_arm {
         -DLITE_WITH_ARM=ON \
         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
         -DWITH_TESTING=ON \
-        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
+        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
+}
+
+function run_gen_code_test {
+    local port=$1
+    local gen_code_file_name="__generated_code__.cc"
+    local gen_code_file_path="./paddle/fluid/lite/gen_code/${gen_code_file_path}"
+    local adb_work_dir="/data/local/tmp"
+
+    # 1. build test_cxx_api_lite
+    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+
+    # 2. run test_cxx_api_lite in emulator to get __generated_code__.cc
+    local test_cxx_api_lite_path=$(find ./paddle/fluid -name test_cxx_api_lite)
+    adb -s emulator-${port} push "./third_party/install/lite_naive_model" ${adb_work_dir}
+    adb -s emulator-${port} push ${test_cxx_api_lite_path} ${adb_work_dir}
+    adb -s emulator-${port} shell "${adb_work_dir}/test_cxx_api_lite --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
+ 
+    # 3. build test_gen_code_lite
+    make test_gen_code_lite -j$NUM_CORES_FOR_COMPILE
+ 
+    # 4. run test_gen_code_lite
+    local test_gen_code_lite_path=$(find ./paddle/fluid -name test_gen_code_lite)
+    adb -s emulator-${port} push ${test_gen_code_lite_path} ${adb_work_dir}
+    adb -s emulator-${port} shell "${adb_work_dir}/test_gen_code_lite --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
+  
+    # 5. pull __generated_code__.cc down and mv to buil real path
+    adb -s emulator-${port} pull "${adb_work_dir}/${gen_code_file_name}" .
+    mv ${gen_code_file_name} ${gen_code_file_path}
+
+    # 6. build and test test_generated_code
+    make test_generated_code -j$NUM_CORES_FOR_COMPILE
+}
+
+# $1: ARM_TARGET_OS in "android" , "armlinux"
+# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+# $3: ARM_TARGET_LANG in "gcc" "clang"
+function build_opencl {
+    os=$1
+    abi=$2
+    lang=$3
+
+    cur_dir=$(pwd)
+    if [[ ${os} == "armlinux" ]]; then
+        # TODO(hongming): enable compile armv7 and armv7hf on armlinux, and clang compile
+        if [[ ${lang} == "clang" ]]; then
+            echo "clang is not enabled on armlinux yet"
+            return 0
+        fi
+        if [[ ${abi} == "armv7hf" ]]; then
+            echo "armv7hf is not supported on armlinux yet"
+            return 0
+        fi
+        if [[ ${abi} == "armv7" ]]; then
+            echo "armv7 is not supported on armlinux yet"
+            return 0
+        fi
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}.opencl
+    mkdir -p $build_dir
+    cd $build_dir
+
+    cmake_opencl ${os} ${abi} ${lang}
+    build $TESTS_FILE
+
+    # test publish inference lib
+    make publish_inference_lite
+}
+
+# This method is only called in CI.
+function cmake_x86_for_CI {
+    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
+    cmake ..  -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} -DLITE_WITH_PROFILE=OFF
+
+    # Compile and execute the gen_code related test, so it will generate some code, and make the compilation reasonable.
+    make test_gen_code_lite -j$NUM_CORES_FOR_COMPILE
+    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+    ctest -R test_cxx_api_lite
+    ctest -R test_gen_code_lite
+    make test_generated_code -j #$NUM_CORES_FOR_COMPILE
+}
+
+function cmake_gpu {
+    prepare_workspace
+    cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
+}
+
+function check_style {
+    export PATH=/usr/bin:$PATH
+    #pre-commit install
+    clang-format --version
+
+    if ! pre-commit run -a ; then
+        git diff
+        exit 1
+    fi
+}
+
+function build_single {
+    #make $1 -j$(expr $(nproc) - 2)
+    make $1 -j$NUM_CORES_FOR_COMPILE
 }
 
 function build {
-    file=$1
-    for _test in $(cat $file); do
-        make $_test -j$(expr $(nproc) - 2)
-    done
+    make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
+
+    # test publish inference lib
+    # make publish_inference_lite
 }
 
 # It will eagerly test all lite related unittests.
@@ -56,107 +170,380 @@ function test_lite {
     echo "file: ${file}"
 
     for _test in $(cat $file); do
-        # We move the build phase here to make the 'gen_code' test compiles after the
-        # corresponding test is executed and the C++ code generates.
-        make $_test -j$(expr $(nproc) - 2)
         ctest -R $_test -V
     done
 }
 
-port_armv8=5554
-port_armv7=5556
+# Build the code and run lite server tests. This is executed in the CI system.
+function build_test_server {
+    mkdir -p ./build
+    cd ./build
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
+    cmake_x86_for_CI
+    build
 
-# Run test on android
-function test_lite_android {
-    local file=$1
-    local adb_abi=$2
-    local port=
-    if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-        port=${port_armv7}
-    fi
+    test_lite $TESTS_FILE
+}
 
-    if [[ ${adb_abi} == "arm64-v8a" ]]; then
-        port=${port_armv8}
+function build_test_train {
+    mkdir -p ./build
+    cd ./build
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
+    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
+    cmake .. -DWITH_LITE=ON -DWITH_GPU=OFF -DWITH_PYTHON=ON -DLITE_WITH_X86=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_TESTING=ON -DWITH_MKL=OFF 
+    
+    make test_gen_code_lite -j$NUM_CORES_FOR_COMPILE
+    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+    ctest -R test_cxx_api_lite
+    ctest -R test_gen_code_lite
+    make test_generated_code -j$NUM_CORES_FOR_COMPILE
+
+    make -j$NUM_CORES_FOR_COMPILE
+
+    find -name "*.whl" | xargs pip2 install 
+    python ../paddle/fluid/lite/python/lite_test.py
+
+}
+
+# test_arm_android <some_test_name> <adb_port_number>
+function test_arm_android {
+    local test_name=$1
+    local port=$2
+    if [[ "${test_name}x" == "x" ]]; then
+        echo "test_name can not be empty"
+        exit 1
     fi
     if [[ "${port}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
 
-    echo "file: ${file}"
-    # push all to adb and test
+    echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
-    skip_list="test_model_parser_lite"
-    for _test in $(cat $file); do
-        [[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test'
-        testpath=$(find ./paddle/fluid -name ${_test})
-        adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-        adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}"
-        adb -s emulator-${port} shell "./${adb_work_dir}/${_test}"
+
+    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api_lite" "test_apis_lite" "test_paddle_api_lite" "test_cxx_api_lite" "test_gen_code_lite")
+    for skip_name in ${skip_list[@]} ; do
+        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
+
+    local testpath=$(find ./paddle/fluid -name ${test_name})
+
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name}"
 }
 
-# Build the code and run lite server tests. This is executed in the CI system.
-function build_test_server {
-    mkdir -p ./build
-    cd ./build
-    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
-    cmake_x86_for_CI
-    # compile the tests and execute them.
-    test_lite $TESTS_FILE
-    # build the remaining libraries to check compiling error.
-    build $LIBS_FILE
+# test the inference high level api
+function test_arm_api {
+    local port=$1
+    local test_name="test_paddle_api_lite"
+
+    make $test_name -j$NUM_CORES_FOR_COMPILE
+
+    local model_path=$(find . -name "lite_naive_model")
+    local remote_model=${adb_work_dir}/paddle_api
+    local testpath=$(find ./paddle/fluid -name ${test_name})
+
+    arm_push_necessary_file $port $model_path $remote_model
+    adb -s emulator-${port} shell mkdir -p $remote_model
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
 }
 
-# Build the code and run lite server tests. This is executed in the CI system.
-function build_test_arm {
+function test_arm_model {
+    local test_name=$1
+    local port=$2
+    local model_dir=$3
+
+    if [[ "${test_name}x" == "x" ]]; then
+        echo "test_name can not be empty"
+        exit 1
+    fi
+    if [[ "${port}x" == "x" ]]; then
+        echo "Port can not be empty"
+        exit 1
+    fi
+    if [[ "${model_dir}x" == "x" ]]; then
+        echo "Model dir can not be empty"
+        exit 1
+    fi
+
+    echo "test name: ${test_name}"
+    adb_work_dir="/data/local/tmp"
+
+    testpath=$(find ./paddle/fluid -name ${test_name})
+    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
+}
+
+function _test_model_optimize_tool {
+    local port=$1
+    local remote_model_path=$ADB_WORK_DIR/lite_naive_model
+    local remote_test=$ADB_WORK_DIR/model_optimize_tool
+    local adb="adb -s emulator-${port}"
+
+    make model_optimize_tool -j$NUM_CORES_FOR_COMPILE
+    local test_path=$(find . -name model_optimize_tool | head -n1)
+    local model_path=$(find . -name lite_naive_model | head -n1)
+    $adb push ${test_path} ${ADB_WORK_DIR}
+    $adb shell mkdir -p $remote_model_path
+    $adb push $model_path/* $remote_model_path
+    $adb shell $remote_test --model_dir $remote_model_path --optimize_out ${remote_model_path}.opt \
+         --valid_targets "arm"
+}
+
+function _test_paddle_code_generator {
+    local port=$1
+    local test_name=paddle_code_generator
+    local remote_test=$ADB_WORK_DIR/$test_name
+    local remote_model=$ADB_WORK_DIR/lite_naive_model.opt
+    local adb="adb -s emulator-${port}"
+
+    make paddle_code_generator -j$NUM_CORES_FOR_COMPILE
+    local test_path=$(find . -name $test_name | head -n1)
+
+    $adb push $test_path $remote_test
+    $adb shell $remote_test --optimized_model $remote_model --generated_code_file $ADB_WORK_DIR/gen_code.cc
+}
+
+function cmake_arm {
+    prepare_workspace
+    # $1: ARM_TARGET_OS in "android" , "armlinux"
+    # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+    # $3: ARM_TARGET_LANG in "gcc" "clang"
+    cmake .. \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_LITE=ON \
+        -DLITE_WITH_CUDA=OFF \
+        -DLITE_WITH_X86=OFF \
+        -DLITE_WITH_ARM=ON \
+        -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+        -DWITH_TESTING=ON \
+        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_LANG=$3
+}
+
+# $1: ARM_TARGET_OS in "android" , "armlinux"
+# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+# $3: ARM_TARGET_LANG in "gcc" "clang"
+function build_arm {
+    os=$1
+    abi=$2
+    lang=$3
+
+    cur_dir=$(pwd)
+    # TODO(xxx): enable armlinux clang compile
+    if [[ ${os} == "armlinux" && ${lang} == "clang" ]]; then
+        echo "clang is not enabled on armlinux yet"
+        return 0
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+    mkdir -p $build_dir
+    cd $build_dir
+
+    cmake_arm ${os} ${abi} ${lang}
+    build $TESTS_FILE
+
+    # test publish inference lib
+    make publish_inference_lite
+}
+
+# $1: ARM_TARGET_OS in "android" , "armlinux"
+# $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
+# $3: ARM_TARGET_LANG in "gcc" "clang"
+# $4: android test port
+# Note: test must be in build dir
+function test_arm {
+    os=$1
+    abi=$2
+    lang=$3
+    port=$4
+
+    if [[ ${os} == "armlinux" ]]; then
+        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
+        echo "Skip test arm linux yet. armlinux must in another docker"
+        return 0
+    fi
+
+    if [[ ${os} == "android" && ${abi} == "armv7hf" ]]; then
+        echo "android do not need armv7hf"
+        return 0
+    fi
+   
+    echo "test file: ${TESTS_FILE}"
+    for _test in $(cat $TESTS_FILE); do
+        test_arm_android $_test $port
+    done
+
+    # test finally
+    test_arm_api $port
+
+    _test_model_optimize_tool $port
+    _test_paddle_code_generator $port
+}
+
+function prepare_emulator {
+    local port_armv8=$1
+    local port_armv7=$2
+
     adb kill-server
     adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-    # start android arm64-v8a armeabi-v7a emulators first
+    # start android armv8 and armv7 emulators first
     echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -verbose -port ${port_armv8} &
+    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port ${port_armv8} &
     sleep 1m
     echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
+    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port ${port_armv7} &
     sleep 1m
+}
+
+function arm_push_necessary_file {
+    local port=$1
+    local testpath=$2
+    local adb_work_dir=$3
+
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+}
+
+function build_test_arm_opencl {
+    ########################################################################
+    cur=$PWD
+
+    # job 1
+    build_opencl "android" "armv8" "gcc"
+    cd $cur
+
+    # job 2
+    build_opencl "android" "armv7" "gcc"
+    cd $cur
+
+    echo "Done"
+}
+
+# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
+# sub-task1
+function build_test_arm_subtask_android {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+
+    prepare_emulator $port_armv8 $port_armv7
+
+    # job 1
+    build_arm "android" "armv8" "gcc"
+    run_gen_code_test ${port_armv8}
+    test_arm "android" "armv8" "gcc" ${port_armv8}
+    cd -
+
+    # job 2
+    #build_arm "android" "armv8" "clang"
+    #run_gen_code_test ${port_armv8}
+    #test_arm "android" "armv8" "clang" ${port_armv8}
+    #cd -
+
+    # job 3
+    build_arm "android" "armv7" "gcc"
+    run_gen_code_test ${port_armv7}
+    test_arm "android" "armv7" "gcc" ${port_armv7}
+    cd -
+
+    # job 4
+    #build_arm "android" "armv7" "clang"
+    #run_gen_code_test ${port_armv7}
+    #test_arm "android" "armv7" "clang" ${port_armv7}
+    #cd -
 
-    for os in "android" "armlinux" ; do
-        for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
-            if [[ ${abi} == "armeabi-v7a-hf" ]]; then
-                echo "armeabi-v7a-hf is not supported on both android and armlinux"
-                continue
-            fi
-
-            if [[ ${os} == "armlinux" && ${abi} == "armeabi-v7a" ]]; then
-                echo "armeabi-v7a is not supported on armlinux yet"
-                continue
-            fi
-
-            build_dir=build.lite.${os}.${abi}
-            mkdir -p $build_dir
-            cd $build_dir
-            cmake_arm ${os} ${abi}
-            build $TESTS_FILE
-
-            if [[ ${os} == "android" ]]; then
-                adb_abi=${abi}
-                if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
-                    adb_abi="armeabi-v7a"
-                fi
-                if [[ ${adb_abi} == "armeabi-v7a" ]]; then
-                    # skip v7 tests
-                    continue
-                fi
-                test_lite_android $TESTS_FILE ${adb_abi}
-                # armlinux need in another docker
-            fi
-            cd -
-        done
-    done
     adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
     echo "Done"
 }
 
+# sub-task2
+function build_test_arm_subtask_armlinux {
+    cur=$PWD
+    # job 5
+    build_arm "armlinux" "armv8" "gcc"
+    test_arm "armlinux" "armv8" "gcc" $port_armv8
+    cd $cur
+
+    # job 6
+    build_arm "armlinux" "armv7" "gcc"
+    test_arm "armlinux" "armv7" "gcc" $port_armv8
+    cd $cur
+
+    # job 7
+    build_arm "armlinux" "armv7hf" "gcc"
+    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    cd $cur
+
+    echo "Done"
+}
+
+# sub-task-model
+function build_test_arm_subtask_model {
+    local port_armv8=5554
+    local port_armv7=5556
+    # We just test following single one environment to limit the CI time.
+    local os=android
+    local abi=armv8
+    local lang=gcc
+
+    local test_name=$1
+    local model_name=$2
+
+    cur_dir=$(pwd)
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+    mkdir -p $build_dir
+    cd $build_dir
+    cmake_arm $os $abi $lang
+    make $test_name -j$NUM_CORES_FOR_COMPILE
+
+    prepare_emulator $port_armv8 $port_armv7
+
+    # just test the model on armv8
+    test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
+
+    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    echo "Done"
+}
+
+
+# this test load a model, optimize it and check the prediction result of both cxx and light APIS.
+function test_arm_predict_apis {
+    local port=$1
+    local workspace=$2
+    local naive_model_path=$3
+    local api_test_path=$(find . -name "test_apis_lite")
+    # the model is pushed to ./lite_naive_model
+    adb -s emulator-${port} push ${naive_model_path} ${workspace}
+    adb -s emulator-${port} push $api_test_path ${workspace}
+
+    # test cxx_api first to store the optimized model.
+    adb -s emulator-${port} shell ./test_apis_lite --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
+}
+
+
+# Build the code and run lite arm tests. This is executed in the CI system.
+function build_test_arm {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+
+    build_test_arm_subtask_android
+    build_test_arm_subtask_armlinux
+}
+
+
 ############################# MAIN #################################
 function print_usage {
     echo -e "\nUSAGE:"
@@ -164,12 +551,13 @@ function print_usage {
     echo "----------------------------------------"
     echo -e "cmake_x86: run cmake with X86 mode"
     echo -e "cmake_cuda: run cmake with CUDA mode"
-    echo -e "cmake_arm: run cmake with ARM mode"
+    echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
     echo
     echo -e "build: compile the tests"
+    echo -e "--test_name=<test_name> build_single: compile single test"
     echo
     echo -e "test_server: run server tests"
-    echo -e "test_mobile: run mobile tests"
+    echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
     echo "----------------------------------------"
     echo
 }
@@ -182,39 +570,119 @@ function main {
                 TESTS_FILE="${i#*=}"
                 shift
                 ;;
+            --test_name=*)
+                TEST_NAME="${i#*=}"
+                shift
+                ;;
+            --arm_os=*)
+                ARM_OS="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                ARM_ABI="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                ARM_LANG="${i#*=}"
+                shift
+                ;;
+            --arm_port=*)
+                ARM_PORT="${i#*=}"
+                shift
+                ;;
             build)
                 build $TESTS_FILE
                 build $LIBS_FILE
                 shift
                 ;;
+            build_single)
+                build_single $TEST_NAME
+                shift
+                ;;
             cmake_x86)
                 cmake_x86
                 shift
                 ;;
+            cmake_opencl)
+                cmake_opencl $ARM_OS $ARM_ABI $ARM_LANG
+                shift
+                ;;
             cmake_cuda)
                 cmake_cuda
                 shift
                 ;;
             cmake_arm)
-                cmake_arm $2 $3
+                cmake_arm $ARM_OS $ARM_ABI $ARM_LANG
+                shift
+                ;;
+            build_opencl)
+                build_opencl $ARM_OS $ARM_ABI $ARM_LANG
+                shift
+                ;;
+            build_arm)
+                build_arm $ARM_OS $ARM_ABI $ARM_LANG
                 shift
                 ;;
             test_server)
                 test_lite $TESTS_FILE
                 shift
                 ;;
-            test_mobile)
-                test_lite $TESTS_FILE
+            test_arm)
+                test_arm $ARM_OS $ARM_ABI $ARM_LANG $ARM_PORT
+                shift
+                ;;
+            test_arm_android)
+                test_arm_android $TEST_NAME $ARM_PORT
                 shift
                 ;;
             build_test_server)
                 build_test_server
                 shift
                 ;;
+            build_test_train)
+                build_test_train
+                shift
+                ;;
             build_test_arm)
                 build_test_arm
                 shift
                 ;;
+            build_test_arm_opencl)
+                build_test_arm_opencl
+                shift
+                ;;
+            build_test_arm_subtask_android)
+                build_test_arm_subtask_android
+                shift
+                ;;
+            build_test_arm_subtask_armlinux)
+                build_test_arm_subtask_armlinux
+                shift
+                ;;
+            build_test_arm_model_mobilenetv1)
+                build_test_arm_subtask_model test_mobilenetv1_lite mobilenet_v1
+                shift
+                ;;
+            build_test_arm_model_mobilenetv2)
+                build_test_arm_subtask_model test_mobilenetv2_lite mobilenet_v2_relu
+                shift
+                ;;
+            build_test_arm_model_resnet50)
+                build_test_arm_subtask_model test_resnet50_lite resnet50
+                shift
+                ;;
+            build_test_arm_model_inceptionv4)
+                build_test_arm_subtask_model test_inceptionv4_lite inception_v4_simple
+                shift
+                ;;
+            check_style)
+                check_style
+                shift
+                ;;
+            check_need_ci)
+                check_need_ci
+                shift
+                ;;
             *)
                 # unknown option
                 print_usage
@@ -224,6 +692,4 @@ function main {
     done
 }
 
-print_usage
-
 main $@
diff --git a/paddle/fluid/lite/tools/debug/CMakeLists.txt b/paddle/fluid/lite/tools/debug/CMakeLists.txt
new file mode 100644
index 00000000000..fec4633d6a0
--- /dev/null
+++ b/paddle/fluid/lite/tools/debug/CMakeLists.txt
@@ -0,0 +1,13 @@
+lite_cc_library(debug_utils_lite SRCS debug_utils.cc DEPS op_params_lite)
+
+lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
+    DEPS
+    cxx_api_lite
+    debug_utils_lite
+    model_parser_lite
+    target_wrapper_host
+    mir_passes
+    ${ops_lite} ${host_kernels}
+    X86_DEPS ${x86_kernels}
+    ARM_DEPS ${arm_kernels}
+    CL_DEPS ${opencl_kernels})
diff --git a/paddle/fluid/lite/tools/debug/analysis_tool.py b/paddle/fluid/lite/tools/debug/analysis_tool.py
new file mode 100644
index 00000000000..3de8ed92496
--- /dev/null
+++ b/paddle/fluid/lite/tools/debug/analysis_tool.py
@@ -0,0 +1,401 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Fluid model analysis tools 
+'''
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import logging
+import os
+import subprocess
+import sys
+from collections import OrderedDict
+from operator import mul
+
+# Simple logging config
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid import debugger
+from paddle.fluid import core
+
+# Command arguments
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_dir", type=str, required=True, help="Model dir path")
+parser.add_argument(
+    "--input_file", default="", type=str, help="Input datas file path")
+parser.add_argument(
+    "--topo_file",
+    type=str,
+    required=True,
+    help="Runtime topology order output file path")
+parser.add_argument(
+    "--tensor_file",
+    default="",
+    type=str,
+    required=True,
+    help="Tensor file path")
+parser.add_argument(
+    "--tensor_names",
+    default="",
+    type=str,
+    help="If tensor_names is not empty, then only this tensors will be compare")
+parser.add_argument(
+    "--separator",
+    default=",",
+    type=str,
+    help="Deafult separator, use in string split")
+parser.add_argument(
+    "--output_tensor",
+    default=0,
+    type=int,
+    help="dump fluid runntime tensors or not")
+parser.add_argument(
+    "--tensor_output_file",
+    default="./tensor_output_py",
+    type=str,
+    help="dump fluid runntime tensors filepath")
+parser.add_argument(
+    "--tensor_output_length",
+    default=-1,
+    type=int,
+    help="Output tensor data length, dims size will be used if tensor_output_length < 0"
+)
+parser.add_argument(
+    "--only_first",
+    default=1,
+    type=int,
+    help="If only output the first mismatch vars info or not")
+parser.add_argument(
+    "--output_file",
+    default="./diff.txt",
+    type=str,
+    help="dump diff info filepath")
+parser.add_argument(
+    "--threshold", default=1e-5, type=float, help="float value diff threshold")
+
+
+# Help functions
+def load_file(filename, delim=None):
+    """
+    Load file help function
+    """
+    with open(filename) as fd:
+        for line in fd:
+            line = line.strip()
+            assert len(line) != ""
+            if delim:
+                line = line.split(delim)
+            yield line
+
+
+class FluidModelExecutor(object):
+    """
+    A fluid inference model executeor
+    """
+
+    def __init__(self, model_dir, input_file):
+        self.model_dir = model_dir
+        self.place = fluid.CPUPlace()
+        self.exe = fluid.Executor(self.place)
+        self.scope = fluid.core.Scope()
+        self.input_data = self._load_input_file(input_file)
+
+        self.program, self.feed_target_names, self.fetch_targets = self._load_inference_model(
+        )
+
+    def infer_var_list(self,
+                       arg_names=None,
+                       out_data_len=-1,
+                       dump_tensor=False,
+                       dump_tensor_file=''):
+        """
+        Get variables' tensor in var_list
+        """
+        with fluid.scope_guard(self.scope):
+            global_block = self.program.global_block()
+            feed_list = self._prepare_feed_data(global_block,
+                                                self.feed_target_names)
+            fetch_targets = self._fetch_tmp_vars(global_block, arg_names)
+            results = self.exe.run(program=self.program,
+                                   feed=feed_list,
+                                   fetch_list=fetch_targets,
+                                   return_numpy=False)
+            return self._get_results(
+                results,
+                fetch_targets,
+                arg_names=arg_names,
+                need_save=dump_tensor,
+                save_path=dump_tensor_file,
+                out_data_len=out_data_len)
+
+    def draw_graph(self, output_path='./', filename='debug'):
+        """
+        Draw graph with graphviz
+        """
+        dot_path = os.path.join([output_path, filename + '.dot'])
+        pdf_path = os.path.join([output_path, filename + '.pdf'])
+        debugger.draw_block_graphviz(self.program.global_block(), path=dot_path)
+        cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+
+    def _prepare_feed_data(self, block, feed_target_names):
+        feed_dict = dict()
+
+        def fill_data(np_dtype, col, shape):
+            if self.input_data:
+                input_size = reduce(mul, shape)
+                assert len(self.input_data[0]) > col
+                data = self.input_data[0][col].split(' ')
+                assert len(data) == input_size
+                return np.array(
+                    map(np_dtype, data), dtype=np_dtype).reshape(shape)
+            else:
+                return np.ones(shape, dtype=np_dtype)
+
+        # TODO(sangoly): support multiple feed fields 
+        assert len(feed_target_names) == 1
+        for idx, name in enumerate(feed_target_names):
+            var = block.var(name)
+            np_shape = list(var.shape)
+            # TODO(sangoly): support batch
+            if np_shape[0] == -1:
+                np_shape[0] = 1
+            if var.dtype == core.VarDesc.VarType.INT32:
+                feed_dict[name] = fill_data(np.int32, idx, np_shape)
+            elif var.dtype == core.VarDesc.VarType.INT64:
+                feed_dict[name] = fill_data(np.int64, idx, np_shape)
+            elif var.dtype == core.VarDesc.VarType.FP16:
+                feed_dict[name] = fill_data(np.float16, idx, np_shape)
+            elif var.dtype == core.VarDesc.VarType.FP32:
+                feed_dict[name] = fill_data(np.float32, idx, np_shape)
+            elif var.dtype == core.VarDesc.VarType.FP64:
+                feed_dict[name] = fill_data(np.float64, idx, np_shape)
+            else:
+                raise TypeError("Data type is not supported")
+        return feed_dict
+
+    def _load_input_file(self, input_file=None):
+        input_data = []
+        if not input_file:
+            return input_data
+        logger.info("Loading input file %s ..." % input_file)
+        for line in load_file(input_file, "\t"):
+            input_data.append(line)
+        return input_data
+
+    def _load_inference_model(self):
+        with fluid.scope_guard(self.scope):
+            model_abs_path = os.path.join(self.model_dir, 'model')
+            param_abs_path = os.path.join(self.model_dir, 'params')
+            if os.path.exists(model_abs_path) and os.path.exists(
+                    param_abs_path):
+                return fluid.io.load_inference_model(self.model_dir, exe,
+                                                     'model', 'params')
+            else:
+                return fluid.io.load_inference_model(self.model_dir, self.exe)
+
+    def _fetch_tmp_vars(self, block, var_names_list=None):
+        fetch_var = block.var('fetch')
+        old_fetch_names = set([var.name for var in self.fetch_targets])
+        new_fetch_vars = [block.var(name) for name in old_fetch_names]
+        i = len(new_fetch_vars)
+        if var_names_list is None:
+            var_names_list = block.vars.keys()
+        for var_name in var_names_list:
+            if var_name in old_fetch_names: continue
+            new_fetch_vars.append(block.var(var_name))
+            block.append_op(
+                type='fetch',
+                inputs={'X': [var_name]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+            i = i + 1
+        return new_fetch_vars
+
+    def _get_results(self,
+                     results,
+                     new_fetch_targets,
+                     need_save=False,
+                     arg_names=None,
+                     save_path='',
+                     out_data_len=10):
+        res = OrderedDict()
+        old_fetch_names = set([var.name for var in self.fetch_targets])
+        if need_save:
+            out_fd = open(save_path, 'w')
+        for result in results:
+            idx = results.index(result)
+            name = new_fetch_targets[idx].name
+            dim = [v if v >= 0 else 1 for v in new_fetch_targets[idx].shape]
+            size = min(reduce(mul, dim),
+                       out_data_len) if out_data_len > 0 else reduce(mul, dim)
+            values = list(np.array(result).flatten())[:size]
+            res[name] = {"dim": dim, "values": values}
+            if need_save:
+                if arg_names and name not in arg_names: continue
+                dim_str = '{' + ','.join(map(str, dim)) + '}'
+                out_fd.write('\t'.join(
+                    [name, dim_str, ' '.join(map(str, values))]) + '\n')
+        if need_save:
+            out_fd.close()
+        return res
+
+
+class Analyser(object):
+    """
+    A FLuid model analysis tool
+    """
+
+    def __init__(self, args):
+        self.args = args
+        self.tensors = OrderedDict()
+        self.topo = {}
+        self.input = []
+        logger.info("Loading fluid inference model %s ..." % args.model_dir)
+        self.predictor = FluidModelExecutor(args.model_dir, args.input_file)
+
+    def analysis(self):
+        """
+        Analyser work function
+        """
+        self._load_topo_file()
+        self._load_tensor_file()
+        arg_names = self.args.tensor_names.split(',') if self.args.tensor_names != "" \
+                                           else self.tensors.keys()
+        infer_results = self.predictor.infer_var_list(
+            out_data_len=self.args.tensor_output_length,
+            arg_names=arg_names,
+            dump_tensor=self.args.output_tensor,
+            dump_tensor_file=self.args.tensor_output_file)
+        if self.args.tensor_names == "":
+            self._check_diff_nodes(infer_results)
+
+    def _parse_topo_field(self, field):
+        params = [item.split(':')[1].strip() for item in field[1:-1].split(' ')]
+        params = [item.split('#') for item in params if item != ""]
+        return [item for lst in params for item in lst]
+
+    def _load_topo_file(self):
+        if self.args.topo_file == "":
+            raise ValueError("Topo file path in empty")
+        logger.info("Loading topo file %s ..." % self.args.topo_file)
+        for line in load_file(self.args.topo_file, '\t'):
+            op_type, inputs, outputs = line
+            for name in self._parse_topo_field(outputs):
+                if name not in self.topo:
+                    self.topo[name] = []
+                self.topo[name].append(line)
+
+    def _load_tensor_file(self):
+        if self.args.tensor_file == "":
+            raise ValueError("Tensor file path in empty")
+        logger.info("Loading tensor file %s ..." % args.tensor_file)
+        for line in load_file(args.tensor_file, "\t"):
+            name, dim, values = line
+            dim = map(int, dim[1:-1].split(','))
+            values = map(float, values.split(' '))
+
+            dim_size = reduce(mul, dim)
+            value_size = len(values)
+            assert dim_size == value_size, \
+                        "Dim size mismatch with data: %d vs %d" % (dim_size, value_size)
+
+            self.tensors[name] = {"dim": dim, "values": values}
+
+    def _check_diff_nodes(self, results):
+        """
+        NOTE: The tensor output by c++ debug tool is according to runtime topology order,
+              so we can find the first ops (may be one of them) with error results
+        """
+        assert len(self.tensors) == len(results), \
+                "FLuid output tensor'size mismatch with `tensor_file`"
+        diff_vars = []
+        flag = False
+        for k in self.tensors:
+            if k not in results:
+                raise KeyError("Have not found infer result for `%s`" % k)
+            if len(self.tensors[k]['values']) != len(results[k]['values']):
+                raise ValueError(
+                    "Argname: %s size mismatch with `tensor_file`: %d vs %d" %
+                    (k, len(self.tensors[k]['values']),
+                     len(results[k]['values'])))
+            for i in range(len(self.tensors[k]['values'])):
+                if abs(self.tensors[k]['values'][i] - results[k]['values'][
+                        i]) > args.threshold:
+                    diff_vars.append(k)
+                    if args.only_first:
+                        flag = True
+                    break
+            if flag: break
+        self._output_diff_nodes(results, diff_vars)
+
+    def _output_diff_nodes(self, results, diff_vars):
+        def output_param_info(inputs, outputs, infos, fd):
+            def tensor_repr(name):
+                return '\t'.join([
+                    name, '{' + ','.join(map(str, infos[name]['dim'])) + '}',
+                    ' '.join(map(str, infos[name]['values']))
+                ])
+
+            for name in self._parse_topo_field(inputs):
+                if name not in infos: continue
+                fd.write(tensor_repr(name) + '\n')
+            for name in self._parse_topo_field(outputs):
+                if name not in infos: continue
+                fd.write(tensor_repr(name) + '\n')
+
+        if len(diff_vars) == 0:
+            logger.info("No diff found. Congratulation!")
+            return
+        logger.info("Total diff vars: %d" % len(diff_vars))
+        with open(self.args.output_file, 'w') as fd:
+            for var in diff_vars:
+                if var not in self.topo:
+                    raise KeyError("%s not in any op's output params, " % var +
+                                   "please check your model and input")
+                fd.write(
+                    '>>>>>>>>>>>>>>>>>>DIFF VARIABLE: %s<<<<<<<<<<<<<<<<<<<\n' %
+                    var)
+                for idx, (op_type, inputs,
+                          outputs) in enumerate(self.topo[var]):
+                    op_repr = '\t'.join([op_type, inputs, outputs])
+                    logger.info("dump diff info: ------------ %s" % op_repr)
+                    fd.write(op_repr + '\n')
+                    fd.write(
+                        "--------------- Tensor File info ---------------\n")
+                    output_param_info(inputs, outputs, self.tensors, fd)
+                    fd.write(
+                        "--------------- Fluid Tensor info ---------------\n")
+                    output_param_info(inputs, outputs, results, fd)
+                    fd.write("\n\n")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    analyser = Analyser(args)
+    analyser.analysis()
diff --git a/paddle/fluid/lite/tools/debug/check_model.sh b/paddle/fluid/lite/tools/debug/check_model.sh
new file mode 100755
index 00000000000..d730de72036
--- /dev/null
+++ b/paddle/fluid/lite/tools/debug/check_model.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+############################# Arguments ############################
+# For both cpp & python
+BUILD_ROOT_DIR=""                 # Cmake build root path, for LD_LIBRARY_PATH
+MODEL_DIR=""                      # Model dir path
+INPUT_FILE=""                     # Input data file, only the first record will be used. 
+                                  # If the path is empty, then all-ones input will be used.
+CPP_TOPO_FILE=./topo_file.txt     # Runtime program topology info. Write by Cpp-debug-tool and Read by Py-debug-tool
+CPP_TENSOR_FILE=./tensor_cpp.txt  # Store Cpp-debug-tool's tensor outputs int runtime topology order.
+                                  # Write by Cpp-debug-tool and Read by Py-debug-tool 
+TENSOR_NAMES=""                   # If is not empty, then only dump the tensor fo arguments whoes name is 
+                                  # in tensor names. Separate by ','.
+TENSOR_OUTPUT_LENGTH=-1           # Output tensor data length. Tensor's dim size will be used if this value < 0.
+
+# For Cpp debug tools
+CPP_OUTPUT_TOPO=1                 # If output topology info or not.
+CPP_OUTPUT_VARS=1                 # If output TmpVar' tensor or not.
+CPP_OUTPUT_WEIGHTS=1              # If output WeightVar' tensor or not.
+CPP_ARM_THREAD_NUM=1              # ARM thread num. Used by ARM device info. 
+                                  # Only be used by compile option - LITE_WITH_ARM
+
+# For python debug tools
+PY_THRESHOLD=0.00001              # The numerical lower bound  be used to judge [Cpp vs Py] runtime model diff.
+PY_TENSOR_FILE=./tensor_py.txt    # Store Py-debug-tool's tensor outputs.
+PY_OUTPUT_FILE=./diff.txt         # Store model different op/var info for debug.
+PY_ONLY_OUTPUT_FIRST_DIFF=1       # If only output the first different var's info in runtime topology order or not.
+PY_OUTPUT_TENSOR=1                # If output var' tensor in CPP_TENSOR_FILE/TENSOR_NAMES or not.
+
+############################# MAIN #################################
+function print_usage {
+    echo -e "\nUSAGE:"
+    echo -e "debug_cpp_stage -> debug_py_stage"
+    echo
+    echo "----------------------------------------"
+    echo -e "debug_cpp_stage:"
+    echo -e "run_debug.sh [--option=value]* debug_cpp_stage"
+    echo -e "See run_debug.sh#run_cpp_debug_tool for detail"
+    echo
+    echo -e "debug_py_stage:"
+    echo -e "run_debug.sh [--option=value]* debug_py_stage"
+    echo -e "See run_debug.sh#run_py_debug_tool for detail"
+    echo "----------------------------------------"
+}
+
+function check_enviroment {
+    if [ "X${BUILD_ROOT_DIR}" == "X" ]; then
+	echo -e "\nOption: --build_root_dir=xxx is required.\n";
+	exit 1
+    fi 
+    if [ "X${MODEL_DIR}" == "X" ]; then
+	echo -e "\nOption: --model_dir=xxx is required.\n";
+	exit 1
+    fi 
+}
+
+function run_cpp_debug_tool {
+    check_enviroment
+
+    local tool_name="lite_model_debug_tool"
+    local tool_path=$(find ${BUILD_ROOT_DIR} -type f -name ${tool_name})
+    if [ "X${tool_path}" == "X" ]; then
+	echo -e "\nERROR: ${tool_name} not found in ${BUILD_ROOT_DIR}.\n"
+	exit 1
+    fi
+    echo "Find Cpp-debug-tool path: ${tool_path}"
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$BUILD_ROOT_DIR/third_party/install/mklml/lib"
+    ${tool_path} \
+        --model_dir=$MODEL_DIR                         \
+        --input_file=$INPUT_FILE                       \
+        --topo_output_file=$CPP_TOPO_FILE              \
+        --output_topo=$CPP_OUTPUT_TOPO                 \
+        --tensor_output_file=$CPP_TENSOR_FILE          \
+        --output_vars=$CPP_OUTPUT_VARS                 \
+        --output_weights=$CPP_OUTPUT_WEIGHTS           \
+        --tensor_names=$TENSOR_NAMES                   \
+        --tensor_output_length=$TENSOR_OUTPUT_LENGTH   \
+        --arm_thread_num=$CPP_ARM_THREAD_NUM
+}
+
+function run_py_debug_tool {
+    check_enviroment
+
+    local tool_name="analysis_tool.py"
+    local tool_path=$(find ${BUILD_ROOT_DIR} -type f -name ${tool_name})
+    if [ "X${tool_path}" == "X" ]; then
+	echo -e "\nERROR: ${tool_name} not found in ${BUILD_ROOT_DIR}.\n"
+	return
+    fi
+    echo "Find Py-debug-tool path: ${tool_path}"
+    python ${tool_path} \
+        --model_dir=$MODEL_DIR                         \
+        --input_file=$INPUT_FILE                       \
+        --topo_file=$CPP_TOPO_FILE                     \
+        --tensor_file=$CPP_TENSOR_FILE                 \
+        --tensor_names=$TENSOR_NAMES                   \
+        --output_tensor=$PY_OUTPUT_TENSOR              \
+        --tensor_output_file=$PY_TENSOR_FILE           \
+        --tensor_output_length=$TENSOR_OUTPUT_LENGTH   \
+        --only_first=$PY_ONLY_OUTPUT_FIRST_DIFF        \
+        --output_file=$PY_OUTPUT_FILE                  \
+        --threshold=$PY_THRESHOLD
+}
+
+function main {
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            --model_dir=*)
+                MODEL_DIR="${i#*=}"
+                shift
+                ;;
+            --input_file=*)
+                INPUT_FILE="${i#*=}"
+                shift
+                ;;
+            --cpp_topo_file=*)
+                CPP_TOPO_FILE="${i#*=}"
+                shift
+                ;;
+            --cpp_tensor_file=*)
+                CPP_TENSOR_FILE="${i#*=}"
+                shift
+                ;;
+            --tensor_names=*)
+                TENSOR_NAMES="${i#*=}"
+                shift
+                ;;
+            --tensor_output_length=*)
+                TENSOR_OUTPUT_LENGTH="${i#*=}"
+                shift
+                ;;
+            --cpp_output_vars=*)
+                CPP_OUTPUT_VARS="${i#*=}"
+                shift
+                ;;
+            --cpp_output_weights=*)
+                CPP_OUTPUT_WEIGHTS="${i#*=}"
+                shift
+                ;;
+            --py_threshold=*)
+                PY_THRESHOLD="${i#*=}"
+                shift
+                ;;
+            --py_tensor_file=*)
+                PY_TENSOR_FILE="${i#*=}"
+                shift
+                ;;
+            --py_output_file=*)
+                PY_OUTPUT_FILE="${i#*=}"
+                shift
+                ;;
+            --py_only_output_first_diff=*)
+                PY_ONLY_OUTPUT_FIRST_DIFF="${i#*=}"
+                shift
+                ;;
+            --py_output_tensor=*)
+                PY_OUTPUT_TENSOR="${i#*=}"
+                shift
+                ;;
+            --build_root_dir=*)
+                BUILD_ROOT_DIR="${i#*=}"
+                shift
+                ;;
+            debug_cpp_stage)
+                run_cpp_debug_tool
+                shift
+                ;;
+            debug_py_stage)
+                run_py_debug_tool
+                shift
+                ;;
+            *)
+                # unknown option
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+}
+
+main $@
diff --git a/paddle/fluid/lite/kernels/arm/relu_compute.cc b/paddle/fluid/lite/tools/debug/debug_utils.cc
similarity index 91%
rename from paddle/fluid/lite/kernels/arm/relu_compute.cc
rename to paddle/fluid/lite/tools/debug/debug_utils.cc
index 6e27e8ec669..1dd8a599860 100644
--- a/paddle/fluid/lite/kernels/arm/relu_compute.cc
+++ b/paddle/fluid/lite/tools/debug/debug_utils.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/lite/kernels/arm/relu_compute.h"
+#include "paddle/fluid/lite/tools/debug/debug_utils.h"
diff --git a/paddle/fluid/lite/tools/debug/debug_utils.h b/paddle/fluid/lite/tools/debug/debug_utils.h
new file mode 100644
index 00000000000..9f1843b09ed
--- /dev/null
+++ b/paddle/fluid/lite/tools/debug/debug_utils.h
@@ -0,0 +1,329 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <gflags/gflags.h>
+#include <algorithm>
+#include <fstream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/utils/string.h"
+
+DEFINE_string(model_dir, "", "Model dir path");
+DEFINE_string(input_file, "", "Input datas file path");
+DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
+DEFINE_bool(output_topo, true, "Dump runtime topology or not");
+DEFINE_string(tensor_output_file, "", "Tensor output file path");
+DEFINE_bool(output_vars, true, "Dump vars or not");
+DEFINE_bool(output_weights, true, "Dump weight tensors or not");
+DEFINE_string(
+    tensor_names, "",
+    "If tensor_names is not empty, then only this tensors will be dump");
+DEFINE_int32(tensor_output_length, -1,
+             "Output tensor data length, dims size will be used if "
+             "output_tensor_length < 0");
+DEFINE_int32(arm_thread_num, 1, "Arm thread nums, 1 as default");
+DEFINE_string(separator, ",", "Deafult separator, use in string split");
+
+namespace paddle {
+namespace lite {
+namespace tools {
+namespace debug {
+
+struct DebugConfig {
+  // arguments
+  std::string model_dir;
+  std::string topo_output_file;
+  std::string tensor_output_file;
+  std::string input_file;
+  std::vector<std::string> tensor_names;
+  bool output_weights;
+  bool output_topo;
+  bool output_vars;
+  int tensor_output_length;
+  int arm_thread_num;
+
+  std::unordered_map<std::string, lite::VarDesc> var_descs;
+  std::vector<std::vector<std::string>> input_values;
+};
+
+template <typename T>
+std::vector<T> Split2Vector(const std::string& input,
+                            const std::string& separator) {
+  std::vector<T> tgt;
+  std::vector<std::string> inputs = Split(input, separator);
+  tgt.resize(inputs.size());
+  std::stringstream ss;
+  for (int i = 0; i < inputs.size(); ++i) {
+    ss << inputs[i] << " ";
+  }
+  for (int i = 0; i < inputs.size(); ++i) {
+    ss >> tgt[i];
+  }
+  return tgt;
+}
+
+void CollectFeedVarsInfo(std::unordered_map<int, std::string>* feed_vars_info,
+                         const framework::proto::ProgramDesc& desc) {
+  CHECK(feed_vars_info);
+  for (const auto& proto_op_desc : desc.blocks(0).ops()) {
+    lite::OpDesc op_desc(proto_op_desc);
+    auto op_type = op_desc.Type();
+    if (op_type == "feed") {
+      (*feed_vars_info)
+          .emplace(op_desc.GetAttr<int>("col"), op_desc.Output("Out").front());
+    }
+  }
+}
+template <typename T>
+void FillTensorData(lite::Tensor* tensor, const DebugConfig& conf, int col) {
+  CHECK(tensor);
+  auto dim_size = tensor->dims().production();
+  auto* data = tensor->mutable_data<T>();
+  if (conf.input_values.size() > 0) {
+    CHECK(col < conf.input_values[0].size())
+        << "Input data fields out of index. field_len: "
+        << conf.input_values[0].size() << " col: " << col;
+    std::vector<T> input_data(
+        std::move(Split2Vector<T>(conf.input_values[0][col], " ")));
+    CHECK(input_data.size() == dim_size)
+        << "Input data field[" << col
+        << "] mismatch TensorDim: " << input_data.size() << " vs " << dim_size;
+    for (int i = 0; i < dim_size; i++) {
+      data[i] = input_data[i];
+    }
+  } else {
+    LOG(INFO) << "------------> Use all-ones input";
+    for (int i = 0; i < dim_size; i++) {
+      data[i] = 1;
+    }
+  }
+}
+
+void CheckDim(std::vector<DDim::value_type>* dim) {
+  CHECK(dim);
+  for (int i = 0; i < dim->size(); ++i) {
+    if ((*dim)[i] < 0) (*dim)[i] = -(*dim)[i];
+  }
+}
+
+void PrepareModelInputTensor(const DebugConfig& conf, lite::Scope* scope,
+                             const framework::proto::ProgramDesc& desc) {
+  CHECK(scope);
+
+  std::unordered_map<int, std::string> feed_vars_info;
+  CollectFeedVarsInfo(&feed_vars_info, desc);
+  auto* feed_var =
+      scope->FindVar("feed")->GetMutable<std::vector<lite::Tensor>>();
+  feed_var->resize(feed_vars_info.size());
+
+  for (auto& item : feed_vars_info) {
+    auto& var_desc = conf.var_descs.at(item.second);
+    auto val_type = var_desc.GetDataType();
+    auto dim = var_desc.GetShape();
+    CheckDim(&dim);
+    auto* input_tensor = &feed_var->at(item.first);
+    input_tensor->Resize(DDim(dim));
+    switch (val_type) {
+#define FILL_TENSOR_BY_TYPE_ONCE(pb_type__, type__)         \
+  case framework::proto::VarType::pb_type__:                \
+    FillTensorData<type__>(input_tensor, conf, item.first); \
+    break
+
+      FILL_TENSOR_BY_TYPE_ONCE(UINT8, uint8_t);
+      FILL_TENSOR_BY_TYPE_ONCE(INT8, int8_t);
+      FILL_TENSOR_BY_TYPE_ONCE(INT16, int16_t);
+      FILL_TENSOR_BY_TYPE_ONCE(INT32, int32_t);
+      FILL_TENSOR_BY_TYPE_ONCE(INT64, int64_t);
+      FILL_TENSOR_BY_TYPE_ONCE(FP32, float);
+      FILL_TENSOR_BY_TYPE_ONCE(FP64, double);
+
+      default:
+        LOG(FATAL) << "Unsupported data type: " << static_cast<int>(val_type);
+#undef FILL_TENSOR_BY_TYPE_ONCE
+    }
+  }
+}
+
+void ParseInputFile(DebugConfig* conf) {
+  CHECK(conf);
+  if (conf->input_file.empty()) return;
+  auto& inputs = conf->input_values;
+  std::ifstream fd(conf->input_file);
+  CHECK(fd.is_open()) << "Open input file: " << conf->input_file << " failed!";
+  std::string line;
+  while (std::getline(fd, line)) {
+    inputs.emplace_back(std::move(Split(line, FLAGS_separator)));
+  }
+  LOG(INFO) << "Load data:" << inputs.size() << " items";
+}
+
+void ParseConfig(DebugConfig* conf) {
+  CHECK(conf);
+#define CHECK_NON_EMPTY(name__) \
+  CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
+  CHECK_NON_EMPTY(model_dir);
+  if (FLAGS_output_topo) {
+    CHECK_NON_EMPTY(topo_output_file);
+  }
+  if (FLAGS_output_vars || FLAGS_output_weights) {
+    CHECK_NON_EMPTY(tensor_output_file);
+  }
+#undef CHECK_NON_EMPTY
+  conf->model_dir = FLAGS_model_dir;
+  conf->topo_output_file = FLAGS_topo_output_file;
+  conf->tensor_output_file = FLAGS_tensor_output_file;
+  conf->input_file = FLAGS_input_file;
+  conf->output_weights = FLAGS_output_weights;
+  conf->output_vars = FLAGS_output_vars;
+  conf->output_topo = FLAGS_output_topo;
+  conf->tensor_output_length = FLAGS_tensor_output_length;
+  conf->arm_thread_num = FLAGS_arm_thread_num;
+
+  if (!FLAGS_tensor_names.empty()) {
+    conf->tensor_names = Split(FLAGS_tensor_names, FLAGS_separator);
+  }
+
+  ParseInputFile(conf);
+}
+
+void CollectAndDumpTopoInfo(const std::vector<Instruction>& instructions,
+                            const DebugConfig& conf) {
+  if (!conf.output_topo) return;
+  LOG(INFO) << "----------------- dump topo file";
+  std::ofstream os(conf.topo_output_file);
+  CHECK(os.is_open());
+  for (auto& inst : instructions) {
+    auto* op_info = inst.op()->op_info();
+    CHECK(op_info);
+    os << op_info->Type() << "\t";
+    os << "(";
+#define DUMP_TOPO_INFO_ONCE(name__)                   \
+  {                                                   \
+    auto argnames = op_info->name__##ArgumentNames(); \
+    for (int i = 0; i < argnames.size(); ++i) {       \
+      os << argnames[i] << ":";                       \
+      auto vars = op_info->name__(argnames[i]);       \
+      for (int j = 0; j < vars.size(); ++j) {         \
+        os << vars[j];                                \
+        if (j != vars.size() - 1) os << "#";          \
+      }                                               \
+      if (i != argnames.size() - 1) os << " ";        \
+    }                                                 \
+  }
+    DUMP_TOPO_INFO_ONCE(Input);
+    os << ")\t(";
+    DUMP_TOPO_INFO_ONCE(Output);
+    os << ")\n";
+#undef DUMP_TOPO_INFO_ONCE
+  }
+  os.close();
+}
+
+void CollectVarDescs(std::unordered_map<std::string, lite::VarDesc>* var_descs,
+                     const framework::proto::ProgramDesc& desc) {
+  CHECK(var_descs);
+  CHECK(!desc.blocks().empty());
+  std::unordered_set<std::string> weights;
+  for (auto proto_var_desc : desc.blocks(0).vars()) {
+    lite::VarDesc var_desc(proto_var_desc);
+    (*var_descs).emplace(var_desc.Name(), std::move(var_desc));
+  }
+}
+
+std::unordered_set<std::string> CollectUnusedVars(
+    const std::vector<Instruction>& instructions) {
+  std::unordered_set<std::string> unused;
+  std::unordered_set<std::string> all_inputs;
+  for (auto& inst : instructions) {
+    for (const auto& name : inst.op()->op_info()->input_names()) {
+      all_inputs.insert(name);
+    }
+  }
+
+  for (auto& inst : instructions) {
+    for (const auto& name : inst.op()->op_info()->output_names()) {
+      if (all_inputs.count(name) == 0) unused.insert(name);
+    }
+  }
+
+  return unused;
+}
+
+std::string GetTensorRepr(const lite::Tensor& tensor, int out_data_len) {
+  std::stringstream ss;
+  auto size = tensor.dims().production();
+  if (out_data_len >= 0) {
+    size = std::min(size, static_cast<DDim::value_type>(out_data_len));
+  }
+  for (int i = 0; i < size; i++) {
+    ss << tensor.template data<float>()[i];
+    if (i != size - 1) ss << " ";
+  }
+  return ss.str();
+}
+
+void CollectAndDumpTensorInfo(const std::vector<Instruction>& instructions,
+                              const framework::proto::ProgramDesc& desc,
+                              const DebugConfig& conf) {
+  CHECK(instructions.size() > 0) << "No instruction found";
+  const auto* scope = const_cast<lite::OpLite*>(instructions[0].op())->scope();
+  std::ofstream os(conf.tensor_output_file);
+  CHECK(os.is_open());
+
+  std::unordered_set<std::string> dump_vars;
+#define DUMP_TENSOR_ONCE(name__)                                  \
+  LOG(INFO) << "----------------- dump tensor: " << name__;       \
+  auto& tensor = scope->FindVar(name__)->Get<lite::Tensor>();     \
+  os << name__ << "\t" << tensor.dims() << "\t"                   \
+     << GetTensorRepr(tensor, conf.tensor_output_length) << "\n"; \
+  dump_vars.insert(name__)
+
+#define DUMP_OP_TENSOR_ONCE(name__, skip__)                              \
+  for (const auto& name : inst.op()->op_info()->name__##_names()) {      \
+    bool is_weight = conf.var_descs.at(name).Persistable();              \
+    if (unused.count(name) != 0 || name == #skip__ ||                    \
+        (!conf.output_weights && is_weight) ||                           \
+        (!conf.output_vars && !is_weight) || dump_vars.count(name) != 0) \
+      continue;                                                          \
+    DUMP_TENSOR_ONCE(name);                                              \
+  }
+
+  if (conf.tensor_names.size() == 0) {
+    std::unordered_set<std::string> unused(
+        std::move(CollectUnusedVars(instructions)));
+
+    for (auto& inst : instructions) {
+      DUMP_OP_TENSOR_ONCE(input, feed);
+      DUMP_OP_TENSOR_ONCE(output, fetch);
+    }
+  } else {
+    for (const auto& name : conf.tensor_names) {
+      DUMP_TENSOR_ONCE(name);
+    }
+  }
+#undef DUMP_OP_TENSOR_ONCE
+#undef DUMP_TENSOR_ONCE
+  os.close();
+}
+
+}  // namespace debug
+}  // namespace tools
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/tools/debug/model_debug_tool.cc b/paddle/fluid/lite/tools/debug/model_debug_tool.cc
new file mode 100644
index 00000000000..7e8556a1579
--- /dev/null
+++ b/paddle/fluid/lite/tools/debug/model_debug_tool.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/tools/debug/debug_utils.h"
+
+namespace paddle {
+namespace lite {
+namespace tools {
+namespace debug {
+
+void Run(DebugConfig* conf) {
+  CHECK(conf);
+#ifdef LITE_WITH_ARM
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, conf->arm_thread_num);
+#endif
+  lite::Predictor predictor;
+  std::vector<Place> valid_places({
+      Place{TARGET(kHost), PRECISION(kFloat)},
+#ifdef LITE_WITH_ARM
+      Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+  });
+
+  std::vector<std::string> passes{{
+      "static_kernel_pick_pass", "variable_place_inference_pass",
+      "type_target_cast_pass", "variable_place_inference_pass",
+      "io_copy_kernel_pick_pass", "variable_place_inference_pass",
+      "runtime_context_assign_pass",
+  }};
+
+  predictor.Build(conf->model_dir,
+#ifdef LITE_WITH_ARM
+                  Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+                  Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+                  valid_places, passes);
+
+  auto& instructions = predictor.runtime_program().instructions();
+  auto& program_desc = predictor.program_desc();
+  auto* scope = const_cast<lite::OpLite*>(instructions[0].op())->scope();
+
+  CollectVarDescs(&(conf->var_descs), program_desc);
+  PrepareModelInputTensor(*conf, scope, program_desc);
+  predictor.Run();
+
+  CollectAndDumpTopoInfo(instructions, *conf);
+  CollectAndDumpTensorInfo(instructions, program_desc, *conf);
+
+  // TODO(sangoly): Maybe add some profile info here
+  auto* out = predictor.GetOutput(0);
+  LOG(INFO) << out << " memory size " << out->data_size();
+  LOG(INFO) << "out " << out->data<float>()[0];
+  LOG(INFO) << "dims " << out->dims();
+  LOG(INFO) << "out data size: " << out->data_size();
+}
+
+}  // namespace debug
+}  // namespace tools
+}  // namespace lite
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::lite::tools::debug::DebugConfig conf;
+  paddle::lite::tools::debug::ParseConfig(&conf);
+  paddle::lite::tools::debug::Run(&conf);
+
+  return 0;
+}
diff --git a/paddle/fluid/lite/tools/mobile_readme.md b/paddle/fluid/lite/tools/mobile_readme.md
index 2069de2af26..08bd7b0f5d6 100644
--- a/paddle/fluid/lite/tools/mobile_readme.md
+++ b/paddle/fluid/lite/tools/mobile_readme.md
@@ -17,8 +17,16 @@ $ git checkout incubate/lite
 
 ### 主要cmake选项
                 
-- `ARM_TARGET_OS` 代表目标操作系统， 目前支持 "android" "armlinux"， 模型是Android
-- `ARM_TARGET_ARCH_ABI` 代表ARCH， 目前支持 "arm64-v8a" "armeabi-v7a"。 模型是arm64-v8a
+- `ARM_TARGET_OS` 代表目标操作系统， 目前支持 "android" "armlinux"， 默认是Android
+- `ARM_TARGET_ARCH_ABI` 代表ARCH，支持输入"armv8"和"armv7"，针对OS不一样选择不一样。
+    - `-DARM_TARGET_OS="android"` 时 
+        - "armv8", 等效于 "arm64-v8a"。 default值为这个。
+        - "armv7", 等效于 "armeabi-v7a"。 
+    - `-DARM_TARGET_OS="armlinux"` 时 
+        - "armv8", 等效于 "arm64"。 default值为这个。
+        - "armv7hf", 等效于使用`eabihf`且`-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 `。
+        - "armv7", 等效于使用`eabi`且`-march=armv7-a -mfloat-abi=softfp -mfpu=neon-vfpv4`。
+- `ARM_TARGET_LANG` 代表目标编译的语言， 默认为gcc，支持 gcc和clang两种。
 
 ### 编译
 
@@ -124,3 +132,4 @@ $ adb devices
 List of devices attached
 5cb00b6 device
 ```
+ 
diff --git a/paddle/fluid/lite/utils/CMakeLists.txt b/paddle/fluid/lite/utils/CMakeLists.txt
index 08eeaa54f8e..c7e9e8e8782 100644
--- a/paddle/fluid/lite/utils/CMakeLists.txt
+++ b/paddle/fluid/lite/utils/CMakeLists.txt
@@ -7,5 +7,6 @@
 set(utils_DEPS glog)
 
 lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
-cc_library(any_lite SRCS any.cc)
-cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
+lite_cc_library(any_lite SRCS any.cc)
+lite_cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
+ 
diff --git a/paddle/fluid/lite/utils/any.h b/paddle/fluid/lite/utils/any.h
index 466deae3de9..2a8c68063f0 100644
--- a/paddle/fluid/lite/utils/any.h
+++ b/paddle/fluid/lite/utils/any.h
@@ -34,7 +34,6 @@ class Any {
       CHECK(type_ == typeid(T).hash_code());
     } else {
       type_ = typeid(T).hash_code();
-      data_ = new T;
       deleter_ = [&] { delete static_cast<T*>(data_); };
     }
     data_ = new T;
@@ -55,10 +54,16 @@ class Any {
 
   bool valid() const { return data_; }
 
+  // ~Any() {
+  //    if (valid()) {
+  //      deleter_();
+  //    }
+  //  }
+
  private:
   static size_t kInvalidType;
   size_t type_{kInvalidType};
-  void* data_{};
+  void* data_{nullptr};
   std::function<void()> deleter_;
 };
 
diff --git a/paddle/fluid/lite/utils/io.h b/paddle/fluid/lite/utils/io.h
index 4dba6f98429..86161a4b1ab 100644
--- a/paddle/fluid/lite/utils/io.h
+++ b/paddle/fluid/lite/utils/io.h
@@ -18,11 +18,12 @@
 #include <fstream>
 #include <string>
 #include "paddle/fluid/lite/utils/cp_logging.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
 
-static bool IsFileExists(const std::string &path) {
+static bool IsFileExists(const std::string& path) {
   std::ifstream file(path);
   bool res = file.is_open();
   if (res) {
@@ -31,5 +32,15 @@ static bool IsFileExists(const std::string &path) {
   return res;
 }
 
+// ARM mobile not support mkdir in C++
+static void MkDirRecur(const std::string& path) {
+#ifndef LITE_WITH_ARM
+  CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0)
+      << "Cann't mkdir " << path;
+#else  // On ARM
+  CHECK_NE(mkdir(path.c_str(), S_IRWXU), -1) << "Cann't mkdir " << path;
+#endif
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/utils/string.h b/paddle/fluid/lite/utils/string.h
index 31b131276bf..797fb4be3e5 100644
--- a/paddle/fluid/lite/utils/string.h
+++ b/paddle/fluid/lite/utils/string.h
@@ -52,8 +52,8 @@ static std::string to_string_with_precision(const T& v, const int n = 6) {
   return ss.str();
 }
 
-static std::string Join(const std::vector<std::string>& vec,
-                        const std::string& delim) {
+template <typename T>
+std::string Join(const std::vector<T>& vec, const std::string& delim) {
   if (vec.empty()) return "";
 
   std::stringstream ss;
@@ -74,5 +74,22 @@ static std::string Repr(const std::vector<std::string>& v) {
   return "{" + Join(tmp, ",") + "}";
 }
 
+static std::vector<std::string> Split(const std::string& original,
+                                      const std::string& separator) {
+  std::vector<std::string> results;
+  std::string::size_type pos1, pos2;
+  pos2 = original.find(separator);
+  pos1 = 0;
+  while (std::string::npos != pos2) {
+    results.push_back(original.substr(pos1, pos2 - pos1));
+    pos1 = pos2 + separator.size();
+    pos2 = original.find(separator, pos1);
+  }
+  if (pos1 != original.length()) {
+    results.push_back(original.substr(pos1));
+  }
+  return results;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/utils/varient.h b/paddle/fluid/lite/utils/varient.h
index 2d2a3061108..52bbcffcef9 100644
--- a/paddle/fluid/lite/utils/varient.h
+++ b/paddle/fluid/lite/utils/varient.h
@@ -20,6 +20,7 @@
 #include <typeinfo>
 #include <utility>
 #include "paddle/fluid/lite/utils/cp_logging.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 // This is an equivalent implementation of boost::any. We implement this to
 // avoid including the whole boost library and keep the inference library small.
@@ -116,9 +117,9 @@ struct variant {
     if (type_id == typeid(T).hash_code())
       return *reinterpret_cast<const T*>(&data);
     else
-      throw std::invalid_argument("unmatched type");
-    // LOG(FATAL) << "unmatched type get, should be " << type_id << " but get "
-    //            << typeid(T).name();
+      throw std::invalid_argument(
+          string_format("unmatched type, store as %d, but want to get %s",
+                        type_id, typeid(T).name()));
     return *reinterpret_cast<const T*>(&data);
   }
 
diff --git a/paddle/fluid/lite/x86/CMakeLists.txt b/paddle/fluid/lite/x86/CMakeLists.txt
index 0347593e38a..eacff0204aa 100644
--- a/paddle/fluid/lite/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/x86/CMakeLists.txt
@@ -2,5 +2,4 @@ if (NOT LITE_WITH_X86)
     return()
 endif()
 
-cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
- 
+lite_cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 7eb663ea280..0d4c5c37e1d 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -6,7 +6,8 @@ cc_library(memcpy SRCS memcpy.cc DEPS place)
 cc_library(memory
         DEPS
         malloc
-        memcpy)
+        memcpy
+        )
 #if (WITH_GPU)
 #   nv_test(pinned_memory_test SRCS pinned_memory_test.cu  DEPS place memory)
 #endif()
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index c309febd499..3dbbea3dd0b 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -37,19 +37,30 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator)
-
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
+cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags)
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS
+        ${AllocatorFacadeDeps}
+        cpu_allocator
+        locked_allocator
+        best_fit_allocator
+        aligned_allocator
+        auto_increment_allocator
+        zero_size_allocator
+        conditional_allocator
+        retry_allocator
+        buffered_allocator
+        allocator_strategy
+        legacy_allocator
+        )
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
-cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade)
-
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator)
+
 if (WITH_TESTING)
   set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 endif()
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 7cedad3d66c..064acd06e71 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -89,12 +89,11 @@ class AlignedAllocator : public ThinAlignedAllocator {
   using ThinAlignedAllocator::ThinAlignedAllocator;
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
-    auto raw_allocation = underlying_allocator_->Allocate(size + kAlignment);
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    auto raw_allocation =
+        underlying_allocator_->Allocate(size + kAlignment, attr);
     return new AlignedAllocation<kAlignment>(std::move(raw_allocation), size);
   }
-
-  void FreeImpl(Allocation* allocation) override { delete allocation; }
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 4998f3dbb96..8fb8a5fb897 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -14,14 +14,29 @@
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+#include <functional>
+
 namespace paddle {
 namespace memory {
 namespace allocation {
+Allocation::~Allocation() {}
+
+Allocator::~Allocator() {}
 
 bool Allocator::IsAllocThreadSafe() const { return false; }
 
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
+AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
+  auto ptr = AllocateImpl(size, attr);
+  ptr->set_allocator(this);
+  return AllocationPtr(ptr);
+}
+
+void Allocator::Free(Allocation* allocation) { delete allocation; }
+
+const char* BadAlloc::what() const noexcept { return msg_.c_str(); }
+
+void AllocationDeleter::operator()(Allocation* allocation) const {
+  auto* allocator = allocation->allocator();
   allocator->Free(allocation);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index d31f37268d9..3465278935f 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -15,10 +15,8 @@
 #pragma once
 #include <memory>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -28,73 +26,40 @@ namespace allocation {
 // Exception when `Alloc`/`AllocShared` failed
 class BadAlloc : public std::exception {
  public:
-  inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
-
-  inline const char* what() const noexcept override { return msg_.c_str(); }
+  explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {}
+  const char* what() const noexcept override;
 
  private:
   std::string msg_;
 };
 
-class Allocator;
+class Allocation;
+class AllocationDeleter {
+ public:
+  void operator()(Allocation* allocation) const;
+};
 
+class Allocator;
 // Allocation is the object holding the actually pointer. Use
 // `Allocation::ptr()` will returns the pointer that allocated.
 //
 // NOTE: this is the base class of Allocation. Each allocator can use its own
 //       allocation object.
 // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0
-
-/**
- * Allocation is returned by Allocator::Allocate() method.
- *
- * An allocator may be decorated by another allocator. For example, we can
- * decorate a RetryAllocator to any allocator to perform allocation retry when
- * first allocation request fails.
- *
- * Explanations of Allocator design are as follows:
- *
- * Suppose we have an allocator which is decorated by several allocators:
- *
- *   A(1) <- A(2) <- A(3) <- ... <- A(n)
- *
- * , and the public allocator is A(1).
- *
- * The allocation process would be:
- *
- *   A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate()
- *
- * , and the free process would be:
- *
- *   A(1).Free() -> A(2).Free() -> ... -> A(n).Free()
- *
- * Therefore, we should record the allocator chain when allocating, so
- * that we can free the allocation in the reverse order of allocator chain.
- * The field `decorated_allocators_` is used to record this chain.
- *
- * Another example is that we want to add additional fields in Allocation,
- * e.g., something what is done in AlignedAllocator, etc.
- * In this case, we should declare a derived class of Allocation, which
- * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
- * be a new chain, differing from the underlying Allocation object.
- */
 class Allocation {
  public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {}
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
 
   // Returns the holding pointer.
   // NOTE: For performance consideration, it is better not to make this method
   // as a virtual method. If we want to implement a `defragmentation` later,
   // we might need to make `ptr_` field as a protected field, and add a virtual
   // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
+  void* ptr() const { return ptr_; }
 
   // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
   // last valid element.
@@ -105,84 +70,77 @@ class Allocation {
   //    The raw pointer might not aligned, so an offset might be added to raw
   //    the pointer. The size of this allocation will be
   //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
+  size_t size() const { return size_; }
 
-  inline const platform::Place& place() const { return place_; }
+  const platform::Place& place() const { return place_; }
 
-  virtual ~Allocation() {}
+  Allocator* allocator() { return allocator_; }
 
- private:
-  inline void RegisterDecoratedAllocator(Allocator* allocator) {
-    decorated_allocators_.emplace_back(allocator);
-  }
+  void set_allocator(Allocator* allocator) { allocator_ = allocator; }
 
-  inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); }
-
-  inline Allocator* TopDecoratedAllocator() {
-    return decorated_allocators_.back();
-  }
+  virtual ~Allocation();
 
  private:
+  Allocator* allocator_;
   void* ptr_;
   size_t size_;
   platform::Place place_;
-
-  /**
-   * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
-   * We reserve a small buffer to it to prevent frequent heap allocation
-   *
-   * Instead, we can use a std::vector<Allocator *> here, and reserve
-   * kReserveAllocatorNum in constructor of Allocation.
-   * But using std::vector<Allocator *> would make ocr recognition model
-   * fail in CE. The train duration is 8% slower than KPI.
-   */
-  static constexpr size_t kReserveAllocatorNum = 8;
-  using DecoratedAllocatorStack =
-      framework::InlinedVector<Allocator*, kReserveAllocatorNum>;
-
-  DecoratedAllocatorStack decorated_allocators_;
-
-  friend class Allocator;
 };
 
+using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+
 // Base interface class of memory Allocator.
+// To allocate a memory, allocator needs two parameters:
+//    1. size of bytes.
+//    2. Attribute of memory.
+// NOTE: the attribute of memory might be ignored if the allocator does not
+// care it.
 class Allocator {
  public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
+  enum Attr {
+    kDefault = 0,  // Default attribute. Uses the fast or stablest allocation
+                   // algorithm.
+
+    kFixedHuge = 1,  // The allocation may not be freed until the program
+                     // ends. e.g., `Parameters` and `Momentum`.
+
+    kFluxHuge = 2,  // The allocation may create and freed frequently and the
+                    // allocation is considerable huge. Like `activations`
+                    // and gradients.
+
+    kScratchpad =
+        3,  // The `Scratchpad` memory is allocated and freed very soon,
+            // usually within an operator or aux memory.
+            // Like CUDNN workspace, AUX memory in batch norm, etc.
+            //
+            // https://en.wikipedia.org/wiki/Scratchpad_memory
+
+    kCrossDevice =
+        4,  // The memory used cross-device memory copy/communication.
+            // For example:
+            // 1. it can use an `pinned` memory for CPU-GPU
+            //    communication.
+            // 2. it can use an `registered` memory for RDMA
+            //    communication.
+
+    NumOfAttrs = 5  // The number of all attributes. It is used internally.
   };
 
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  virtual ~Allocator();
 
   // Allocate an allocation.
-  inline AllocationPtr Allocate(size_t size) {
-    auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
-  }
-
-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
-    FreeImpl(allocation);
-  }
+  AllocationPtr Allocate(size_t size, Allocator::Attr attr = kDefault);
 
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
  protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
-};
+  virtual void Free(Allocation* allocation);
+  virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0;
 
-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
+ private:
+  friend class AllocationDeleter;
+};
 
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 440b2475f16..a3b73e3ba31 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -29,6 +29,7 @@
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -48,17 +49,6 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-static inline std::shared_ptr<Allocator> WrapRetryAllocator(
-    std::shared_ptr<Allocator> allocator, int64_t retry_time) {
-  if (retry_time > 0) {
-    auto* retry_allocator =
-        new RetryAllocator(std::move(allocator), retry_time);
-    allocator.reset(retry_allocator);
-  }
-
-  return allocator;
-}
-
 // TODO(yy): Dirty code here. This class should be configurable in runtime.
 class CPUManagedAllocator : public Allocator {
  public:
@@ -67,8 +57,8 @@ class CPUManagedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
-    return normal_allocator_->Allocate(size).release();
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    return normal_allocator_->Allocate(size, attr).release();
   }
 
  private:
@@ -101,10 +91,11 @@ class ChunkedAllocator : public Allocator {
 
     auto* cond_allocator = new ConditionalAllocator();
     cond_allocator
-        ->AddAllocator([this](size_t size) { return size < max_chunk_size_; },
-                       default_allocator_)
+        ->AddAllocator(
+            [this](size_t size, Attr attr) { return size < max_chunk_size_; },
+            default_allocator_)
         .AddAllocator(
-            [](size_t size) {
+            [](size_t size, Attr attr) {
               return true;  // default case
             },
             raw_allocator_);
@@ -121,10 +112,14 @@ class ChunkedAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateAllocatorWithChunk() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    std::shared_ptr<Allocator> allocator(new LockedAllocator(
-        std::shared_ptr<Allocator>(new BestFitAllocator(allocation))));
+    std::unique_ptr<Allocator> allocator(new LockedAllocator(
+        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
 
-    allocator = WrapRetryAllocator(allocator, retry_time_);
+    if (retry_time_ > 0) {
+      auto* retry_allocator =
+          new RetryAllocator(std::move(allocator), retry_time_);
+      allocator.reset(retry_allocator);
+    }
 
     return std::make_shared<AlignedAllocator<64u>>(std::move(allocator));
   }
@@ -132,8 +127,8 @@ class ChunkedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
-    return default_allocator_->Allocate(size).release();
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+    return default_allocator_->Allocate(size, attr).release();
   }
 
  protected:
@@ -190,36 +185,19 @@ class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
 
 class AllocatorFacadePrivate {
  public:
-  AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
-      case AllocatorStrategy::kLegacy: {
-        InitLegacyAllocator();
-        break;
-      }
-      case AllocatorStrategy::kNaiveBestFit: {
-        InitCPUAllocator();
-        InitCUDAAllocator();
-        InitCUDAPinnedAllocator();
-        break;
-      }
-      default: {
-        PADDLE_THROW("Unsupported allocator strategy: %d",
-                     static_cast<int>(strategy));
-      }
-    }
-    InitZeroSizeAllocators();
-  }
+  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
+
+  ~AllocatorFacadePrivate() = default;
 
-  inline const std::shared_ptr<Allocator>& GetAllocator(
-      const platform::Place& place, size_t size) {
-    const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
-    auto iter = allocators.find(place);
-    if (iter == allocators.end()) {
-      throw BadAlloc(
-          string::Sprintf("No such allocator for the place, %s", place));
+  AllocatorFacadePrivate() {
+    if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) {
+      InitLegacyAllocator();
+    } else {
+      InitCPUAllocator();
+      InitCUDAAllocator();
+      InitCUDAPinnedAllocator();
+      WrapZeroSizeAllocator();
     }
-    return iter->second;
   }
 
  private:
@@ -257,40 +235,12 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  class ZeroSizeAllocator : public Allocator {
-   public:
-    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-
-   protected:
-    Allocation* AllocateImpl(size_t size) override {
-      return new Allocation(nullptr, 0, place_);
-    }
-
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
-
-   private:
-    platform::Place place_;
-  };
-
-  void InitZeroSizeAllocators() {
-    std::vector<platform::Place> places;
-    places.emplace_back(platform::CPUPlace());
-#ifdef PADDLE_WITH_CUDA
-    int device_count = platform::GetCUDADeviceCount();
-    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
-      places.emplace_back(platform::CUDAPlace(dev_id));
-    }
-    places.emplace_back(platform::CUDAPinnedPlace());
-#endif
-
-    for (auto& p : places) {
-      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
+  void WrapZeroSizeAllocator() {
+    for (auto& pair : allocators_) {
+      pair.second =
+          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
     }
   }
-
- private:
-  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
-  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
 };
 
 // Pimpl. Make interface clean.
@@ -303,13 +253,19 @@ AllocatorFacade& AllocatorFacade::Instance() {
 }
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
-    const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
+    const platform::Place& place, size_t size, Allocator::Attr attr) {
+  return std::shared_ptr<Allocation>(Alloc(place, size, attr).release(),
+                                     AllocationDeleter());
 }
 
-AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
-                                     size_t size) {
-  return m_->GetAllocator(place, size)->Allocate(size);
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     Allocator::Attr attr) {
+  auto it = m_->allocators_.find(place);
+  if (it == m_->allocators_.end()) {
+    throw BadAlloc(
+        string::Sprintf("No such allocator for the place, %s", place));
+  }
+  return m_->allocators_.at(place)->Allocate(size, attr);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 64b6fe25c35..16da30bec0d 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -38,11 +38,13 @@ class AllocatorFacade {
   static AllocatorFacade& Instance();
 
   // Allocate a shared allocation.
-  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
-                                          size_t size);
+  std::shared_ptr<Allocation> AllocShared(
+      const platform::Place& place, size_t size,
+      Allocator::Attr attr = Allocator::kDefault);
 
   // Allocate a unique allocation.
-  AllocationPtr Alloc(const platform::Place& place, size_t size);
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      Allocator::Attr attr = Allocator::kDefault);
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc
index fff94c01e70..8cebda9005b 100644
--- a/paddle/fluid/memory/allocation/allocator_strategy.cc
+++ b/paddle/fluid/memory/allocation/allocator_strategy.cc
@@ -19,22 +19,16 @@
 DEFINE_string(
     allocator_strategy, "legacy",
     "The allocation strategy. Legacy means the original allocator of Fluid."
-    "naive_best_fit means the experimental best fit allocator. "
-    "allocator. Enum in [legacy, naive_best_fit].");
+    "New means the experimental allocators of Fluid. in [legacy, new]");
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 static AllocatorStrategy GetStrategyFromFlag() {
-  if (FLAGS_allocator_strategy == "legacy") {
-    return AllocatorStrategy::kLegacy;
-  } else if (FLAGS_allocator_strategy == "naive_best_fit") {
-    return AllocatorStrategy::kNaiveBestFit;
-  } else {
-    PADDLE_THROW("Unsupported allocator strategy: %s",
-                 FLAGS_allocator_strategy);
-  }
+  return FLAGS_allocator_strategy == "legacy"
+             ? AllocatorStrategy::kLegacy
+             : AllocatorStrategy::kNaiveBestFit;
 }
 
 AllocatorStrategy GetAllocatorStrategy() {
diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.cc b/paddle/fluid/memory/allocation/auto_increment_allocator.cc
index bafa82f18c7..c4785d20786 100644
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.cc
@@ -34,13 +34,14 @@ std::shared_ptr<Allocator> AutoIncrementAllocator::CreateNewAllocator() {
       "bug.");
   return underlying_allocators_[old_size];
 }
-Allocation *AutoIncrementAllocator::AllocateImpl(size_t size) {
+Allocation *AutoIncrementAllocator::AllocateImpl(size_t size,
+                                                 Allocator::Attr attr) {
   auto cur = prev_success_allocator_.load();
   size_t retry_count = allocator_num_.load();
   size_t allocator_num = retry_count;
   while (retry_count-- > 0) {  // until there retry count is zero
     try {
-      auto res = underlying_allocators_[cur]->Allocate(size);
+      auto res = underlying_allocators_[cur]->Allocate(size, attr);
       prev_success_allocator_ = cur;
       return res.release();
     } catch (BadAlloc &) {
@@ -60,7 +61,7 @@ Allocation *AutoIncrementAllocator::AllocateImpl(size_t size) {
   // the newly created allocator by the first allocation request.
   for (cur = allocator_num; cur < allocator_num_; ++cur) {
     try {
-      auto ret = underlying_allocators_[cur]->Allocate(size);
+      auto ret = underlying_allocators_[cur]->Allocate(size, attr);
       prev_success_allocator_ = cur;
       return ret.release();
     } catch (BadAlloc &) {
@@ -69,7 +70,7 @@ Allocation *AutoIncrementAllocator::AllocateImpl(size_t size) {
     }
   }
   // No suitable allocator
-  return CreateNewAllocator()->Allocate(size).release();
+  return CreateNewAllocator()->Allocate(size, attr).release();
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/auto_increment_allocator.h b/paddle/fluid/memory/allocation/auto_increment_allocator.h
index 068cda473d6..c179004eacb 100644
--- a/paddle/fluid/memory/allocation/auto_increment_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_increment_allocator.h
@@ -61,7 +61,7 @@ class AutoIncrementAllocator : public Allocator {
   std::shared_ptr<Allocator> CreateNewAllocator();
 
  protected:
-  Allocation* AllocateImpl(size_t size) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
   AllocatorCreator creator_;
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 72ee4e5411c..e3d6c2f511e 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::Free(Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(bf_allocation,
                           "The input allocation is not BestFitAllocation.");
@@ -140,7 +140,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) {
   InsertFreeNode(chunk_it);
   delete allocation;
 }
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
+Allocation* BestFitAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
   MapIt map_it;
   for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 64a552e4fd2..4f10f2b53e8 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -119,8 +119,8 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void Free(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
   Allocation* allocation_;  // not owned
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
index 7e5207e6345..d23a88991b6 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -35,10 +35,10 @@ class StubAllocation : public Allocation {
 TEST(BestFitAllocator, test_allocation) {
   StubAllocation stub(4UL * 1024 * 1024 * 1024);
   BestFitAllocator allocator(&stub);
-  { auto allocation = allocator.Allocate(64); }
+  { auto allocation = allocator.Allocate(64, allocator.kDefault); }
 
   {
-    auto allocation = allocator.Allocate(80);
+    auto allocation = allocator.Allocate(80, allocator.kDefault);
 
     {
       auto best_fit_allocation =
@@ -50,10 +50,10 @@ TEST(BestFitAllocator, test_allocation) {
       ASSERT_EQ(allocation->ptr(), nullptr);
     }
 
-    auto allocation2 = allocator.Allocate(60);
-    auto allocation3 = allocator.Allocate(90);
+    auto allocation2 = allocator.Allocate(60, allocator.kDefault);
+    auto allocation3 = allocator.Allocate(90, allocator.kDefault);
     allocation2.reset();
-    allocation2 = allocator.Allocate(30);
+    allocation2 = allocator.Allocate(30, allocator.kDefault);
 
     {
       auto best_fit_allocation =
@@ -61,7 +61,7 @@ TEST(BestFitAllocator, test_allocation) {
       ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80);
     }
     allocation2.reset();
-    allocation2 = allocator.Allocate(60);
+    allocation2 = allocator.Allocate(60, allocator.kDefault);
 
     {
       auto best_fit_allocation =
@@ -72,7 +72,7 @@ TEST(BestFitAllocator, test_allocation) {
     allocation.reset();
     allocation2.reset();
 
-    allocation = allocator.Allocate(80 + 60);
+    allocation = allocator.Allocate(80 + 60, allocator.kDefault);
     {
       auto best_fit_allocation =
           dynamic_cast<BestFitAllocation*>(allocation.get());
@@ -81,8 +81,8 @@ TEST(BestFitAllocator, test_allocation) {
 
     allocation.reset();
 
-    allocation = allocator.Allocate(80);
-    allocation2 = allocator.Allocate(60);
+    allocation = allocator.Allocate(80, allocator.kDefault);
+    allocation2 = allocator.Allocate(60, allocator.kDefault);
     allocation = nullptr;
     allocation2 = nullptr;
     allocation3 = nullptr;
@@ -93,7 +93,8 @@ TEST(BestFitAllocator, test_allocation) {
 
 TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
   CPUAllocator allocator;
-  auto global_allocation = allocator.Allocate(256UL * 1024 * 1024);
+  auto global_allocation =
+      allocator.Allocate(256UL * 1024 * 1024, allocator.kDefault);
 
   std::unique_ptr<Allocator> best_fit_allocator(
       new BestFitAllocator(global_allocation.get()));
@@ -107,8 +108,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) {
     for (size_t i = 0; i < 128; ++i) {
       size_t allocate_size = dist(engine);
 
-      auto allocation =
-          locked_allocator.Allocate(sizeof(size_t) * allocate_size);
+      auto allocation = locked_allocator.Allocate(
+          sizeof(size_t) * allocate_size, locked_allocator.kDefault);
 
       size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index eb24ba84c88..acace011c7a 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -37,7 +37,8 @@ struct ForEachFill {
 TEST(BestFitAllocator, concurrent_cuda) {
   CUDAAllocator allocator(platform::CUDAPlace(0));
   // 256 MB
-  auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024);
+  auto cuda_allocation =
+      allocator.Allocate(256U * 1024 * 1024, allocator.kDefault);
   LockedAllocator concurrent_allocator(
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
@@ -50,8 +51,8 @@ TEST(BestFitAllocator, concurrent_cuda) {
     for (size_t i = 0; i < 128; ++i) {
       size_t allocate_size = dist(engine);
 
-      auto allocation =
-          concurrent_allocator.Allocate(sizeof(size_t) * allocate_size);
+      auto allocation = concurrent_allocator.Allocate(
+          sizeof(size_t) * allocate_size, concurrent_allocator.kDefault);
 
       size_t* data = reinterpret_cast<size_t*>(allocation->ptr());
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index d80616b7a8d..fc75abc9dfe 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -16,16 +16,17 @@
 #include <algorithm>
 #include <limits>
 #include <utility>
+#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
-BufferedAllocator::BufferedAllocator(std::shared_ptr<Allocator> allocator)
+BufferedAllocator::BufferedAllocator(std::unique_ptr<Allocator> &&allocator)
     : underlying_allocator_(std::move(allocator)) {
   PADDLE_ENFORCE_NOT_NULL(
       underlying_allocator_,
-      "Underlying allocator of BufferedAllocator must not be null");
+      "Underlying allocator of BufferedAllocator must be unmanaged");
   if (underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
@@ -40,35 +41,37 @@ void BufferedAllocator::FreeCache(size_t size) {
   while (!allocations_.empty()) {  // free the largest
     auto it = --allocations_.end();
     cur += it->second->size();
-    underlying_allocator_->Free(it->second.release());
+    delete it->second.release();
     allocations_.erase(it);
     if (cur >= size) return;
   }
 }
 
-bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
-
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+bool BufferedAllocator::IsAllocThreadSafe() const {
+  return this->underlying_allocator_->IsAllocThreadSafe();
+}
+void BufferedAllocator::Free(Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   allocations_.emplace(allocation->size(), AllocationPtr(allocation));
 }
-
-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
+Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
     auto it = allocations_.lower_bound(size);
     if (it != allocations_.end() && it->first < size * 2) {
       AllocationPtr result(std::move(it->second));
       allocations_.erase(it);
-      return result.release();
+      return new AllocationWithUnderlying(std::move(result));
     }
   }
 
   try {
-    return underlying_allocator_->Allocate(size).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   } catch (BadAlloc &) {
     FreeCache(size);
-    return underlying_allocator_->Allocate(size).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index fd0996f7748..d44a3f85beb 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -31,7 +31,7 @@ namespace allocation {
 // underlying_allocator_
 class BufferedAllocator : public Allocator {
  public:
-  explicit BufferedAllocator(std::shared_ptr<Allocator> allocator);
+  explicit BufferedAllocator(std::unique_ptr<Allocator> &&allocator);
 
   ~BufferedAllocator();
 
@@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void Free(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::multimap<size_t, AllocationPtr> allocations_;
   std::unique_ptr<std::mutex> mtx_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index e4825233d58..c8bd5292ca0 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/memory/allocation/buffered_allocator.h"
 #include <gtest/gtest.h>
+#include <memory>
 #include <utility>
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
@@ -36,7 +37,7 @@ inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
 
 TEST(buffered_allocator, thread_safety) {
   std::unique_ptr<CPUAllocator> allocator(new CPUAllocator());
-  auto chunk = allocator->Allocate(1 << 20);
+  auto chunk = allocator->Allocate(1 << 20, allocator->kDefault);
   {
     auto buf_allocator = GetBufferedAllocator(chunk.get(), true);
     ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true);
@@ -65,14 +66,14 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void Free(Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(alloc);
     if (alloc->ptr()) delete[] static_cast<uint8_t *>(alloc->ptr());
     ++destruct_count_;
     delete allocation;
   }
-  Allocation *AllocateImpl(size_t size) override {
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override {
     ++construct_count_;
     if (size == 0) {
       return new StubAllocation(nullptr, 0, platform::CPUPlace());
@@ -98,7 +99,7 @@ TEST(buffered_allocator, lazy_free) {
 
   {
     underlying_allocator->ResetCounter();
-    auto x = allocator->Allocate(1025);
+    auto x = allocator->Allocate(1025, allocator->kDefault);
     ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
     ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
     x = nullptr;
@@ -107,10 +108,10 @@ TEST(buffered_allocator, lazy_free) {
 
   {
     underlying_allocator->ResetCounter();
-    auto x = allocator->Allocate(900);
+    auto x = allocator->Allocate(900, allocator->kDefault);
     ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero);
     ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
-    auto y = allocator->Allocate(2048);
+    auto y = allocator->Allocate(2048, allocator->kDefault);
     ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne);
     ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero);
     x = nullptr;
@@ -129,13 +130,13 @@ TEST(buffered_allocator, lazy_free) {
 
 TEST(buffered_allocator, garbage_collection) {
   std::unique_ptr<CPUAllocator> cpu_allocator(new CPUAllocator());
-  auto chunk = cpu_allocator->Allocate(2048);
+  auto chunk = cpu_allocator->Allocate(2048, cpu_allocator->kDefault);
   auto allocator = GetBufferedAllocator(chunk.get(), false);
-  auto x1 = allocator->Allocate(1600);
-  auto x2 = allocator->Allocate(400);
+  auto x1 = allocator->Allocate(1600, allocator->kDefault);
+  auto x2 = allocator->Allocate(400, allocator->kDefault);
   x1 = nullptr;
   x2 = nullptr;
-  auto x3 = allocator->Allocate(1600);
+  auto x3 = allocator->Allocate(1600, allocator->kDefault);
   ASSERT_NE(x3, nullptr);
   ASSERT_NE(x3->ptr(), nullptr);
 }
diff --git a/paddle/fluid/memory/allocation/conditional_allocator.cc b/paddle/fluid/memory/allocation/conditional_allocator.cc
index 373afb1bd6e..59e30f16fa0 100644
--- a/paddle/fluid/memory/allocation/conditional_allocator.cc
+++ b/paddle/fluid/memory/allocation/conditional_allocator.cc
@@ -20,7 +20,8 @@ namespace memory {
 namespace allocation {
 
 ConditionalAllocator& ConditionalAllocator::AddAllocator(
-    std::function<bool(size_t)> func, std::shared_ptr<Allocator> allocator) {
+    std::function<bool(size_t, Allocator::Attr)> func,
+    std::shared_ptr<Allocator> allocator) {
   underlying_allocators_.emplace_back(std::move(func), std::move(allocator));
   return *this;
 }
@@ -33,10 +34,11 @@ bool ConditionalAllocator::IsAllocThreadSafe() const {
                      });
 }
 
-Allocation* ConditionalAllocator::AllocateImpl(size_t size) {
+Allocation* ConditionalAllocator::AllocateImpl(size_t size,
+                                               Allocator::Attr attr) {
   for (auto& pair : underlying_allocators_) {
-    if (pair.first(size)) {
-      return pair.second->Allocate(size).release();
+    if (pair.first(size, attr)) {
+      return pair.second->Allocate(size, attr).release();
     }
   }
   throw BadAlloc("No suitable allocator");
diff --git a/paddle/fluid/memory/allocation/conditional_allocator.h b/paddle/fluid/memory/allocation/conditional_allocator.h
index 61c3670803a..73ec006d142 100644
--- a/paddle/fluid/memory/allocation/conditional_allocator.h
+++ b/paddle/fluid/memory/allocation/conditional_allocator.h
@@ -29,10 +29,13 @@ namespace allocation {
 // For example:
 //
 // auto* cond_allocator = new ConditionalAllocator();
-// cond_allocator->AddAllocator([](size_t size){
+// cond_allocator->AddAllocator([](size_t size, Attr attr){
 //   // if size > 10
 //   return size > 10;
-// }, allocator_b).AddAllocator([](size_t size){
+// }, allocator_a).AddAllocator([](size_t size, Attr attr){
+//   // elif attr is kDefault
+//   return attr == kDefault;
+// }, allocator_b).AddAllocator([](size_t size, Attr attr){
 //   // else
 //   return true;
 // }, allocator_c);
@@ -40,17 +43,17 @@ class ConditionalAllocator : public Allocator {
  public:
   ConditionalAllocator() = default;
 
-  ConditionalAllocator& AddAllocator(std::function<bool(size_t)> func,
+  ConditionalAllocator& AddAllocator(std::function<bool(size_t, Attr)> func,
                                      std::shared_ptr<Allocator> allocator);
 
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation* AllocateImpl(size_t size) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
   using AllocatorWithCond =
-      std::pair<std::function<bool(size_t)>, std::shared_ptr<Allocator>>;
+      std::pair<std::function<bool(size_t, Attr)>, std::shared_ptr<Allocator>>;
   std::vector<AllocatorWithCond> underlying_allocators_;
 };
 
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 580cf1af56a..cc81a6f7b8b 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -20,27 +20,25 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+CPUAllocation::CPUAllocation(void *ptr, size_t size)
+    : Allocation(ptr, size, platform::CPUPlace()) {}
+
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
-  void *p = allocation->ptr();
-#ifdef _WIN32
-  _aligned_free(p);
-#else
-  free(p);
-#endif
+void CPUAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUAllocation *>(allocation));
+  free(allocation->ptr());
   delete allocation;
 }
 
-Allocation *CPUAllocator::AllocateImpl(size_t size) {
-  void *p;
-#ifdef _WIN32
-  p = _aligned_malloc(size, kAlignment);
-#else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!",
-                    size);
-#endif
-  return new Allocation(p, size, platform::CPUPlace());
+Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
+  void *ptr;
+  auto status = posix_memalign(&ptr, kAlignment, size);
+  if (UNLIKELY(status) != 0) {
+    throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d",
+                                   size, status));
+  }
+  return new CPUAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 058ff633816..26d3643f4ed 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -31,14 +31,20 @@ namespace allocation {
 //
 // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import
 // an open-sourced allocator into Paddle.
+class CPUAllocator;
+class CPUAllocation : public Allocation {
+ public:
+  CPUAllocation(void* ptr, size_t size);
+};
+
 class CPUAllocator : public Allocator {
  public:
-  constexpr static size_t kAlignment = 4096UL;
+  constexpr static size_t kAlignment = 64u;
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void Free(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 349c71cece1..430bf0be98e 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,15 +23,16 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::Free(Allocation* allocation) {
   platform::CUDADeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(allocation->place()),
+  auto* cuda_allocation = dynamic_cast<CUDAAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(cuda_allocation);
+  PADDLE_ENFORCE_EQ(boost::get<platform::CUDAPlace>(cuda_allocation->place()),
                     place_);
   PADDLE_ENFORCE(cudaFree(allocation->ptr()));
   delete allocation;
 }
-
-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
+Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::CUDADeviceGuard guard(place_.device);
   void* ptr;
   auto status = cudaMalloc(&ptr, size);
@@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
         "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device,
         status, cudaGetErrorString(status)));
   }
-  return new Allocation(ptr, size, platform::Place(place_));
+  return new CUDAAllocation(ptr, size, platform::Place(place_));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 886f6e7a327..63726f5820b 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -20,6 +20,13 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// CUDA System allocator and allocation.
+// Just a flag type.
+class CUDAAllocation : public Allocation {
+ public:
+  using Allocation::Allocation;
+};
+
 class CUDAAllocator : public Allocator {
  public:
   explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {}
@@ -28,8 +35,8 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void Free(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc
index 4adc0aabf4f..2ecb44ff15f 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -200,12 +200,12 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     platform::GpuMemoryUsage(&avail, &total);
     LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size)
                << " in GPU " << place.device << ", available "
-               << string::HumanReadableSize(avail) << ", total "
-               << string::HumanReadableSize(total) << ", GpuMinChunkSize "
+               << string::HumanReadableSize(avail) << "total " << total
+               << "GpuMinChunkSize "
                << string::HumanReadableSize(buddy_allocator->GetMinChunkSize())
-               << ", GpuMaxChunkSize "
+               << "GpuMaxChunkSize "
                << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize())
-               << ", GPU memory used: "
+               << "GPU memory used: "
                << string::HumanReadableSize(Used<platform::CUDAPlace>(place));
   } else {
     if (FLAGS_benchmark) {
@@ -339,7 +339,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 namespace allocation {
 LegacyMemMonitor GPUMemMonitor;
 
-Allocation *LegacyAllocator::AllocateImpl(size_t size) {
+Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
   auto *tmp_alloc = new Allocation(ptr, size, place_);
   platform::MemEvenRecorder::Instance().PushMemRecord(
@@ -347,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size) {
   return tmp_alloc;
 }
 
-void LegacyAllocator::FreeImpl(Allocation *allocation) {
+void LegacyAllocator::Free(Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h
index c7efb5fd2e5..d9bdae153da 100644
--- a/paddle/fluid/memory/allocation/legacy_allocator.h
+++ b/paddle/fluid/memory/allocation/legacy_allocator.h
@@ -72,8 +72,8 @@ class LegacyAllocator : public Allocator {
   explicit LegacyAllocator(const platform::Place &p) : place_(p) {}
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
+  void Free(Allocation *allocation) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index a912807645b..62d768c5806 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -15,8 +15,8 @@
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include <mutex>  // NOLINT
 #include <utility>
+#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
-
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,24 +24,26 @@ namespace allocation {
 bool LockedAllocator::IsAllocThreadSafe() const { return true; }
 
 LockedAllocator::LockedAllocator(
-    std::shared_ptr<Allocator> underlying_allocator)
+    std::unique_ptr<Allocator> &&underlying_allocator)
     : underlying_allocator_(std::move(underlying_allocator)) {
   PADDLE_ENFORCE_NOT_NULL(underlying_allocator_);
   if (!underlying_allocator_->IsAllocThreadSafe()) {
     mtx_.reset(new std::mutex());
   }
 }
-
-void LockedAllocator::FreeImpl(Allocation *allocation) {
-  platform::LockGuardPtr<std::mutex> guard(mtx_);
-  underlying_allocator_->Free(allocation);
+void LockedAllocator::Free(Allocation *allocation) {
+  {
+    platform::LockGuardPtr<std::mutex> guard(mtx_);
+    reinterpret_cast<AllocationWithUnderlying *>(allocation)
+        ->allocation_.reset();  // Destroy inner allocation
+  }
+  delete allocation;
 }
-
-Allocation *LockedAllocator::AllocateImpl(size_t size) {
+Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  return underlying_allocator_->Allocate(size).release();
+  return new AllocationWithUnderlying(
+      underlying_allocator_->Allocate(size, attr));
 }
-
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 4af77e6e057..4967b9bb8d3 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -24,15 +24,15 @@ namespace allocation {
 // A allocator to make underlying allocator thread safe.
 class LockedAllocator : public Allocator {
  public:
-  explicit LockedAllocator(std::shared_ptr<Allocator> underlying_allocator);
+  explicit LockedAllocator(std::unique_ptr<Allocator> &&underlying_allocator);
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void Free(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::unique_ptr<std::mutex> mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 35391167fe6..de81d12cca6 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -20,14 +20,20 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::Free(Allocation *allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation *>(allocation));
   PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
   delete allocation;
 }
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
+Allocation *CPUPinnedAllocator::AllocateImpl(size_t size,
+                                             Allocator::Attr attr) {
+  // PADDLE_ENFORCE_EQ(
+  //    attr, kCrossDevice,
+  //    "CPUPinnedAllocator should be used for Cross-Device Communication");
+
   void *ptr;
   PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
-  return new Allocation(ptr, size, platform::CUDAPinnedPlace());
+  return new CPUPinnedAllocation(ptr, size);
 }
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 4f535ef3373..42d0938f2af 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -20,13 +20,19 @@ namespace memory {
 namespace allocation {
 
 // Allocator uses `cudaHostAlloc`
+class CPUPinnedAllocation : public Allocation {
+ public:
+  CPUPinnedAllocation(void *ptr, size_t size)
+      : Allocation(ptr, size, platform::CUDAPinnedPlace()) {}
+};
+
 class CPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void Free(Allocation *allocation) override;
+  Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index bf14ed5db10..981705051b4 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,19 +13,30 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
+#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+bool RetryAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+
+void RetryAllocator::Free(Allocation* allocation) {
   // Delete underlying allocation first.
-  underlying_allocator_->Free(allocation);
-  cv_.notify_all();
+  reinterpret_cast<AllocationWithUnderlying*>(allocation)->allocation_.reset();
+  {
+    // notify all waited allocators, they can try to allocate memory after free.
+    std::lock_guard<std::mutex> lock(mutex_);
+    cv_.notify_all();
+  }
+  delete allocation;
 }
 
-Allocation* RetryAllocator::AllocateImpl(size_t size) {
+Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
   auto alloc_func = [&, this]() {
-    return underlying_allocator_->Allocate(size).release();
+    return new AllocationWithUnderlying(
+        underlying_allocator_->Allocate(size, attr));
   };
   // In fact, we can unify the code of allocation success and failure
   // But it would add lock even when allocation success at the first time
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 7840a834472..6ab8ca8fbec 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -25,25 +25,32 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+class RetryAllocator;
+
 class RetryAllocator : public Allocator {
  public:
-  RetryAllocator(std::shared_ptr<Allocator> allocator, size_t retry_ms)
+  RetryAllocator(std::unique_ptr<Allocator>&& allocator, size_t retry_ms)
       : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) {
+    EnforceCheck();
+  }
+
+  bool IsAllocThreadSafe() const override;
+
+ private:
+  void EnforceCheck() {
     PADDLE_ENFORCE_NOT_NULL(
-        underlying_allocator_,
-        "UnderlyingAllocator of RetryAllocator must not be null");
+        underlying_allocator_.get(),
+        "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator");
     PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(),
                    "UnderlyingAllocator of RetryAllocator must be thread-safe");
   }
 
-  bool IsAllocThreadSafe() const override { return true; }
-
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void Free(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
 
  private:
-  std::shared_ptr<Allocator> underlying_allocator_;
+  std::unique_ptr<Allocator> underlying_allocator_;
   std::chrono::milliseconds retry_time_;
   std::mutex mutex_;
   std::condition_variable cv_;
@@ -51,6 +58,8 @@ class RetryAllocator : public Allocator {
   // For debug, We can add an atomic integer to record how many memory sizes are
   // waited to allocate
   // std::atomic<size_t> waited_allocate_size_{0};
+
+  friend class RetryAllocation;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 4ac08d442d4..345b5f44d3d 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -32,7 +32,7 @@ TEST(RetryAllocator, RetryAllocator) {
   CPUAllocator cpu_allocator;
 
   size_t size = (1 << 20);
-  auto cpu_allocation = cpu_allocator.Allocate(size);
+  auto cpu_allocation = cpu_allocator.Allocate(size, cpu_allocator.kDefault);
 
   std::unique_ptr<BestFitAllocator> best_fit_allocator(
       new BestFitAllocator(cpu_allocation.get()));
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 5884433aaff..e414ad657a9 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -21,12 +21,13 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
-                                        size_t size) {
-  return allocation::AllocatorFacade::Instance().AllocShared(place, size);
+                                        size_t size, Allocator::Attr attr) {
+  return allocation::AllocatorFacade::Instance().AllocShared(place, size, attr);
 }
 
-AllocationPtr Alloc(const platform::Place& place, size_t size) {
-  return allocation::AllocatorFacade::Instance().Alloc(place, size);
+AllocationPtr Alloc(const platform::Place& place, size_t size,
+                    Allocator::Attr attr) {
+  return allocation::AllocatorFacade::Instance().Alloc(place, size, attr);
 }
 
 }  // namespace memory
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 6731203fccb..916538b2a65 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -23,10 +23,12 @@ using allocation::Allocation;
 using allocation::Allocator;
 using allocation::AllocationPtr;
 
-extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
-                                               size_t size);
+extern std::shared_ptr<Allocation> AllocShared(
+    const platform::Place& place, size_t size,
+    Allocator::Attr attr = Allocator::kDefault);
 
-extern AllocationPtr Alloc(const platform::Place& place, size_t size);
+extern AllocationPtr Alloc(const platform::Place& place, size_t size,
+                           Allocator::Attr attr = Allocator::kDefault);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index c08d86eb213..1408163e4b5 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 
 #include <cstring>  // for memcpy
-#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -25,7 +24,6 @@ template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -42,7 +40,6 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
 
   if (stream) {
@@ -62,8 +59,6 @@ template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
   platform::SetDeviceId(dst_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
@@ -82,8 +77,6 @@ template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     if (stream) {
@@ -110,7 +103,6 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
     platform::CPUPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -118,7 +110,6 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CPUPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -126,7 +117,6 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
-  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -135,7 +125,6 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CUDAPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
@@ -151,8 +140,6 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
-  if (UNLIKELY(num == 0)) return;
-
   platform::SetDeviceId(dst_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
index a2355d2deee..403be1fc2c9 100644
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -29,6 +29,7 @@ prelu
 quantize
 rank_loss
 reduce_max
+reduce_mean
 reduce_min
 reduce_prod
 reduce_sum
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 3356c1e669d..6e8d6f459c5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -8,6 +8,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.
 
 add_subdirectory(math)
 add_subdirectory(controlflow)
+add_subdirectory(csp)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
 add_subdirectory(fused)
@@ -33,7 +34,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-if (ANAKIN_SUBGRAPH) 
+if (ANAKIN_FOUND) 
     add_subdirectory(anakin)
 endif()
 
@@ -47,8 +48,7 @@ if (WITH_DISTRIBUTE)
     SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op
-	sync_batch_norm_op deformable_conv_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
+register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op sync_batch_norm_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
 
 if (WITH_GPU)
     # warpctc_op needs cudnn 7 above
@@ -66,8 +66,6 @@ if (WITH_GPU)
         op_library(sync_batch_norm_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
     endif()
-    op_library(deformable_conv_op)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(deformable_conv);\n")
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 66453027596..f93474a122f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -597,31 +597,40 @@ REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
-template <ActBwdOpFwdDeps kDepValue>
 class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
-      if (HasOutputs("DX") && ctx->HasOutput("DX")) {
-        ctx->ShareDim("X", "DX");
-        ctx->ShareLoD("X", "DX");
-      }
-      if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("X", "DDOut");
-        ctx->ShareLoD("X", "DDOut");
-      }
+    if (ctx->HasOutput("DOut")) {
+      ctx->ShareDim("Out", "DOut");
+      ctx->ShareLoD("Out", "DOut");
+    }
+    if (ctx->HasOutput("DDOut")) {
+      ctx->ShareDim("Out", "DDOut");
+      ctx->ShareLoD("Out", "DDOut");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return GetKernelType(ctx, *this, "Out");
+  }
+};
+
+class LeakyReluDoubleGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    if (ctx->HasOutput("DX")) {
+      ctx->ShareDim("X", "DX");
+      ctx->ShareLoD("X", "DX");
     }
-    if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
-      if (HasOutputs("DOut") && ctx->HasOutput("DOut")) {
-        ctx->ShareDim("Out", "DOut");
-        ctx->ShareLoD("Out", "DOut");
-      }
-      if (HasOutputs("DDOut") && ctx->HasOutput("DDOut")) {
-        ctx->ShareDim("Out", "DDOut");
-        ctx->ShareLoD("Out", "DDOut");
-      }
+    if (ctx->HasOutput("DDOut")) {
+      ctx->ShareDim("X", "DDOut");
+      ctx->ShareLoD("X", "DDOut");
     }
   }
 
@@ -635,6 +644,7 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel {
 //
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
+//               dy = 0
 //
 class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
  public:
@@ -649,7 +659,9 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
     // input2: ddx
     op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
     op->SetAttrMap(Attrs());
-    // output: ddy
+    // output1: ddy
+    op->SetOutput("DOut", InputGrad("Out"));
+    // output2: ddy
     op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
     return std::unique_ptr<::paddle::framework::OpDesc>(op);
   }
@@ -672,53 +684,7 @@ class LeakyReluDoubleGradMaker
     op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
     op->SetAttrMap(Attrs());
     // Out@GRAD@GRAD: ddy
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-// sqrt Grad: dx = 0.5 * dy / y
-// sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
-class SqrtDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("sqrt_grad_grad");
-    op->SetInput("Out", Input("Out"));
-    op->SetInput("DX", Output(framework::GradVarName("X")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetAttrMap(Attrs());
-    op->SetOutput("DOut", InputGrad("Out"));
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return std::unique_ptr<::paddle::framework::OpDesc>(op);
-  }
-};
-
-// square Grad: dx=2x*dy
-// square GradGrad: ddy=2x*ddx, dx=2dy*ddx
-class SquareDoubleGradMaker
-    : public ::paddle::framework::SingleGradOpDescMaker {
- public:
-  using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {
-    auto* op = new ::paddle::framework::OpDesc();
-    op->SetType("square_grad_grad");
-    op->SetInput("X", Input("X"));
-    // Out@GRAD: dy
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    // X@GRAD@GRAD: ddx
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-
-    op->SetAttrMap(Attrs());
-
-    // X@GRAD: dx
     op->SetOutput("DX", InputGrad("X"));
-    // Out@GRAD@GRAD: ddy
     op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
     return std::unique_ptr<::paddle::framework::OpDesc>(op);
   }
@@ -761,7 +727,6 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
-/* ==========================    relu register  ============================= */
 REGISTER_OPERATOR(
     relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
     ops::ActivationGradOpDescMaker<ops::ReluGradFunctor<float>::FwdDeps()>,
@@ -769,9 +734,7 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(relu_grad, ops::ActivationOpGrad,
                   paddle::framework::SingleOpInplaceInToOut,
                   ops::ReluDoubleGradMaker);
-REGISTER_OPERATOR(
-    relu_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::ReluGradFunctor<float>::FwdDeps()>);
+REGISTER_OPERATOR(relu_grad_grad, ops::ActivationOpDoubleGrad);
 
 REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
 
@@ -783,9 +746,7 @@ REGISTER_OP_CPU_KERNEL(
                                     ops::ReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<plat::CPUDeviceContext,
                                     ops::ReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
 
-/* ======================== leaky relu register  ============================ */
 REGISTER_OPERATOR(
     leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
     ops::ActivationOpInferVarType,
@@ -794,10 +755,7 @@ REGISTER_OPERATOR(
 REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad,
                   paddle::framework::SingleOpInplaceInToOut,
                   ops::LeakyReluDoubleGradMaker);
-REGISTER_OPERATOR(
-    leaky_relu_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::LeakyReluGradFunctor<float>::FwdDeps()>);
-
+REGISTER_OPERATOR(leaky_relu_grad_grad, ops::LeakyReluDoubleGrad);
 REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
                                LeakyReluGradFunctor);
 REGISTER_OP_CPU_KERNEL(
@@ -808,51 +766,3 @@ REGISTER_OP_CPU_KERNEL(
                                     ops::LeakyReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<
         plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================   sqrt register  ============================= */
-REGISTER_OPERATOR(
-    sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::SqrtGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(sqrt_grad, ops::ActivationOpGrad,
-                  paddle::framework::SingleOpInplaceInToOut,
-                  ops::SqrtDoubleGradMaker);
-REGISTER_OPERATOR(
-    sqrt_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::SqrtGradGradFunctor<float>::FwdDeps()>);
-REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad_grad, ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<plat::CPUDeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ==========================   square register  ============================ */
-REGISTER_OPERATOR(
-    square, ops::ActivationOp, ops::SquareOpMaker,
-    ops::ActivationOpInferVarType,
-    ops::ActivationGradOpDescMaker<ops::SquareGradFunctor<float>::FwdDeps()>,
-    paddle::framework::SingleOpInplaceInToOut);
-REGISTER_OPERATOR(square_grad, ops::ActivationOpGrad,
-                  paddle::framework::SingleOpInplaceInToOut,
-                  ops::SquareDoubleGradMaker);
-REGISTER_OPERATOR(
-    square_grad_grad,
-    ops::ActivationOpDoubleGrad<ops::SquareGradGradFunctor<float>::FwdDeps()>);
-
-REGISTER_ACTIVATION_CPU_KERNEL(square, Square, SquareFunctor,
-                               SquareGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CPUDeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 25514186de9..377e5a4af75 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -33,7 +33,6 @@ namespace plat = paddle::platform;
 
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
 
-/* ======================== leaky relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
                                 LeakyReluGradFunctor);
 
@@ -45,9 +44,7 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::LeakyReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<
         plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
 
-/* ===========================    relu register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
@@ -58,31 +55,3 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<double>>,
     ops::ActivationDoubleGradKernel<plat::CUDADeviceContext,
                                     ops::ReluGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    sqrt_grad_grad,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<float>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<double>>,
-    ops::SqrtDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                              ops::SqrtGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
-/* ===========================  square register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(square, Square, SquareFunctor,
-                                SquareGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    square_grad_grad,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<float>>,
-    ops::SquareDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<double>>,
-    ops::SquareDoubleGradKernel<plat::CUDADeviceContext,
-                                ops::SquareGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index b516fc8a418..5848d9dad5e 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -1,5 +1,4 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -1321,6 +1320,10 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
       auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
       ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
     }
+    if (dOut) {
+      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+      dout.device(*d) = dout.constant(static_cast<T>(0));
+    }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
@@ -1347,171 +1350,14 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
                               (x < static_cast<T>(0)).template cast<T>().eval())
                              .template cast<T>();
     }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
-};
-
-template <typename T>
-struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* Out,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  framework::Tensor* dOut, const framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
-    }
-    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
-};
-
-template <typename T>
-struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
-      ddout.device(*d) = ddx * static_cast<T>(2) * x;
-    }
     if (dX) {
       auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-      dx.device(*d) = ddx * static_cast<T>(2) * dout;
+      dx.device(*d) = dx.constant(static_cast<T>(0));
     }
   }
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-// TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
-// DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel seperately here.
-inline void ExtractDoubleGradTensorWithInputDOut(
-    const framework::ExecutionContext& ctx, const framework::Tensor** X,
-    const framework::Tensor** ddX, framework::Tensor** dX,
-    const framework::Tensor** dOut, framework::Tensor** ddOut) {
-  // extract ddX(output), ddOut(input)
-  auto ddx_var = ctx.InputVar("DDX");
-  auto ddo_var = ctx.OutputVar("DDOut");
-  PADDLE_ENFORCE(ddx_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.op().Input("DDX"));
-  *ddX = ctx.Input<framework::Tensor>("DDX");
-  if (ddo_var) {
-    *ddOut = ctx.Output<framework::Tensor>("DDOut");
-  }
-  PADDLE_ENFORCE(*ddX != nullptr,
-                 "Cannot get output tensor DDX, variable name = %s",
-                 ctx.op().Output("DDX"));
-
-  // extract x(input), dx(output)
-  auto x_var = ctx.InputVar("X");
-  PADDLE_ENFORCE(x_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 ctx.op().Input("X"));
-  auto dx_var = ctx.OutputVar("DX");
-  *X = ctx.Input<framework::Tensor>("X");
-  if (dx_var) {
-    *dX = ctx.Output<framework::Tensor>("DX");
-  }
-
-  // extract dOut(input)
-  auto dout_var = ctx.InputVar("DOut");
-  if (dout_var) {
-    *dOut = ctx.Input<framework::Tensor>("DOut");
-  }
-}
-
-template <typename DeviceContext, typename Functor>
-class SquareDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class SqrtDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *Out, *dX, *ddX;
-    Out = dX = ddX = nullptr;
-    framework::Tensor *ddOut, *dOut;
-    ddOut = dOut = nullptr;
-
-    // extract ddx(input), ddout(output)
-    auto ddx_var = ctx.InputVar("DDX");
-    auto ddo_var = ctx.OutputVar("DDOut");
-    PADDLE_ENFORCE(ddx_var != nullptr,
-                   "Cannot get input Variable DDX, variable name = %s",
-                   ctx.op().Input("DDX"));
-    ddX = ctx.Input<framework::Tensor>("DDX");
-    if (ddo_var) {
-      ddOut = ctx.Output<framework::Tensor>("DDOut");
-    }
-    PADDLE_ENFORCE(ddX != nullptr,
-                   "Cannot get input Variable DDX, variable name = %s",
-                   ctx.op().Input("DDX"));
-
-    // extract out(input), dout(output)
-    auto out_var = ctx.InputVar("Out");
-    PADDLE_ENFORCE(out_var != nullptr,
-                   "Cannot get input Variable Out, variable name = %s",
-                   ctx.op().Input("Out"));
-    auto dout_var = ctx.OutputVar("DOut");
-    Out = ctx.Input<framework::Tensor>("Out");
-    if (dout_var) {
-      dOut = ctx.Output<framework::Tensor>("DOut");
-    }
-
-    // extract dx(input)
-    auto dx_var = ctx.InputVar("DX");
-    PADDLE_ENFORCE(dx_var != nullptr,
-                   "Cannot get input Variable DX, variable name = %s",
-                   ctx.op().Input("DX"));
-    if (dx_var) {
-      dX = ctx.Input<framework::Tensor>("DX");
-    }
-
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    functor(place, Out, ddX, ddOut, dOut, dX);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -1523,6 +1369,7 @@ class SqrtDoubleGradKernel
   __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
+  __macro(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);                          \
   __macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);                      \
   __macro(abs, Abs, AbsFunctor, AbsGradFunctor);                              \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
@@ -1534,6 +1381,7 @@ class SqrtDoubleGradKernel
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log, Log, LogFunctor, LogGradFunctor);                              \
+  __macro(square, Square, SquareFunctor, SquareGradFunctor);                  \
   __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
   __macro(pow, Pow, PowFunctor, PowGradFunctor);                              \
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index 2580c5a523e..3882bbedaa0 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -33,13 +33,6 @@ class AddPositionEncodingOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   platform::CPUPlace());
-  }
 };
 
 class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
@@ -52,14 +45,6 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
       ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
     }
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
-        platform::CPUPlace());
-  }
 };
 
 class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc
index 85da8a827f7..d4bdecff62c 100644
--- a/paddle/fluid/operators/alloc_continuous_space_op.cc
+++ b/paddle/fluid/operators/alloc_continuous_space_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <sstream>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -97,8 +96,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
 
     // Make the outputs point to the continuous space.
     offset = 0;
-    std::stringstream ss;
-    ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < out_tensors.size(); ++i) {
       size_t len = static_cast<size_t>(out_tensors[i]->numel());
       auto dim = out_tensors[i]->dims();
@@ -108,10 +105,10 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
           .Resize(dim);
       len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype;
       offset += len;
-      ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
-         << " address: " << out_tensors[i]->data<void>() << ", ";
+      VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i]
+               << ") ,dim:(" << dim << ")"
+               << " Address: " << out_tensors[i]->data<void>();
     }
-    VLOG(10) << ss.str();
   }
 
  private:
@@ -136,9 +133,6 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
     *numel = 0;
     size_t size_of_dtype = 0;
-
-    std::stringstream ss;
-    ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.",
                      var_names[i]);
@@ -154,13 +148,11 @@ class AllocContinuousSpaceKernel : public framework::OpKernel<T> {
 
       auto size = lod_tensors[i]->numel();
       PADDLE_ENFORCE_GT(size, 0);
-      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
-         << "), ";
+      VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:("
+               << lod_tensors[i]->dims() << ")";
       *numel += Alignment(static_cast<size_t>(size) * size_of_dtype, place) /
                 size_of_dtype;
     }
-
-    VLOG(10) << ss.str();
   }
 };
 
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
index b4aaa228693..11c394c76cd 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -119,15 +119,11 @@ class AnakinEngineOp : public framework::OperatorBase {
       engine->Execute(inputs, outputs, stream);
 #endif
     } else {
-#ifdef ANAKIN_X86_PLACE
       auto *engine =
           inference::Singleton<inference::anakin::AnakinEngineManager<
               ::anakin::saber::X86, PrecisionT>>::Global()
               .Get(engine_key_);
       engine->Execute(inputs, outputs);
-#else
-      LOG(FATAL) << "Unknown Platform for AnakinEngine!";
-#endif
     }
   }
 };
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index f6295337d1f..d583909a666 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -454,7 +454,6 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
       const auto *running_mean = ctx.Input<Tensor>("Mean");
       const auto *running_variance = ctx.Input<Tensor>("Variance");
       mean_data = running_mean->data<T>();
-      inv_var_tensor.Resize({C});
       T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
       EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
       ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc
index ac487223d09..fec091255f6 100644
--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -257,8 +257,7 @@ framework::VarDesc *OpTester::Var(const std::string &name) {
 template <typename T>
 void OpTester::SetupTensor(framework::LoDTensor *tensor,
                            const std::vector<int64_t> &shape, T lower, T upper,
-                           const std::string &initializer,
-                           const std::string &filename) {
+                           const std::string &initializer) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
@@ -281,20 +280,12 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor,
     }
   } else if (initializer == "natural") {
     for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = static_cast<T>(lower + i);
+      cpu_ptr[i] = lower + i;
     }
   } else if (initializer == "zeros") {
     for (int i = 0; i < cpu_tensor.numel(); ++i) {
-      cpu_ptr[i] = static_cast<T>(0);
+      cpu_ptr[i] = 0;
     }
-  } else if (initializer == "file") {
-    std::ifstream is(filename);
-    for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
-      T value;
-      is >> value;
-      cpu_ptr[i] = static_cast<T>(value);
-    }
-    is.close();
   } else {
     PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
   }
@@ -334,19 +325,15 @@ void OpTester::CreateVariables(framework::Scope *scope) {
     auto *tensor = var->GetMutable<framework::LoDTensor>();
     const auto &data_type = var_desc->GetDataType();
     if (data_type == framework::proto::VarType::INT32) {
-      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer,
-                       item.second.filename);
+      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
     } else if (data_type == framework::proto::VarType::INT64) {
-      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer,
-                           item.second.filename);
+      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
     } else if (data_type == framework::proto::VarType::FP32) {
       SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
-                         static_cast<float>(1.0), item.second.initializer,
-                         item.second.filename);
+                         static_cast<float>(1.0), item.second.initializer);
     } else if (data_type == framework::proto::VarType::FP64) {
       SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
-                          static_cast<double>(1.0), item.second.initializer,
-                          item.second.filename);
+                          static_cast<double>(1.0), item.second.initializer);
     } else {
       PADDLE_THROW("Unsupported dtype %d.", data_type);
     }
diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h
index a6d21573a05..328389293c4 100644
--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -55,7 +55,7 @@ class OpTester {
   template <typename T>
   void SetupTensor(framework::LoDTensor *input,
                    const std::vector<int64_t> &shape, T lower, T upper,
-                   const std::string &initializer, const std::string &filename);
+                   const std::string &initializer);
 
   void RunImpl();
 
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc
index 818e5f64edc..b4878ab0424 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -56,9 +56,6 @@ OpInputConfig::OpInputConfig(std::istream& is) {
         ParseDims(is);
       } else if (sep == "lod" || sep == "lod:") {
         ParseLoD(is);
-      } else if (sep == "filename") {
-        is >> filename;
-        EraseEndSep(&filename);
       }
     }
   }
@@ -89,7 +86,7 @@ void OpInputConfig::ParseInitializer(std::istream& is) {
   EraseEndSep(&initializer_str);
 
   const std::vector<std::string> supported_initializers = {"random", "natural",
-                                                           "zeros", "file"};
+                                                           "zeros"};
   if (!Has(supported_initializers, initializer_str)) {
     PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
   }
diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h
index 3956bc0a8b1..5803f82ac28 100644
--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -35,8 +35,7 @@ struct OpInputConfig {
 
   std::string name;
   std::string dtype{"fp32"};  // int32/int, int64/long, fp32/float, fp64/double
-  std::string initializer{"random"};  // random, natural, zeros, file
-  std::string filename{""};
+  std::string initializer{"random"};  // random, natural
   std::vector<int64_t> dims;
   std::vector<std::vector<size_t>> lod;
 };
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 7f249924f5b..029b05bb662 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-using Tensor = framework::Tensor;
+using framework::Tensor;
 
 class ConcatOp : public framework::OperatorWithKernel {
  public:
@@ -36,10 +36,7 @@ class ConcatOp : public framework::OperatorWithKernel {
                    "Output(Out) of ConcatOp should not be null.");
 
     auto ins = ctx->GetInputsDim("X");
-    size_t axis =
-        ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                    static_cast<int64_t>(ins[0].size()));
-
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     const size_t n = ins.size();
 
     PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
@@ -83,19 +80,8 @@ class ConcatOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    auto inputs = ctx.MultiInput<Tensor>("X");
-    auto input_data_type = framework::proto::VarType::Type(0);
-    bool flag = 0;
-    for (auto *input : inputs) {
-      if (input->IsInitialized() && input->numel() > 0) {
-        input_data_type = input->type();
-        flag = 1;
-        break;
-      }
-    }
-    if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
-    }
+    auto input_data_type =
+        framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]);
 
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
@@ -118,17 +104,8 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default false) Indicates if MKL-DNN kernel will be used")
         .SetDefault(false);
     AddAttr<int>("axis",
-                 "The axis along which the input tensors will be concatenated."
-                 "The axis could also be negative numbers. Negative axis is "
-                 "interpreted as counting from the end of the rank."
-                 "i.e., axis + rank(X) th dimension.")
+                 "The axis along which the input tensors will be concatenated.")
         .SetDefault(0);
-    AddAttr<bool>("use_quantizer",
-                  "(bool, default false) "
-                  "Set to true for operators that should be quantized and use "
-                  "int8 kernel. "
-                  "Only used on CPU.")
-        .SetDefault(false);
     AddComment(R"DOC(
 Concat Operator.
 
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 4a371de3235..0414550dd18 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -23,22 +23,13 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
-  if (axis < 0) {
-    axis = axis + rank;
-  }
-  return axis > 0 ? axis : 0;
-}
-
 template <typename DeviceContext, typename T>
 class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
     framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE(ins[0], "The input should not be null.");
-    auto axis = ComputeAxis(static_cast<int64_t>(ctx.Attr<int>("axis")),
-                            static_cast<int64_t>(ins[0]->dims().size()));
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
@@ -92,9 +83,8 @@ class ConcatGradKernel : public framework::OpKernel<T> {
         }
       }
     }
-    PADDLE_ENFORCE(ins[0], "The input should not be null.");
-    auto axis = ComputeAxis(static_cast<int64_t>(ctx.Attr<int>("axis")),
-                            static_cast<int64_t>(ins[0]->dims().size()));
+
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
     // get output tensor that the name is not kEmptyVarName
     std::vector<framework::Tensor*> outputs;
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc
index 054deeaa710..158d6ced274 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -136,7 +136,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     }
 
     // ------------------- cudnn conv algorithm ---------------------
-    cudnnConvolutionFwdAlgo_t algo{};
+    cudnnConvolutionFwdAlgo_t algo;
     bool half_float = false;
 
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
@@ -165,43 +165,11 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     // TODO(dangqingqing) simplify the following code by SearchAlgorithm in
     // conv_cudnn_helper.h
-    bool has_got_workspace_size = false;
     if ((!exhaustive_search) && (!half_float)) {
-#if CUDNN_VERSION >= 7001
-      using perf_t = cudnnConvolutionFwdAlgoPerf_t;
-      int perf_count;
-      int best_algo_idx = 0;
-      std::unique_ptr<perf_t[]> perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]);
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7(
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
           handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, kNUM_CUDNN_FWD_ALGS, &perf_count,
-          perf_results.get()));
-      algo = (perf_results.get())[best_algo_idx].algo;
-
-      // get workspace size able to allocate
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, algo, &workspace_size_in_bytes));
-
-      // NOTE(zjl): cudnnGetConvolutionForwardAlgorithm_v7 cannot limit
-      // workspace size. If the workspace size found by v7 exceeds the limit,
-      // we should fallback to non-v7 method to find another algorithm.
-      if (workspace_size_in_bytes > workspace_size_limit) {
-        VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                   "the workspace size request("
-                << workspace_size_in_bytes << ") exceeds the limit("
-                << workspace_size_limit << ")";
-#endif
-        CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-            handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-            cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-            workspace_size_limit, &algo));
-#if CUDNN_VERSION >= 7001
-      } else {
-        has_got_workspace_size = true;
-      }
-#endif
-
+          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &algo));
       VLOG(3) << "cuDNN forward algo " << algo;
     } else if (exhaustive_search && (!half_float)) {
       AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
@@ -238,13 +206,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
                      "cuDNN exhaustive search doesn't support half float.");
     }
 
-    if (!has_got_workspace_size) {
-      // get workspace size able to allocate
-      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, algo, &workspace_size_in_bytes));
-    }
-
+    // get workspace size able to allocate
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, algo, &workspace_size_in_bytes));
     // It is possible for float16 on Volta GPU to allocate more memory than
     // the limit because the algo is overrided to use tensor core.
     PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
@@ -361,8 +326,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     int group_offset_out = o_c / groups * o_h * o_w * o_d;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn backward algorithm ---------------------
-    cudnnConvolutionBwdDataAlgo_t data_algo{};
-    cudnnConvolutionBwdFilterAlgo_t filter_algo{};
+    cudnnConvolutionBwdDataAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
     size_t workspace_size_in_bytes = 0, tmp_size = 0;
     size_t workspace_size_limit = 0;
     if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
@@ -388,8 +353,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     auto x_dims = framework::vectorize(input->dims());
     auto f_dims = framework::vectorize(filter->dims());
     auto handle = dev_ctx.cudnn_handle();
-
-    bool has_got_bwd_data_ws_size = false;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
@@ -425,14 +388,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else if (FLAGS_cudnn_deterministic) {
         data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
       } else {
-#if CUDNN_VERSION >= 7001
-        using perf_t = cudnnConvolutionBwdDataAlgoPerf_t;
-        int perf_count;
-        int best_algo_idx = 0;
-        std::unique_ptr<perf_t[]> perf_results(
-            new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]);
         CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7(
+            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                 handle, cudnn_filter_desc,
                 // dyDesc: Handle to the previously initialized input
                 // differential
@@ -440,64 +397,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                 cudnn_output_grad_desc, cudnn_conv_desc,
                 // dxDesc: Handle to the previously initialized output tensor
                 // descriptor.
-                cudnn_input_desc, kNUM_CUDNN_BWD_DATA_ALGS, &perf_count,
-                perf_results.get()));
-        data_algo = (perf_results.get())[best_algo_idx].algo;
-        int stride_dim = input->dims().size() - 2;
-        bool blacklist =
-            std::any_of(strides.begin(), strides.begin() + stride_dim,
-                        [=](int n) { return n != 1; });
-        if (blacklist && (static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                              perf_results[best_algo_idx].algo) ==
-                              CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING ||
-                          static_cast<cudnnConvolutionBwdDataAlgo_t>(
-                              perf_results[best_algo_idx].algo) ==
-                              CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) {
-          data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-        }
-
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-                handle, cudnn_filter_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
-        auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);
-
-        if (new_workspace_size > workspace_size_limit) {
-          VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                     "the workspace size request("
-                  << new_workspace_size << ") exceeds the limit("
-                  << workspace_size_limit << ")";
-#endif
-          CUDNN_ENFORCE(
-              platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-                  handle, cudnn_filter_desc,
-                  // dyDesc: Handle to the previously initialized input
-                  // differential
-                  // tensor descriptor.
-                  cudnn_output_grad_desc, cudnn_conv_desc,
-                  // dxDesc: Handle to the previously initialized output tensor
-                  // descriptor.
-                  cudnn_input_desc,
-                  CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-                  workspace_size_limit, &data_algo));
-#if CUDNN_VERSION >= 7001
-        } else {
-          workspace_size_in_bytes = new_workspace_size;
-          has_got_bwd_data_ws_size = true;
-        }
-#endif
-      }
-
-      if (!has_got_bwd_data_ws_size) {
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
-                handle, cudnn_filter_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
-        workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+                cudnn_input_desc,
+                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &data_algo));
       }
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+              handle, cudnn_filter_desc, cudnn_output_grad_desc,
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
-    bool has_got_bwd_filter_ws_size = false;
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       if (exhaustive_search) {
@@ -527,58 +437,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       } else if (FLAGS_cudnn_deterministic) {
         filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
       } else {
-#if CUDNN_VERSION >= 7001
-        using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t;
-        int perf_count;
-        int best_algo_idx = 0;
-        std::unique_ptr<perf_t[]> perf_results(
-            new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]);
-
         CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                 handle, cudnn_input_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_filter_desc, kNUM_CUDNN_BWD_FILTER_ALGS,
-                &perf_count, perf_results.get()));
-        filter_algo = (perf_results.get())[best_algo_idx].algo;
-
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                handle, cudnn_input_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
-        auto new_workspace_size = std::max(workspace_size_in_bytes, tmp_size);
-
-        if (new_workspace_size > workspace_size_limit) {
-          VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue "
-                     "the workspace size request("
-                  << new_workspace_size << ") exceeds the limit("
-                  << workspace_size_limit << ")";
-#endif
-          CUDNN_ENFORCE(
-              platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-                  handle, cudnn_input_desc, cudnn_output_grad_desc,
-                  cudnn_conv_desc, cudnn_filter_desc,
-                  CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-                  workspace_size_limit, &filter_algo));
-#if CUDNN_VERSION >= 7001
-        } else {
-          workspace_size_in_bytes = new_workspace_size;
-          has_got_bwd_filter_ws_size = true;
-        }
-#endif
-      }
-
-      if (!has_got_bwd_filter_ws_size) {
-        CUDNN_ENFORCE(
-            platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                handle, cudnn_input_desc, cudnn_output_grad_desc,
-                cudnn_conv_desc, cudnn_filter_desc, filter_algo, &tmp_size));
-        workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+                cudnn_conv_desc, cudnn_filter_desc,
+                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+                workspace_size_limit, &filter_algo));
       }
+      CUDNN_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
-    PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
-                      "workspace_size to be allocated exceeds the limit");
-
     // ------------------- cudnn conv workspace ---------------------
     if (!cudnn_workspace_ptr) {
       cudnn_workspace =
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index ee37585a709..5b923f8a5eb 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -209,12 +209,6 @@ void Conv2DOpMaker::Make() {
       .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>("fuse_brelu",
-                "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
-  AddAttr<float>("fuse_brelu_threshold",
-                 "(float, default false 6.0) Only used in mkldnn kernel")
-      .SetDefault(6.0f);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
@@ -533,16 +527,9 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker {
     // ddO, dI, dW
     // Unlike grad op, double grad op does not use name@GRAD@GRAD
     // as key of ops' inputs and outputs.
-    auto ddx = OutputGrad(framework::GradVarName("Input"));
-    auto ddw = OutputGrad(framework::GradVarName("Filter"));
-    std::vector<std::string> empty_str = {};
-
-    op->SetOutput(
-        "DDOutput",
-        ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output")));
-    op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter"));
-    op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input"));
-
+    op->SetOutput("DDOutput", InputGrad(framework::GradVarName("Output")));
+    op->SetOutput("DFilter", InputGrad("Filter"));
+    op->SetOutput("DInput", InputGrad("Input"));
     op->SetAttrMap(Attrs());
 
     return std::unique_ptr<framework::OpDesc>(op);
@@ -554,13 +541,13 @@ void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const {
   auto w_dims = ctx->GetInputDim("Filter");
   auto do_dims = ctx->GetInputDim("DOutput");
 
-  if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) {
+  if (ctx->HasOutput("DDOutput")) {
     ctx->SetOutputDim("DDOutput", do_dims);
   }
-  if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) {
+  if (ctx->HasOutput("DFilter")) {
     ctx->SetOutputDim("DFilter", w_dims);
   }
-  if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) {
+  if (ctx->HasOutput("DInput")) {
     ctx->SetOutputDim("DInput", x_dims);
   }
 }
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 309ba46cfa3..89bacfc33ed 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -154,9 +154,9 @@ struct HardLabelCrossEntropyForwardFunctor {
 
   HOSTDEVICE void operator()(int64_t idx) const {
     auto label = label_[idx];
+    PADDLE_ASSERT_MSG(label >= 0 && label < feature_size_,
+                      "The label is out of the range.", label);
     if (label != ignore_index_) {
-      PADDLE_ASSERT_MSG(label >= 0 && label < feature_size_,
-                        "The label is out of the range.", label);
       auto match_x = x_[idx * feature_size_ + label];
       y_[idx] = -math::TolerableValue<T>()(real_log(match_x));
       match_x_[idx] = match_x;
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
index c6140483ff5..77cb7e446b7 100644
--- a/paddle/fluid/operators/cvm_op.h
+++ b/paddle/fluid/operators/cvm_op.h
@@ -73,8 +73,8 @@ class CVMOpKernel : public framework::OpKernel<T> {
       }
     } else {
       auto lod = x->lod()[0];
-      for (size_t i = 0; i < lod.size() - 1; ++i) {
-        for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) {
+      for (int i = 0; i < lod.size() - 1; ++i) {
+        for (int j = 0; j < lod[i + 1] - lod[i]; ++j) {
           CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
         }
       }
@@ -113,7 +113,7 @@ class CVMGradOpKernel : public framework::OpKernel<T> {
       auto lod = dx->lod()[0];
       int seq_num = static_cast<int>(lod.size()) - 1;
       for (int i = 0; i < seq_num; ++i) {
-        for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) {
+        for (int j = 0; j < lod[i + 1] - lod[i]; ++j) {
           CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data,
                                &dx_data);
         }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index f1c504d6e4b..94a2016aa53 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -35,17 +35,13 @@ detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
-detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
-detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
 
 if(WITH_GPU)
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
 
 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index afc39c1db9f..d4cf9a326cc 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -22,10 +22,10 @@ namespace paddle {
 namespace operators {
 
 struct RangeInitFunctor {
-  int start;
-  int delta;
-  int* out;
-  HOSTDEVICE void operator()(size_t i) { out[i] = start + i * delta; }
+  int start_;
+  int delta_;
+  int* out_;
+  HOSTDEVICE void operator()(size_t i) { out_[i] = start_ + i * delta_; }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index f34866360f9..598510870a6 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -140,7 +140,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
                                               target_lvls_data, keys_out,
                                               idx_in, idx_out, roi_num);
     // Allocate temporary storage
-    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
+                                        memory::Allocator::kScratchpad);
 
     // Run sorting operation
     // sort target level to get corresponding index
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index 38eafa5fe8f..46727c29de1 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -323,10 +323,6 @@ class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
     auto gt_segms_lod = gt_segms->lod();
 
     for (int i = 0; i < n; ++i) {
-      if (rois_lod[i] == rois_lod[i + 1]) {
-        lod0.emplace_back(num_mask);
-        continue;
-      }
       Tensor im_info_slice = im_info->Slice(i, i + 1);
       Tensor gt_classes_slice =
           gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 451e0ca8550..b9b8a5a53ae 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -109,18 +109,17 @@ std::vector<std::vector<int>> SampleFgBgGt(
     const platform::CPUDeviceContext& context, Tensor* iou,
     const Tensor& is_crowd, const int batch_size_per_im,
     const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
-    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random,
-    const bool is_cascade_rcnn, const Tensor& rpn_rois) {
+    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
   std::vector<int> fg_inds;
   std::vector<int> bg_inds;
-  std::vector<int> mapped_gt_inds;
+  std::vector<int> gt_inds;
   int64_t gt_num = is_crowd.numel();
   const int* crowd_data = is_crowd.data<int>();
   T* proposal_to_gt_overlaps = iou->data<T>();
   int64_t row = iou->dims()[0];
   int64_t col = iou->dims()[1];
   float epsilon = 0.00001;
-  const T* rpn_rois_dt = rpn_rois.data<T>();
+
   // Follow the Faster RCNN's implementation
   for (int64_t i = 0; i < row; ++i) {
     const T* v = proposal_to_gt_overlaps + i * col;
@@ -128,82 +127,64 @@ std::vector<std::vector<int>> SampleFgBgGt(
     if ((i < gt_num) && (crowd_data[i])) {
       max_overlap = -1.0;
     }
-    if (is_cascade_rcnn &&
-        ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) <= 0 ||
-         (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) <= 0)) {
-      continue;
-    }
-    if (max_overlap >= fg_thresh) {
-      // fg mapped gt label index
+    if (max_overlap > fg_thresh) {
       for (int64_t j = 0; j < col; ++j) {
         T val = proposal_to_gt_overlaps[i * col + j];
         auto diff = std::abs(max_overlap - val);
         if (diff < epsilon) {
           fg_inds.emplace_back(i);
-          mapped_gt_inds.emplace_back(j);
+          gt_inds.emplace_back(j);
           break;
         }
       }
-    } else if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) {
-      bg_inds.emplace_back(i);
     } else {
-      continue;
+      if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) {
+        bg_inds.emplace_back(i);
+      }
     }
   }
 
-  std::vector<std::vector<int>> res;
-  if (is_cascade_rcnn) {
-    res.emplace_back(fg_inds);
-    res.emplace_back(bg_inds);
-    res.emplace_back(mapped_gt_inds);
-  } else {
-    // Reservoir Sampling
-    // sampling fg
-    std::uniform_real_distribution<float> uniform(0, 1);
-    int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
-    int fg_rois_this_image = fg_inds.size();
-    int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
-    if (use_random) {
-      const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
-      if (fg_size > fg_rois_per_this_image) {
-        for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);
-          if (rng_ind < fg_rois_per_this_image) {
-            std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
-            std::iter_swap(mapped_gt_inds.begin() + rng_ind,
-                           mapped_gt_inds.begin() + i);
-          }
+  // Reservoir Sampling
+  std::uniform_real_distribution<float> uniform(0, 1);
+  int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
+  int fg_rois_this_image = fg_inds.size();
+  int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
+  if (use_random) {
+    const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
+    if (fg_size > fg_rois_per_this_image) {
+      for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image) {
+          std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i);
+          std::iter_swap(gt_inds.begin() + rng_ind, gt_inds.begin() + i);
         }
       }
     }
-    std::vector<int> new_fg_inds(fg_inds.begin(),
-                                 fg_inds.begin() + fg_rois_per_this_image);
-    std::vector<int> new_gt_inds(
-        mapped_gt_inds.begin(),
-        mapped_gt_inds.begin() + fg_rois_per_this_image);
-    // sampling bg
-    int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
-    int bg_rois_this_image = bg_inds.size();
-    int bg_rois_per_this_image =
-        std::min(bg_rois_per_image, bg_rois_this_image);
-    if (use_random) {
-      const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
-      if (bg_size > bg_rois_per_this_image) {
-        for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
-          int rng_ind = std::floor(uniform(engine) * i);
-          if (rng_ind < fg_rois_per_this_image)
-            std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
-        }
+  }
+  std::vector<int> new_fg_inds(fg_inds.begin(),
+                               fg_inds.begin() + fg_rois_per_this_image);
+  std::vector<int> new_gt_inds(gt_inds.begin(),
+                               gt_inds.begin() + fg_rois_per_this_image);
+
+  int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
+  int bg_rois_this_image = bg_inds.size();
+  int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
+  if (use_random) {
+    const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
+    if (bg_size > bg_rois_per_this_image) {
+      for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
+        int rng_ind = std::floor(uniform(engine) * i);
+        if (rng_ind < fg_rois_per_this_image)
+          std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
       }
     }
-    std::vector<int> new_bg_inds(bg_inds.begin(),
-                                 bg_inds.begin() + bg_rois_per_this_image);
-    //
-    res.emplace_back(new_fg_inds);
-    res.emplace_back(new_bg_inds);
-    res.emplace_back(new_gt_inds);
   }
-
+  std::vector<int> new_bg_inds(bg_inds.begin(),
+                               bg_inds.begin() + bg_rois_per_this_image);
+  std::vector<std::vector<int>> res;
+  res.emplace_back(new_fg_inds);
+  res.emplace_back(new_bg_inds);
+  res.emplace_back(new_gt_inds);
   return res;
 }
 
@@ -250,50 +231,35 @@ std::vector<Tensor> SampleRoisForOneImage(
     const Tensor& im_info, const int batch_size_per_im, const float fg_fraction,
     const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
-    std::minstd_rand engine, bool use_random, bool is_cascade_rcnn,
-    bool is_cls_agnostic) {
-  // 1.1 map to original image
+    std::minstd_rand engine, bool use_random) {
   auto im_scale = im_info.data<T>()[2];
-  Tensor rpn_rois_slice;
-  Tensor rpn_rois;
 
-  if (is_cascade_rcnn) {
-    // slice rpn_rois from gt_box_num refer to detectron
-    rpn_rois_slice =
-        rpn_rois_in.Slice(gt_boxes.dims()[0], rpn_rois_in.dims()[0]);
-    rpn_rois.mutable_data<T>(rpn_rois_slice.dims(), context.GetPlace());
-    const T* rpn_rois_in_dt = rpn_rois_slice.data<T>();
-    T* rpn_rois_dt = rpn_rois.data<T>();
-    for (int i = 0; i < rpn_rois.numel(); ++i) {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
-    }
-  } else {
-    rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
-    const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
-    T* rpn_rois_dt = rpn_rois.data<T>();
-    for (int i = 0; i < rpn_rois.numel(); ++i) {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
-    }
+  Tensor rpn_rois;
+  rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
+  T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
+  for (int i = 0; i < rpn_rois.numel(); ++i) {
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
   }
 
-  // 1.2 compute overlaps
-  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   Tensor boxes;
+  int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0];
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
   Concat<T>(context, gt_boxes, rpn_rois, &boxes);
+
+  // Overlaps
   Tensor proposal_to_gt_overlaps;
   proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
   BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
 
   // Generate proposal index
-  std::vector<std::vector<int>> fg_bg_gt =
-      SampleFgBgGt<T>(context, &proposal_to_gt_overlaps, is_crowd,
-                      batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-                      bg_thresh_lo, engine, use_random, is_cascade_rcnn, boxes);
+  std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
+      context, &proposal_to_gt_overlaps, is_crowd, batch_size_per_im,
+      fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
   std::vector<int> fg_inds = fg_bg_gt[0];
   std::vector<int> bg_inds = fg_bg_gt[1];
-  std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
+  std::vector<int> gt_inds = fg_bg_gt[2];
 
   // Gather boxes and labels
   Tensor sampled_boxes, sampled_labels, sampled_gts;
@@ -305,8 +271,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
   GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
-                       mapped_gt_inds, &sampled_boxes, &sampled_labels,
-                       &sampled_gts);
+                       gt_inds, &sampled_boxes, &sampled_labels, &sampled_gts);
 
   // Compute targets
   Tensor bbox_targets_single;
@@ -340,9 +305,6 @@ std::vector<Tensor> SampleRoisForOneImage(
   for (int64_t i = 0; i < boxes_num; ++i) {
     int label = sampled_labels_data[i];
     if (label > 0) {
-      if (is_cls_agnostic) {
-        label = 1;
-      }
       int dst_idx = i * width + kBoxDim * label;
       int src_idx = kBoxDim * i;
       bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx];
@@ -394,8 +356,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
         context.Attr<std::vector<float>>("bbox_reg_weights");
     int class_nums = context.Attr<int>("class_nums");
     bool use_random = context.Attr<bool>("use_random");
-    bool is_cascade_rcnn = context.Attr<bool>("is_cascade_rcnn");
-    bool is_cls_agnostic = context.Attr<bool>("is_cls_agnostic");
+
     PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
                       "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
     PADDLE_ENFORCE_EQ(
@@ -450,7 +411,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
           dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
           gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-          engine, use_random, is_cascade_rcnn, is_cls_agnostic);
+          engine, use_random);
       Tensor sampled_rois = tensor_output[0];
       Tensor sampled_labels_int32 = tensor_output[1];
       Tensor sampled_bbox_targets = tensor_output[2];
@@ -552,13 +513,6 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "use_random",
         "Use random sampling to choose foreground and background boxes.")
         .SetDefault(true);
-    AddAttr<bool>("is_cascade_rcnn",
-                  "cascade rcnn sampling policy changed from stage 2.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "is_cls_agnostic",
-        "the box regress will only include fg and bg locations if set true ")
-        .SetDefault(false);
 
     AddComment(R"DOC(
 This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 43deb5f9f38..2dfd9befdb7 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -70,7 +70,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
       nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
   // Allocate temporary storage
   auto place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+  auto d_temp_storage =
+      memory::Alloc(place, temp_storage_bytes, memory::Allocator::kScratchpad);
 
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 338954346c5..0b8053e8d03 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -202,32 +202,21 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data,
   }
 
   // Reservoir Sampling
-  int fg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
-    ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
-  } else {
-    fg_num = static_cast<int>(fg_inds_fake.size());
-  }
+  int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
+  ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
   int fg_fake_num = static_cast<int>(fg_inds_fake.size());
   for (int64_t i = 0; i < fg_fake_num; ++i) {
     target_label[fg_inds_fake[i]] = 1;
   }
 
+  int bg_num = rpn_batch_size_per_im - fg_fake_num;
   for (int64_t i = 0; i < anchor_num; ++i) {
     if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
       bg_inds_fake.push_back(i);
     }
   }
-  int bg_num = 0;
-  if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) {
-    bg_num = rpn_batch_size_per_im - fg_fake_num;
-    ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  } else {
-    bg_num = static_cast<int>(bg_inds_fake.size());
-  }
-
+  ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
+  bg_num = static_cast<int>(bg_inds_fake.size());
   int fake_num = 0;
   for (int64_t i = 0; i < bg_num; ++i) {
     // fg fake found
@@ -503,9 +492,9 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Anchor",
              "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
     AddInput("GtBoxes",
-             "(LoDTensor) input ground-truth bbox with shape [K, 4].");
+             "(LoDTensor) input groud-truth bbox with shape [K, 4].");
     AddInput("IsCrowd",
-             "(LoDTensor) input which indicates ground-truth is crowd.");
+             "(LoDTensor) input which indicates groud-truth is crowd.");
     AddInput("ImInfo",
              "(LoDTensor) input image information with shape [N, 3]. "
              "N is the batch size, each image information includes height, "
@@ -547,7 +536,7 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
         "ScoreIndex",
         "(Tensor), The indexes of foreground and background anchors in all "
         "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are sampled foreground and background "
+        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
         " number.");
     AddOutput("TargetBBox",
               "(Tensor), The target bbox deltas with shape "
@@ -555,7 +544,7 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "TargetLabel",
         "(Tensor<int>), The target labels of each anchor with shape "
-        "[F + B, 1], F and B are sampled foreground and background number.");
+        "[F + B, 1], F and B are sampled foreground and backgroud number.");
     AddOutput("BBoxInsideWeight",
               "(Tensor), The bbox inside weight with shape "
               "[F, 4], F is the sampled foreground number.");
@@ -584,440 +573,6 @@ negative do not contribute to the training objective.
   }
 };
 
-class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Anchor",
-             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBoxes",
-             "(LoDTensor) input ground-truth bbox with shape [K, 4].");
-    AddInput("GtLabels",
-             "(LoDTensor) input ground-truth label with shape [K, 1].");
-    AddInput("IsCrowd",
-             "(LoDTensor) input which indicates ground-truth is crowd.");
-    AddInput("ImInfo",
-             "(LoDTensor) input image information with shape [N, 3]. "
-             "N is the batch size, each image information includes height, "
-             "width and scale.");
-    AddAttr<float>(
-        "positive_overlap",
-        "Minimum overlap required between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a positive example.")
-        .SetDefault(0.5);
-    AddAttr<float>(
-        "negative_overlap",
-        "Maximum overlap allowed between an anchor and ground-truth "
-        "box for the (anchor, gt box) pair to be a negative examples.")
-        .SetDefault(0.4);
-    AddOutput(
-        "LocationIndex",
-        "(Tensor), The indexes of foreground anchors in all anchors, the "
-        "shape of the LocationIndex is [F], F depends on the value of input "
-        "tensor and attributes.");
-    AddOutput(
-        "ScoreIndex",
-        "(Tensor), The indexes of foreground and background anchors in all "
-        "RPN anchors(The rest anchors are ignored). The shape of the "
-        "ScoreIndex is [F + B], F and B are foreground and background "
-        " number.");
-    AddOutput("TargetBBox",
-              "(Tensor), The target bbox deltas with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("TargetLabel",
-              "(Tensor<int>), The target labels of each anchor with shape "
-              "[F + B, 1], F and B are foreground and background number.");
-    AddOutput("BBoxInsideWeight",
-              "(Tensor), The bbox inside weight with shape "
-              "[F, 4], F is the foreground number.");
-    AddOutput("ForegroundNumber",
-              "(Tensor), The foreground number. "
-              "[1, 1].");
-    AddComment(R"DOC(
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for
-    train retinanet. 
-    
-    Every anchor is assigned with a length C one-hot vector of
-    classification targets, and a 4-vector of box regression targets,
-    where C is the class number. The assignment rules are as followed:
-    
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-    
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
-
-)DOC");
-  }
-};
-
-class RetinanetTargetAssignOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Anchor"),
-        "Input(Anchor) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("GtBoxes"),
-        "Input(GtBoxes) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("GtLabels"),
-        "Input(GtLabels) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("IsCrowd"),
-        "Input(Anchor) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasInput("ImInfo"),
-        "Input(ImInfo) of RetinanetTargetAssignOp should not be null");
-
-    PADDLE_ENFORCE(
-        ctx->HasOutput("LocationIndex"),
-        "Output(LocationIndex) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("ScoreIndex"),
-        "Output(ScoreIndex) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetLabel"),
-        "Output(TargetLabel) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(
-        ctx->HasOutput("TargetBBox"),
-        "Output(TargetBBox) of RetinanetTargetAssignOp should not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("BBoxInsideWeight"),
-                   "Output(BBoxInsideWeight) of RetinanetTargetAssignOp should "
-                   "not be null");
-    PADDLE_ENFORCE(ctx->HasOutput("ForegroundNumber"),
-                   "Output(ForegroundNumber) of RetinanetTargetAssignOp should "
-                   "not be null");
-
-    auto anchor_dims = ctx->GetInputDim("Anchor");
-    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto gt_labels_dims = ctx->GetInputDim("GtLabels");
-    auto im_info_dims = ctx->GetInputDim("ImInfo");
-
-    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
-                      "The rank of Input(Anchor) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
-                      "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_labels_dims.size(), 2,
-                      "The rank of Input(GtLabels) must be 2.");
-    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
-                      "The rank of Input(ImInfo) must be 2.");
-
-    ctx->SetOutputDim("LocationIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("ScoreIndex", {gt_labels_dims[0]});
-    ctx->SetOutputDim("TargetBBox", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("TargetLabel", {gt_labels_dims[0], 1});
-    ctx->SetOutputDim("BBoxInsideWeight", {gt_labels_dims[0], 4});
-    ctx->SetOutputDim("ForegroundNumber", {gt_labels_dims[0], 1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Anchor")->type(),
-        platform::CPUPlace());
-  }
-};
-
-template <typename T>
-std::vector<Tensor> FilterCrowdGtBoxLabel(
-    const platform::CPUDeviceContext& context, Tensor* gt_boxes,
-    Tensor* gt_labels, Tensor* is_crowd) {
-  int gt_num = gt_boxes->dims()[0];
-  std::vector<int> not_crowd_inds;
-  auto* is_crowd_data = is_crowd->data<int>();
-  for (int i = 0; i < gt_num; ++i) {
-    if (is_crowd_data[i] == 0) {
-      not_crowd_inds.emplace_back(i);
-    }
-  }
-  int ncrowd_num = not_crowd_inds.size();
-  Tensor ncrowd_gt_boxes, ncrowd_gt_labels;
-  T* ncrowd_gt_boxes_data =
-      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
-  int* ncrowd_gt_labels_data =
-      ncrowd_gt_labels.mutable_data<int>({ncrowd_num, 1}, context.GetPlace());
-  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
-            ncrowd_gt_boxes_data);
-  Gather<int>(gt_labels->data<int>(), 1, not_crowd_inds.data(), ncrowd_num,
-              ncrowd_gt_labels_data);
-  std::vector<Tensor> res;
-  res.emplace_back(ncrowd_gt_boxes);
-  res.emplace_back(ncrowd_gt_labels);
-  return res;
-}
-
-template <typename T>
-std::vector<Tensor> GetAllFgBgGt(const platform::CPUDeviceContext& ctx,
-                                 const Tensor& anchor_by_gt_overlap,
-                                 const Tensor& ncrowd_gt_labels,
-                                 const float positive_overlap,
-                                 const float negative_overlap,
-                                 std::minstd_rand engine) {
-  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
-  int anchor_num = anchor_by_gt_overlap.dims()[0];
-  int gt_num = anchor_by_gt_overlap.dims()[1];
-
-  std::vector<int> fg_inds;
-  std::vector<int> bg_inds;
-  std::vector<int> gt_inds;
-  std::vector<int> tgt_lbl;
-  std::vector<int> fg_fake;
-  std::vector<T> bbox_inside_weight;
-  // Calculate the max IoU between anchors and gt boxes
-  // Map from anchor to gt box that has highest overlap
-  auto place = ctx.GetPlace();
-  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
-  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
-  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
-  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
-
-  auto anchor_by_gt_overlap_et =
-      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
-  auto anchor_to_gt_max_et =
-      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-  auto gt_to_anchor_max_et =
-      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
-  auto anchor_to_gt_argmax_et =
-      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-  anchor_to_gt_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
-  anchor_to_gt_argmax_et =
-      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
-  gt_to_anchor_max_et =
-      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));
-
-  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, -1,
-              -1, positive_overlap, negative_overlap, &fg_inds, &bg_inds,
-              &tgt_lbl, &fg_fake, &bbox_inside_weight, engine, false);
-  const int* gt_labels_data = ncrowd_gt_labels.data<int>();
-  int64_t fg_num = fg_inds.size();
-  for (int64_t i = 0; i < fg_num; ++i) {
-    int gt_idx = argmax[fg_inds[i]];
-    tgt_lbl[i] = gt_labels_data[gt_idx];
-  }
-
-  int bg_num = bg_inds.size();
-  int fg_fake_num = fg_fake.size();
-  gt_inds.reserve(fg_fake_num);
-  for (int i = 0; i < fg_fake_num; ++i) {
-    gt_inds.emplace_back(argmax[fg_fake[i]]);
-  }
-
-  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t;
-  Tensor fg_num_t;
-  int* loc_index_data = loc_index_t.mutable_data<int>({fg_fake_num}, place);
-  int* score_index_data =
-      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
-  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_fake_num}, place);
-  int* fg_num_data = fg_num_t.mutable_data<int>({1}, place);
-  T* bbox_inside_weight_data =
-      bbox_inside_weight_t.mutable_data<T>({fg_fake_num, 4}, place);
-  std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data);
-  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
-  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
-  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
-  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
-  std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(),
-            bbox_inside_weight_data);
-  fg_num_data[0] = fg_fake.size() + 1;
-  std::vector<Tensor> loc_score_tgtlbl_gt;
-  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
-  loc_score_tgtlbl_gt.emplace_back(score_index_t);
-  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
-  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
-  loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t);
-  loc_score_tgtlbl_gt.emplace_back(fg_num_t);
-
-  return loc_score_tgtlbl_gt;
-}
-
-template <typename T>
-class RetinanetTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* gt_labels = context.Input<LoDTensor>("GtLabels");
-    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
-    auto* im_info = context.Input<LoDTensor>("ImInfo");
-
-    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
-    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
-    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
-    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
-    auto* bbox_inside_weight = context.Output<LoDTensor>("BBoxInsideWeight");
-    auto* fg_num = context.Output<LoDTensor>("ForegroundNumber");
-
-    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(gt_labels->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD");
-    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
-                      "RetinanetTargetAssignOp is_crowd needs 1 level of LoD");
-
-    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
-    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
-
-    float positive_overlap = context.Attr<float>("positive_overlap");
-    float negative_overlap = context.Attr<float>("negative_overlap");
-
-    int64_t max_num = batch_num * anchor_num;
-    auto place = context.GetPlace();
-
-    loc_index->mutable_data<int>({max_num}, place);
-    score_index->mutable_data<int>({max_num}, place);
-    tgt_bbox->mutable_data<T>({max_num, 4}, place);
-    tgt_lbl->mutable_data<int>({max_num, 1}, place);
-    bbox_inside_weight->mutable_data<T>({max_num, 4}, place);
-    fg_num->mutable_data<int>({batch_num, 1}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed = rnd();
-    engine.seed(seed);
-
-    framework::LoD lod_loc, loc_score, lod_fg;
-    std::vector<size_t> lod0_loc(1, 0);
-    std::vector<size_t> lod0_score(1, 0);
-    std::vector<size_t> lod0_fg(1, 0);
-
-    int total_loc_num = 0;
-    int total_score_num = 0;
-    int total_fg_num = 0;
-    auto gt_boxes_lod = gt_boxes->lod().back();
-    auto gt_labels_lod = gt_labels->lod().back();
-    auto is_crowd_lod = is_crowd->lod().back();
-    for (int i = 0; i < batch_num; ++i) {
-      Tensor gt_boxes_slice =
-          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor gt_labels_slice =
-          gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]);
-      Tensor is_crowd_slice =
-          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
-      Tensor im_info_slice = im_info->Slice(i, i + 1);
-      auto* im_info_data = im_info_slice.data<T>();
-      auto im_height = im_info_data[0];
-      auto im_width = im_info_data[1];
-      auto im_scale = im_info_data[2];
-
-      // Filter straddle anchor
-      std::vector<Tensor> filter_output =
-          FilterStraddleAnchor<T>(dev_ctx, anchor, -1, im_height, im_width);
-      Tensor inds_inside = filter_output[0];
-      Tensor inside_anchor = filter_output[1];
-
-      // Filter crowd gt
-      std::vector<Tensor> ncrowd_output = FilterCrowdGtBoxLabel<T>(
-          dev_ctx, &gt_boxes_slice, &gt_labels_slice, &is_crowd_slice);
-      Tensor ncrowd_gt_boxes = ncrowd_output[0];
-      Tensor ncrowd_gt_labels = ncrowd_output[1];
-
-      auto ncrowd_gt_boxes_et =
-          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
-      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
-
-      Tensor anchor_by_gt_overlap;
-      anchor_by_gt_overlap.mutable_data<T>(
-          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
-      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
-
-      auto loc_score_tgtlbl_gt =
-          GetAllFgBgGt<T>(dev_ctx, anchor_by_gt_overlap, ncrowd_gt_labels,
-                          positive_overlap, negative_overlap, engine);
-
-      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
-      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
-      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
-      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
-      Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4];
-      Tensor sampled_fg_num = loc_score_tgtlbl_gt[5];
-
-      int loc_num = sampled_loc_index.dims()[0];
-      int score_num = sampled_score_index.dims()[0];
-      // unmap to all anchor
-      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
-      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
-      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
-      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
-                  loc_num, sampled_loc_index_unmap.data<int>());
-      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
-                  score_num, sampled_score_index_unmap.data<int>());
-
-      // get target bbox deltas
-      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
-      auto* sampled_anchor_data =
-          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
-      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
-      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
-                loc_num, sampled_anchor_data);
-      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
-                loc_num, sampled_gt_data);
-      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
-      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
-                    &sampled_tgt_bbox);
-
-      // Add anchor offset
-      int anchor_offset = i * anchor_num;
-      auto sampled_loc_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
-      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
-      auto sampled_score_index_unmap_et =
-          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
-      sampled_score_index_unmap_et =
-          sampled_score_index_unmap_et + anchor_offset;
-      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
-      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
-      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
-      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
-      AppendRpns<T>(bbox_inside_weight, total_loc_num * 4,
-                    &sampled_bbox_inside_weight);
-      AppendRpns<int>(fg_num, total_fg_num, &sampled_fg_num);
-
-      total_loc_num += loc_num;
-      total_score_num += score_num;
-      total_fg_num += 1;
-      lod0_loc.emplace_back(total_loc_num);
-      lod0_score.emplace_back(total_score_num);
-      lod0_fg.emplace_back(total_fg_num);
-    }
-
-    PADDLE_ENFORCE_LE(total_loc_num, max_num);
-    PADDLE_ENFORCE_LE(total_score_num, max_num);
-    PADDLE_ENFORCE_LE(total_fg_num, batch_num);
-
-    lod_loc.emplace_back(lod0_loc);
-    loc_score.emplace_back(lod0_score);
-    lod_fg.emplace_back(lod0_fg);
-    loc_index->set_lod(lod_loc);
-    score_index->set_lod(loc_score);
-    tgt_bbox->set_lod(lod_loc);
-    tgt_lbl->set_lod(loc_score);
-    bbox_inside_weight->set_lod(lod_loc);
-    fg_num->set_lod(lod_fg);
-    loc_index->Resize({total_loc_num});
-    score_index->Resize({total_score_num});
-    tgt_bbox->Resize({total_loc_num, 4});
-    tgt_lbl->Resize({total_score_num, 1});
-    bbox_inside_weight->Resize({total_loc_num, 4});
-    fg_num->Resize({total_fg_num, 1});
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -1027,9 +582,3 @@ REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp,
                   paddle::framework::EmptyGradOpMaker);
 REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel<float>,
                        ops::RpnTargetAssignKernel<double>);
-REGISTER_OPERATOR(retinanet_target_assign, ops::RetinanetTargetAssignOp,
-                  ops::RetinanetTargetAssignOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(retinanet_target_assign,
-                       ops::RetinanetTargetAssignKernel<float>,
-                       ops::RetinanetTargetAssignKernel<double>);
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 8909135d234..f6531ec9edc 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -29,7 +29,7 @@ if(WITH_GRPC)
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
 
   cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS ${RPC_DEPS} scope profiler math_function)
+    DEPS ${RPC_DEPS} scope profiler math_function SERIAL)
 
 else()
   set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
@@ -47,12 +47,12 @@ else()
 
   set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
   cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op)
+      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op SERIAL)
 endif()
 
 
 cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op)
+    DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL)
 cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
 cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
@@ -62,5 +62,5 @@ cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
 if(WITH_GPU)
     cc_test(collective_server_test SRCS collective_server_test.cc 
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
-        selected_rows_functor  scope math_function)
+        selected_rows_functor  scope math_function SERIAL)
 endif()
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index 3a185667e7a..b528bcdd32b 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/communicator.h"
 
 #include <gflags/gflags.h>
-#include <paddle/fluid/framework/program_desc.h>
 #include <chrono>  // NOLINT
 #include <thread>  // NOLINT
 
@@ -30,7 +29,7 @@ DEFINE_bool(communicator_independent_recv_thread, true,
             "use an independent to recv vars from parameter server");
 DEFINE_int32(communicator_send_queue_size, 20,
              "queue size to recv gradient before send");
-DEFINE_int32(communicator_min_send_grad_num_before_recv, 20,
+DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
 DEFINE_int32(communicator_send_wait_times, 5,
@@ -51,7 +50,8 @@ inline double GetCurrentUS() {
   return 1e+6 * time.tv_sec + time.tv_usec;
 }
 
-std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
+std::unique_ptr<Communicator> Communicator::communicator_(nullptr);
+std::once_flag Communicator::init_flag_;
 
 Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
                            const RpcCtxMap &recv_varname_to_ctx,
@@ -64,8 +64,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
           << FLAGS_communicator_independent_recv_thread;
   VLOG(0) << "communicator_send_queue_size: "
           << FLAGS_communicator_send_queue_size;
-  VLOG(0) << "communicator_min_send_grad_num_before_recv: "
-          << FLAGS_communicator_min_send_grad_num_before_recv;
+  VLOG(0) << "communicator_max_send_grad_num_before_recv: "
+          << FLAGS_communicator_max_send_grad_num_before_recv;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
   VLOG(0) << "communicator_send_wait_times: "
@@ -84,17 +84,11 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
 }
 
 Communicator::~Communicator() {
-  if (FLAGS_v >= 3) {
-    std::string msg("~Communicator");
-    fwrite(msg.c_str(), msg.length(), 1, stdout);
-  }
+  VLOG(3) << "~Communicator";
   running_ = false;
   if (send_thread_) send_thread_->join();
   if (recv_thread_) recv_thread_->join();
-  if (FLAGS_v >= 3) {
-    std::string msg("~Communicator done");
-    fwrite(msg.c_str(), msg.length(), 1, stdout);
-  }
+  VLOG(3) << "~Communicator done";
 }
 
 void Communicator::SendThread() {
@@ -150,7 +144,7 @@ void Communicator::SendThread() {
         task_futures.emplace_back(
             send_threadpool_->enqueue(std::move(send_task)));
       } else {
-        VLOG(4) << var_name << " queue empty";
+        VLOG(3) << var_name << " queue empty";
       }
     }
     for (auto &task_f : task_futures) {
@@ -166,19 +160,17 @@ void Communicator::SendThread() {
       RecvAll();
     }
   }
-  VLOG(0) << "communicator stopped, send thread exit";
 }
 
 void Communicator::RecvAll() {
   VLOG(3) << "parallel run recv graph";
-  if (!running_) return;
   auto before_send = GetCurrentUS();
   std::vector<std::future<void>> task_futures;
   task_futures.reserve(recv_varname_to_ctx_.size());
   for (auto &iter : recv_varname_to_ctx_) {
     auto recv_task = [this, &iter] {
       auto &var_name = iter.first;
-      VLOG(4) << "recv var " << var_name;
+      VLOG(3) << "recv var " << var_name;
       auto recv_functor = distributed::ParameterRecv<float>();
       if (!FLAGS_communicator_fake_rpc) {
         recv_functor(iter.second, *recv_scope_);
@@ -197,7 +189,7 @@ void Communicator::RecvThread() {
   VLOG(3) << "RecvThread start!";
   while (running_) {
     auto grad_num = grad_num_.load();
-    if (grad_num > FLAGS_communicator_min_send_grad_num_before_recv) {
+    if (grad_num > FLAGS_communicator_max_send_grad_num_before_recv) {
       VLOG(1) << "current grad num " << grad_num;
       RecvAll();
       grad_num_.store(0);
@@ -205,7 +197,6 @@ void Communicator::RecvThread() {
       std::this_thread::sleep_for(std::chrono::milliseconds(10));
     }
   }
-  VLOG(0) << "communicator stopped, recv thread exit";
 }
 
 void Communicator::Send(const std::string &var_name,
@@ -221,90 +212,17 @@ void Communicator::Send(const std::string &var_name,
   queue->Push(tmp_grad_var);
 }
 
-void Communicator::Init(const paddle::framework::ProgramDesc &program,
-                        Scope *param_scope) {
-  using RpcCtxMap = operators::distributed::RpcCtxMap;
-  VLOG(3) << "ProcessGraph";
-  RpcCtxMap send_varname_to_ctx;
-  RpcCtxMap recv_varname_to_ctx;
-  for (auto *op : program.Block(0).AllOps()) {
-    VLOG(3) << "node name " << op->Type();
-    if (op->Type() == "send") {
-      auto send_var_name = op->Input("X")[0];
-      auto send_varnames = boost::get<std::vector<std::string>>(
-          op->GetNullableAttr("send_varnames"));
-      auto epmap =
-          boost::get<std::vector<std::string>>(op->GetNullableAttr("epmap"));
-      auto height_section =
-          boost::get<std::vector<int64_t>>(op->GetNullableAttr("sections"));
-      auto trainer_id = boost::get<int>(op->GetNullableAttr("trainer_id"));
-      send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(
-          send_var_name, send_varnames, epmap, height_section, trainer_id);
-      VLOG(3) << "find and init an send op: "
-              << send_varname_to_ctx[send_var_name];
-    } else if (op->Type() == "recv") {
-      auto do_not_run = boost::get<int>(op->GetNullableAttr("do_not_run"));
-      PADDLE_ENFORCE_GT(do_not_run, 0, "recv should not run!");
-      auto recv_var_name = op->Output("Out")[0];
-      auto recv_varnames = boost::get<std::vector<std::string>>(
-          op->GetNullableAttr("recv_varnames"));
-      auto epmap =
-          boost::get<std::vector<std::string>>(op->GetNullableAttr("epmap"));
-      auto trainer_id = boost::get<int>(op->GetNullableAttr("trainer_id"));
-      recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(
-          recv_var_name, recv_varnames, epmap, {}, trainer_id);
-    }
-  }
-
-  // init communicator here
-  if (send_varname_to_ctx.size() == 0 && recv_varname_to_ctx.size() == 0) {
-    LOG(WARNING) << "no var need to send and recv!!";
-  }
-  operators::distributed::Communicator::Init(send_varname_to_ctx,
-                                             recv_varname_to_ctx, param_scope);
-}
-
 Communicator *Communicator::GetInstance() { return communicator_.get(); }
 
-std::shared_ptr<Communicator> Communicator::GetInstantcePtr() {
-  return communicator_;
-}
-
 void Communicator::Start() {
-  VLOG(0) << "Communicator start";
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    VLOG(1) << "start send thread and recv thread";
-    running_ = true;
-    // start send and recv thread
-    send_thread_.reset(
-        new std::thread(std::bind(&Communicator::SendThread, this)));
-    if (FLAGS_communicator_independent_recv_thread) {
-      recv_thread_.reset(
-          new std::thread(std::bind(&Communicator::RecvThread, this)));
-    }
-  }
-}
-
-void Communicator::Stop() {
-  VLOG(0) << "Communicator stop";
-  running_ = false;
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    if (send_thread_) {
-      VLOG(1) << "stop send thread";
-      send_thread_->join();
-      send_thread_.reset(nullptr);
-    }
-    if (recv_thread_) {
-      VLOG(1) << "stop recv thread";
-      recv_thread_->join();
-      recv_thread_.reset(nullptr);
-    }
+  running_ = true;
+  // start send and recv thread
+  send_thread_.reset(
+      new std::thread(std::bind(&Communicator::SendThread, this)));
+  if (FLAGS_communicator_independent_recv_thread) {
+    recv_thread_.reset(
+        new std::thread(std::bind(&Communicator::RecvThread, this)));
   }
-  VLOG(0) << "Communicator stop done";
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 17f68fb4f1b..37c39eb1511 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -165,7 +165,6 @@ class Communicator {
   ~Communicator();
 
   void Start();
-  void Stop();
 
   // send grad
   void Send(const std::string& var_name, const framework::Scope& scope);
@@ -182,8 +181,8 @@ class Communicator {
       send_varname_to_queue_;
   RpcCtxMap send_varname_to_ctx_;
   RpcCtxMap recv_varname_to_ctx_;
-  std::unique_ptr<std::thread> send_thread_{nullptr};
-  std::unique_ptr<std::thread> recv_thread_{nullptr};
+  std::unique_ptr<std::thread> send_thread_;
+  std::unique_ptr<std::thread> recv_thread_;
   Scope* recv_scope_;                  // should be global scope
   std::unique_ptr<Scope> send_scope_;  // an independent scope
   std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
@@ -194,21 +193,25 @@ class Communicator {
  public:
   static void Init(const RpcCtxMap& send_varname_to_ctx,
                    const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) {
+    InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope);
+  }
+
+  static Communicator* GetInstance();
+
+ private:
+  // Init is called by GetInstance.
+  static void InitImpl(const RpcCtxMap& send_varname_to_ctx,
+                       const RpcCtxMap& recv_varname_to_ctx,
+                       Scope* recv_scope) {
     if (communicator_ == nullptr) {
       communicator_.reset(new Communicator(send_varname_to_ctx,
                                            recv_varname_to_ctx, recv_scope));
     }
   }
 
-  static void Init(const paddle::framework::ProgramDesc& program,
-                   Scope* param_scope);
-
-  static Communicator* GetInstance();
-
-  static std::shared_ptr<Communicator> GetInstantcePtr();
-
  private:
-  static std::shared_ptr<Communicator> communicator_;
+  static std::once_flag init_flag_;
+  static std::unique_ptr<Communicator> communicator_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 876b764a751..a41536368ab 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -104,7 +104,7 @@ bool RequestGetHandler::Handle(const std::string& varname,
   } else {
     if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
       if (enable_dc_asgd_) {
-        // NOTE: the format is determined by distribute_transpiler.py
+        // NOTE: the format is determined by distributed_transpiler.py
         std::string param_bak_name =
             string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
         VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
index 9bd2c9928cc..f598b3780bf 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -40,7 +40,8 @@ static TensorPayload GetCommunicationAllocationFromTensor(
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
     auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
     platform::CUDAPinnedPlace cuda_pinned;
-    auto result = memory::AllocShared(cuda_pinned, copy_size);
+    auto result = memory::AllocShared(
+        cuda_pinned, copy_size, memory::allocation::Allocator::kCrossDevice);
 
     memory::Copy(cuda_pinned, result->ptr(),
                  boost::get<platform::CUDAPlace>(tensor.place()),
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
index 0275f6a9cf3..8c143867618 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -39,7 +39,6 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
-
     int dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
     auto* sendbuff = in->data<void>();
@@ -67,10 +66,12 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
         red_type = ncclMin;
         break;
     }
+    VLOG(0) << "call allreduce with type: " << reduce_type;
     PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
         comm, stream));
     if (ctx.Attr<bool>("sync_mode")) {
+      VLOG(0) << "sync allreduce...";
       cudaError_t e_sync = cudaStreamSynchronize(stream);
       if (e_sync != 0) {
         LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync);
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index c33842c06e4..80d712a0e02 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -41,132 +41,31 @@ class GenNCCLIdOp : public framework::OperatorBase {
     // put nccl id in CPUPlace
     auto& dev_ctx = *pool.Get(platform::CPUPlace());
     int trainer_id = Attr<int>("trainer_id");
-
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    PADDLE_ENFORCE(
-        trainer_id >= 0 && trainer_id < static_cast<int>(trainers.size()),
-        "trainer_id:%d must be in trainers.size range", trainer_id);
-    std::string endpoint = trainers[trainer_id];
-
     framework::Scope& local_scope = scope.NewScope();
 
-    int nccl_comm_num = Attr<int>("nccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE(trainers.size() > 1, "trainers.size():%llu < 1",
-                     trainers.size());
-      PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d < 1", inter_nranks);
-      PADDLE_ENFORCE((trainers.size() % inter_nranks == 0),
-                     "trainers.size():%llu mod inter_nranks:%d != 0",
-                     trainers.size(), inter_nranks);
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    if (trainer_id != 0) {
-      GetIdByServer(endpoint, &local_scope, dev_ctx, nccl_comm_num,
-                    use_hierarchical_allreduce, trainer_id, inter_trainer_id,
-                    exter_trainer_id);
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    // init flat
     if (trainer_id == 0) {
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
-                            trainers.end());
-      // flat nccl_id
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string var_name = platform::GetFlatNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, var_name, flat_endpoints);
-      }
-    }
-
-    if (!use_hierarchical_allreduce) {
-      return;
-    }
-
-    PADDLE_ENFORCE(trainers.size() % inter_nranks == 0,
-                   "enpoints.size:%llu mod inter_nranks:%d should ==0",
-                   trainers.size(), inter_nranks);
-    PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d must > 1", inter_nranks);
-
-    // hierarchical inter ncclid
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalInterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, inter_endpoints);
-      }
-    }
-
-    // hierarchical exter ncclid
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalExterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, exter_endpoints);
-      }
+      GenerateAndSend(&local_scope, dev_ctx);
+    } else {
+      GetIdByServer(&local_scope, dev_ctx);
     }
   }
 
  private:
   void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx,
-                       const std::string& nccl_id_name,
-                       const std::vector<std::string>& endpoint_list) const {
-    auto var = scope->FindVar(nccl_id_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "can't find nccl_id_var_name:%s",
-                            nccl_id_name);
+                       const platform::DeviceContext& dev_ctx) const {
+    auto var = scope->FindVar(NCCL_ID_VARNAME);
+    PADDLE_ENFORCE_NOT_NULL(var);
     auto id = var->GetMutable<ncclUniqueId>();
     PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
 
+    std::vector<std::string> endpoint_list =
+        Attr<std::vector<std::string>>("endpoint_list");
     distributed::RPCClient* client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
     for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
+      VLOG(3) << "sending nccl id to " << ep;
+      client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME);
     }
     client->Wait();
     for (auto& ep : endpoint_list) {
@@ -176,11 +75,9 @@ class GenNCCLIdOp : public framework::OperatorBase {
     VLOG(3) << "sending completed...";
   }
 
-  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
-                     bool use_hierarchical_allreduce, int trainer_id,
-                     int inter_trainer_id, int exter_trainer_id) const {
-    // std::string endpoint = Attr<std::string>("endpoint");
+  void GetIdByServer(framework::Scope* scope,
+                     const platform::DeviceContext& dev_ctx) const {
+    std::string endpoint = Attr<std::string>("endpoint");
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
@@ -201,44 +98,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
     std::thread server_thread(
         std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
-    for (int i = 0; i < nccl_comm_num; i++) {
-      rpc_service->SetCond(distributed::kRequestSend);
-      VLOG(3) << "trainer_id:" << trainer_id
-              << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
-      rpc_service->WaitBarrier(distributed::kRequestSend);
-      rpc_service->ResetBarrierCounter();
-    }
-
-    if (use_hierarchical_allreduce) {
-      if (inter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3) << "trainer_id:" << trainer_id
-                  << ", inter_trainer_id:" << inter_trainer_id
-                  << " start getting nccl id from inter_trainer:" << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-
-      if (exter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3)
-              << "trainer_id:" << trainer_id
-              << ", exter_trainer_id:" << exter_trainer_id
-              << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
-              << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-    }
-
-    VLOG(3) << "traier_id:" << trainer_id
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << " got nccl id and stop server...";
+    rpc_service->SetCond(distributed::kRequestSend);
+    VLOG(3) << "start getting nccl id from trainer 0...";
+    rpc_service->WaitBarrier(distributed::kRequestSend);
+    VLOG(3) << "got nccl id and stop server...";
     rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
     server_thread.join();
@@ -255,26 +118,18 @@ GenNCCLId operator
 For trainer 0: generate a new UniqueId and send it to all the other trainers.
 For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
 )DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
     AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
+        "endpoint_list",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of trainer endpoints start from trainer 1")
         .SetDefault({});
     AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("nccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
+                 "(int default 0) "
+                 "The index of the trainer in distributed training.")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index b871859dbb1..8e9846b1fc8 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -36,7 +36,7 @@ class RecvOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    int do_not_run = Attr<int>("do_not_run");
+    bool do_not_run = Attr<bool>("do_not_run");
     if (do_not_run) {
       VLOG(3) << "recv do not run!";
       return;
@@ -132,7 +132,7 @@ This operator can get variables from server side.
         "(vector<string>) "
         "the splited parameter varnames to be recved from pserver")
         .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("do_not_run", "if recv need to really run").SetDefault(0);
+    AddAttr<bool>("do_not_run", "if recv need to really run").SetDefault(false);
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index bf12d8a1a6d..2b3fc06dcb7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -13,48 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include <memory>
-#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseAddDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_add_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_add, "Add",
-                                           "Out = X + Y");
-
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpExplicitGrad,
-                  ops::ElementwiseGradOpInplace,
-                  ops::ElementwiseGradNoBufVarsInference,
-                  ops::ElementwiseAddDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_add_grad_grad,
-                  ops::ElementwiseOpDoubleGradWithoutDXDY);
+REGISTER_ELEMWISE_GRAD_MAKER(elementwise_add, Add);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_add, "Add", "Out = X + Y");
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_add,
@@ -68,13 +30,3 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 8320272b4b6..fed12785f47 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -31,9 +30,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_add_grad_grad,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index 7f8b0ffe92f..69f640ab664 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -40,26 +40,25 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_floating_point<T>::value &&
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y, framework::Tensor *z) {
+elementwise_add(const framework::ExecutionContext &ctx,
+                const framework::Tensor *x, const framework::Tensor *y,
+                framework::Tensor *z) {
+  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
+  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
+  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
+
   auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
+  blas.VADD(x->numel(), eigen_x.data(), eigen_y.data(), eigen_z.data());
 }
 
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     !std::is_floating_point<T>::value ||
     !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_same_dims(const framework::ExecutionContext &ctx,
-                          const framework::Tensor *x,
-                          const framework::Tensor *y, framework::Tensor *z) {
-  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-
-  auto &place = *ctx.template device_context<DeviceContext>().eigen_device();
-  eigen_z.device(place) = eigen_x + eigen_y;
+elementwise_add(const framework::ExecutionContext &ctx,
+                const framework::Tensor *x, const framework::Tensor *y,
+                framework::Tensor *z) {
+  default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
 }
 
 template <typename DeviceContext, typename T>
@@ -74,7 +73,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
 
     auto dims_equal = x->dims() == y->dims();
     if (dims_equal) {
-      elementwise_add_same_dims<DeviceContext, T>(ctx, x, y, z);
+      elementwise_add<DeviceContext, T>(ctx, x, y, z);
     } else {
       default_elementwise_add<DeviceContext, T>(ctx, x, y, z);
     }
@@ -161,31 +160,5 @@ class ElementwiseAddGradKernel : public ElemwiseGradKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class ElementwiseAddDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto *y = ctx.Input<Tensor>("Y");
-    auto *dout = ctx.Input<Tensor>("DOut");
-    auto *ddx = ctx.Input<Tensor>("DDX");
-    auto *ddy = ctx.Input<Tensor>("DDY");
-
-    auto *ddout = ctx.Output<Tensor>("DDOut");
-
-    // ddOut = ddx + ddy
-    if (ddout) {
-      Tensor ddx_safe, ddy_safe;
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dout, ddx, &ddx_safe);
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-      ddout->mutable_data<T>(ctx.GetPlace());
-      default_elementwise_add<DeviceContext, T>(ctx, &ddx_safe, &ddy_safe,
-                                                ddout);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
index 6689823d4a2..530a54b7ca1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc
@@ -44,31 +44,6 @@ class ElementwiseDivGradOpDescMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
-class ElementwiseDivDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_div_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("Out", Input("Out"));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-    op->SetInput("DX", Output(framework::GradVarName("X")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    op->SetOutput("DOut", InputGrad("Out"));
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-
-    return op;
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -78,9 +53,7 @@ REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp,
                   ops::ElementwiseDivOpMaker, ops::ElementwiseOpInferVarType,
                   ops::ElementwiseDivGradOpDescMaker);
 
-REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad,
-                  ops::ElementwiseDivDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad);
+REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_div,
@@ -94,14 +67,3 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index b38f84845b7..ae669f55254 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -33,13 +33,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_div_grad_grad,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseDivDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index c604c9017ec..0f0ad863730 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -14,13 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <vector>
-#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
 namespace paddle {
 namespace operators {
 
@@ -56,13 +51,6 @@ struct DivGradDY {
   }
 };
 
-template <typename T>
-struct DivDoubleDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
-    return y * out * dout - x * dout;
-  }
-};
-
 template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
  public:
@@ -84,109 +72,5 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
   }
 };
 
-class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput("DOut")) {
-      ctx->ShareDim("DX", "DOut");
-      ctx->ShareLoD("DX", "DOut");
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", y_grad_name);
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DX", "DDOut");
-      ctx->ShareLoD("DX", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("DDX")->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ElementwiseDivDoubleGradKernel : public framework::OpKernel<T> {
-  using Tensor = framework::Tensor;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Y = ctx.Input<Tensor>("Y");
-    auto* Out = ctx.Input<Tensor>("Out");
-    auto* ddX = ctx.Input<Tensor>("DDX");
-    auto* ddY = ctx.Input<Tensor>("DDY");
-    auto* dX = ctx.Input<Tensor>("DX");
-
-    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* dOut = ctx.Output<Tensor>("DOut");
-    auto* ddOut = ctx.Output<Tensor>("DDOut");
-
-    int axis = ctx.Attr<int>("axis");
-
-    if (dY) dY->mutable_data<T>(Y->dims(), ctx.GetPlace());
-    if (dOut) dOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
-
-    // ddX_safe == null ? 0 : ddX
-    // ddY_safe == null ? 0 : ddY
-    Tensor ddX_safe, ddY_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Out, ddX, &ddX_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, Y, ddY, &ddY_safe);
-
-    if (dOut) {
-      // dOut = - dX * ddY
-      default_elementwise_mul<DeviceContext, T>(ctx, dX, &ddY_safe, dOut);
-      auto& place =
-          *ctx.template device_context<DeviceContext>().eigen_device();
-      auto dout = framework::EigenVector<T>::Flatten(*dOut);
-      dout.device(place) = static_cast<T>(-1) * dout;
-    }
-
-    if (dY) {
-      // dX_div_Y = dX / Y;
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      Tensor dX_div_Y =
-          ctx.AllocateTmpTensor<T, DeviceContext>(Out->dims(), dev_ctx);
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, dX, Y, axis, DivFunctor<T>(), &dX_div_Y);
-
-      // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the
-      // first output tensor is nullptr, the branch to calculate first
-      // output tensor will not be activated, DivGradDx function will not
-      // be called and can be ignored, the first branch has little effect
-      // on running speed.
-
-      // dY = Out * dX * ddY / Y - dX * ddX / Y
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivDoubleDY<T>>(
-          ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY,
-          DivGradDX<T>(), DivDoubleDY<T>());
-    }
-
-    if (ddOut) {
-      // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y
-      default_elementwise_mul<DeviceContext, T>(ctx, Out, &ddY_safe, ddOut);
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &ddX_safe, ddOut, 0, SubFunctor<T>(), ddOut);
-      ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(
-          ctx, ddOut, Y, axis, DivFunctor<T>(), ddOut);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
index 0f6af96ff3d..763e0c713d4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -44,30 +44,6 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
   virtual std::string GetEquation() const { return "Out = X \\\\odot Y"; }
 };
 
-class ElementwiseMulDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_mul_grad_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
-    return op;
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -75,9 +51,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp,
                   ops::ElementwiseMulOpMaker, ops::ElementwiseOpInferVarType,
                   ops::ElementwiseMulOpGradDescMaker);
-REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad,
-                  ops::ElementwiseMulDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_mul_grad_grad, ops::ElementwiseOpDoubleGrad);
+REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
@@ -91,13 +65,3 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseMulDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index d18c7e66f10..e36cc8f9f28 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -89,9 +88,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseMulGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_mul_grad_grad,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, double>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseMulDoubleGradKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 105707b803e..7a7a3989c04 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -38,26 +38,22 @@ template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_floating_point<T>::value &&
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, framework::Tensor* z) {
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
   auto blas = math::GetBlas<DeviceContext, T>(ctx);
-  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
+  blas.VMUL(x->numel(), x->data<T>(), y->data<T>(),
+            z->mutable_data<T>(ctx.GetPlace()));
 }
 
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     !std::is_floating_point<T>::value ||
     !std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_mul_same_dims(const framework::ExecutionContext& ctx,
-                          const framework::Tensor* x,
-                          const framework::Tensor* y, framework::Tensor* z) {
-  auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-  auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-  auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-
-  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-  eigen_z.device(place) = eigen_x * eigen_y;
+elementwise_mul(const framework::ExecutionContext& ctx,
+                const framework::Tensor* x, const framework::Tensor* y,
+                framework::Tensor* z) {
+  default_elementwise_mul<DeviceContext, T>(ctx, x, y, z);
 }
 
 template <typename DeviceContext, typename T>
@@ -92,7 +88,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
 
     z->mutable_data<T>(ctx.GetPlace());
     if (x.numel() == y->numel()) {
-      elementwise_mul_same_dims<DeviceContext, T>(ctx, &x, y, z);
+      elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
     } else {
       default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
     }
@@ -127,56 +123,5 @@ class ElementwiseMulGradKernel : public ElemwiseGradKernel<T> {
         ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
   }
 };
-
-template <typename DeviceContext, typename T>
-class ElementwiseMulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    if (ddout) ddout->mutable_data<T>(ctx.GetPlace());
-
-    // dx = dout * ddy
-    // dy = dout * ddx
-    Tensor ddx_safe, ddy_safe;
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, x, ddx, &ddx_safe);
-    GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-    int axis = ctx.Attr<int>("axis");
-    ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
-        ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX<T>(),
-        MulGradDY<T>());
-
-    // ddout = ddx * y + x * ddy
-    if (ddout) {
-      if (ddx && ddy) {
-        Tensor ddout_tmp;
-        ddout_tmp.mutable_data<T>(ddout->dims(), ctx.GetPlace());
-
-        default_elementwise_mul<DeviceContext, T>(ctx, ddx, y, ddout);
-        default_elementwise_mul<DeviceContext, T>(ctx, x, ddy, &ddout_tmp);
-
-        auto& place =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        auto ddout_t = framework::EigenVector<T>::Flatten(*ddout);
-        auto ddout_tmp_t = framework::EigenVector<T>::Flatten(ddout_tmp);
-        ddout_t.device(place) = ddout_t + ddout_tmp_t;
-      } else {
-        if (ddx) default_elementwise_mul<DeviceContext, T>(ctx, ddx, y, ddout);
-        if (ddy) default_elementwise_mul<DeviceContext, T>(ctx, x, ddy, ddout);
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index c251cc72270..5ec335972a0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -212,71 +212,6 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-class ElementwiseOpDoubleGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->ShareDim("X", x_grad_name);
-      ctx->ShareLoD("X", x_grad_name);
-    }
-    if (ctx->HasOutput(y_grad_name)) {
-      ctx->ShareDim("Y", y_grad_name);
-      ctx->ShareLoD("Y", y_grad_name);
-    }
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DOut", "DDOut");
-      ctx->ShareLoD("DOut", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("DOut")->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ElementwiseOpDoubleGradWithoutDXDY
-    : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  using Tensor = framework::Tensor;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    if (ctx->HasOutput("DDOut")) {
-      ctx->ShareDim("DOut", "DDOut");
-      ctx->ShareLoD("DOut", "DDOut");
-    }
-  }
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("DOut")->type();
-
-#ifdef PADDLE_WITH_MKLDNN
-    if (platform::CanMKLDNNBeUsed(ctx)) {
-      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
-                                     framework::DataLayout::kMKLDNN,
-                                     framework::LibraryType::kMKLDNN);
-    }
-#endif
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
-  }
-};
-
 // For Add, Sub op, the X, Out is not needed.
 class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
  public:
@@ -387,16 +322,3 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y");
                     ::paddle::operators::ElementwiseOpExplicitGrad, \
                     ::paddle::operators::ElementwiseGradOpInplace,  \
                     ::paddle::operators::ElementwiseGradNoBufVarsInference)
-
-#define REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(op_type, op_name, equation) \
-  class __ElemwiseOp##op_type##Maker__                                         \
-      : public ::paddle::operators::ElementwiseOpMaker {                       \
-   protected:                                                                  \
-    virtual std::string GetName() const { return op_name; }                    \
-    virtual std::string GetEquation() const { return equation; }               \
-  };                                                                           \
-  REGISTER_OPERATOR(op_type, ::paddle::operators::ElementwiseOp,               \
-                    __ElemwiseOp##op_type##Maker__,                            \
-                    ::paddle::operators::ElementwiseOpInferVarType,            \
-                    op_type##GradMaker,                                        \
-                    ::paddle::operators::ElementwiseOpInplace);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 2b108efef4a..2e91ec84848 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -1005,24 +1005,24 @@ template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
           bool UseIntermediateOut>
 struct FusedElemwiseAndActGradNoBroadcast {
   HOSTDEVICE void operator()(size_t i) {
-    T x_val = x_[i];
-    T y_val = y_[i];
-    T out_val = out_[i];
-    T dout_val = dout_[i];
-    T intermediate_out_val = UseIntermediateOut
-                                 ? intermediate_out_[i]
-                                 : dx_op_.GetIntermediateOut(x_val, y_val);
     if (dx_ != nullptr) {
-      dx_[i] = dx_op_.UseIntermediateOut(x_val, y_val, intermediate_out_val,
-                                         out_val, dout_val);
+      dx_[i] = UseIntermediateOut
+                   ? dx_op_.UseIntermediateOut(
+                         x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i])
+                   : dx_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
     }
     if (dy_ != nullptr) {
-      dy_[i] = dy_op_.UseIntermediateOut(x_val, y_val, intermediate_out_val,
-                                         out_val, dout_val);
+      dy_[i] = UseIntermediateOut
+                   ? dy_op_.UseIntermediateOut(
+                         x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i])
+                   : dy_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
     }
     if (dintermediate_ != nullptr) {
-      dintermediate_[i] = dintermediate_op_.UseIntermediateOut(
-          x_val, intermediate_out_val, out_val, dout_val);
+      dintermediate_[i] =
+          UseIntermediateOut
+              ? dintermediate_op_.UseIntermediateOut(
+                    x_[i], intermediate_out_[i], out_[i], dout_[i])
+              : dintermediate_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
     }
   }
 
@@ -1636,21 +1636,5 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
     }
   }
 }
-
-template <typename DeviceContext, typename T>
-static inline void GetDoubleGradSafeTensor(
-    const framework::ExecutionContext &ctx, const framework::Tensor *x,
-    const framework::Tensor *ddx, framework::Tensor *ddx_safe) {
-  if (ddx) {
-    *ddx_safe = *ddx;
-  } else {
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    *ddx_safe = ctx.AllocateTmpTensor<T, DeviceContext>(x->dims(), dev_ctx);
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(ctx.template device_context<DeviceContext>(), ddx_safe,
-             static_cast<T>(0));
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index b1ec10ea86c..04c87c1b2ac 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -13,48 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
-#include <memory>
-#include <string>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class ElementwiseSubDoubleGradDescMaker
-    : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("elementwise_sub_grad_grad");
-    op->SetInput("Y", Input("Y"));
-    op->SetInput("DOut", Input(framework::GradVarName("Out")));
-    op->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    op->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    op->SetAttrMap(Attrs());
-
-    op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out")));
-    return op;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 namespace ops = paddle::operators;
 REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub);
-REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, "Sub",
-                                           "Out = X - Y");
-
-REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpExplicitGrad,
-                  ops::ElementwiseGradOpInplace,
-                  ops::ElementwiseGradNoBufVarsInference,
-                  ops::ElementwiseSubDoubleGradDescMaker);
-REGISTER_OPERATOR(elementwise_sub_grad_grad,
-                  ops::ElementwiseOpDoubleGradWithoutDXDY);
+REGISTER_ELEMWISE_EXPLICIT_OP(elementwise_sub, "Sub", "Out = X - Y");
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_sub,
@@ -68,13 +30,3 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CPUDeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 52fad7fd04b..f2adf1c8373 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -33,13 +33,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                   int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    elementwise_sub_grad_grad,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        float>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        double>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int>,
-    ops::ElementwiseSubDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                                        int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 5049d587b58..770323fe5a8 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -68,33 +68,5 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel<T> {
         ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
   }
 };
-
-template <typename DeviceContext, typename T>
-class ElementwiseSubDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using Tensor = framework::Tensor;
-
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* dout = ctx.Input<Tensor>("DOut");
-    auto* ddx = ctx.Input<Tensor>("DDX");
-    auto* ddy = ctx.Input<Tensor>("DDY");
-
-    auto* ddout = ctx.Output<Tensor>("DDOut");
-
-    // DDOut = ddx - ddy
-    if (ddout) {
-      Tensor ddx_safe, ddy_safe;
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, dout, ddx, &ddx_safe);
-      GetDoubleGradSafeTensor<DeviceContext, T>(ctx, y, ddy, &ddy_safe);
-
-      ddout->mutable_data<T>(ctx.GetPlace());
-      int axis = ctx.Attr<int>("axis");
-      ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &ddx_safe, &ddy_safe, axis, SubFunctor<T>(), ddout);
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 88cda1cd668..6a6741d8fc5 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
-#include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
 namespace paddle {
@@ -54,44 +53,12 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
     // Execute default elementwise_add operator when
     // broadcast operations need to performed.
     if (x_dims != y_dims_untrimed) {
-      Tensor _x;
-      mkldnn::memory::format format;
-      std::vector<int> src_x_tz = framework::vectorize2int(x_dims);
-
-      if ((src_x_tz.size() == 3 &&
-           x->format() != (format = memory::format::ncw)) ||
-          (src_x_tz.size() == 4 &&
-           x->format() != (format = memory::format::nchw)) ||
-          (src_x_tz.size() == 5 &&
-           x->format() != (format = memory::format::ncdhw))) {
-        _x.Resize(x_dims);
-        auto user_x_memory_pd = memory::primitive_desc(
-            {{src_x_tz}, memory::data_type::f32, x->format()}, mkldnn_engine);
-        auto x_memory_pd = memory::primitive_desc(
-            {{src_x_tz}, memory::data_type::f32, format}, mkldnn_engine);
-        auto size = x_memory_pd.get_size();
-        _x.mutable_data<T>(ctx.GetPlace(), size);
-        auto user_x_memory =
-            memory(user_x_memory_pd, paddle::platform::to_void_cast<T>(x_data));
-        auto x_memory = memory(x_memory_pd,
-                               paddle::platform::to_void_cast<T>(_x.data<T>()));
-
-        auto x_reorder = reorder(user_x_memory, x_memory);
-
-        std::vector<primitive> pipeline;
-        pipeline.push_back(x_reorder);
-        stream(stream::kind::eager).submit(pipeline).wait();
-      } else {
-        format = x->format();
-        _x.ShareDataWith(*x);
-      }
-
       auto sum_func = [](T a, T b) -> T { return a + b; };
 
       TransformFunctor<decltype(sum_func), T,
                        paddle::platform::CPUDeviceContext, T>
           functor(
-              &_x, y, z,
+              x, y, z,
               ctx.template device_context<paddle::platform::CPUDeviceContext>(),
               sum_func);
 
@@ -111,7 +78,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
         functor.RunMidWise(n, pre, post);
       }
       z->set_layout(DataLayout::kMKLDNN);
-      z->set_format(format);
+      z->set_format(x->format());
     } else {
       PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN &&
                          x->format() != memory::format::format_undef,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index e15f848c23d..fcb2be93635 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/expand_op.h"
 #include <memory>
-#include <string>
 #include <vector>
 
 namespace paddle {
@@ -31,12 +30,9 @@ class ExpandOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
 
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
     auto x_dims = ctx->GetInputDim("X");
-    std::vector<int> expand_times(x_dims.size(), -1);
-
-    if (!ctx->HasInputs("expand_times_tensor")) {
-      expand_times = ctx->Attrs().Get<std::vector<int>>("expand_times");
-    }
 
     PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
                       "The number of Attr(expand_times)'s value must be equal "
@@ -46,11 +42,15 @@ class ExpandOp : public framework::OperatorWithKernel {
 
     std::vector<int64_t> out_shape(x_dims.size());
     for (size_t i = 0; i < expand_times.size(); ++i) {
-      if (x_dims[i] == -1 || expand_times[i] == -1) {
-        out_shape[i] = -1;
-      } else {
-        out_shape[i] = x_dims[i] * expand_times[i];
-      }
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    // set the first dim to -1 in compile time
+    if (!ctx->IsRuntime() && x_dims[0] < 0) {
+      out_shape[0] = x_dims[0];
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
@@ -58,23 +58,6 @@ class ExpandOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", "Out");
     }
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "expand_times_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -83,9 +66,6 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
              "X is the input to be expanded.");
-    AddInput("expand_times_tensor", "(Tensor Tensor<int>), epxand times for X")
-        .AsDuplicable()
-        .AsDispensable();
     AddOutput("Out",
               "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
               "The rank of Output(Out) have the same with Input(X). "
@@ -93,8 +73,7 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
               "to size of the corresponding dimension of Input(X) multiplying "
               "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("expand_times",
-                              "Expand times number for each dimension.")
-        .SetDefault({});
+                              "Expand times number for each dimension.");
     AddComment(R"DOC(
 Expand operator tiles the input by given times number. You should set times
 number for each dimension by providing attribute 'expand_times'. The rank of X
@@ -134,7 +113,6 @@ class ExpandGradOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     std::vector<int> expand_times =
         ctx->Attrs().Get<std::vector<int>>("expand_times");
-
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     size_t start_pos = 0u;
@@ -159,23 +137,6 @@ class ExpandGradOp : public framework::OperatorWithKernel {
       ctx->SetOutputDim(x_grad_name, x_dims);
     }
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "expand_times_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
@@ -189,7 +150,6 @@ class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("X", Input("X"));
     op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    op->SetInput("expand_times_tensor", Input("expand_times_tensor"));
     op->SetAttrMap(Attrs());
     return op;
   }
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 8153987d6c7..33940824977 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -48,29 +48,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-inline std::vector<int> get_expand_times(
-    const framework::ExecutionContext& ctx) {
-  auto list_expand_times_tensor =
-      ctx.MultiInput<framework::Tensor>("expand_times_tensor");
-  if (list_expand_times_tensor.size() > 0) {
-    // get tensor from
-    std::vector<int> vec_epxand_times;
-    for (size_t i = 0; i < list_expand_times_tensor.size(); ++i) {
-      auto tensor = list_expand_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-        vec_epxand_times.push_back(*temp.data<int32_t>());
-      } else {
-        vec_epxand_times.push_back(*tensor->data<int32_t>());
-      }
-    }
-
-    return vec_epxand_times;
-  } else {
-    return ctx.Attr<std::vector<int>>("expand_times");
-  }
-}
 
 using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -97,21 +74,12 @@ class ExpandKernel : public framework::OpKernel<T> {
   template <int Rank>
   void Expand(const framework::ExecutionContext& context) const {
     auto* in0 = context.Input<Tensor>("X");
-
-    auto in_dims = in0->dims();
-    auto expand_times = get_expand_times(context);
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
     auto* out0 = context.Output<Tensor>("Out");
     Eigen::DSizes<int, Rank> bcast_dims;
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
-
-    framework::DDim out_dims(in_dims);
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      out_dims[i] *= expand_times[i];
-    }
-
-    out0->Resize(out_dims);
     auto x = EigenTensor<T, Rank>::From(*in0);
     out0->mutable_data<T>(context.GetPlace());
     auto y = EigenTensor<T, Rank>::From(*out0);
@@ -126,8 +94,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
-    // auto& expand_times = context.Attr<std::vector<int>>("expand_times");
-    auto expand_times = get_expand_times(context);
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
     auto x_dims = in0->dims();
     // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
     //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 034f3c7dceb..25ca1f7e0a0 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -68,23 +68,6 @@ struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ClipAndFakeQuantFunctor<platform::CPUDeviceContext, float>;
 
-template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    T s = scale.data<T>()[0];
-    platform::Transform<platform::CPUDeviceContext> trans;
-    trans(ctx, in.data<T>(), in.data<T>() + in.numel(),
-          out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
-    auto out_e = framework::EigenVector<T>::Flatten(*out);
-    out_e.device(*ctx.eigen_device()) =
-        (s / bin_cnt) * (bin_cnt / s * out_e).round();
-  }
-};
-template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
-                                               float>;
-
 template <typename T>
 struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -341,26 +324,24 @@ $$Out = round(X/scale * range)$$
   }
 };
 
-class FakeQuantOrWithDequantMovingAverageAbsMaxOp
-    : public framework::OperatorWithKernel {
+class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantOrWithDequantMovingAverageAbsMaxOp(
-      const std::string& type, const framework::VariableNameMap& inputs,
-      const framework::VariableNameMap& outputs,
-      const framework::AttributeMap& attrs)
+  FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
+                                    const framework::VariableNameMap& inputs,
+                                    const framework::VariableNameMap& outputs,
+                                    const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-                   "should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-                   "should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("OutScale"),
-        "Output(OutScale) of FakeQuantOrWithDequantMovingAverageAbsMaxOp "
-        "should not be null");
+        ctx->HasInput("X"),
+        "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
+                   "should not be null");
     if (ctx->HasOutput("OutState")) {
       ctx->SetOutputDim("OutState", {1});
     }
@@ -380,7 +361,7 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOp
   }
 };
 
-class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
+class FakeQuantizeMovingAverageAbsMaxOpMaker
     : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -405,19 +386,12 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
     AddComment(R"DOC(
-This is a Base Op which support FakeQuantMovingAverageAbsMaxOp and FakeQuantDequantMovingAverageAbsMaxOp
-FakeQuantMovingAverageAbsMaxOp operator is used in static quantization.
+FakeQuantize operator is used in static quantization.
 
 $$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
 $$range = 2^{bit\_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
 
-FakeQuantDequantMovingAverageAbsMaxOp operator do the moving_average_abs_max op quant and then dequant.
-
-$$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
-$$range = 2^{bit\_length - 1} - 1$$
-$$Out = round(X/scale * range) * scale / range$$
-
 )DOC");
   }
 };
@@ -503,21 +477,11 @@ REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
                        ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
 
 REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
+                  ops::FakeQuantizeMovingAverageAbsMaxOp,
+                  ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                        ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
-
-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
-
 REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
                   ops::FakeChannelWiseQuantizeAbsMaxOp,
                   ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index e9a7201bc08..6e1d40cac76 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -129,23 +129,6 @@ __global__ void ClipAndQuantKernel(const T* in, const T* scale,
   }
 }
 
-template <typename T>
-__global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
-                                          const int bin_cnt, const int n,
-                                          T* out) {
-  int bid = threadIdx.x + blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-
-  T s = scale[0];
-  for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
-    T x = in[i];
-    T v = x > s ? s : x;
-    v = v < -s ? -s : v;
-    v = bin_cnt / s * v;
-    out[i] = round(v) * s / bin_cnt;
-  }
-}
-
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& ctx,
@@ -166,27 +149,6 @@ struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
 
 template struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, float>;
 
-template <typename T>
-struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& ctx,
-                  const framework::Tensor& in, const framework::Tensor& scale,
-                  const int bin_cnt, framework::Tensor* out) {
-    int num = in.numel();
-    int block = 1024;
-    int grid = (block - 1 + num) / block;
-
-    const T* in_data = in.data<T>();
-    const T* scale_data = scale.data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    ClipAndQuantDequantKernel<T><<<grid, block, 0, ctx.stream()>>>(
-        in_data, scale_data, bin_cnt, num, out_data);
-  }
-};
-
-template struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext,
-                                               float>;
-
 template <typename T>
 __global__ void ChannelClipAndQuantKernel(const T* in, const T* scale,
                                           const int bin_cnt, const int n,
@@ -264,8 +226,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
     T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
 
     framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
+    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>(gpu_place);
 
     FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
         cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),
@@ -340,6 +302,3 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
                         ops::MovingAverageAbsMaxScaleKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 422d99dd433..87bcece5824 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -35,13 +35,6 @@ struct ClipAndFakeQuantFunctor {
                   framework::Tensor* out);
 };
 
-template <typename DeviceContext, typename T>
-struct ClipAndFakeQuantDequantFunctor {
-  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
-                  const framework::Tensor& scale, const int bin_cnt,
-                  framework::Tensor* out);
-};
-
 template <typename DeviceContext, typename T>
 struct FindRangeAbsMaxFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& cur_scale,
@@ -157,13 +150,8 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
+class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
  public:
-  ~FakeMovingAverageAbsMaxKernelBase() {}
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& in_scale, int bin_cnt,
-                              framework::Tensor* out) const = 0;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* in_scale = context.Input<framework::Tensor>("InScale");
@@ -177,7 +165,8 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
 
     // testing
     if (is_test) {
-      RunClipFunctor(dev_ctx, *in, *in_scale, bin_cnt, out);
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
       return;
     }
 
@@ -204,31 +193,8 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
         dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
         out_accum, out_scale);
 
-    RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      framework::Tensor* out) const override {
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale, bin_cnt,
-                                                out);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
-    : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
-  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
-                      const framework::Tensor& in_scale, int bin_cnt,
-                      framework::Tensor* out) const override {
-    ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(dev_ctx, in, in_scale,
-                                                       bin_cnt, out);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 1cd6c40aa05..89cdca3ec27 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -50,10 +50,7 @@ bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
  * out.
  */
 static bool IsSupportedCompound(const std::vector<std::string> &functors) {
-  PADDLE_ENFORCE_EQ(functors.size(), 2UL);
-
-  static std::unordered_set<std::string> unary_fun = {"scale", "relu", "tanh",
-                                                      "sigmoid"};
+  static std::unordered_set<std::string> unary_fun = {"scale", "relu"};
   static std::unordered_set<std::string> binary_fun = {"elementwise_add",
                                                        "elementwise_mul"};
 
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
index 7cb753211ea..01dc2dbfd61 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -255,27 +255,6 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
                              paddle::operators::math::ScaleFunctor<T>>(
         ctx, paddle::operators::math::MulFunctor<T>(),
         paddle::operators::math::ScaleFunctor<T>(scale), in_x, in_y, outputs);
-  } else if (funcs_str == "tanh,elementwise_add") {
-    // Z = Unary(Binary(X, Y))
-    RunUnaryCompoundFunctors<DeviceContext, T,
-                             paddle::operators::math::TanhFunctor<T>,
-                             paddle::operators::math::AddFunctor<T>>(
-        ctx, paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_mul,tanh") {
-    // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::TanhFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(), in_x, in_y, outputs);
-  } else if (funcs_str == "elementwise_mul,sigmoid") {
-    // Z = Binary(X, Unary(Y))
-    RunBinaryCompoundFunctor<DeviceContext, T,
-                             paddle::operators::math::MulFunctor<T>,
-                             paddle::operators::math::SigmoidFunctor<T>>(
-        ctx, paddle::operators::math::MulFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(), in_x, in_y, outputs);
   } else {
     PADDLE_THROW("%s has not been implemented.", funcs_str);
   }
@@ -314,7 +293,6 @@ static void RunGradFunctors(
         paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "elementwise_add_grad,relu_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
     RunBinaryCompoundGradFunctors<
         DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
         paddle::operators::math::ReluFunctor<T>,
@@ -324,7 +302,6 @@ static void RunGradFunctors(
         paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else if (funcs_str == "relu_grad,elementwise_add_grad") {
-    // The backward of Z = Unary(Binary(X, Y))
     RunUnaryCompoundGradFunctors<
         DeviceContext, T, paddle::operators::math::ReluGradFunctor<T>,
         paddle::operators::math::AddFunctor<T>,
@@ -344,36 +321,6 @@ static void RunGradFunctors(
         paddle::operators::math::ScaleFunctor<T>(scale),
         paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
         in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "tanh_grad,elementwise_add_grad") {
-    // The backward of Z = Unary(Binary(X, Y))
-    RunUnaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::TanhGradFunctor<T>,
-        paddle::operators::math::AddFunctor<T>,
-        paddle::operators::math::AddGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::TanhGradFunctor<T>(),
-        paddle::operators::math::AddFunctor<T>(),
-        paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_mul_grad,tanh_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::TanhFunctor<T>,
-        paddle::operators::math::TanhGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::TanhFunctor<T>(),
-        paddle::operators::math::TanhGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
-  } else if (funcs_str == "elementwise_mul_grad,sigmoid_grad") {
-    // The backward of Z = Binary(X, Unary(Y))
-    RunBinaryCompoundGradFunctors<
-        DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
-        paddle::operators::math::SigmoidFunctor<T>,
-        paddle::operators::math::SigmoidGradFunctor<T>, InPlace>(
-        ctx, paddle::operators::math::MulGradFunctor<T>(),
-        paddle::operators::math::SigmoidFunctor<T>(),
-        paddle::operators::math::SigmoidGradFunctor<T>(), in_x, in_y, in_out,
-        in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
   } else {
     PADDLE_THROW("%s has not been implemented.", funcs_str);
   }
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index fff817fbd02..5bc2e63757f 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -26,15 +26,14 @@ using platform::DeviceContext;
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
-template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
-                                 size_t slice_size) {
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
   CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT gather_i = indices[indices_i];
-    IndexT params_i = gather_i * slice_size + slice_i;
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
     *(output + i) = *(params + params_i);
   }
 }
@@ -43,10 +42,10 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
  * A thin wrapper on gpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename T, typename IndexT = int>
+template <typename T>
 void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
@@ -65,14 +64,15 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
   const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
+  // why must be int?
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   int block = 512;
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
 
-  GatherCUDAKernel<T, IndexT><<<
+  GatherCUDAKernel<T><<<
       grid, block, 0,
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index 1e02c036e35..dc08ee5efac 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -30,10 +30,10 @@ using framework::Tensor;
  * A thin wrapper for gathering on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename T, typename IndexT = int>
+template <typename T>
 void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
@@ -45,7 +45,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   auto src_dims = src.dims();
 
   const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   // slice size
@@ -55,7 +55,7 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   const size_t slice_bytes = slice_size * sizeof(T);
 
   for (int64_t i = 0; i < index_size; ++i) {
-    IndexT index_ = p_index[i];
+    int index_ = p_index[i];
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index cbabd59cf63..91f3818f216 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -74,13 +74,6 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "The source input of gather op");
     AddInput("Index", "The index input of gather op");
     AddOutput("Out", "The output of gather op");
-    AddAttr<bool>(
-        "overwrite",
-        "(bool, default: False) "
-        "In backward process, calc the grad when has same index,"
-        "If true, update the grad using the overwrite mode in same index,"
-        "If false, using the accumulate mode in same index.")
-        .SetDefault(true);
     AddComment(R"DOC(
 Gather Operator.
 
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 061f92c76c3..490ba9a585e 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -32,20 +32,7 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
@@ -55,7 +42,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext &ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                    "This kernel only runs on GPU device.");
-    auto *index = ctx.Input<Tensor>("Index");
+    auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
@@ -65,23 +52,7 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
-    }
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 852790a4c63..2e18298cf8e 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -36,21 +36,7 @@ class GatherOpKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
-    }
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
@@ -61,7 +47,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "This kernel only runs on CPU.");
 
-    auto *index = ctx.Input<Tensor>("Index");
+    auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
@@ -71,30 +57,7 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
                        .eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
-    bool overwrite = ctx.Attr<bool>("overwrite");
-
-    const auto &index_type = index->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (index_type == framework::proto::VarType::INT32) {
-      if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
-      } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
-      }
-    } else if (index_type == framework::proto::VarType::INT64) {
-      if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
-      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
-      }
-    }
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index bcca992e2b4..45c769ee372 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -144,7 +144,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
         Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
         gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
-      gru_value.output_value = nullptr;
+
       math::GRUUnitGradFunctor<DeviceContext, T>::compute(
           dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
           active_gate, origin_mode);
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 0fa7322fbd6..44fd95edef2 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -34,7 +34,7 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                       "Input(X) format must be 4D tensor, eg., NCHW.");
-    auto img_channels = in_dim[1];
+    int img_channels = in_dim[1];
 
     auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
     auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 9c9069b7227..4a994281941 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -113,10 +113,9 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
                                            paddings[2], strides[0]);
       int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
                                           paddings[3], strides[1]);
-      out->mutable_data<T>(
-          {static_cast<int64_t>(batch_size) * output_height * output_width,
-           static_cast<int64_t>(img_channels) * kernels[0] * kernels[1]},
-          ctx.GetPlace());
+      out->mutable_data<T>({batch_size * output_height * output_width,
+                            img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
       const std::vector<int> dilations({1, 1});
       auto out_dims = out->dims();
       out->Resize({batch_size, out->numel() / batch_size});
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 652aec9a538..5fd42809dfe 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -11,7 +11,6 @@
 
 #pragma once
 #include <string>
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -58,17 +57,7 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
   auto input_t = EigenTensor<T, 4>::From(input);
   auto output_t = EigenTensor<T, 4>::From(*output);
   bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int k = 0; k < out_h; k++) {
+  for (int k = 0; k < out_h; k++) {  // loop for images
     int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
                          : static_cast<int>(ratio_h * k);
     y_n = (y_n > 0) ? y_n : 0;
@@ -76,53 +65,24 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output,
     float d_n =
         align_flag ? ratio_h * (k + 0.5) - 0.5 - y_n : ratio_h * k - y_n;
     float d_s = 1.f - d_n;
-    {
-      vy_n[k] = y_n;
-      vy_s[k] = y_s;
-      vd_n[k] = d_n;
-      vd_s[k] = d_s;
-    }
-  }
 
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for
-#endif
-  for (int l = 0; l < out_w; l++) {
-    int x_w = (align_mode == 0 && !align_corners)
-                  ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                  : static_cast<int>(ratio_w * l);
-    x_w = (x_w > 0) ? x_w : 0;
-    int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
-    float d_w =
-        align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
-    float d_e = 1.f - d_w;
-    {
-      vx_w[l] = x_w;
-      vx_e[l] = x_e;
-      vd_w[l] = d_w;
-      vd_e[l] = d_e;
-    }
-  }
+    for (int l = 0; l < out_w; l++) {
+      int x_w = (align_mode == 0 && !align_corners)
+                    ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                    : static_cast<int>(ratio_w * l);
+      x_w = (x_w > 0) ? x_w : 0;
+      int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1);
+      float d_w =
+          align_flag ? ratio_w * (l + 0.5) - 0.5 - x_w : ratio_w * l - x_w;
+      float d_e = 1.f - d_w;
 
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for collapse(4)
-#endif
-  for (int i = 0; i < n; i++) {          // loop for batches
-    for (int j = 0; j < c; j++) {        // loop for channels
-      for (int k = 0; k < out_h; k++) {  // loop for images
-        for (int l = 0; l < out_w; l++) {
+      for (int i = 0; i < n; i++) {    // loop for batches
+        for (int j = 0; j < c; j++) {  // loop for channels
           // bilinear interpolation
-          T out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
-                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
-                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
-                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
-          output_t(i, j, k, l) = out_t;
+          output_t(i, j, k, l) = input_t(i, j, y_n, x_w) * d_s * d_e +
+                                 input_t(i, j, y_s, x_w) * d_n * d_e +
+                                 input_t(i, j, y_n, x_e) * d_s * d_w +
+                                 input_t(i, j, y_s, x_e) * d_n * d_w;
         }
       }
     }
diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h
index 45a155af852..8f620ba7d2f 100644
--- a/paddle/fluid/operators/load_combine_op.h
+++ b/paddle/fluid/operators/load_combine_op.h
@@ -65,10 +65,7 @@ class LoadCombineOpKernel : public framework::OpKernel<T> {
       auto *tensor = out_vars[i]->GetMutable<framework::LoDTensor>();
 
       // Error checking
-      PADDLE_ENFORCE(
-          static_cast<bool>(*buffer),
-          "There is a problem with loading model parameters. "
-          "Please check whether the model file is complete or damaged.");
+      PADDLE_ENFORCE(static_cast<bool>(*buffer), "Cannot read more");
 
       // Get data from fin to tensor
       DeserializeFromStream(*buffer, tensor, dev_ctx);
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 962822f33e6..61e34273704 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -131,7 +131,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
       }
     }
 
-    std::map<size_t, framework::Tensor> outputs;
+    auto &outputs = *const_cast<framework::Scope &>(scope)
+                         .Var()
+                         ->GetMutable<std::map<size_t, framework::Tensor>>();
 
     for (size_t i = 0; i < max_seq_len; ++i) {
       auto &ranges = copy_ranges[i];
diff --git a/paddle/fluid/operators/math/compound_functors.h b/paddle/fluid/operators/math/compound_functors.h
index 6a43215bf52..7aba4a917cd 100644
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/fluid/operators/math/compound_functors.h
@@ -74,8 +74,6 @@ struct BinaryCompoundGradDxFunctor {
     return dout * d_binary_fun_.Dx(x, intermediate_out);
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
  private:
   DBinaryFun d_binary_fun_;
   UnaryFun unary_fun_;
@@ -107,8 +105,6 @@ struct BinaryCompoundGradDyFunctor {
     }
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
  private:
   DBinaryFun d_binary_fun_;
   UnaryFun unary_fun_;
@@ -147,8 +143,6 @@ struct UnaryCompoundGradDxFunctor {
     return base * d_binary_fun_.Dx(x, y);
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
  private:
   DUnaryFun d_unary_fun_;
   BinaryFun binary_fun_;
@@ -187,8 +181,6 @@ struct UnaryCompoundGradDyFunctor {
     return base * d_binary_fun_.Dy(x, y);
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
  private:
   DUnaryFun d_unary_fun_;
   BinaryFun binary_fun_;
@@ -211,8 +203,6 @@ struct BinaryCompoundGradDIntermedaiteOutFunctor {
     return dout * d_binary_fun_.Dy(x, intermediate_out);
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return unary_fun_(y); }
-
  private:
   DBinaryFun d_binary_fun_;
   UnaryFun unary_fun_;
@@ -242,8 +232,6 @@ struct UnaryCompoundGradDIntermediateFunctor {
     }
   }
 
-  inline HOSTDEVICE T GetIntermediateOut(T x, T y) { return binary_fun_(x, y); }
-
  private:
   DUnaryFun d_unary_fun_;
   BinaryFun binary_fun_;
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 153e6117227..e925e7bb591 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -24,9 +24,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__global__ void ConcatKernel(const T** inputs, const int* input_cols,
-                             int col_size, const int output_rows,
-                             const int output_cols, T* output) {
+__global__ void ConcatKernel(T** inputs, const int* input_cols, int col_size,
+                             const int output_rows, const int output_cols,
+                             T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
   int curr_offset = input_cols[0];
@@ -41,7 +41,7 @@ __global__ void ConcatKernel(const T** inputs, const int* input_cols,
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
 
-    const T* input_ptr = inputs[curr_segment];
+    T* input_ptr = inputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
       output[tid_y * output_cols + tid_x] =
@@ -50,14 +50,14 @@ __global__ void ConcatKernel(const T** inputs, const int* input_cols,
 }
 
 template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col, const int out_rows,
-                                   const int out_cols, T* output_data) {
+__global__ void ConcatKernel(T** inputs_data, const int fixed_in_col,
+                             const int out_rows, const int out_cols,
+                             T* output_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x * 1.0 / fixed_in_col;
     int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
+    T* input_ptr = inputs_data[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
       output_data[tid_y * out_cols + tid_x] =
@@ -66,52 +66,6 @@ __device__ void ConcatKernelDetail(const T** inputs_data,
   }
 }
 
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const int fixed_in_col,
-                             const int out_rows, const int out_cols,
-                             T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const T* input_addr3,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs_data, const int in_num,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
 template <typename T>
 __global__ void SplitKernel(const T* input_data, const int in_row,
                             const int in_col, const int* out_cols,
@@ -140,9 +94,9 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__device__ void SplitKernelDetail(const T* input_data, const int in_row,
-                                  const int in_col, const int fixed_out_col,
-                                  T** outputs_data) {
+__global__ void SplitKernel(const T* input_data, const int in_row,
+                            const int in_col, const int fixed_out_col,
+                            T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x / fixed_out_col;
@@ -157,70 +111,6 @@ __device__ void SplitKernelDetail(const T* input_data, const int in_row,
   }
 }
 
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2, T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int num_rows, int num_cols, dim3* block_dims,
-                                dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-  int grid_cols =
-      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1));
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
 /*
  * All tensors' dimension should be the same and the values of
  * each dimension must be the same, except the axis dimension.
@@ -241,55 +131,53 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    std::vector<const T*> inputs_data(in_num);
+    std::vector<const T*> inputs_data;
     std::vector<int> inputs_col(in_num + 1);
+    inputs_data.reserve(in_num);
 
     inputs_col[0] = 0;
-    bool has_same_shape = true;
+    bool sameShape = true;
     for (int i = 0; i < in_num; ++i) {
       int t_cols = input[i].numel() / in_row;
-      if (has_same_shape) {
-        if (t_cols != in_col) has_same_shape = false;
+      if (sameShape) {
+        if (t_cols != in_col) sameShape = false;
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_data[i] = input[i].data<T>();
+      inputs_data.emplace_back(input[i].data<T>());
     }
 
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
-
-    memory::allocation::AllocationPtr tmp_dev_ins_data;
-    const T** dev_ins_data = nullptr;
-    if (!has_same_shape || in_num < 2 || in_num > 4) {
-      tmp_dev_ins_data =
-          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
-              inputs_data.size() * sizeof(T*));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_data.data()),
-                   inputs_data.size() * sizeof(T*), context.stream());
-      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
+    // computation
+    // set the thread block and grid according to CurrentDeviceId
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (out_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((out_col + 31) >> 5) << 5;
     }
-
-    if (has_same_shape) {
-      if (in_num == 2) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], in_col, out_row, out_col,
-            output->data<T>());
-      } else if (in_num == 3) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], in_col, out_row,
-            out_col, output->data<T>());
-      } else if (in_num == 4) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], inputs_data[3],
-            in_col, out_row, out_col, output->data<T>());
-      } else {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
-      }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((out_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    auto tmp_dev_ins_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            inputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_ins_data->ptr(), platform::CPUPlace(),
+                 static_cast<void*>(inputs_data.data()),
+                 inputs_data.size() * sizeof(T*), context.stream());
+    T** dev_ins_data = reinterpret_cast<T**>(tmp_dev_ins_data->ptr());
+
+    if (sameShape) {
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
+          dev_ins_data, in_col, out_row, out_col, output->data<T>());
     } else {
       auto tmp_dev_ins_col_data =
           platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
@@ -300,7 +188,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                    inputs_col.size() * sizeof(int), context.stream());
       int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
 
-      ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+      ConcatKernel<<<grid_size, block_size, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
           out_row, out_col, output->data<T>());
     }
@@ -328,7 +216,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
 
     int out0_col = ref_inputs[0]->numel() / out_row;
     int in_col = 0, in_row = out_row;
-    bool has_same_shape = true;
+    bool sameShape = true;
 
     std::vector<T*> outputs_data(o_num);
     std::vector<int> outputs_cols(o_num + 1);
@@ -336,8 +224,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
       int t_col = ref_inputs.at(i)->numel() / out_row;
-      if (has_same_shape) {
-        if (t_col != out0_col) has_same_shape = false;
+      if (sameShape) {
+        if (t_col != out0_col) sameShape = false;
       }
       in_col += t_col;
       outputs_cols[i + 1] = in_col;
@@ -348,40 +236,36 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       }
     }
 
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-    memory::allocation::AllocationPtr tmp_dev_outs_data;
-    T** dev_out_gpu_data = nullptr;
-    if (!has_same_shape || o_num < 2 || o_num > 4) {
-      tmp_dev_outs_data =
-          platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
-              outputs_data.size() * sizeof(T*));
-      memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
-                   tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_data.data()),
-                   outputs_data.size() * sizeof(T*), context.stream());
-      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+    // computation
+    const int kThreadsPerBlock = 1024;
+    int block_cols = kThreadsPerBlock;
+    if (in_col < kThreadsPerBlock) {  // block_cols is aligned by 32.
+      block_cols = ((in_col + 31) >> 5) << 5;
     }
-
-    if (has_same_shape) {
-      if (o_num == 2) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1]);
-      } else if (o_num == 3) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2]);
-      } else if (o_num == 4) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2], outputs_data[3]);
-      } else {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-      }
+    int block_rows = kThreadsPerBlock / block_cols;
+    dim3 block_size = dim3(block_cols, block_rows, 1);
+
+    int max_threads = context.GetMaxPhysicalThreadCount();
+    int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+    int grid_cols =
+        std::min((in_col + block_cols - 1) / block_cols, max_blocks);
+    int grid_rows =
+        std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
+    dim3 grid_size = dim3(grid_cols, grid_rows, 1);
+
+    auto tmp_dev_outs_data =
+        platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
+            outputs_data.size() * sizeof(T*));
+    memory::Copy(boost::get<platform::CUDAPlace>(context.GetPlace()),
+                 tmp_dev_outs_data->ptr(), platform::CPUPlace(),
+                 reinterpret_cast<void*>(outputs_data.data()),
+                 outputs_data.size() * sizeof(T*), context.stream());
+    T** dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+
+    if (sameShape) {
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
     } else {
       auto tmp_dev_ins_col_data =
           platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
@@ -393,7 +277,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       int* dev_outs_col_data =
           reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
 
-      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+      SplitKernel<<<grid_size, block_size, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
           static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
     }
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 411dbca25bb..8ba9e8e8ec1 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -17,24 +17,26 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 
-/**
- * case 1:
- *    inputs:
- *        t_a.shape: [2, 3, 4]
- *        t_b.shape: [3, 3, 4]
- *    output:
- *        out.shape: [5, 3, 4]
- */
 template <typename DeviceContext, typename Place>
-void ConcatCase1(DeviceContext* context) {
+void testConcat() {
   paddle::framework::Tensor input_a_cpu;
   paddle::framework::Tensor input_b_cpu;
   paddle::framework::Tensor out_cpu;
-
   paddle::framework::Tensor input_a;
   paddle::framework::Tensor input_b;
   paddle::framework::Tensor out;
 
+  DeviceContext* context = new DeviceContext(Place());
+  //  DeviceContext context(Place());
+
+  /**
+   * cast1:
+   *    inputs:
+   *        t_a.shape: [2, 3, 4]
+   *        t_b.shape: [3, 3, 4]
+   *    output:
+   *        out.shape: [5, 3, 4]
+   */
   auto dim_a = paddle::framework::make_ddim({2, 3, 4});
   auto dim_b = paddle::framework::make_ddim({3, 3, 4});
   auto dim_out = paddle::framework::make_ddim({5, 3, 4});
@@ -49,8 +51,8 @@ void ConcatCase1(DeviceContext* context) {
     out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
   }
 
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
+  int* a_ptr;
+  int* b_ptr;
   if (paddle::platform::is_gpu_place(Place())) {
     a_ptr = input_a_cpu.data<int>();
     b_ptr = input_b_cpu.data<int>();
@@ -82,7 +84,7 @@ void ConcatCase1(DeviceContext* context) {
   PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
-  int* out_ptr = nullptr;
+  int* out_ptr;
   if (paddle::platform::is_gpu_place(Place())) {
     paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
                                       &out_cpu);
@@ -102,42 +104,28 @@ void ConcatCase1(DeviceContext* context) {
       ++idx_a;
     }
   }
-}
-
-/**
-  * case 2:
-  *    inputs:
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 4, 4]
-  *    output:
-  *        out.shape: [2, 7, 4]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase2(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 4, 4});
-  auto dim_out = paddle::framework::make_ddim({2, 7, 4});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
-
+  //
+  /**
+    * cast2:
+    *    inputs:
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 4, 4]
+    *    output:
+    *        out.shape: [2, 7, 4]
+    */
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 4, 4});
+  dim_out = paddle::framework::make_ddim({2, 7, 4});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
   if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
   }
 
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     a_ptr = input_a_cpu.data<int>();
     b_ptr = input_b_cpu.data<int>();
@@ -158,18 +146,16 @@ void ConcatCase2(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  input.clear();
   input.push_back(input_a);
   input.push_back(input_b);
 
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
   concat_functor(*context, input, 1, &out);
 
   // check the dim of input_a, input_b
   PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
-  int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
                                       &out_cpu);
@@ -178,8 +164,8 @@ void ConcatCase2(DeviceContext* context) {
     out_ptr = out.data<int>();
   }
 
-  int cols = 3 * 4;
-  int idx_a = 0, idx_b = 0;
+  cols = 3 * 4;
+  idx_a = 0, idx_b = 0;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 28; ++j) {
       if (j >= cols) {
@@ -191,42 +177,28 @@ void ConcatCase2(DeviceContext* context) {
       }
     }
   }
-}
-
-/**
-  * case 3:
-  *    inputs:
-  *        t_a.shape: [2, 3, 5]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 3, 9]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase3(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 3, 5});
-  auto dim_out = paddle::framework::make_ddim({2, 3, 9});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
 
+  /**
+    * cast3:
+    *    inputs:
+    *        t_a.shape: [2, 3, 5]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 3, 9]
+    */
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 5});
+  dim_out = paddle::framework::make_ddim({2, 3, 9});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
   if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
   }
 
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     a_ptr = input_a_cpu.data<int>();
     b_ptr = input_b_cpu.data<int>();
@@ -247,18 +219,16 @@ void ConcatCase3(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  input.clear();
   input.push_back(input_a);
   input.push_back(input_b);
 
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
   concat_functor(*context, input, 2, &out);
 
   // check the dim of input_a, input_b
   PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
-  int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
                                       &out_cpu);
@@ -268,8 +238,8 @@ void ConcatCase3(DeviceContext* context) {
   }
 
   // check the data
-  int cols = 4;
-  int idx_a = 0, idx_b = 0;
+  cols = 4;
+  idx_a = 0, idx_b = 0;
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 9; ++j) {
       if (j >= cols) {
@@ -281,43 +251,29 @@ void ConcatCase3(DeviceContext* context) {
       }
     }
   }
-}
-
-/**
-  * case 4:
-  *    inputs:
-  *        axis = 1
-  *        t_a.shape: [2, 3, 4]
-  *        t_b.shape: [2, 3, 4]
-  *    output:
-  *        out.shape: [2, 6, 4]
-  */
-template <typename DeviceContext, typename Place>
-void ConcatCase4(DeviceContext* context) {
-  paddle::framework::Tensor input_a_cpu;
-  paddle::framework::Tensor input_b_cpu;
-  paddle::framework::Tensor out_cpu;
-
-  paddle::framework::Tensor input_a;
-  paddle::framework::Tensor input_b;
-  paddle::framework::Tensor out;
-
-  auto dim_a = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_b = paddle::framework::make_ddim({2, 3, 4});
-  auto dim_out = paddle::framework::make_ddim({2, 6, 4});
-
-  input_a.mutable_data<int>(dim_a, Place());
-  input_b.mutable_data<int>(dim_b, Place());
-  out.mutable_data<int>(dim_out, Place());
 
+  /**
+    * cast4:
+    *    inputs:
+    *        axis = 1
+    *        t_a.shape: [2, 3, 4]
+    *        t_b.shape: [2, 3, 4]
+    *    output:
+    *        out.shape: [2, 6, 4]
+    */
+  dim_a = paddle::framework::make_ddim({2, 3, 4});
+  dim_b = paddle::framework::make_ddim({2, 3, 4});
+  dim_out = paddle::framework::make_ddim({2, 6, 4});
+
+  input_a.Resize(dim_a);
+  input_b.Resize(dim_b);
+  out.Resize(dim_out);
   if (paddle::platform::is_gpu_place(Place())) {
-    input_a_cpu.mutable_data<int>(dim_a, paddle::platform::CPUPlace());
-    input_b_cpu.mutable_data<int>(dim_b, paddle::platform::CPUPlace());
-    out_cpu.mutable_data<int>(dim_out, paddle::platform::CPUPlace());
+    input_a_cpu.Resize(dim_a);
+    input_b_cpu.Resize(dim_b);
+    out_cpu.Resize(dim_out);
   }
 
-  int* a_ptr = nullptr;
-  int* b_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     a_ptr = input_a_cpu.data<int>();
     b_ptr = input_b_cpu.data<int>();
@@ -338,19 +294,16 @@ void ConcatCase4(DeviceContext* context) {
     paddle::framework::TensorCopySync(input_b_cpu, Place(), &input_b);
   }
 
-  std::vector<paddle::framework::Tensor> input;
+  input.clear();
   input.push_back(input_a);
   input.push_back(input_b);
 
-  paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
   concat_functor(*context, input, 1, &out);
-  context->Wait();
 
   // check the dim of input_a, input_b
   PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
   PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
 
-  int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
     paddle::framework::TensorCopySync(out, paddle::platform::CPUPlace(),
                                       &out_cpu);
@@ -360,8 +313,8 @@ void ConcatCase4(DeviceContext* context) {
   }
 
   // check the data
-  int cols = 12;
-  int idx_a = 0, idx_b = 0;
+  cols = 12;
+  idx_a = 0, idx_b = 0;
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 24; ++j) {
       if (j >= cols) {
@@ -375,21 +328,10 @@ void ConcatCase4(DeviceContext* context) {
   }
 }
 
-template <typename DeviceContext, typename Place>
-void TestConcatMain() {
-  DeviceContext* context = new DeviceContext(Place());
-
-  ConcatCase1<DeviceContext, Place>(context);
-  ConcatCase2<DeviceContext, Place>(context);
-  ConcatCase3<DeviceContext, Place>(context);
-  ConcatCase4<DeviceContext, Place>(context);
-}
-
 TEST(math, concat) {
-  TestConcatMain<paddle::platform::CPUDeviceContext,
-                 paddle::platform::CPUPlace>();
+  testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
 #ifdef PADDLE_WITH_CUDA
-  TestConcatMain<paddle::platform::CUDADeviceContext,
-                 paddle::platform::CUDAPlace>();
+  testConcat<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
 #endif
 }
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index e9019c6d2fe..f6094369567 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -144,8 +144,7 @@ class ContextProjectFunctor {
         sequence_height = static_cast<int>(out_t.dims()[0]);
 
         // add up trainable data
-        out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                      sequence_width});
+        out_t.Resize({sequence_height * context_length, sequence_width});
 
         if (up_pad > 0) {  // add up pad
           int padding_rows = std::min(
@@ -192,8 +191,7 @@ class ContextProjectFunctor {
                                   &out_t_sub);
           }
         }
-        out_t.Resize({sequence_height,
-                      static_cast<int64_t>(context_length) * sequence_width});
+        out_t.Resize({sequence_height, context_length * sequence_width});
       }
     }
   }
@@ -262,8 +260,7 @@ class ContextProjectGradFunctor {
                                     static_cast<int>(lod_level_0[i + 1]));
 
           sequence_height = static_cast<int>(out_t.dims()[0]);
-          out_t.Resize({static_cast<int64_t>(sequence_height) * context_length,
-                        sequence_width});
+          out_t.Resize({sequence_height * context_length, sequence_width});
 
           if (up_pad > 0) {
             int padding_rows = std::min(
@@ -311,8 +308,7 @@ class ContextProjectGradFunctor {
                         w_sub.data<T>());
             }
           }
-          out_t.Resize({sequence_height,
-                        static_cast<int64_t>(context_length) * sequence_width});
+          out_t.Resize({sequence_height, context_length * sequence_width});
         }
       }
     }
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 4406a558718..57726956cfb 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -54,14 +54,7 @@ inline void vec_scal(const int n, const T a, T* x) {
 #ifdef PADDLE_WITH_MKLML
 template <>
 inline void vec_exp<float>(const int n, const float* x, float* y) {
-  constexpr int small_enough = 128;
-  if (n < small_enough) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = std::exp(x[i]);
-    }
-  } else {
-    platform::dynload::vsExp(n, x, y);
-  }
+  platform::dynload::vsExp(n, x, y);
 }
 
 template <>
@@ -135,120 +128,6 @@ inline void vec_scal<float, platform::avx512f>(const int n, const float a,
   vec_scal<float, platform::avx2>(n, a, x, y);
 }
 
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_sum(const size_t n, const T* x, T* s) {
-  s[0] = x[0];
-  for (size_t i = 1; i < n; ++i) {
-    s[0] += x[i];
-  }
-}
-
-template <>
-inline void vec_sum<float, platform::avx>(const size_t n, const float* x,
-                                          float* s) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sum<float, platform::isa_any>(n, x, s);
-    return;
-  }
-
-  unsigned int i, end;
-  i = end = 0;
-  s[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(tmp, _mm256_load_ps(x + i));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(s, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
-                              _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    s[0] += x[i];
-  }
-#else
-  vec_sum<float, platform::isa_any>(n, x, s);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
-  for (size_t i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul<float, platform::avx>(const size_t n, const float* x,
-                                          const float* y, float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(
-        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  for (; i < n; i++) {
-    z[i] = x[i] * y[i];
-  }
-#else
-  vec_mul<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
-  z[0] = x[0] * y[0];
-  for (size_t i = 1; i < n; ++i) {
-    z[0] += x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul_reduce<float, platform::avx>(const size_t n, const float* x,
-                                                 const float* y, float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  z[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(
-        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(z, _mm_hadd_ps(_mm256_castps256_ps128(hsum),
-                              _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    z[0] += x[i] * y[i];
-  }
-#else
-  vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
 template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
@@ -363,39 +242,6 @@ inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
   vec_cross<float, platform::avx>(n, x, y, z, out);
 }
 
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
-  for (size_t i = 0; i < n; ++i) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-}
-
-template <>
-inline void vec_clip<float, platform::avx>(const size_t n, const float a,
-                                           const float* x, float* y) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_clip<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  __m256 threshold = _mm256_set1_ps(a);
-
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
-  }
-
-  for (; i < n; i++) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-#else
-  vec_clip<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
 template <typename T, platform::cpu_isa_t isa = platform::isa_any>
 inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
   for (int i = 0; i < n; ++i) {
diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc
index f2f80f836fd..28eb9cadc9d 100644
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
@@ -65,11 +65,12 @@ void ref_relu(const int n, const T* x, T* y) {
 }
 
 template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f)) {
+void RandomVec(const int n, T* a) {
   static unsigned int seed = 100;
   std::mt19937 rng(seed++);
   std::uniform_real_distribution<double> uniform_dist(0, 1);
+  const T lower = static_cast<T>(-20.f);
+  const T upper = static_cast<T>(20.f);
   for (int i = 0; i < n; ++i) {
     a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
   }
@@ -143,126 +144,6 @@ TEST(CpuVecTest, relu) {
   TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
 }
 
-template <typename T>
-void compare_sum(size_t n, std::function<void(const size_t, const T*, T*)> tgt,
-                 std::function<void(const size_t, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  T ytgt_data, yref_data;
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  tgt(n, x_data, &ytgt_data);
-  ref(n, x_data, &yref_data);
-  EXPECT_NEAR(ytgt_data, yref_data, 1e-3);
-}
-
-TEST(CpuVecTest, vec_sum) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_sum<float>(sz, vec_sum<float>, vec_sum<float, platform::isa_any>);
-    compare_sum<float>(sz, vec_sum<float, platform::avx>,
-                       vec_sum<float, platform::isa_any>);
-  }
-  compare_sum<double>(30U, vec_sum<double>, vec_sum<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_clip(
-    size_t n, T threshold,
-    std::function<void(const size_t, const T, const T*, T*)> tgt,
-    std::function<void(const size_t, const T, const T*, T*)> ref) {
-  std::vector<T> x(n);
-  std::vector<T> ytgt(n), yref(n);
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  T* yref_data = yref.data();
-  T* ytgt_data = ytgt.data();
-  tgt(n, threshold, x_data, ytgt_data);
-  ref(n, threshold, x_data, yref_data);
-  for (int i = 0; i < n; ++i) {
-    EXPECT_NEAR(ytgt_data[i], yref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, vec_clip) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_clip<float>(sz, -4.f, vec_clip<float>,
-                        vec_clip<float, platform::isa_any>);
-    compare_clip<float>(sz, -1.1f, vec_clip<float, platform::avx>,
-                        vec_clip<float, platform::isa_any>);
-  }
-  compare_clip<double>(30U, 1.0, vec_clip<double>,
-                       vec_clip<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_mul(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
-    std::function<void(const size_t, const T*, const T*, T*)> ref) {
-  std::vector<T> x(n), y(n);
-  std::vector<T> ztgt(n), zref(n);
-
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-  RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-  T* ztgt_data = ztgt.data();
-  T* zref_data = zref.data();
-
-  tgt(n, x_data, y_data, ztgt_data);
-  ref(n, x_data, y_data, zref_data);
-  for (size_t i = 0; i < n; ++i) {
-    EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
-  }
-}
-
-TEST(CpuVecTest, vec_mul) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul<float>(sz, vec_mul<float>, vec_mul<float, platform::isa_any>);
-    compare_mul<float>(sz, vec_mul<float, platform::avx>,
-                       vec_mul<float, platform::isa_any>);
-  }
-  compare_mul<double>(30U, vec_mul<double>, vec_mul<double, platform::isa_any>);
-}
-
-template <typename T>
-void compare_mul_reduce(
-    size_t n, std::function<void(const size_t, const T*, const T*, T*)> tgt,
-    std::function<void(const size_t, const T*, const T*, T*)> ref) {
-  std::vector<T> x(n), y(n);
-  T ztgt_data, zref_data;
-
-  RandomVec<T>(n, x.data(), static_cast<T>(-2), static_cast<T>(2));
-  RandomVec<T>(n, y.data(), static_cast<T>(-2), static_cast<T>(2));
-
-  const T* x_data = x.data();
-  const T* y_data = y.data();
-
-  tgt(n, x_data, y_data, &ztgt_data);
-  ref(n, x_data, y_data, &zref_data);
-  EXPECT_NEAR(ztgt_data, zref_data, 1e-3);
-}
-
-TEST(CpuVecTest, vec_mul_reduce) {
-  namespace platform = paddle::platform;
-  using namespace paddle::operators::math;  // NOLINT
-  for (size_t sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float>,
-                              vec_mul_reduce<float, platform::isa_any>);
-    compare_mul_reduce<float>(sz, vec_mul_reduce<float, platform::avx>,
-                              vec_mul_reduce<float, platform::isa_any>);
-  }
-  compare_mul_reduce<double>(30U, vec_mul_reduce<double>,
-                             vec_mul_reduce<double, platform::isa_any>);
-}
-
 template <typename T>
 void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
                  std::function<void(const int, const T*, T*)> ref) {
diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h
index e98bf82169a..955c0b6bad5 100644
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
@@ -78,48 +78,6 @@ struct ReluGradFunctor {
   inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
 };
 
-template <typename T>
-struct TanhFunctor {
-  const T kMin = static_cast<T>(-40);
-  const T kMax = static_cast<T>(13);
-  inline HOSTDEVICE T operator()(T x) {
-    // y = 2 / (1 + e^-2x) - 1
-    T t0 = 2 * x;
-    T t1 = (t0 < kMin) ? kMin : ((t0 > kMax) ? kMax : t0);
-    return static_cast<T>(2) / (static_cast<T>(1) + std::exp(-t1)) -
-           static_cast<T>(1);
-  }
-};
-
-template <typename T>
-struct TanhGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return static_cast<T>(1) - x * x; }
-  inline HOSTDEVICE T UseOut(T out) { return static_cast<T>(1) - out * out; }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) {
-    return static_cast<T>(1) - out * out;
-  }
-};
-
-template <typename T>
-struct SigmoidFunctor {
-  const T kMin = static_cast<T>(-40);
-  const T kMax = static_cast<T>(13);
-  inline HOSTDEVICE T operator()(T x) {
-    // y = 1 / (1 + e^-x)
-    T tmp = (x < kMin) ? kMin : ((x > kMax) ? kMax : x);
-    return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-tmp));
-  }
-};
-
-template <typename T>
-struct SigmoidGradFunctor {
-  inline HOSTDEVICE T UseX(T x) { return x * (static_cast<T>(1) - x); }
-  inline HOSTDEVICE T UseOut(T out) { return out * (static_cast<T>(1) - out); }
-  inline HOSTDEVICE T UseXAndOut(T x, T out) {
-    return out * (static_cast<T>(1) - out);
-  }
-};
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/gru_compute.cu b/paddle/fluid/operators/math/gru_compute.cu
index b564f990b49..75417cced23 100644
--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -30,31 +30,25 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
     dim3 threads;
     dim3 grid;
     if (batch_size == 1) {
-      if (context.GetComputeCapability() >= 70) {
-        constexpr int tiled_size = 16;
-        int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
-        threads = dim3(tiled_size, 1);
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruGate<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.gate_value, value.prev_out_value, value.gate_weight,
-            value.reset_output_value, frame_size, active_gate);
-
-        frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
-        grid = dim3(frame_blocks, 1);
-        detail::KeFastCollectiveGruOut<
-            T, tiled_size><<<grid, threads, 0, stream>>>(
-            value.state_weight, value.prev_out_value, value.output_value,
-            value.gate_value, value.reset_output_value, frame_size, active_node,
-            origin_mode);
-
-        return;
-      } else {
-        int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-        int frame_blocks = (frame_size + 1024 - 1) / 1024;
-        threads = dim3(frame_per_block, 1);
-        grid = dim3(frame_blocks, 1);
-      }
+      constexpr int tiled_size = 16;
+      int frame_blocks = (frame_size * 2 + tiled_size - 1) / tiled_size;
+      threads = dim3(tiled_size, 1);
+      grid = dim3(frame_blocks, 1);
+
+      detail::KeFastCollectiveGruGate<T,
+                                      tiled_size><<<grid, threads, 0, stream>>>(
+          value.gate_value, value.prev_out_value, value.gate_weight,
+          value.reset_output_value, frame_size, active_gate);
+
+      frame_blocks = (frame_size + tiled_size - 1) / tiled_size;
+      grid = dim3(frame_blocks, 1);
+      detail::KeFastCollectiveGruOut<T,
+                                     tiled_size><<<grid, threads, 0, stream>>>(
+          value.state_weight, value.prev_out_value, value.output_value,
+          value.gate_value, value.reset_output_value, frame_size, active_node,
+          origin_mode);
+
+      return;
     } else {
       threads = dim3(32, 32);
       grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 4630689dec1..25f06a25a06 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -59,22 +59,6 @@ void CopyValidData(framework::Tensor* dst_tensor,
   }
 }
 
-template <typename T>
-static void fast_mem_init(void* dest, size_t dest_size, const T* src,
-                          size_t num_bytes) {
-  if (dest == nullptr || dest_size == 0 || src == nullptr) return;
-
-  memcpy(dest, src, num_bytes);
-
-  dest_size *= num_bytes;
-  while (dest_size > num_bytes) {
-    size_t remaining = dest_size - num_bytes;
-    size_t count = (remaining > num_bytes) ? num_bytes : remaining;
-    memcpy((unsigned char*)dest + num_bytes, dest, count);
-    num_bytes += count;
-  }
-}
-
 template <typename T>
 class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
  public:
@@ -103,8 +87,9 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
     T* pad_data = pad_tensor->data<T>();
     const T* pad_value_data = pad_value.data<T>();
     if (pad_value.numel() == 1) {
-      fast_mem_init<T>(pad_data, pad_tensor->numel(), pad_value_data,
-                       sizeof(T));
+      for (int i = 0; i < pad_tensor->numel(); ++i) {
+        pad_data[i] = *pad_value_data;
+      }
     } else {
       for (int i = 0; i < pad_tensor->numel(); i += step_width) {
         memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 011d45c3965..7af44f2b2ca 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -36,8 +36,8 @@ template <typename T, bool is_test>
 class MaxSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::Tensor* output, framework::Tensor* index) {
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
@@ -56,13 +56,6 @@ class MaxSeqPoolFunctor {
     int64_t num_seq = out_dims[0];
     int64_t dim = output->numel() / num_seq;
     for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-          max_index[i * dim + k] = -1;
-        }
-        continue;
-      }
       for (int64_t k = 0; k < dim; ++k) {
         out_data[i * dim + k] = in_data[starts[i] * dim + k];
         max_index[i * dim + k] = starts[i];
@@ -84,8 +77,8 @@ template <typename T>
 class MaxSeqPoolFunctor<T, true> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
-                  framework::Tensor* output, framework::Tensor* index) {
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1);
@@ -101,12 +94,6 @@ class MaxSeqPoolFunctor<T, true> {
     int64_t num_seq = out_dims[0];
     int64_t dim = output->numel() / num_seq;
     for (int64_t i = 0; i < num_seq; ++i) {
-      if (starts[i] == starts[i + 1]) {
-        for (int64_t k = 0; k < dim; ++k) {
-          out_data[i * dim + k] = pad_value;
-        }
-        continue;
-      }
       std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim],
                   dim * sizeof(T));
       for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
@@ -147,7 +134,6 @@ class MaxSeqPoolGradFunctor {
     for (int64_t i = 0; i < num_seq; ++i) {
       for (int64_t j = 0; j < dim; ++j) {
         int step_id = max_index[i * dim + j];
-        if (step_id == -1) continue;
         ig_data[step_id * dim + j] = og_data[i * dim + j];
       }
     }
@@ -158,7 +144,7 @@ template <typename T>
 class LastSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
+                  const framework::LoDTensor& input,
                   framework::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
@@ -171,16 +157,10 @@ class LastSeqPoolFunctor {
     for (int i = 0; i < seq_num; ++i) {
       // Calculate the length of each sequence
       int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Point to the begin of next sequence
-        in_data += seq_len * item_size;
-        // Copy the last item of sequence to output
-        std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
-      }
+      // Point to the begin of next sequence
+      in_data += seq_len * item_size;
+      // Copy the last item of sequence to output
+      std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
       out_data += item_size;
     }
   }
@@ -190,7 +170,7 @@ template <typename T>
 class FirstSeqPoolFunctor {
  public:
   void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& input, T pad_value,
+                  const framework::LoDTensor& input,
                   framework::Tensor* output) {
     // Create pointers to input and output data
     auto* in_data = input.data<T>();
@@ -203,16 +183,10 @@ class FirstSeqPoolFunctor {
     for (int i = 0; i < seq_num; ++i) {
       // Calculate the length of each sequence
       int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (seq_len == 0) {
-        for (int j = 0; j < item_size; ++j) {
-          out_data[j] = pad_value;
-        }
-      } else {
-        // Copy the first item of sequence to output
-        std::memcpy(out_data, in_data, item_size * sizeof(T));
-        // Point to the next sequence
-        in_data += seq_len * item_size;
-      }
+      // Copy the first item of sequence to output
+      std::memcpy(out_data, in_data, item_size * sizeof(T));
+      // Point to the next sequence
+      in_data += seq_len * item_size;
       out_data += item_size;
     }
   }
@@ -233,7 +207,6 @@ class SumSeqPoolGradFunctor {
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-      if (h == 0) continue;
       int64_t in_offset = lod[i] * in_w;
       const T* out_pos = out_g_data + i * out_w;
       T* in_pos = in_g_data + in_offset;
@@ -249,27 +222,27 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
  public:
   /* max pool has index output */
   void operator()(const platform::CPUDeviceContext& context,
-                  const std::string pooltype, T pad_value,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  bool is_test, framework::Tensor* index = nullptr) {
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output, bool is_test,
+                  framework::Tensor* index = nullptr) {
     if (pooltype == "MAX") {
       if (is_test) {
         math::MaxSeqPoolFunctor<T, true> max_pool;
-        max_pool(context, input, pad_value, output, index);
+        max_pool(context, input, output, index);
       } else {
         math::MaxSeqPoolFunctor<T, false> max_pool;
-        max_pool(context, input, pad_value, output, index);
+        max_pool(context, input, output, index);
       }
       return;
     }
     if (pooltype == "LAST") {
       math::LastSeqPoolFunctor<T> last_pool;
-      last_pool(context, input, pad_value, output);
+      last_pool(context, input, output);
       return;
     }
     if (pooltype == "FIRST") {
       math::FirstSeqPoolFunctor<T> first_pool;
-      first_pool(context, input, pad_value, output);
+      first_pool(context, input, output);
       return;
     }
 
@@ -287,13 +260,7 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
               .At(attr);
       for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
         attr.h = static_cast<int>(lod[i + 1] - lod[i]);
-        if (attr.h == 0) {
-          for (int j = 0; j < attr.w; ++j) {
-            dst[j] = pad_value;
-          }
-        } else {
-          seqpool(src, dst, &attr);
-        }
+        seqpool(src, dst, &attr);
         dst += attr.w;
         src += attr.h * attr.w;
       }
@@ -301,17 +268,11 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
     }
     auto& place = *context.eigen_device();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      Tensor out_t = output->Slice(i, i + 1);
-      int64_t w = input.numel() / input.dims()[0];
-      if (lod[i] == lod[i + 1]) {
-        for (int j = 0; j < w; ++j) {
-          out_t.data<T>()[j] = pad_value;
-        }
-        continue;
-      }
       Tensor in_t =
           input.Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      Tensor out_t = output->Slice(i, i + 1);
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      int64_t w = input.numel() / input.dims()[0];
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
       if (pooltype == "AVERAGE") {
@@ -355,7 +316,6 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
     auto lod = in_grad->lod()[0];
     auto& place = *context.eigen_device();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      if (lod[i] == lod[i + 1]) continue;
       auto in_g_t = in_grad->Slice(static_cast<int>(lod[i]),
                                    static_cast<int>(lod[i + 1]));
       auto out_g_t = out_grad.Slice(i, i + 1);
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 4de99ba677d..51da6de26e2 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -24,122 +24,96 @@ namespace math {
 
 template <typename T>
 struct MaxPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
       T max_val = static_cast<T>(-FLT_MAX);
       int max_index = -1;
-      if (start == end) {
-        output[tid] = pad_value;
-        index[tid] = -1;
-      } else {
-        for (int i = start; i < end; ++i) {
-          if (max_val < input[item_dim * i + tid]) {
-            max_val = input[item_dim * i + tid];
-            max_index = i;
-          }
+      for (int i = start; i < end; ++i) {
+        if (max_val < input[item_dim * i + tid]) {
+          max_val = input[item_dim * i + tid];
+          max_index = i;
         }
-        output[tid] = max_val;
-        index[tid] = max_index;
       }
+      output[tid] = max_val;
+      index[tid] = max_index;
     }
   }
 };
 
 template <typename T>
 struct AvgPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        // end, start is lod, so end - start != 0
-        output[tid] = val / static_cast<T>(end - start);
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
       }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / static_cast<T>(end - start);
     }
   }
 };
 
 template <typename T>
 struct SumPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        output[tid] = val;
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
       }
+      output[tid] = val;
     }
   }
 };
 
 template <typename T>
 struct SqrtPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        T val = static_cast<T>(0);
-        for (int i = start; i < end; ++i) {
-          val += input[item_dim * i + tid];
-        }
-        // end, start is lod, so end - start != 0
-        output[tid] = val / sqrt(end - start);
+      T val = static_cast<T>(0);
+      for (int i = start; i < end; ++i) {
+        val += input[item_dim * i + tid];
       }
+      // end, start is lod, so end - start != 0
+      output[tid] = val / sqrt(end - start);
     }
   }
 };
 
 template <typename T>
 struct LastPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        output[tid] = input[item_dim * (end - 1) + tid];
-      }
+      output[tid] = input[item_dim * (end - 1) + tid];
     }
   }
 };
 
 template <typename T>
 struct FirstPoolFunctor {
-  HOSTDEVICE void operator()(const T* input, const T pad_value,
-                             const size_t start, const size_t end,
-                             const size_t item_dim, T* output, int* index) {
+  HOSTDEVICE void operator()(const T* input, const size_t start,
+                             const size_t end, const size_t item_dim, T* output,
+                             int* index) {
     for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) {
-      if (start == end) {
-        output[tid] = pad_value;
-      } else {
-        output[tid] = input[item_dim * start + tid];
-      }
+      output[tid] = input[item_dim * start + tid];
     }
   }
 };
 
 template <typename T, typename Range_OP>
 __global__ void sequence_pool_kernel(Range_OP op, const T* input,
-                                     const T pad_value, const size_t* lod,
-                                     const size_t lod_size,
+                                     const size_t* lod, const size_t lod_size,
                                      const size_t item_dim, T* output,
                                      int* index) {
   int bid = blockIdx.x;
@@ -150,17 +124,16 @@ __global__ void sequence_pool_kernel(Range_OP op, const T* input,
   if (index != nullptr) {
     index_offset = &index[bid * item_dim];
   }
-  op(input, pad_value, start, end, item_dim, &output[bid * item_dim],
-     index_offset);
+  op(input, start, end, item_dim, &output[bid * item_dim], index_offset);
 }
 
 template <typename T>
 class SequencePoolFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
-                  const std::string pooltype, T pad_value,
-                  const framework::LoDTensor& input, framework::Tensor* output,
-                  bool is_test, framework::Tensor* index = nullptr) {
+                  const std::string pooltype, const framework::LoDTensor& input,
+                  framework::Tensor* output, bool is_test,
+                  framework::Tensor* index = nullptr) {
     auto& lod = input.lod()[0];
     const size_t item_dim = output->numel() / output->dims()[0];
     dim3 threads(1024, 1);
@@ -168,37 +141,37 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
     if (pooltype == "MAX") {
       sequence_pool_kernel<
           T, MaxPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          MaxPoolFunctor<T>(), input.data<T>(), pad_value,
+          MaxPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), index->data<int>());
     } else if (pooltype == "AVERAGE") {
       sequence_pool_kernel<
           T, AvgPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          AvgPoolFunctor<T>(), input.data<T>(), pad_value,
+          AvgPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SUM") {
       sequence_pool_kernel<
           T, SumPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SumPoolFunctor<T>(), input.data<T>(), pad_value,
+          SumPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "SQRT") {
       sequence_pool_kernel<
           T, SqrtPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          SqrtPoolFunctor<T>(), input.data<T>(), pad_value,
+          SqrtPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "LAST") {
       sequence_pool_kernel<
           T, LastPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          LastPoolFunctor<T>(), input.data<T>(), pad_value,
+          LastPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else if (pooltype == "FIRST") {
       sequence_pool_kernel<
           T, FirstPoolFunctor<T>><<<grid, threads, 0, context.stream()>>>(
-          FirstPoolFunctor<T>(), input.data<T>(), pad_value,
+          FirstPoolFunctor<T>(), input.data<T>(),
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h
index 1dc02eae201..a1046ea2160 100644
--- a/paddle/fluid/operators/math/sequence_pooling.h
+++ b/paddle/fluid/operators/math/sequence_pooling.h
@@ -27,9 +27,8 @@ class SequencePoolFunctor {
  public:
   /* max pool has index output */
   void operator()(const DeviceContext& context, const std::string pooltype,
-                  T pad_value, const framework::LoDTensor& input,
-                  framework::Tensor* output, bool is_test = false,
-                  framework::Tensor* index = nullptr);
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  bool is_test = false, framework::Tensor* index = nullptr);
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/softmax.h b/paddle/fluid/operators/math/softmax.h
index 7a4306efef9..a7a30a71e4c 100644
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -27,7 +27,7 @@ class SoftmaxFunctor {
                   const framework::Tensor* X, framework::Tensor* Y);
 };
 
-template <typename DeviceContext, typename T, typename Enable = void>
+template <typename DeviceContext, typename T>
 class SoftmaxGradFunctor {
  public:
   void operator()(const DeviceContext& context, const int axis_dim,
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 4fb03cdce0c..6f6f33345f5 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -36,15 +34,16 @@ struct ValueClip {
   }
 };
 
-template <typename DeviceContext, typename T, bool is_test>
-void SoftmaxEigen(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
-
+template <typename DeviceContext, typename T, bool is_test, typename Enable>
+void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
+    const DeviceContext& context, const int axis_dim,
+    const framework::Tensor* X, framework::Tensor* Y) {
   auto logits = EigenMatrix<T>::From(*X);
   auto softmax = EigenMatrix<T>::From(*Y);
 
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
   const int batch_size = logits.dimension(kBatchDim);
   const int num_classes = logits.dimension(kClassDim);
   const int num_remain = num_classes / axis_dim;
@@ -71,58 +70,12 @@ void SoftmaxEigen(const DeviceContext& context, const int axis_dim,
                                                  .broadcast(one_axis));
 }
 
-template <typename DeviceContext, typename T, bool is_test, typename Enable>
-void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
-    const DeviceContext& context, const int axis_dim,
-    const framework::Tensor* X, framework::Tensor* Y) {
-  SoftmaxEigen<DeviceContext, T, is_test>(context, axis_dim, X, Y);
-}
-
 template <class DeviceContext>
 using enable_if_CPU = typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type;
 
-template <typename DeviceContext, typename T, bool is_test>
-class SoftmaxFunctor<DeviceContext, T, is_test, enable_if_CPU<DeviceContext>> {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* X, framework::Tensor* Y) {
-    auto in_dims = X->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-
-    const int num_classes = in_dims[kClassDim];
-    const int batch_size = in_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
-      const T* in_data = X->data<T>();
-      T* out_data = Y->data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T max_val = *std::max_element(in_data, in_data + num_classes);
-        max_val *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, max_val, in_data, out_data);
-        vec_clip<T, platform::avx>(num_classes, static_cast<T>(-64), out_data,
-                                   out_data);
-        vec_exp<T>(num_classes, out_data, out_data);
-
-        T sum = 0;
-        vec_sum<T, platform::avx>(num_classes, out_data, &sum);
-        sum = static_cast<T>(1) / sum;
-        vec_scal<T, platform::avx>(num_classes, sum, out_data, out_data);
-
-        in_data += num_classes;
-        out_data += num_classes;
-      }
-    } else {
-      SoftmaxEigen<DeviceContext, T, is_test>(context, axis_dim, X, Y);
-    }
-  }
-};
-
 template <typename DeviceContext>
 class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
- public:
   void operator()(const DeviceContext& context, const int axis_dim,
                   const framework::Tensor* X, framework::Tensor* Y) {
     auto in_dims = X->dims();
@@ -140,16 +93,16 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
 };
 
 template <typename DeviceContext, typename T>
-void SoftmaxGradEigen(const DeviceContext& context, const int axis_dim,
-                      const framework::Tensor* y,
-                      const framework::Tensor* y_grad,
-                      framework::Tensor* x_grad) {
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const int axis_dim,
+    const framework::Tensor* y, const framework::Tensor* y_grad,
+    framework::Tensor* x_grad) {
   auto softmax = EigenMatrix<T>::From(*y);
   auto softmax_grad = EigenMatrix<T>::From(*y_grad);
   auto logits_grad = EigenMatrix<T>::From(*x_grad);
 
-  constexpr int kBatchDim = 0;
-  constexpr int kClassDim = 1;
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
 
   const int batch_size = softmax.dimension(kBatchDim);
   const int num_classes = softmax.dimension(kClassDim);
@@ -169,48 +122,6 @@ void SoftmaxGradEigen(const DeviceContext& context, const int axis_dim,
   logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
 }
 
-template <typename DeviceContext, typename T, typename Enable>
-void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
-    const DeviceContext& context, const int axis_dim,
-    const framework::Tensor* y, const framework::Tensor* y_grad,
-    framework::Tensor* x_grad) {
-  SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
-}
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
- public:
-  void operator()(const DeviceContext& context, const int axis_dim,
-                  const framework::Tensor* y, const framework::Tensor* y_grad,
-                  framework::Tensor* x_grad) {
-    auto out_dims = y->dims();
-    constexpr int kBatchDim = 0;
-    constexpr int kClassDim = 1;
-    const int num_classes = out_dims[kClassDim];
-    const int batch_size = out_dims[kBatchDim];
-    const int num_remain = num_classes / axis_dim;
-
-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
-      const T* out_data = y->data<T>();
-      const T* out_grad = y_grad->data<T>();
-      T* in_grad = x_grad->data<T>();
-      for (int bs = 0; bs < batch_size; ++bs) {
-        T scalar;
-        vec_mul_reduce<T, platform::avx>(num_classes, out_grad, out_data,
-                                         &scalar);
-        scalar *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
-        vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
-        out_data += num_classes;
-        out_grad += num_classes;
-        in_grad += num_classes;
-      }
-    } else {
-      SoftmaxGradEigen<DeviceContext, T>(context, axis_dim, y, y_grad, x_grad);
-    }
-  }
-};
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 40f7231c125..911c4d22ee5 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -61,25 +61,20 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
   std::shared_ptr<batch_norm_fwd::primitive_desc>
   AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
                                       const mkldnn::engine &engine) {
-    // BatchNorm PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_batch_norm_fwd_pd = key_common_ + "@bn_fwd_pd";
-    batch_norm_pd_ = std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-        dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
-
-    if (batch_norm_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      batch_norm_pd_ = std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
-          dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
-      if (batch_norm_pd_ == nullptr) {
-        batch_norm_pd_.reset(
-            new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
-        dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
-      }
+    const std::string key_batch_norm_fwd_pd = key_ + "@bn_fwd_pd";
+    auto batch_norm_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
+
+    if (batch_norm_pd == nullptr) {
+      batch_norm_pd_.reset(
+          new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
+      dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
+    } else {
+      batch_norm_pd_ = batch_norm_pd;
+      is_reusing_ = true;
     }
+
     return batch_norm_pd_;
   }
 
@@ -92,6 +87,9 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
     auto batch_norm_p =
         std::static_pointer_cast<batch_norm_fwd>(dev_ctx_.GetBlob(prim_key));
 
+    PADDLE_ENFORCE((batch_norm_p != nullptr) || !is_reusing_,
+                   "Fail to find batch norm primitive in device context");
+
     if (batch_norm_p == nullptr) {
       if (is_test) {
         batch_norm_p = std::make_shared<batch_norm_fwd>(
@@ -106,6 +104,8 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
       }
 
       dev_ctx_.SetBlob(prim_key, batch_norm_p);
+    } else {
+      is_reusing_ = true;
     }
 
     return batch_norm_p;
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index a855ba8475a..50fe2e6e4c5 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -79,8 +79,6 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis));
   platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out"));
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
-  platform::MKLDNNHandler::AppendKey(&key,
-                                     std::to_string(multi_input[0]->format()));
   return key;
 }
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 647e09a9291..faf518005c8 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -119,14 +119,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
     bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-    bool fuse_brelu = false;
-    float fuse_brelu_threshold = 6.0;
     int groups = ctx.Attr<int>("groups");
+
     bool is_conv3d = strides.size() == 3U;
-    if (!is_conv3d) {
-      fuse_brelu = ctx.Attr<bool>("fuse_brelu");
-      fuse_brelu_threshold = ctx.Attr<float>("fuse_brelu_threshold");
-    }
     // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
         is_conv3d
@@ -147,8 +142,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // Get unique name for storing MKLDNN primitives
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
-        src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations,
-        groups, ctx.op().Input("Input") + ctx.op().Input("Filter"));
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     std::vector<primitive> pipeline;
 
@@ -199,13 +194,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
       conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, fuse_residual_conn, fuse_brelu, fuse_brelu_threshold,
-          fwd_prop_kind);
+          fuse_relu, fuse_residual_conn, fwd_prop_kind);
     } else {
       conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_relu, fuse_residual_conn, fuse_brelu,
-          fuse_brelu_threshold, fwd_prop_kind);
+          mkldnn_engine, fuse_relu, fuse_residual_conn, fwd_prop_kind);
     }
 
     // create mkldnn memory from input tensors (data/weights)
@@ -234,8 +227,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                         "same dimension sizes");
 
       if (residual_param->format() != handler.GetDstFormat()) {
-        auto output_data =
-            output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+        auto output_data = output->mutable_data<T>(
+            ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
+            handler.GetDstMemorySize());
         auto residual_data_tz =
             paddle::framework::vectorize2int(residual_param->dims());
         auto residual_data_type =
@@ -255,8 +249,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
             handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
       }
     } else {
-      auto output_data =
-          output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+      auto output_data = output->mutable_data<T>(
+          ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+          handler.GetDstMemorySize());
       dst_memory_p =
           handler.AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
     }
@@ -286,7 +281,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(GetMKLDNNFormat(*dst_memory_p));
   }
-
   void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
     const bool is_test = ctx.Attr<bool>("is_test");
 
@@ -323,14 +317,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     int groups = ctx.Attr<int>("groups");
     bool fuse_relu = ctx.Attr<bool>("fuse_relu");
     bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
-    bool fuse_brelu = ctx.Attr<bool>("fuse_brelu");
-    float fuse_brelu_threshold = ctx.Attr<float>("fuse_brelu_threshold");
+
     bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-    bool unsigned_output = fuse_relu || fuse_brelu;
     if (fuse_residual_conn) {
       PADDLE_ENFORCE(force_fp32_output != true,
                      "residual fusion does not support force output with fp32");
     }
+
     bool is_conv3d = strides.size() == 3U;
     // TODO(tpatejko): add support for dilation
     PADDLE_ENFORCE(
@@ -348,18 +341,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> weights_tz =
         paddle::framework::vectorize2int(filter->dims());
     int g = std::max(groups, 1);
-
     GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
 
     mkldnn::memory::data_type src_dt =
         paddle::framework::ToMKLDNNDataType(input->type());
-
-    auto dst_dt = unsigned_output
-                      ? paddle::framework::ToMKLDNNDataType(
-                            framework::DataTypeTrait<uint8_t>::DataType)
-                      : paddle::framework::ToMKLDNNDataType(
-                            framework::DataTypeTrait<int8_t>::DataType);
+    auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<uint8_t>::DataType)
+                            : paddle::framework::ToMKLDNNDataType(
+                                  framework::DataTypeTrait<int8_t>::DataType);
 
     if (force_fp32_output) {
       dst_dt = paddle::framework::ToMKLDNNDataType(
@@ -377,19 +367,20 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     key.reserve(MaxKeyLength);
     platform::ConvMKLDNNHandler::AppendKey(
         &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
-        input->format(), fuse_relu, fuse_residual_conn, fuse_brelu,
+        input->format(), fuse_relu, fuse_residual_conn,
         ctx.op().Input("Input") + ctx.op().Input("Filter"));
-
     const std::string key_conv_pd = key + "@conv_pd";
 
     bool need_s8_to_u8 = false;
-    std::shared_ptr<mkldnn::convolution_forward> conv_p;
-    std::shared_ptr<mkldnn::memory> src_memory_p;
-    std::shared_ptr<mkldnn::memory> user_src_memory_p;
-    std::shared_ptr<mkldnn::memory> dst_memory_p;
+
+    std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
+    std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
+    std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
+    std::shared_ptr<mkldnn::memory> dst_memory_p = nullptr;
     std::vector<primitive> pipeline;
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
-    std::shared_ptr<platform::ConvMKLDNNHandler> handler;
+    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
+        nullptr;
+    std::shared_ptr<platform::ConvMKLDNNHandler> handler = nullptr;
 
     auto prim_key = key + "@conv_p";
     auto dst_key = key + "@dst_mem_p";
@@ -426,9 +417,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                // scale couldn't be calculated
         else
           output_shift_scale[i] =
-              static_cast<float>(static_cast<double>(scale_out_data) /
-                                 (static_cast<double>(scale_in_data) *
-                                  static_cast<double>(scale_weights_data[i])));
+              scale_out_data / (scale_in_data * scale_weights_data[i]);
       }
 
       auto user_src_md =
@@ -456,24 +445,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
 
       // create a conv primitive descriptor and save it for usage in backward
-      // TODO(lidanqing): We use relu post-op instead of brelu post-op cause
-      // mkldnn v0.18 does not support INT8 brelu post-op. Use code in /**/ when
-      // v0.20 is enabled
-      std::shared_ptr<memory::desc> bias_md_p;
       if (bias) {
         bias_tz = paddle::framework::vectorize2int(bias->dims());
-        bias_md_p = std::make_shared<memory::desc>(platform::MKLDNNMemDesc(
-            bias_tz, memory::data_type::s32, memory::format::x));
+        auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
+                                               memory::format::x);
+        conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                       strides, paddings, mkldnn_engine,
+                                       fuse_relu, fuse_residual_conn,
+                                       output_shift_scale, sum_scale, is_test);
+      } else {
+        conv_pd =
+            ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                                 mkldnn_engine, fuse_relu, fuse_residual_conn,
+                                 output_shift_scale, sum_scale, is_test);
       }
-      conv_pd = ConvFwdPrimitiveDesc(
-          src_md, weights_md, bias_md_p, dst_md, strides, paddings,
-          mkldnn_engine, fuse_relu || fuse_brelu /*fuse_relu*/,
-          fuse_residual_conn, false /*fuse_brelu*/, fuse_brelu_threshold,
-          output_shift_scale, sum_scale, is_test);
       // Save conv_pd/src_memory/weights_memory for backward pass
       dev_ctx.SetBlob(key_conv_pd, conv_pd);
+
       handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
                                                     mkldnn_engine, key));
+
       // create mkldnn memory from input tensors (data/weights)
       user_src_memory_p =
           handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
@@ -510,7 +501,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                 ctx, output, residual_param, user_residual_md, handler,
                 &pipeline);
           } else {
-            need_s8_to_u8 = unsigned_output;
+            need_s8_to_u8 = fuse_relu;
             dst_memory_p = platform::SetDstMemory<int8_t>(
                 ctx, output, residual_param, user_residual_md, handler,
                 &pipeline);
@@ -521,12 +512,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
             dst_memory_p =
                 platform::SetDstMemory<uint8_t>(ctx, output, handler);
           } else {
-            need_s8_to_u8 = unsigned_output;
+            need_s8_to_u8 = fuse_relu;
             dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
           }
         }
       } else if (!force_fp32_output) {
-        if (unsigned_output) {
+        if (fuse_relu) {
           dst_memory_p = platform::SetDstMemory<uint8_t>(ctx, output, handler);
         } else {
           dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
@@ -598,12 +589,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                  &dst_memory_p);
         } else {
-          need_s8_to_u8 = unsigned_output;
+          need_s8_to_u8 = fuse_relu;
           platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
                                                 &dst_memory_p);
         }
       } else if (!force_fp32_output) {
-        if (unsigned_output) {
+        if (fuse_relu) {
           platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                  &dst_memory_p);
         } else {
@@ -641,13 +632,11 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  private:
   mkldnn::primitive_attr CreatePostOps(
       bool fuse_relu, bool fuse_residual_conn,
-      const std::vector<float>& output_shift_scale, float sum_scale,
-      bool fuse_brelu, float fuse_brelu_threshold) const {
+      const std::vector<float> output_shift_scale, float sum_scale) const {
     mkldnn::primitive_attr conv_attr;
     mkldnn::post_ops post_operations;
     int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
     conv_attr.set_output_scales(mask, output_shift_scale);
-
     if (fuse_residual_conn) {
       post_operations.append_sum(sum_scale);
     }
@@ -658,46 +647,59 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
                                      negative_slope, placeholder);
     }
-    if (fuse_brelu) {
-      constexpr float scale = 1.0f;
-      constexpr float placeholder = 0.0f;  // beta
-      post_operations.append_eltwise(scale,
-                                     mkldnn::algorithm::eltwise_bounded_relu,
-                                     fuse_brelu_threshold, placeholder);
-    }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
   }
 
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
-                       const std::shared_ptr<memory::desc> bias_md_p,
                        const memory::desc& dst, const std::vector<int>& strides,
                        const std::vector<int>& paddings,
                        const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn, const bool fuse_brelu,
-                       const float fuse_brelu_threshold,
-                       const std::vector<float>& output_shift_scale,
+                       const bool fuse_residual_conn,
+                       const std::vector<float> output_shift_scale,
                        const float sum_scale, bool is_test) const {
     memory::dims stride_dims = {strides[0], strides[1]};
     memory::dims padding_dims = {paddings[0], paddings[1]};
 
     auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
                                : mkldnn::prop_kind::forward_training;
-    auto conv_desc =
-        (bias_md_p != nullptr)
-            ? mkldnn::convolution_forward::desc(
-                  propagation, mkldnn::convolution_direct, src, weights,
-                  (*bias_md_p), dst, stride_dims, padding_dims, padding_dims,
-                  mkldnn::padding_kind::zero)
-            : mkldnn::convolution_forward::desc(
-                  propagation, mkldnn::convolution_direct, src, weights, dst,
-                  stride_dims, padding_dims, padding_dims,
-                  mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr conv_attr =
-        CreatePostOps(fuse_relu, fuse_residual_conn, output_shift_scale,
-                      sum_scale, fuse_brelu, fuse_brelu_threshold);
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
+        padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr = CreatePostOps(
+        fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
+
+    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
+        conv_desc, conv_attr, engine);
+
+    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
+        p_conv_pd);
+  }
+
+  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
+  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
+                       const memory::desc& bias, const memory::desc& dst,
+                       const std::vector<int>& strides,
+                       const std::vector<int>& paddings,
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_residual_conn,
+                       const std::vector<float> output_shift_scale,
+                       const float sum_scale, bool is_test) const {
+    memory::dims stride_dims = {strides[0], strides[1]};
+    memory::dims padding_dims = {paddings[0], paddings[1]};
+
+    auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
+                               : mkldnn::prop_kind::forward_training;
+
+    auto conv_desc = mkldnn::convolution_forward::desc(
+        propagation, mkldnn::convolution_direct, src, weights, bias, dst,
+        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
+
+    mkldnn::primitive_attr conv_attr = CreatePostOps(
+        fuse_relu, fuse_residual_conn, output_shift_scale, sum_scale);
 
     auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
         conv_desc, conv_attr, engine);
@@ -760,11 +762,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     GetWeightsTz(weights_tz, g, is_conv3d);
     std::vector<int> dst_tz =
         paddle::framework::vectorize2int(output_grad->dims());
-    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
-    bool fuse_brelu = false;
-    if (!is_conv3d) {
-      fuse_brelu = ctx.Attr<bool>("fuse_brelu");
-    }
+
     auto src_format = input->format();
     mkldnn::memory::format weights_format =
         GetWeightsFormat(filter->format(), g, is_conv3d);
@@ -773,8 +771,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     // as well as attributes of primitive to be created
     // This name will be used as key when saving info into device context
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
-        src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations,
-        groups, ctx.op().Input("Input") + ctx.op().Input("Filter"));
+        src_tz, weights_tz, strides, paddings, dilations, groups,
+        ctx.op().Input("Input") + ctx.op().Input("Filter"));
 
     const std::string key_conv_pd = key + "@conv_pd";
     std::vector<primitive> pipeline;
@@ -860,7 +858,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
               user_diff_dst_memory_p, pipeline);
 
       const size_t size = handler.GetDiffWeightsMemorySize();
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+      filter_grad_data = filter_grad->mutable_data<T>(
+          ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
 
       auto diff_weights_memory_p =
           handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
@@ -885,7 +884,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                                         pipeline);
 
       const size_t size = handler.GetDiffSourceMemorySize();
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+      input_grad_data = input_grad->mutable_data<T>(
+          ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
 
       auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
           reinterpret_cast<void*>(input_grad_data));
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 6d5982ab3f8..30d2469eeaf 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -166,11 +166,11 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, false, false, 0.0, fwd_prop_kind);
+          fuse_relu, false, fwd_prop_kind);
     } else {
       conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, boost::none, dst_md, strides, paddings,
-          mkldnn_engine, fuse_relu, false, false, 0.0, fwd_prop_kind);
+          mkldnn_engine, fuse_relu, false, fwd_prop_kind);
     }
 
     // create mkldnn memory from input tensors (data/weights)
@@ -188,8 +188,9 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     std::shared_ptr<mkldnn::memory> dst_memory_p;
 
-    auto output_data =
-        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+    auto output_data = output->mutable_data<T>(
+        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+        handler.GetDstMemorySize());
     dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
         platform::to_void_cast<T>(output_data));
 
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index b525eaac3ef..69c0486eb63 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -12,265 +12,299 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <mkldnn/include/mkldnn_types.h>
-#include <memory>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/fc_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using framework::Tensor;
-using framework::LoDTensor;
-using framework::DDim;
-using framework::ExecutionContext;
-using platform::MKLDNNDeviceContext;
-using platform::to_void_cast;
-using platform::GetMKLDNNFormat;
-using mkldnn::memory;
-using mkldnn::inner_product_forward;
-using mkldnn::primitive;
-using mkldnn::stream;
-using mkldnn::prop_kind;
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
 
 template <typename T>
-class FCPrimitiveFactory {
+class MKLDNNMD {
  public:
-  explicit FCPrimitiveFactory(const mkldnn::engine& engine) : engine_(engine) {}
-
-  inner_product_forward CreateFcPrimitive(const LoDTensor* input,
-                                          const Tensor* weights,
-                                          const Tensor* bias, LoDTensor* output,
-                                          const ExecutionContext& ctx) {
-    RecomputeOutputDims(ctx, input, weights, output);
-    if (fc_) {
-      UpdateDataPointers(ctx, output, input);
-      return *fc_;
-    }
-    auto src_desc = CreateMemDescriptor(input, input->format());
-    input_ = CreateMemory(src_desc, input);
-
-    weights_ = TransposeWeights(weights);
-    if (src_desc.data.ndims == 4) {
-      weights_ = CreateFourDimWeightsMemory(input, weights);
-    }
+  explicit MKLDNNMD(const T* in, const T* w, bool bias)
+      : in(paddle::framework::vectorize2int(in->dims())),
+        w(paddle::framework::vectorize2int(w->dims())) {
+    with_bias_ = bias;
+  }
 
-    auto dst_desc = CreateMemDescriptor(output, memory::format::any);
+  mkldnn::memory::desc dst() const {
+    return platform::MKLDNNMemDesc({in[0], w[1]},
+                                   mkldnn::memory::data_type::f32,
+                                   mkldnn::memory::format::nc);
+  }
 
-    fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx);
-    return *fc_;
+  mkldnn::memory::desc src() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nchw)
+               : platform::MKLDNNMemDesc({in[0], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nc);
   }
 
- private:
-  void UpdateDataPointers(const ExecutionContext& ctx, Tensor* out,
-                          const Tensor* in) {
-    input_->set_data_handle(const_cast<T*>(in->data<T>()));
-    output_->set_data_handle(out->mutable_data<T>(ctx.GetPlace()));
-    if (out->format() == memory::format::format_undef) {
-      auto output_format = output_->get_primitive_desc().desc().data.format;
-      out->set_format((memory::format)output_format);
-    }
+  mkldnn::memory::desc weights() const {
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oihw)
+               : platform::MKLDNNMemDesc({w[1], in[1]},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oi);
   }
 
-  memory::format MatchWeightFormat(memory::format fmt) {
-    using format = memory::format;
-    switch (fmt) {
-      case format::nChw16c:
-        return format::oIhw16i;
-      case format::nChw8c:
-        return format::oIhw8i;
-      case format::nchw:
-        return format::oihw;
-      default:
-        return format::format_undef;
-    }
+  mkldnn::memory::desc bias() const {
+    return with_bias_
+               ? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef)
+               : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef);
   }
 
-  mkldnn::memory Reorder(const memory::desc& src_desc,
-                         const memory::desc& dst_desc, const void* src_data) {
-    auto src_mem = memory({src_desc, engine_}, const_cast<void*>(src_data));
-    auto dst_mem = memory({dst_desc, engine_});
+ private:
+  bool is_spatial() const { return in.size() > 1 && w.size() > 1; }
 
-    auto reorder = mkldnn::reorder(src_mem, dst_mem);
-    stream(stream::kind::eager).submit({reorder}).wait();
+  std::vector<int> in;
+  std::vector<int> w;
+  bool with_bias_;
+  bool is_spatial_;
+};
 
-    return dst_mem;
+class MKLDNNMemory {
+ public:
+  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
+      : md_(t), engine_(e) {}
+  virtual ~MKLDNNMemory() = default;
+
+  template <typename Output>
+  mkldnn::memory dst(const Output* out) {
+    return mkldnn::memory({md_->dst(), engine_},
+                          static_cast<void*>(const_cast<float*>(out)));
   }
 
-  static mkldnn::memory::desc CreateMemDescriptor(const std::vector<int>& dims,
-                                                  memory::format format) {
-    return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType<T>(),
-                                   format);
+  template <typename Output>
+  mkldnn::memory dst(Output* out) {
+    return mkldnn::memory({md_->dst(), engine_}, out);
   }
 
-  static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor,
-                                                  memory::format format) {
-    auto dims = framework::vectorize2int(tensor->dims());
-    return CreateMemDescriptor(dims, format);
+  template <typename Input>
+  mkldnn::memory src(const Input* in) {
+    return mkldnn::memory({md_->src(), engine_},
+                          static_cast<void*>(const_cast<float*>(in)));
   }
 
-  mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc,
-                              const Tensor* tensor) {
-    return CreateMemory(desc, tensor->data<T>());
+  template <typename Weight>
+  mkldnn::memory weights(const Weight* w) {
+    return mkldnn::memory({md_->weights(), engine_},
+                          static_cast<void*>(const_cast<float*>(w)));
   }
 
-  mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc,
-                              const void* data) {
-    return memory({desc, engine_}, const_cast<void*>(data));
+  mkldnn::memory bias() {
+    return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_));
   }
 
-  mkldnn::memory TransposeWeights(const Tensor* weights) {
-    auto dims = framework::vectorize2int(weights->dims());
-    std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor(dims, memory::format::io);
-    auto dst_desc = CreateMemDescriptor(dims, memory::format::oi);
-    return Reorder(src_desc, dst_desc, weights->data<T>());
-  }
+ private:
+  MKLDNNMD<Tensor>* md_;
+  const mkldnn::engine& engine_;
+};
 
-  inner_product_forward CreateFcPrimitive(const memory& src_memory,
-                                          const memory& weights_memory,
-                                          const memory::desc& dst_desc,
-                                          const Tensor* bias, Tensor* output,
-                                          const ExecutionContext& ctx) {
-    const auto weights_desc = weights_memory.get_primitive_desc().desc();
-    const auto src_desc = src_memory.get_primitive_desc().desc();
-    if (bias) {
-      auto bias_desc = CreateMemDescriptor(bias, bias->format());
-      bias_ = CreateMemory(bias_desc, bias);
-      auto fc_prim_desc =
-          CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc);
-
-      output_ = CreateDstMemory(fc_prim_desc, ctx, output);
-
-      return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
-                                   *bias_, *output_);
-    } else {
-      auto fc_prim_desc = CreateFcPrimDesc(src_desc, weights_desc, dst_desc);
-
-      output_ = CreateDstMemory(fc_prim_desc, ctx, output);
-
-      return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
-                                   *output_);
-    }
-  }
+template <typename T>
+class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
 
-  mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
-      const mkldnn::memory::desc& input_desc,
-      const mkldnn::memory::desc& weights_desc,
-      const mkldnn::memory::desc& bias_desc,
-      const mkldnn::memory::desc& dst_desc) {
-    auto fc_desc =
-        inner_product_forward::desc(prop_kind::forward_scoring, input_desc,
-                                    weights_desc, bias_desc, dst_desc);
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
-  }
+    auto input = ctx.Input<framework::LoDTensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
 
-  mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
-      const mkldnn::memory::desc& input_desc,
-      const mkldnn::memory::desc& weights_desc,
-      const mkldnn::memory::desc& dst_desc) {
-    auto fc_desc = inner_product_forward::desc(prop_kind::forward, input_desc,
-                                               weights_desc, dst_desc);
+    PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
+                   "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    // TODO(intel friends): the native weight format is io,
+    // but the mkldnn weight format is oihw, which may need be transposed.
+    PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
+                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
 
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
-  }
+    bool with_bias = bias != nullptr;
+    MKLDNNMD<Tensor> md(input, w, with_bias);
 
-  mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input,
-                                            const Tensor* weights) {
-    auto input_dims = framework::vectorize2int(input->dims());
-    auto weight_dims = framework::vectorize2int(weights->dims());
-    auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]};
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
+        FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                           with_bias, mkldnn_engine);
 
-    auto dst_format = MatchWeightFormat(input->format());
-    auto src_desc = CreateMemDescriptor(dims, memory::format::oihw);
-    auto dst_desc = CreateMemDescriptor(dims, dst_format);
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
 
-    return Reorder(src_desc, dst_desc, weights_->get_data_handle());
-  }
+    dev_ctx.SetBlob(key_fc_pd, pd);
 
-  mkldnn::memory CreateDstMemory(
-      const mkldnn::inner_product_forward::primitive_desc& fc_prim_desc,
-      const ExecutionContext& ctx, Tensor* output) {
-    auto dst_prim_desc = fc_prim_desc.dst_primitive_desc();
-    auto buffer_size = dst_prim_desc.get_size();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace(), buffer_size);
-    output->set_format((memory::format)dst_prim_desc.desc().data.format);
-    return memory(dst_prim_desc, to_void_cast<T>(output_data));
-  }
+    MKLDNNMemory mem(&md, mkldnn_engine);
 
-  void RecomputeOutputDims(const ExecutionContext& ctx, const LoDTensor* input,
-                           const Tensor* w, LoDTensor* output) {
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+
+    auto output = ctx.Output<framework::LoDTensor>("Out");
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     std::vector<int64_t> output_dims;
     FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
     output->Resize(framework::make_ddim(output_dims));
     output->set_lod(input->lod());
-  }
 
- private:
-  const mkldnn::engine& engine_;
-  boost::optional<memory> bias_;
-  boost::optional<memory> input_;
-  boost::optional<memory> output_;
-  boost::optional<memory> weights_;
-  boost::optional<inner_product_forward> fc_;
-};
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-static std::string GetHash(const Tensor* input, const Tensor* weights,
-                           const std::string& suffix) {
-  auto dim2str = [](const DDim& operand_dims) {
-    std::string str = "";
-    for (size_t i = 0; i < operand_dims.size(); ++i) {
-      str += std::to_string(operand_dims[i]) + "-";
-    }
-    return str;
-  };
-  return std::to_string((unsigned)input->format()) + dim2str(weights->dims()) +
-         suffix;
-}
+    auto dst_memory = mem.dst(output_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    // TODO(intel friends): bias memory should also be obtain from bias->data()
+    auto bias_memory = mem.bias();
 
-template <typename T>
-std::shared_ptr<FCPrimitiveFactory<T>> GetPrimitiveFactory(
-    const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx,
-    const Tensor* input, const Tensor* weights,
-    const mkldnn::engine& mkldnn_engine) {
-  const std::string key = GetHash(input, weights, ctx.op().Output("Out"));
-
-  auto prim_creator =
-      std::static_pointer_cast<FCPrimitiveFactory<T>>(dev_ctx.GetBlob(key));
-  if (prim_creator == nullptr) {
-    prim_creator = std::make_shared<FCPrimitiveFactory<T>>(mkldnn_engine);
-    dev_ctx.SetBlob(key, prim_creator);
+    auto forward = with_bias ? mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, bias_memory,
+                                   dst_memory)
+                             : mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, dst_memory);
+
+    std::vector<mkldnn::primitive> pipeline = {forward};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
   }
 
-  return prim_creator;
-}
+ private:
+  std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>
+  FcFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+                     const mkldnn::memory::desc& weights,
+                     const mkldnn::memory::desc& dst,
+                     const mkldnn::memory::desc& bias, const bool with_bias,
+                     const mkldnn::engine& engine) const {
+    auto desc = with_bias
+                    ? mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, bias, dst)
+                    : mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, dst);
+
+    auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine);
+    return std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>(pd);
+  }
+};
 
 template <typename T>
-class FCMKLDNNOpKernel : public framework::OpKernel<T> {
+class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
+
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto input = ctx.Input<LoDTensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
+    T* input_grad_data = nullptr;
+    T* w_grad_data = nullptr;
+
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
+    if (input_grad) {
+      input_grad->Resize(input->dims());
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (w_grad) {
+      w_grad->Resize(w->dims());
+      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const T* out_grad_data = out_grad->data<T>();
+
     auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<LoDTensor>("Out");
+    bool with_bias = bias != nullptr;
+
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    auto dst_memory = mem.dst(out_grad_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
 
-    auto prim_creator =
-        GetPrimitiveFactory<T>(dev_ctx, ctx, input, w, mkldnn_engine);
-    auto fc = prim_creator->CreateFcPrimitive(input, w, bias, output, ctx);
-    stream(stream::kind::eager).submit({fc}).wait();
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
 
-    output->set_layout(DataLayout::kMKLDNN);
+    auto pd =
+        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fc_pd));
+
+    PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context");
+
+    if (w_grad) {
+      auto weights_grad_memory = mem.weights(w_grad_data);
+
+      mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd =
+          FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                                    with_bias, *pd, mkldnn_engine);
+
+      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
+          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory,
+          bias_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      auto src_grad_memory = mem.src(input_grad_data);
+
+      mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd =
+          FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd,
+                                 mkldnn_engine);
+
+      auto bwd_data_prim = mkldnn::inner_product_backward_data(
+          bwd_data_pd, dst_memory, weights_memory, src_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }
+
+ private:
+  mkldnn::inner_product_backward_weights::primitive_desc
+  FcBwdWeightsPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
+      const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias,
+      const bool with_bias,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_weight_desc = with_bias
+                               ? mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst)
+                               : mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, diff_dst);
+
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        bwd_weight_desc, engine, pd);
+  }
+
+  mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc(
+      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& diff_dst,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_data_desc =
+        mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst);
+    return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc,
+                                                               engine, pd);
   }
 };
 }  // namespace operators
@@ -278,3 +312,6 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T> {
 
 REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
                    paddle::operators::FCMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index c635fd11c37..5d8e8192115 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -36,8 +36,7 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
                       const std::vector<int>& ksize,
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                      const memory::data_type& dt, const memory::format& fmt,
-                      const std::string& suffix) {
+                      const memory::data_type& dt, const std::string& suffix) {
   std::string key;
   key.reserve(platform::MKLDNNHandler::MaxKeyLength);
   platform::MKLDNNHandler::AppendKeyDims(&key, input_dims);
@@ -46,7 +45,6 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKeyVec(&key, strides);
   platform::MKLDNNHandler::AppendKeyVec(&key, paddings);
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
-  platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt));
   platform::MKLDNNHandler::AppendKey(&key, suffix);
   return key;
 }
@@ -117,10 +115,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     mkldnn::memory::data_type dt =
         paddle::framework::ToMKLDNNDataType(input->type());
-    auto fmt = input->format();
-    const std::string key =
-        CreateKey(ctx, src_tz, pooling_type, ksize, strides, paddings, dt, fmt,
-                  ctx.op().Output("Out"));
+    const std::string key = CreateKey(ctx, src_tz, pooling_type, ksize, strides,
+                                      paddings, dt, ctx.op().Output("Out"));
     const std::string key_pool_p = key + "@pool_p";
     const std::string key_pool_pd = key + "@pool_pd";
     const std::string key_pool_src_mem_p = key + "@pool_src_mem_p";
@@ -298,9 +294,9 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     // Get an unique name from "argument" name of "Out" variable
     // This name will be used as key when referring info from device context
-    const std::string key = CreateKey(ctx, diff_src_tz, pooling_type, ksize,
-                                      strides, paddings, memory::data_type::f32,
-                                      in_x->format(), ctx.op().Input("Out"));
+    const std::string key =
+        CreateKey(ctx, diff_src_tz, pooling_type, ksize, strides, paddings,
+                  memory::data_type::f32, ctx.op().Input("Out"));
     const std::string key_pool_bwd_p = key + "@pool_bwd_p";
     const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p";
     const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p";
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index a01dd512a37..1b3f33d345f 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -54,24 +54,18 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
   std::shared_ptr<softmax_forward::primitive_desc>
   AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc,
                                     const mkldnn::engine& engine) {
-    // Softmax PD has to be passed to Grad op that
-    // may be executed by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_softmax_pd = key_common_ + "@softmax_pd";
+    const std::string key_softmax_pd = key_ + "@softmax_pd";
 
-    softmax_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
+    auto softmax_pd = std::static_pointer_cast<softmax_forward::primitive_desc>(
         dev_ctx_.GetBlob(key_softmax_pd));
-    if (softmax_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-      softmax_pd_ = std::static_pointer_cast<softmax_forward::primitive_desc>(
-          dev_ctx_.GetBlob(key_softmax_pd));
-      if (softmax_pd_ == nullptr) {
-        softmax_pd_.reset(
-            new softmax_forward::primitive_desc(softmax_desc, engine));
-        dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
-      }
+
+    if (softmax_pd == nullptr) {
+      softmax_pd_.reset(
+          new softmax_forward::primitive_desc(softmax_desc, engine));
+      dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
+    } else {
+      softmax_pd_ = softmax_pd;
+      is_reusing_ = true;
     }
 
     return softmax_pd_;
@@ -85,11 +79,15 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
 
     auto softmax_p = std::static_pointer_cast<mkldnn::softmax_forward>(
         dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax primitive in device context");
     if (softmax_p == nullptr) {
       softmax_p = std::make_shared<mkldnn::softmax_forward>(
           *softmax_pd_, *(static_cast<mkldnn::memory*>(src_memory_p.get())),
           *(static_cast<mkldnn::memory*>(dst_memory_p.get())));
       dev_ctx_.SetBlob(prim_key, softmax_p);
+    } else {
+      is_reusing_ = true;
     }
 
     return softmax_p;
@@ -102,11 +100,15 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     auto prim_key = key_ + "@softmax_bwd_p";
     auto softmax_bwd_p = std::static_pointer_cast<mkldnn::softmax_backward>(
         dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((softmax_bwd_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find softmax backward primitive in device context");
     if (softmax_bwd_p == nullptr) {
       softmax_bwd_p = std::make_shared<mkldnn::softmax_backward>(
           *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p,
           *diff_src_memory_p);
       dev_ctx_.SetBlob(prim_key, softmax_bwd_p);
+    } else {
+      is_reusing_ = true;
     }
 
     return softmax_bwd_p;
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 480167f4352..95cee806ac4 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -46,8 +46,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
 
     const std::string key = platform::TransposeMKLDNNHandler::GetHash(
-        nchw_tz, axis,
-        ctx.op().Output("Out") + std::to_string(input->format()));
+        nchw_tz, axis, ctx.op().Output("Out"));
 
     platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx,
                                              mkldnn_engine, key);
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index bbf9fbfa1ff..32173f3f500 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -180,72 +180,16 @@ class MulOpGradMaker : public framework::SingleGradOpDescMaker {
   }
 };
 
-class MulDoubleGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("DOut"), "Input(DOut) should not be null");
-
-    if (ctx->HasOutput("DDOut") && ctx->HasInput("DDX")) {
-      ctx->ShareDim("DOut", "DDOut");
-    }
-    if (ctx->HasOutput("DX") && ctx->HasInput("DDY")) {
-      ctx->ShareDim("X", "DX");
-    }
-    if (ctx->HasOutput("DY") && ctx->HasInput("DDX")) {
-      ctx->ShareDim("Y", "DY");
-    }
-  }
-};
-
-class MulDoubleGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> retv(new framework::OpDesc());
-    retv->SetType("mul_grad_grad");
-
-    retv->SetInput("X", Input("X"));
-    retv->SetInput("Y", Input("Y"));
-    retv->SetInput("DOut", Input(framework::GradVarName("Out")));
-    retv->SetInput("DDX", OutputGrad(framework::GradVarName("X")));
-    retv->SetInput("DDY", OutputGrad(framework::GradVarName("Y")));
-
-    auto ddx = OutputGrad(framework::GradVarName("X"));
-    auto ddw = OutputGrad(framework::GradVarName("Y"));
-    std::vector<std::string> empty_str = {};
-
-    retv->SetOutput("DDOut", (ddx.empty())
-                                 ? empty_str
-                                 : InputGrad(framework::GradVarName("Out")));
-    retv->SetOutput("DX", ddw.empty() ? empty_str : InputGrad("X"));
-    retv->SetOutput("DY", ddx.empty() ? empty_str : InputGrad("Y"));
-
-    retv->SetAttrMap(Attrs());
-    return retv;
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(mul, ops::MulOp, ops::MulOpMaker, ops::MulOpInferVarType,
                   ops::MulOpGradMaker);
-REGISTER_OPERATOR(mul_grad, ops::MulGradOp, ops::MulDoubleGradMaker);
-REGISTER_OPERATOR(mul_grad_grad, ops::MulDoubleGradOp);
+REGISTER_OPERATOR(mul_grad, ops::MulGradOp);
 REGISTER_OP_CPU_KERNEL(
     mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MulKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MulGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.cu.cc b/paddle/fluid/operators/mul_op.cu.cc
index 6e841712b9b..6c5a83c6a50 100644
--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -24,7 +24,3 @@ REGISTER_OP_CUDA_KERNEL(
     mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
     ops::MulGradKernel<plat::CUDADeviceContext, double>,
     ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad_grad,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MulDoubleGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index c77eb5c4ccb..f72824806ed 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -110,95 +109,5 @@ class MulGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class MulDoubleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto x_mat = x->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*x, x_num_col_dims)
-                     : static_cast<const Tensor&>(*x);
-    auto y_mat = y->dims().size() > 2
-                     ? framework::ReshapeToMatrix(*y, y_num_col_dims)
-                     : static_cast<const Tensor&>(*y);
-
-    const int m = framework::flatten_to_2d(x->dims(), x_num_col_dims)[0];
-    const int n = framework::flatten_to_2d(y->dims(), y_num_col_dims)[1];
-
-    auto* dout = ctx.Input<framework::LoDTensor>("DOut");
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize({m, n});
-
-    auto* ddx = ctx.Input<framework::LoDTensor>("DDX");
-    auto* ddy = ctx.Input<framework::LoDTensor>("DDY");
-
-    auto* dx = ctx.Output<framework::LoDTensor>("DX");
-    auto* dy = ctx.Output<framework::LoDTensor>("DY");
-    auto* ddout = ctx.Output<framework::LoDTensor>("DDOut");
-
-    Tensor ddout_mat;
-    if (ddout) {
-      ddout->set_lod(dout->lod());
-      // allocate and reshape ddout
-      ddout->mutable_data<T>(ctx.GetPlace());
-      ddout_mat.ShareDataWith(*ddout);
-      ddout_mat.Resize({m, n});
-    }
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    // a flag to specify whether ddout value has been set, if flag
-    // is false, MatMul beta should be 0 to set ddout, if flag is
-    // true, MatMul beta should be 1 to add result to ddout.
-    bool ddout_flag = false;
-    if (ddx) {
-      auto ddx_mat = ddx->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddx, x_num_col_dims)
-                         : static_cast<const Tensor&>(*ddx);
-
-      // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
-      if (dy) {
-        dy->set_lod(y->lod());
-        // allocate and reshape dy
-        dy->mutable_data<T>(ctx.GetPlace());
-        Tensor dy_mat = dy->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
-                            : *dy;
-        blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
-      }
-      // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
-      if (ddout) {
-        blas.MatMul(ddx_mat, false, y_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-        ddout_flag = true;
-      }
-    }
-    if (ddy) {
-      auto ddy_mat = ddy->dims().size() > 2
-                         ? framework::ReshapeToMatrix(*ddy, y_num_col_dims)
-                         : static_cast<const Tensor&>(*ddy);
-      // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
-      if (dx) {
-        dx->set_lod(x->lod());
-        // allocate and reshape dx
-        dx->mutable_data<T>(ctx.GetPlace());
-        Tensor dx_mat = dx->dims().size() > 2
-                            ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
-                            : *dx;
-        blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
-      }
-      // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
-      if (ddout) {
-        blas.MatMul(x_mat, false, ddy_mat, false, static_cast<T>(1.0),
-                    &ddout_mat, static_cast<T>(ddout_flag));
-      }
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
index db8a7ca94a5..dafc31b546e 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <algorithm>
 #include <functional>
 #include <memory>
-#include <unordered_set>
 #include <vector>
 
 #include "ngraph/ngraph.hpp"
@@ -25,8 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
-constexpr int64_t kNoPadding = -1;
-
 namespace paddle {
 namespace operators {
 
@@ -34,34 +31,6 @@ bool NgraphBridge::isRegister(const std::string& str) {
   return ops::NgraphSingleton::Lookup(str);
 }
 
-bool NgraphBridge::isSupported(
-    const std::unique_ptr<framework::OperatorBase>& op) {
-  static std::unordered_set<std::string> skip_op_list{
-      "reshape", "reshape2", "lookup_table", "lookup_table_grad"};
-  bool result = true;
-  auto& op_type = op->Type();
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  if (!isRegister(op_type)) {
-    if (skip_op_list.count(op_type)) {
-      if (op_type == "lookup_table" || op_type == "lookup_table_grad") {
-        if (op_attrs.Get<bool>("is_sparse") ||
-            (op_attrs.Get<int64_t>("padding_idx") != kNoPadding)) {
-          result = false;
-        }
-      } else if ((op_type == "reshape") || (op_type == "reshape2")) {
-        if (op->Input("Shape") != paddle::framework::kEmptyVarName) {
-          result = false;
-        }
-      } else {
-        result = false;
-      }
-    }
-  } else {
-    result = false;
-  }
-  return result;
-}
-
 void NgraphBridge::BuildNgNode(
     const std::shared_ptr<framework::OperatorBase>& op) {
   auto& op_type = op->Type();
diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.h b/paddle/fluid/operators/ngraph/ngraph_bridge.h
index 0b43ec53874..b609c284959 100644
--- a/paddle/fluid/operators/ngraph/ngraph_bridge.h
+++ b/paddle/fluid/operators/ngraph/ngraph_bridge.h
@@ -39,8 +39,6 @@ class NgraphBridge {
 
   static bool isRegister(const std::string& str);
 
-  static bool isSupported(const std::unique_ptr<framework::OperatorBase>& op);
-
  private:
   std::shared_ptr<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index ae87687e342..5ef385d2fcb 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -38,10 +38,6 @@ namespace operators {
 
 static ngraph::Shape Ddim2Shape(const framework::DDim& dims) {
   ngraph::Shape sp;
-  if (dims.size() == 1 && dims[0] == 0) {
-    sp.emplace_back(0);
-    return sp;
-  }
   for (int i = 0; i < dims.size(); ++i) {
     int k = dims[i];
     k = k == 0 ? 1 : k;
@@ -65,7 +61,6 @@ static std::map<framework::proto::VarType::Type, ngraph::element::Type>
         {framework::proto::VarType::FP64, ngraph::element::f64},
         {framework::proto::VarType::INT32, ngraph::element::i32},
         {framework::proto::VarType::INT64, ngraph::element::i64},
-        {framework::proto::VarType::UINT8, ngraph::element::u8},
         {framework::proto::VarType::BOOL, ngraph::element::boolean}};
 
 static std::map<ngraph::element::Type, framework::proto::VarType::Type>
@@ -74,7 +69,6 @@ static std::map<ngraph::element::Type, framework::proto::VarType::Type>
         {ngraph::element::f64, framework::proto::VarType::FP64},
         {ngraph::element::i32, framework::proto::VarType::INT32},
         {ngraph::element::i64, framework::proto::VarType::INT64},
-        {ngraph::element::u8, framework::proto::VarType::UINT8},
         {ngraph::element::boolean, framework::proto::VarType::BOOL}};
 
 std::vector<std::string> NgraphEngine::feed_vars = {};
@@ -138,11 +132,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int pivot = left;
   while (pivot < right) {
     auto op_type = ops->at(pivot)->Type();
-    if (!NgraphBridge::isSupported(ops->at(pivot))) {
+    if (NgraphBridge::isRegister(op_type)) {
       ++pivot;
     } else {
       int start = pivot, end = start;
-      while (pivot < right && (NgraphBridge::isSupported(ops->at(pivot)))) {
+      while (pivot < right &&
+             (!NgraphBridge::isRegister(ops->at(pivot)->Type()))) {
         ++pivot;
         ++end;
       }
@@ -161,8 +156,6 @@ static void SubstituteNgraphOp(
   ng_op_desc.SetAttr("interval", interval);
   ng_op_desc.SetAttr("engine_key", engine_key);
   ng_op_desc.SetAttr("graph", block_str);
-  ng_op_desc.SetInput("Xs", std::vector<std::string>(0));
-  ng_op_desc.SetOutput("Ys", std::vector<std::string>(0));
 
   ops->erase(ops->begin() + interval[0], ops->begin() + interval[1]);
   ops->insert(ops->begin() + interval[0],
@@ -228,36 +221,20 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope,
                            const platform::Place& place,
                            const framework::ExecutionContext& ctx)
     : scope_(scope), place_(place) {
+  std::string serialized_graph = ctx.Attr<std::string>("graph");
+  auto interval = ctx.Attr<std::vector<int>>("interval");
+  std::string engine_key = ctx.Attr<std::string>("engine_key");
+
   var_in_node_map_ = std::make_shared<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
 
   var_node_map_ = std::make_shared<
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
 
-  GetNgFunction(ctx);
+  GetNgFunction(engine_key, interval);
 }
 
-void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
-  auto interval = ctx.Attr<std::vector<int>>("interval");
-  std::string serialized_graph = ctx.Attr<std::string>("graph");
-
-  auto input_vars = ctx.Inputs("Xs");
-  if (!input_vars.empty()) {
-    feed_vars = input_vars;
-    var_in_ = input_vars;
-  }
-  auto output_vars = ctx.Outputs("Ys");
-  if (!output_vars.empty()) {
-    var_out_ = output_vars;
-  }
-
-  framework::proto::BlockDesc block_proto;
-  if (!serialized_graph.empty()) block_proto.ParseFromString(serialized_graph);
-  framework::BlockDesc block_desc(nullptr, &block_proto);
-  if (!serialized_graph.empty()) {
-    NgraphEngine::p_bdesc = &block_desc;
-  }
-
+void NgraphEngine::Prepare(const std::vector<int>& interval) {
   bool has_fetch = false, is_full = false;
   for (auto& var : p_bdesc->AllVars()) {
     if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS ||
@@ -337,15 +314,7 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) {
     op_state_ = OpState::UNKNOWN;
   }
 
-  if (var_in_.empty() && var_out_.empty()) {
-    BuildNgIO(ops_desc, interval);
-  }
-  for (size_t i = 0; i < var_in_.size(); ++i) {
-    auto var_name = var_in_[i];
-    if (persistables_.find(var_name) == persistables_.end()) {
-      var_in_updates_.emplace_back(i);
-    }
-  }
+  BuildNgIO(ops_desc, interval);
 }
 
 void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
@@ -421,14 +390,12 @@ void NgraphEngine::BuildNgIO(const std::vector<framework::OpDesc*>& ops_desc,
       }
     }
   }
-  // remove output duplicates
-  std::unordered_set<std::string> var_out_set;
-  for (int i = static_cast<int>(var_out_.size()) - 1; i >= 0; --i) {
-    std::string var_name = var_out_.at(i);
-    if (var_out_set.count(var_name)) {
-      var_out_.erase(var_out_.begin() + i);
+
+  for (size_t i = 0; i < var_in_.size(); ++i) {
+    auto var_name = var_in_[i];
+    if (persistables_.find(var_name) == persistables_.end()) {
+      var_in_updates_.emplace_back(i);
     }
-    var_out_set.insert(var_name);
   }
 }
 
@@ -465,17 +432,26 @@ void NgraphEngine::BuildNgNodes() {
       }
     }
   }
+
   NgraphBridge ngb(var_node_map_);
   for (auto& op : fused_ops_) {
     ngb.BuildNgNode(op);
   }
 }
 
-std::shared_ptr<ngraph::Function> NgraphEngine::BuildNgFunction(
-    const framework::ExecutionContext& ctx) {
-  Prepare(ctx);
+void NgraphEngine::RunInferShape() {
+  for (auto& op : fused_ops_) {
+    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+    op->RuntimeInferShape(scope_, place_, ctx);
+  }
+}
+
+void NgraphEngine::BuildNgFunction(const std::vector<int>& interval) {
+  Prepare(interval);
+  RunInferShape();
   GetNgInputShape();
   BuildNgNodes();
+  ngraph_function_ = nullptr;
   ngraph::NodeVector func_outputs;
   ngraph::ParameterVector func_inputs;
 
@@ -490,105 +466,93 @@ std::shared_ptr<ngraph::Function> NgraphEngine::BuildNgFunction(
     func_inputs.emplace_back(prm);
   }
 
-  return std::make_shared<ngraph::Function>(func_outputs, func_inputs);
-}
-
-void NgraphEngine::ClearNgCache() {
-  auto it = engine_cache.begin();
-  while (it != engine_cache.end()) {
-    auto ng_engine = it->second;
-    backend_->remove_compiled_function(ng_engine.ngraph_handle);
-    ++it;
-  }
-  engine_cache.clear();
-  auto it_tensor = t_in_cache_.begin();
-  while (it_tensor != t_in_cache_.end()) {
-    auto t_vec = it_tensor->second;
-    for (auto t_in : t_vec) {
-      t_in.reset();
-    }
-    ++it_tensor;
-  }
-  t_in_cache_.clear();
+  ngraph_function_ =
+      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }
 
-void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) {
-  auto interval = ctx.Attr<std::vector<int>>("interval");
-  std::string engine_key = ctx.Attr<std::string>("engine_key");
-
-  // set to flase, to debug cache or recompile everytime.
+void NgraphEngine::GetNgFunction(std::string engine_key,
+                                 const std::vector<int>& interval) {
   bool use_cache = true;
-  if (!use_cache) ClearNgCache();
-
-  this->func_cache_key_ = "";
-  for (int i = 0; i < static_cast<int>(feed_vars.size()); ++i) {
-    auto* var = scope_.FindVar(feed_vars[i]);
-    if (var && var->IsType<framework::LoDTensor>()) {
-      auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
-      auto dims = tensor_pd->dims();
-      for (int j = 0; j < dims.size(); ++j) {
-        func_cache_key_ += std::to_string(dims[j]);
+  if (use_cache) {
+    this->func_cache_key_ = "";
+    for (int i = 0; i < std::min(static_cast<int>(feed_vars.size()), 10); ++i) {
+      auto* var = scope_.FindVar(feed_vars[i]);
+      if (var && var->IsType<framework::LoDTensor>()) {
+        auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+        auto dims = tensor_pd->dims();
+        for (int j = 0; j < dims.size(); ++j) {
+          func_cache_key_ += std::to_string(dims[j]);
+        }
       }
     }
-  }
-  func_cache_key_ += std::to_string(interval[0]) + "_" +
-                     std::to_string(interval[1]) + engine_key;
-  func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
-
-  if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
-    if (engine_cache[func_cache_key_].persistables.size() == 0) {
-      ClearNgCache();
-    } else {
-      auto var_name = engine_cache[func_cache_key_].persistables.begin();
-      framework::Variable* var = scope_.FindVar(*var_name);
-      if (var != pre_var_ptr) {
-        ClearNgCache();
+    func_cache_key_ += std::to_string(interval[0]) + "_" +
+                       std::to_string(interval[1]) + engine_key;
+    func_cache_key_ = std::to_string(std::hash<std::string>()(func_cache_key_));
+
+    if (engine_cache.find(func_cache_key_) != engine_cache.end()) {
+      if (engine_cache[func_cache_key_].persistables.size() == 0) {
+        engine_cache.clear();
+        t_in_cache_.clear();
+      } else {
+        auto var_name = engine_cache[func_cache_key_].persistables.begin();
+        framework::Variable* var = scope_.FindVar(*var_name);
+        if (var != pre_var_ptr) {
+          engine_cache.clear();
+          t_in_cache_.clear();
+        }
+        pre_var_ptr = var;
       }
-      pre_var_ptr = var;
     }
-  }
 
-  if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
-    if (engine_cache.size() > 5) ClearNgCache();
-    auto func = BuildNgFunction(ctx);
-    // Due to optimization backend may produce results in other layouts,
-    // make sure we get default layout for results.
-    for (auto& r : func->get_results()) {
-      r->set_needs_default_layout(true);
+    if (engine_cache.find(func_cache_key_) == engine_cache.end()) {
+      BuildNgFunction(interval);
+      engine_cache[func_cache_key_].ngraph_function = this->ngraph_function_;
+      engine_cache[func_cache_key_].persistables = this->persistables_;
+      engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
+      engine_cache[func_cache_key_].var_in = this->var_in_;
+      engine_cache[func_cache_key_].var_out = this->var_out_;
+      engine_cache[func_cache_key_].is_test = this->is_test_;
     }
-    engine_cache[func_cache_key_].ngraph_handle = backend_->compile(func);
-    engine_cache[func_cache_key_].persistables = this->persistables_;
-    engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_;
-    engine_cache[func_cache_key_].var_in = this->var_in_;
-    engine_cache[func_cache_key_].var_out = this->var_out_;
-    engine_cache[func_cache_key_].is_test = this->is_test_;
+  } else {
+    BuildNgFunction(interval);
   }
 }
 
 void NgraphEngine::Run(const framework::Scope& scope,
                        const platform::Place& place) const {
-  std::shared_ptr<ngraph::runtime::Executable> ng_handle;
+  std::shared_ptr<ngraph::Function> ng_func;
   const std::set<std::string>* p_persistables;
   const std::vector<size_t>* p_var_in_updates;
   const std::vector<std::string>* p_var_in;
   const std::vector<std::string>* p_var_out;
   bool is_test;
 
-  PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
-                 "Cannot find cached data to run ngraph function");
-  ng_handle = engine_cache[func_cache_key_].ngraph_handle;
-  p_persistables = &(engine_cache[func_cache_key_].persistables);
-  p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
-  p_var_in = &(engine_cache[func_cache_key_].var_in);
-  p_var_out = &(engine_cache[func_cache_key_].var_out);
-  is_test = engine_cache[func_cache_key_].is_test;
+  bool use_cache = true;
+  if (use_cache) {
+    PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(),
+                   "Cannot find cached data to run ngraph function");
+    ng_func = engine_cache[func_cache_key_].ngraph_function;
+    p_persistables = &(engine_cache[func_cache_key_].persistables);
+    p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates);
+    p_var_in = &(engine_cache[func_cache_key_].var_in);
+    p_var_out = &(engine_cache[func_cache_key_].var_out);
+    is_test = engine_cache[func_cache_key_].is_test;
+  } else {
+    ng_func = ngraph_function_;
+    p_persistables = &this->persistables_;
+    p_var_in_updates = &this->var_in_updates_;
+    p_var_in = &this->var_in_;
+    p_var_out = &this->var_out_;
+    is_test = this->is_test_;
+  }
 
   std::vector<std::shared_ptr<ngraph::runtime::Tensor>>* p_t_in;
   std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in = {};
 
-  auto m_parameters = ng_handle->get_parameters();
-  auto m_results = ng_handle->get_results();
-  if (is_test && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
+  auto m_parameters = ng_func->get_parameters();
+  auto m_results = ng_func->get_results();
+  if (is_test && use_cache &&
+      t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) {
     p_t_in = &(t_in_cache_[func_cache_key_]);
     for (size_t i = 0; i < p_var_in_updates->size(); ++i) {
       int index = p_var_in_updates->at(i);
@@ -607,7 +571,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
       }
     }
   } else {
-    if (is_test) {
+    if (is_test && use_cache) {
       p_t_in = &(t_in_cache_[func_cache_key_]);
     } else {
       p_t_in = &t_in;
@@ -637,21 +601,6 @@ void NgraphEngine::Run(const framework::Scope& scope,
     }
   }
 
-  for (auto& op : fused_ops_) {
-    framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
-    if (op->Type() == "reshape2_grad") {
-      auto xshape_name = op->Inputs().at("XShape").at(0);
-      auto* xshape_var = scope_.FindVar(xshape_name);
-      auto* xshape_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*xshape_var);
-      auto& xshape_ddim = xshape_tensor->dims();
-      auto xgrad_name = op->Outputs().at(framework::GradVarName("X")).at(0);
-      auto* xgrad_var = scope_.FindVar(xgrad_name);
-      xgrad_var->GetMutable<framework::LoDTensor>()->Resize(xshape_ddim);
-    } else {
-      op->RuntimeInferShape(scope_, place_, ctx);
-    }
-  }
-
   std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out = {};
   for (size_t i = 0; i < p_var_out->size(); ++i) {
     auto vo = p_var_out->at(i);
@@ -670,7 +619,8 @@ void NgraphEngine::Run(const framework::Scope& scope,
     }
   }
 
-  ng_handle->call(t_out, *p_t_in);
+  auto handle = backend_->compile(ng_func);
+  handle->call_with_validate(t_out, *p_t_in);
 }  // NgraphEngine::Run
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index 4cb14653713..19400ac5b0e 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -40,7 +40,7 @@ enum class OpState {                /* nGraph support state on ops          */
 
 // cache engine repetitives
 struct EngineCache {
-  std::shared_ptr<ngraph::runtime::Executable> ngraph_handle;
+  std::shared_ptr<ngraph::Function> ngraph_function;
   std::set<std::string> persistables;
   std::vector<std::string> var_in;
   std::vector<std::string> var_out;
@@ -84,6 +84,8 @@ class NgraphEngine {
 
   // ngraph backend eg. CPU
   static std::shared_ptr<ngraph::runtime::Backend> backend_;
+  // ngraph function to call and execute
+  std::shared_ptr<ngraph::Function> ngraph_function_;
   // var_name of inputs
   std::vector<std::string> var_in_;
   // var_name of outputs from  fetch in order
@@ -99,7 +101,7 @@ class NgraphEngine {
       std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
       var_node_map_;
   // prepare info for ngraph engine need
-  void Prepare(const framework::ExecutionContext& ctx);
+  void Prepare(const std::vector<int>& interval);
   // get ngraph engine input and output list
   void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
                  const std::vector<int>& interval);
@@ -107,13 +109,12 @@ class NgraphEngine {
   void GetNgInputShape();
   // Call ngraph bridge to map ops
   void BuildNgNodes();
+  // run paddle RuntimeInferShape to get the tensor shape
+  void RunInferShape();
   // build ngraph function call
-  std::shared_ptr<ngraph::Function> BuildNgFunction(
-      const framework::ExecutionContext& ctx);
-  // clear ngraph engine cache and t_in cache
-  void ClearNgCache();
+  void BuildNgFunction(const std::vector<int>& interval);
   // Check cache for ngraph function or otherwise build the function
-  void GetNgFunction(const framework::ExecutionContext& ctx);
+  void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/ngraph/ops/activation_op.h b/paddle/fluid/operators/ngraph/ops/activation_op.h
index 884ec659267..a66ec65a336 100644
--- a/paddle/fluid/operators/ngraph/ops/activation_op.h
+++ b/paddle/fluid/operators/ngraph/ops/activation_op.h
@@ -26,52 +26,6 @@ namespace paddle {
 namespace operators {
 namespace ngraphs {
 
-void BuildGeluNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto half = paddle::platform::CreateConstant(input->get_element_type(),
-                                               input->get_shape(), {0.5});
-  auto one = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {1});
-  auto sqrt_two =
-      std::make_shared<ngraph::op::Sqrt>(paddle::platform::CreateConstant(
-          input->get_element_type(), input->get_shape(), {2}));
-  auto out = half * input *
-             (one + std::make_shared<ngraph::op::Erf>(input / sqrt_two));
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-void BuildGeluGradNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto dout = platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
-  auto half = paddle::platform::CreateConstant(input->get_element_type(),
-                                               input->get_shape(), {0.5});
-  auto minus_half = paddle::platform::CreateConstant(
-      input->get_element_type(), input->get_shape(), {-0.5});
-  auto one = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {1});
-  auto two = paddle::platform::CreateConstant(input->get_element_type(),
-                                              input->get_shape(), {2});
-  auto pi = paddle::platform::CreateConstant(
-      input->get_element_type(), input->get_shape(), {3.14159265359});
-  auto sqrt_two = std::make_shared<ngraph::op::Sqrt>(two);
-  auto sqrt_pi = std::make_shared<ngraph::op::Sqrt>(pi);
-
-  auto first =
-      half * (one + std::make_shared<ngraph::op::Erf>(input * one / sqrt_two));
-  auto second = half * (two / sqrt_pi) * (one / sqrt_two) * input *
-                std::make_shared<ngraph::op::Exp>(minus_half * input * input);
-  auto gelu_grad = dout * (first + second);
-  platform::SetOutputNode(op, "X@GRAD", gelu_grad, ngb_node_map);
-}
-
 void BuildReluGradNode(
     const std::shared_ptr<framework::OperatorBase>& op,
     std::shared_ptr<
@@ -83,16 +37,6 @@ void BuildReluGradNode(
   platform::SetOutputNode(op, "X@GRAD", relu_grad, ngb_node_map);
 }
 
-void BuildSquareNode(
-    const std::shared_ptr<framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = platform::GetInputNode(op, "X", ngb_node_map);
-  auto out = input * input;
-  platform::SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
 void BuildTanhGradNode(
     const std::shared_ptr<framework::OperatorBase>& op,
     std::shared_ptr<
@@ -110,8 +54,5 @@ void BuildTanhGradNode(
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_NG_OP(gelu, BuildGeluNode);
-REGISTER_NG_OP(gelu_grad, BuildGeluGradNode);
 REGISTER_NG_OP(relu_grad, BuildReluGradNode);
-REGISTER_NG_OP(square, BuildSquareNode);
 REGISTER_NG_OP(tanh_grad, BuildTanhGradNode);
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
index b8e9f3d8584..2d11775849a 100644
--- a/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
+++ b/paddle/fluid/operators/ngraph/ops/binary_unary_op.h
@@ -51,11 +51,6 @@ static void BuildUnaryNode(
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_NG_OP(abs, BuildUnaryNode<ngraph::op::Abs>);
 REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
 REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
 REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
-
-REGISTER_NG_OP(logical_and, BuildBinaryNode<ngraph::op::And>);
-REGISTER_NG_OP(logical_or, BuildBinaryNode<ngraph::op::Or>);
-REGISTER_NG_OP(logical_not, BuildUnaryNode<ngraph::op::Not>);
diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
index b8ad7491d57..be766ebeb47 100644
--- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h
@@ -239,4 +239,3 @@ void BuildConv2dGradNode(
 
 REGISTER_NG_OP(conv2d, BuildConv2dNode);
 REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
-REGISTER_NG_OP(depthwise_conv2d, BuildConv2dNode);
diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
index bc91be45325..c92ebb7e96f 100644
--- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
+++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h
@@ -26,82 +26,59 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace ngraphs {
-std::shared_ptr<ngraph::Node> remove_trailing_one(
-    const std::shared_ptr<ngraph::Node>& input) {
-  auto shape = input->get_shape();
-  if (shape.back() == 1) {
-    shape.pop_back();
-    return platform::NgReshaper(input, shape);
-  } else {
-    return input;
-  }
-}
 
-std::shared_ptr<ngraph::Node> flatten_node(
-    const std::shared_ptr<ngraph::Node>& input) {
-  auto shape = input->get_shape();
-  auto rank = shape.size();
-  auto output = input;
-  if (rank > 2) {
-    auto shape_2d = paddle::platform::FlattenTo2d(shape, rank - 1);
-    output = paddle::platform::NgReshaper(input, shape_2d);
+std::shared_ptr<ngraph::Node> GetCrossEntropy(
+    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
+    const bool is_soft_label, int ignore_index) {
+  auto label_shape = label->get_shape();
+  auto x_shape = x->get_shape();
+  auto label_rank = label_shape.size();
+  auto x_rank = x_shape.size();
+  std::shared_ptr<ngraph::Node> x_2d = x, label_2d = label;
+  auto label_2d_shape = label_shape, x_2d_shape = x_shape;
+
+  if (label_rank > 2) {
+    label_2d_shape = paddle::platform::FlattenTo2d(label_shape, label_rank - 1);
+    label_2d = paddle::platform::NgReshaper(label, label_2d_shape);
   }
-  return output;
-}
-
-std::shared_ptr<ngraph::Node> convert_to_node_type(
-    const std::shared_ptr<ngraph::Node>& input,
-    const std::shared_ptr<ngraph::Node>& ref) {
-  auto output = input;
-  if (input->get_element_type() != ref->get_element_type()) {
-    output =
-        std::make_shared<ngraph::op::Convert>(input, ref->get_element_type());
+  if (x_rank > 2) {
+    x_2d_shape = platform::FlattenTo2d(x_shape, x_rank - 1);
+    x_2d = platform::NgReshaper(x, x_2d_shape);
   }
-  return output;
-}
 
-std::shared_ptr<ngraph::Node> create_xe(
-    const std::shared_ptr<ngraph::Node>& one_hot,
-    const std::shared_ptr<ngraph::Node>& x) {
-  auto node_log = std::make_shared<ngraph::op::Log>(x);
+  auto batch_size = x_2d_shape.at(0);
 
-  auto node_mul = one_hot * node_log;
-  auto node_sum = std::make_shared<ngraph::op::Sum>(
-      node_mul, ngraph::AxisSet{x->get_shape().size() - 1});
-
-  auto shape = x->get_shape();
-  shape.back() = 1;
-  return platform::NgReshaper(-node_sum, shape);
-}
-std::shared_ptr<ngraph::Node> create_mask(
-    const std::shared_ptr<ngraph::Node>& label, int ignore_index) {
-  auto ignore_node = paddle::platform::CreateConstant(
-      label->get_element_type(), label->get_shape(), {ignore_index});
-  auto not_equal_node =
-      std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
-  return not_equal_node;
-}
-
-std::shared_ptr<ngraph::Node> create_one_hot(
-    const std::shared_ptr<ngraph::Node>& label,
-    const std::shared_ptr<ngraph::Node>& x) {
-  auto label_shape = label->get_shape();
-  return std::make_shared<ngraph::op::OneHot>(
-      remove_trailing_one(label), x->get_shape(), x->get_shape().size() - 1);
-}
-
-std::shared_ptr<ngraph::Node> GetCrossEntropy(
-    std::shared_ptr<ngraph::Node> x, std::shared_ptr<ngraph::Node> label,
-    const bool is_soft_label, int ignore_index) {
-  std::shared_ptr<ngraph::Node> node_1_hot = label;
+  std::shared_ptr<ngraph::Node> node_1_hot = label_2d;
   if (!is_soft_label) {
-    node_1_hot = create_one_hot(label, x);
+    auto label_1d =
+        platform::NgReshaper(label_2d, ngraph::Shape{label_2d_shape.at(0)});
+    node_1_hot = std::make_shared<ngraph::op::OneHot>(label_1d, x_2d_shape, 1);
+  }
+  if (x->get_element_type() != node_1_hot->get_element_type()) {
+    node_1_hot = std::make_shared<ngraph::op::Convert>(node_1_hot,
+                                                       x->get_element_type());
   }
-  node_1_hot = convert_to_node_type(node_1_hot, x);
 
-  auto xe = create_xe(node_1_hot, x);
+  auto node_log = std::make_shared<ngraph::op::Log>(x_2d);
+  auto high_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                                node_log->get_shape(), {1e20});
+  auto low_clip = ngraph::op::Constant::create(node_log->get_element_type(),
+                                               node_log->get_shape(), {-1e20});
+  auto node_min = std::make_shared<ngraph::op::Minimum>(node_log, high_clip);
+  auto node_max = std::make_shared<ngraph::op::Maximum>(node_min, low_clip);
+  auto node_mul = node_1_hot * node_log;
+  auto node_sum =
+      std::make_shared<ngraph::op::Sum>(node_mul, ngraph::AxisSet{1});
+  auto node_neg = std::make_shared<ngraph::op::Negative>(node_sum);
+  auto xe = platform::NgReshaper(node_neg, ngraph::Shape{batch_size, 1});
+
   if (!is_soft_label) {
-    auto mask = convert_to_node_type(create_mask(label, ignore_index), xe);
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_2d_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label_2d, ignore_node);
+    auto mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                      xe->get_element_type());
     xe = xe * mask;
   }
   return xe;
@@ -116,17 +93,30 @@ std::shared_ptr<ngraph::Node> GetCrossEntropyGrad(
 
   std::shared_ptr<ngraph::Node> mask;
   if (!is_soft_label) {
-    mask = convert_to_node_type(create_mask(label, ignore_index), x);
-    mask = std::make_shared<ngraph::op::Broadcast>(
-        remove_trailing_one(mask), x_shape, ngraph::AxisSet{rank - 1});
-    label = create_one_hot(label, x);
+    auto label_shape = label->get_shape();
+    label_shape.pop_back();
+    label = platform::NgReshaper(label, label_shape);
+
+    auto ignore_node = ngraph::op::Constant::create(
+        label->get_element_type(), label_shape, {ignore_index});
+    auto not_equal_node =
+        std::make_shared<ngraph::op::NotEqual>(label, ignore_node);
+    mask = std::make_shared<ngraph::op::Convert>(not_equal_node,
+                                                 x->get_element_type());
+    mask = std::make_shared<ngraph::op::Broadcast>(mask, x_shape,
+                                                   ngraph::AxisSet{rank - 1});
+
+    label = std::make_shared<ngraph::op::OneHot>(label, x_shape, rank - 1);
   }
 
-  auto dy_reshape = remove_trailing_one(dy);
+  auto dy_shape = dy->get_shape();
+  dy_shape.pop_back();
+  auto dy_reshape = platform::NgReshaper(dy, dy_shape);
   auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
       dy_reshape, x_shape, ngraph::AxisSet{rank - 1});
-
-  label = convert_to_node_type(label, x);
+  if (x->get_element_type() != label->get_element_type()) {
+    label = std::make_shared<ngraph::op::Convert>(label, x->get_element_type());
+  }
 
   auto xe_grad = -label * dy_bcast / x;
 
@@ -164,80 +154,9 @@ void BuildCrossEntropyGradNode(
   auto xe_grad = GetCrossEntropyGrad(x, label, dy, is_soft_label, ignore_index);
   paddle::platform::SetOutputNode(op, "X@GRAD", xe_grad, ngb_node_map);
 }
-
-void BuildCrossEntropy2Node(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-
-  auto rank = x->get_shape().size();
-
-  auto one_hot = convert_to_node_type(create_one_hot(label, x), x);
-  auto xe = create_xe(one_hot, x);
-  auto mask = convert_to_node_type(create_mask(label, ignore_index), xe);
-
-  xe = xe * mask;
-
-  std::shared_ptr<ngraph::Node> node_sum =
-      std::make_shared<ngraph::op::Sum>(one_hot * x, ngraph::AxisSet{rank - 1});
-  node_sum = paddle::platform::NgReshaper(node_sum, mask->get_shape());
-  auto matchx = mask * node_sum;
-
-  paddle::platform::SetOutputNode(op, "MatchX", matchx, ngb_node_map);
-  platform::SetOutputNode(op, "XShape", x, ngb_node_map);
-  paddle::platform::SetOutputNode(op, "Y", xe, ngb_node_map);
-}
-
-void BuildCrossEntropyGrad2Node(
-    const std::shared_ptr<paddle::framework::OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
-  int ignore_index = op_attrs.Get<int>("ignore_index");
-  auto matchx = paddle::platform::GetInputNode(op, "MatchX", ngb_node_map);
-  auto label = paddle::platform::GetInputNode(op, "Label", ngb_node_map);
-  auto x = paddle::platform::GetInputNode(op, "XShape", ngb_node_map);
-  auto dy = paddle::platform::GetInputNode(op, framework::GradVarName("Y"),
-                                           ngb_node_map);
-
-  matchx = remove_trailing_one(matchx);
-  label = remove_trailing_one(label);
-  x = remove_trailing_one(x);
-  dy = remove_trailing_one(dy);
-
-  auto x_shape = x->get_shape();
-  auto rank = x_shape.size();
-
-  auto one_hot = convert_to_node_type(create_one_hot(label, x), x);
-  auto mask = convert_to_node_type(create_mask(label, ignore_index), x);
-
-  auto zero = paddle::platform::CreateConstant(matchx->get_element_type(),
-                                               matchx->get_shape(), {0});
-  auto one = paddle::platform::CreateConstant(matchx->get_element_type(),
-                                              matchx->get_shape(), {1});
-  auto is_zero = std::make_shared<ngraph::op::Equal>(matchx, zero);
-  matchx = std::make_shared<ngraph::op::Select>(is_zero, one, matchx);
-
-  auto dy_bcast = std::make_shared<ngraph::op::Broadcast>(
-      mask * dy, x_shape, ngraph::AxisSet{rank - 1});
-  auto matchx_bcast = std::make_shared<ngraph::op::Broadcast>(
-      matchx, x_shape, ngraph::AxisSet{rank - 1});
-
-  auto xe_grad = -dy_bcast * one_hot / matchx_bcast;
-  paddle::platform::SetOutputNode(op, framework::GradVarName("X"), xe_grad,
-                                  ngb_node_map);
-}
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
 
 REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
 REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
-REGISTER_NG_OP(cross_entropy2, BuildCrossEntropy2Node);
-REGISTER_NG_OP(cross_entropy_grad2, BuildCrossEntropyGrad2Node);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h b/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
index e4e17f5bb21..872bf006c1c 100644
--- a/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <vector>
-
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
@@ -44,11 +43,11 @@ ngraph::NodeVector ElementwiseBinaryNodePrepare(
   if (lhs_shape == rhs_shape) {
     return ngraph::NodeVector{lhs, rhs};
   }
-  axis = (rhs_shape.size() == 0) ? lhs_shape.size() - 1 : axis;
   axis = (axis == -1 ? lhs_shape.size() - rhs_shape.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < (int)(lhs_shape.size()),
                  "Axis should be in range [0, lhs_shape)");
   paddle::platform::TrimTrailingSingularDims(&rhs_shape);
+  axis = (rhs_shape.size() == 0) ? lhs_shape.size() : axis;
 
   int pre, n, post;
   paddle::platform::GetMidDims(lhs_shape, rhs_shape, axis, &pre, &n, &post);
diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_node.h b/paddle/fluid/operators/ngraph/ops/elementwise_node.h
index 2b10af4588c..1de63a9ee6e 100644
--- a/paddle/fluid/operators/ngraph/ops/elementwise_node.h
+++ b/paddle/fluid/operators/ngraph/ops/elementwise_node.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,10 +17,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <unordered_map>
-
 #include "ngraph/ngraph.hpp"
 #include "paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h"
-#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
 #include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
@@ -61,17 +59,6 @@ void BuildElementwiseCompareNode(
   auto out = std::make_shared<T>(x, y);
   paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
-
 }  // namespace ngraphs
 }  // namespace operators
 }  // namespace paddle
-
-REGISTER_NG_OP(elementwise_max,
-               BuildElementwiseBinaryNode<ngraph::op::Maximum>);
-REGISTER_NG_OP(elementwise_pow, BuildElementwiseBinaryNode<ngraph::op::Power>);
-REGISTER_NG_OP(elementwise_sub,
-               BuildElementwiseBinaryNode<ngraph::op::Subtract>);
-REGISTER_NG_OP(elementwise_min,
-               BuildElementwiseBinaryNode<ngraph::op::Minimum>);
-REGISTER_NG_OP(less_than, BuildElementwiseCompareNode<ngraph::op::Less>);
-REGISTER_NG_OP(elementwise_div, BuildElementwiseBinaryNode<ngraph::op::Divide>);
diff --git a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
index fee5f57e486..42c2df52592 100644
--- a/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
+++ b/paddle/fluid/operators/ngraph/ops/fill_constant_op.h
@@ -38,9 +38,20 @@ void BuildFillConstantNode(
     shape.push_back(sp);
   }
   float value = op_attrs.Get<float>("value");
-  auto ng_dtype =
-      platform::GetNgType(static_cast<paddle::framework::proto::VarType::Type>(
-          op_attrs.Get<int>("dtype")));
+  ngraph::element::Type ng_dtype;
+  auto data_type = static_cast<paddle::framework::proto::VarType::Type>(
+      op_attrs.Get<int>("dtype"));
+  if (data_type == paddle::framework::proto::VarType::FP32) {
+    ng_dtype = ngraph::element::f32;
+  } else if (data_type == paddle::framework::proto::VarType::FP64) {
+    ng_dtype = ngraph::element::f64;
+  } else if (data_type == paddle::framework::proto::VarType::INT64) {
+    ng_dtype = ngraph::element::i64;
+  } else if (data_type == paddle::framework::proto::VarType::INT32) {
+    ng_dtype = ngraph::element::i32;
+  } else {
+    PADDLE_THROW("unsupported data type: %s", data_type);
+  }
   auto out = ngraph::op::Constant::create(ng_dtype, shape, {value});
   paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
index cb46478ee8a..d13665864b8 100644
--- a/paddle/fluid/operators/ngraph/ops/mul_op.h
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -35,7 +35,6 @@ static void BuildMulNode(
   int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
   auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
   auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
-  int y_rank = y->get_shape().size();
 
   auto x_reshape = x;
   auto y_reshape = y;
@@ -53,14 +52,10 @@ static void BuildMulNode(
   std::shared_ptr<ngraph::Node> out =
       std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
 
-  ngraph::Shape out_shape;
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    out_shape.push_back(x->get_shape()[i]);
+  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+  if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
+    out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
   }
-  for (int i = y_num_col_dims; i < y_rank; ++i) {
-    out_shape.push_back(y->get_shape()[i]);
-  }
-  out = paddle::platform::NgReshaper(out, out_shape);
   paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
 }
 
diff --git a/paddle/fluid/operators/ngraph/ops/pool2d_op.h b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
index e5542d47157..c7b9c931617 100644
--- a/paddle/fluid/operators/ngraph/ops/pool2d_op.h
+++ b/paddle/fluid/operators/ngraph/ops/pool2d_op.h
@@ -60,20 +60,17 @@ void BuildPool2dNode(
   ngraph::Strides ng_strides{static_cast<size_t>(strides.at(0)),
                              static_cast<size_t>(strides.at(1))};
 
-  auto ComputeFlooredOutput = [](size_t in, size_t k, size_t p, size_t s) {
-    return (in - k + 2 * p) / s + 1;
-  };
   auto ComputeCeiledOutput = [](size_t in, size_t k, size_t p, size_t s) {
-    return ceil(static_cast<float>(in - k + 2 * p) / s) + 1;
+    return (in - k + 2 * p) / s + 1;
   };
 
   if (op_attrs.Get<bool>("ceil_mode")) {
+    auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+    auto dummpy_shape = dummy_out->get_shape();
     for (size_t i = 0; i < ng_padding_above.size(); ++i) {
-      auto ceiled_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
-                                             paddings[i], strides[i]);
-      auto floored_size = ComputeFlooredOutput(x_shape[i + 2], ksize[i],
-                                               paddings[i], strides[i]);
-      if (ceiled_size != floored_size) {
+      auto desired_size = ComputeCeiledOutput(x_shape[i + 2], ksize[i],
+                                              paddings[i], strides[i]);
+      if (desired_size != dummpy_shape[i + 2]) {
         ng_padding_above[i] += strides[i];
       }
     }
@@ -99,10 +96,6 @@ void BuildPool2dNode(
       pool2d =
           std::make_shared<ngraph::op::AvgPool>(x, ng_ksize_shape, ng_strides);
     } else {
-      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
-          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
-        padding_exclusive = false;
-      }
       pool2d = std::make_shared<ngraph::op::AvgPool>(
           x, ng_ksize_shape, ng_strides, ng_padding_below, ng_padding_above,
           !padding_exclusive);
@@ -170,10 +163,6 @@ void BuildPool2dGradNode(
           x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
           ng_padding_above, !padding_exclusive);
     } else {
-      if ((ng_padding_below[0] == 0) && (ng_padding_below[1] == 0) &&
-          (ng_padding_above[0] == 0) && (ng_padding_above[1] == 0)) {
-        padding_exclusive = false;
-      }
       pool2d_grad = std::make_shared<ngraph::op::AvgPoolBackprop>(
           x->get_shape(), dout, ng_ksize_shape, ng_strides, ng_padding_below,
           ng_padding_above, !padding_exclusive);
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index cbb0c4028b3..626895f49d8 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/one_hot_op.h"
-#include <string>
-#include <vector>
 #include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
@@ -36,34 +34,15 @@ class OneHotOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
                         "Last dimension of Input(X) should be 1.");
     }
-
-    framework::DDim out_dims(x_dims);
     int depth = ctx->Attrs().Get<int>("depth");
-    if (ctx->HasInput("depth_tensor")) {
-      depth = -1;
-    }
 
+    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
+
+    framework::DDim out_dims(x_dims);
     out_dims[out_dims.size() - 1] = depth;
     ctx->SetOutputDim("Out", out_dims);
     ctx->ShareLoD("X", /* --> */ "Out");
   }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
-                                   ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string& var_name, const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const override {
-    if (var_name == "depth_tensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -73,15 +52,11 @@ class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
              "The last dimension of X should be 1. Each value of X is an index "
              "to indicate the position.");
-    AddInput("depth_tensor", "(Tensor, Tensor<int>), Length of one-hot vector")
-        .AsDispensable();
     AddOutput("Out",
               "(Tensor, Tensor<float>) Output tensor with same rank as X. "
               "The tensor consists of one-hot representations of values in X.");
-
     AddAttr<int>("depth",
-                 "A positive integer to specify the length of one-hot vector.")
-        .SetDefault(-1);
+                 "A positive integer to specify the length of one-hot vector.");
     AddAttr<int>("dtype",
                  "An integer to specify the data type of one-hot "
                  "vector. The default value is FP32.")
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index b9fe0bf2e9d..59d8b9b8a8d 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -62,25 +62,8 @@ class OneHotCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
 
-    int depth = -1;
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<framework::Tensor>("depth_tensor");
-      if (platform::is_gpu_place(depth_tensor->place())) {
-        framework::Tensor temp;
-        TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp);
-        depth = *temp.data<int32_t>();
-      } else {
-        depth = *depth_tensor->data<int32_t>();
-      }
-
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    } else {
-      depth = context.Attr<int>("depth");
-    }
     framework::VisitDataType(
         static_cast<framework::proto::VarType::Type>(
             context.Attr<int>("dtype")),
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index 7273080927e..1ebd2676496 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -49,7 +49,6 @@ struct OneHotOpFunctor {
 };
 
 using LoDTensor = framework::LoDTensor;
-using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class OneHotKernel : public framework::OpKernel<T> {
  public:
@@ -57,15 +56,6 @@ class OneHotKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<LoDTensor>("Out");
     int depth = context.Attr<int>("depth");
-    if (context.HasInput("depth_tensor")) {
-      auto* depth_tensor = context.Input<Tensor>("depth_tensor");
-      auto* depth_data = depth_tensor->data<int32_t>();
-      depth = depth_data[0];
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
 
     framework::VisitDataType(
         static_cast<framework::proto::VarType::Type>(
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index dd347aa0afe..54e0f5146da 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -18,64 +18,67 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-
-void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Param"),
-                 "Input(Param) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                 "Input(Grad) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                 "Input(Moment1) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                 "Input(Moment2) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                 "Input(LearningRate) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                 "Input(Beta1Pow) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                 "Input(Beta2Pow) of AdamOp should not be null.");
-
-  PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                 "Output(ParamOut) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                 "Output(Moment1Out) of AdamOp should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                 "Output(Moment2Out) of AdamOp should not be null.");
-
-  auto lr_dims = ctx->GetInputDim("LearningRate");
-  PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                    "Learning rate should have 1 dimension");
-  auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-  PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                    "Beta1 power accumulator should have 1 dimension");
-  auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-  PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                    "Beta2 power accumulator should have 1 dimension");
-
-  auto param_dims = ctx->GetInputDim("Param");
-  if (ctx->GetInputsVarType("Grad")[0] ==
-      framework::proto::VarType::LOD_TENSOR) {
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dims, ctx->GetInputDim("Grad"),
+          "Param and Grad input of AdamOp should have same dimension");
+    }
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment1 input of AdamOp should have same dimension");
     PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Grad"),
-        "Param and Grad input of AdamOp should have same dimension");
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and Moment2 input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type = ctx.Input<Tensor>("Param")->type();
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
-  PADDLE_ENFORCE_EQ(
-      param_dims, ctx->GetInputDim("Moment1"),
-      "Param and Moment1 input of AdamOp should have same dimension");
-  PADDLE_ENFORCE_EQ(
-      param_dims, ctx->GetInputDim("Moment2"),
-      "Param and Moment2 input of AdamOp should have same dimension");
-
-  ctx->SetOutputDim("ParamOut", param_dims);
-  ctx->SetOutputDim("Moment1Out", param_dims);
-  ctx->SetOutputDim("Moment2Out", param_dims);
-}
-
-framework::OpKernelType AdamOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  auto input_data_type = ctx.Input<framework::Tensor>("Param")->type();
-  return framework::OpKernelType(input_data_type, ctx.GetPlace());
-}
+};
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 1cc34f11d09..6262ef0c2d3 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -29,15 +29,6 @@ namespace operators {
 
 namespace scatter = paddle::operators::math::scatter;
 
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override;
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override;
-};
-
 struct GPUAdam;
 struct CPUAdam;
 
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index f686e5293b0..6a5bf170600 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -26,19 +26,6 @@ const char kForward[] = "FORWARD";
 const char kBackward[] = "BACKWARD";
 const char kBoth[] = "BOTH";
 
-class LogGuard {
- public:
-  inline LogGuard() { LogMutex().lock(); }
-
-  inline ~LogGuard() { LogMutex().unlock(); }
-
- private:
-  static std::mutex &LogMutex() {
-    static std::mutex mtx;
-    return mtx;
-  }
-};
-
 struct Formater {
   std::string message;
   std::string name;
@@ -47,54 +34,48 @@ struct Formater {
   framework::LoD lod;
   int summarize;
   void *data{nullptr};
-  platform::Place place;
-  std::stringstream logs;
 
   void operator()(size_t size) {
     PrintMessage();
-    PrintPlaceInfo();
     PrintName();
     PrintDims();
     PrintDtype();
     PrintLod();
     PrintData(size);
-    LogGuard guard;
-    CLOG << logs.str();
   }
 
  private:
-  void PrintPlaceInfo() { logs << "The place is:" << place << std::endl; }
-  void PrintMessage() { logs << std::time(nullptr) << "\t" << message << "\t"; }
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message << "\t"; }
   void PrintName() {
     if (!name.empty()) {
-      logs << "Tensor[" << name << "]" << std::endl;
+      CLOG << "Tensor[" << name << "]" << std::endl;
     }
   }
   void PrintDims() {
     if (!dims.empty()) {
-      logs << "\tshape: [";
+      CLOG << "\tshape: [";
       for (auto i : dims) {
-        logs << i << ",";
+        CLOG << i << ",";
       }
-      logs << "]" << std::endl;
+      CLOG << "]" << std::endl;
     }
   }
   void PrintDtype() {
     if (!framework::IsType<const char>(dtype)) {
-      logs << "\tdtype: " << dtype.name() << std::endl;
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
     }
   }
   void PrintLod() {
     if (!lod.empty()) {
-      logs << "\tLoD: [";
+      CLOG << "\tLoD: [";
       for (auto level : lod) {
-        logs << "[ ";
+        CLOG << "[ ";
         for (auto i : level) {
-          logs << i << ",";
+          CLOG << i << ",";
         }
-        logs << " ]";
+        CLOG << " ]";
       }
-      logs << "]" << std::endl;
+      CLOG << "]" << std::endl;
     }
   }
 
@@ -112,57 +93,56 @@ struct Formater {
     } else if (framework::IsType<const bool>(dtype)) {
       Display<bool>(size);
     } else {
-      logs << "\tdata: unprintable type: " << dtype.name() << std::endl;
+      CLOG << "\tdata: unprintable type: " << dtype.name() << std::endl;
     }
   }
 
   template <typename T>
   void Display(size_t size) {
     auto *d = reinterpret_cast<T *>(data);
-    logs << "\tdata: ";
+    CLOG << "\tdata: ";
     if (summarize != -1) {
       summarize = std::min(size, (size_t)summarize);
       for (int i = 0; i < summarize; i++) {
-        logs << d[i] << ",";
+        CLOG << d[i] << ",";
       }
     } else {
       for (size_t i = 0; i < size; i++) {
-        logs << d[i] << ",";
+        CLOG << d[i] << ",";
       }
     }
-    logs << std::endl;
+    CLOG << std::endl;
   }
 };
 
 // TODO(ChunweiYan) there should be some other printers for TensorArray
-class PrintOp : public framework::OperatorBase {
+class TensorPrintOp : public framework::OperatorBase {
  public:
-  PrintOp(const std::string &type, const framework::VariableNameMap &inputs,
-          const framework::VariableNameMap &outputs,
-          const framework::AttributeMap &attrs)
+  TensorPrintOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
+  TensorPrintOp(const TensorPrintOp &o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase &>(o)) {
+    PADDLE_THROW("Not implemented.");
+  }
+
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    const auto in_var = scope.FindVar(Input("In"));
-    auto out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE_NOT_NULL(in_var, "The input should not be found in scope",
-                            Input("In"));
-    PADDLE_ENFORCE_NOT_NULL(out_var, "The output should not be found in scope",
-                            Output("Out"));
-    auto &in_tensor = in_var->Get<framework::LoDTensor>();
-    framework::LoDTensor *out_tensor =
-        out_var->GetMutable<framework::LoDTensor>();
-
-    PrintValue(place, Inputs("In").front(), in_tensor);
-    framework::TensorCopy(in_tensor, place, out_tensor);
-    out_tensor->set_lod(in_tensor.lod());
-  }
+    const framework::Variable *in_var_ptr = nullptr;
+    std::string printed_var_name = "";
+
+    in_var_ptr = scope.FindVar(Input("In"));
+    printed_var_name = Inputs("In").front();
+
+    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
+
+    auto &in_tensor = in_var_ptr->Get<framework::LoDTensor>();
 
-  void PrintValue(const platform::Place &place,
-                  const std::string &printed_var_name,
-                  const framework::LoDTensor &in_tensor) const {
     std::string print_phase = Attr<std::string>("print_phase");
     bool is_forward = Attr<bool>("is_forward");
 
@@ -178,16 +158,15 @@ class PrintOp : public framework::OperatorBase {
     printed_tensor.set_lod(in_tensor.lod());
     printed_tensor.Resize(in_tensor.dims());
 
-    if (is_cpu_place(in_tensor.place())) {
+    if (platform::is_cpu_place(in_tensor.place())) {
       printed_tensor.ShareDataWith(in_tensor);
     } else {
       // copy data to cpu to print
       platform::CPUPlace place;
-      TensorCopy(in_tensor, place, &printed_tensor);
+      framework::TensorCopy(in_tensor, place, &printed_tensor);
     }
 
     Formater formater;
-    formater.place = place;
     formater.message = Attr<std::string>("message");
     if (Attr<bool>("print_tensor_name")) {
       formater.name = printed_var_name;
@@ -216,7 +195,6 @@ class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("In", "Input tensor to be displayed.");
-    AddOutput("Out", "The output tensor.");
     AddAttr<int>("first_n", "Only log `first_n` number of times.");
     AddAttr<std::string>("message", "A string message to print as a prefix.");
     AddAttr<int>("summarize", "Number of elements printed.");
@@ -241,23 +219,10 @@ tensor `t`.)DOC");
   }
 };
 
-class PrintOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    VLOG(10) << "PrintOpInferShape";
-    PADDLE_ENFORCE(ctx->HasInput("In"), "Input(In) should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
-    ctx->ShareDim("In", /*->*/ "Out");
-    ctx->ShareLoD("In", /*->*/ "Out");
-  }
-};
-
-class PrintOpVarTypeInference : public framework::VarTypeInference {
+class InferShapeForward : public framework::InferShapeBase {
  public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetType(ctx->Input("In")[0]);
-    auto out_name = ctx->Output("Out").front();
-    ctx->SetType(out_name, input_type);
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
   }
 };
 
@@ -268,8 +233,7 @@ class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto *op_desc_ptr = new framework::OpDesc();
     op_desc_ptr->SetType("print");
-    op_desc_ptr->SetInput("In", OutputGrad("Out"));
-    op_desc_ptr->SetOutput("Out", InputGrad("In"));
+    op_desc_ptr->SetInput("In", InputGrad("In"));
     op_desc_ptr->SetAttrMap(Attrs());
     op_desc_ptr->SetAttr("is_forward", false);
     return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
@@ -281,6 +245,5 @@ class PrintOpGradientMaker : public framework::SingleGradOpDescMaker {
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(print, ops::PrintOp, ops::PrintOpProtoAndCheckMaker,
-                  ops::PrintOpGradientMaker, ops::PrintOpInferShape,
-                  ops::PrintOpVarTypeInference);
+REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
+                  ops::PrintOpGradientMaker, ops::InferShapeForward);
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 16cb08f4190..418c342c8fc 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include <memory>
-#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 
@@ -168,8 +167,7 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  *out = std::move(platform::is_gpu_place(place_) ? gpu_buffer_[i]
-                                                  : cpu_buffer_[i]);
+  *out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
 
   // Do not push current position into ReadAsync. Push the previous position
   // Since all computation in fluid are async, change the data of
diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc
index 4edc15a2635..43a49de5224 100644
--- a/paddle/fluid/operators/reader/ctr_reader.cc
+++ b/paddle/fluid/operators/reader/ctr_reader.cc
@@ -32,17 +32,17 @@ namespace reader {
 
 static inline void string_split(const std::string& s, const char delimiter,
                                 std::vector<std::string>* output) {
-  if (s.empty()) return;
-
   size_t start = 0;
-  size_t end = s.find(delimiter);
-  while (end != std::string::npos) {
-    if (end > start) output->emplace_back(s.substr(start, end - start));
+  size_t end = s.find_first_of(delimiter);
+
+  while (end <= std::string::npos) {
+    output->emplace_back(s.substr(start, end - start));
+    if (end == std::string::npos) {
+      break;
+    }
     start = end + 1;
-    end = s.find(delimiter, start);
+    end = s.find_first_of(delimiter, start);
   }
-  auto term = s.substr(start);
-  if (!term.empty()) output->emplace_back(term);
 }
 
 static inline void parse_line(
@@ -52,9 +52,9 @@ static inline void parse_line(
     std::unordered_map<std::string, std::vector<int64_t>>* slot_to_data) {
   std::vector<std::string> ret;
   string_split(line, ' ', &ret);
-  *label = std::stoi(ret[0]) > 0;
+  *label = std::stoi(ret[2]) > 0;
 
-  for (size_t i = 1; i < ret.size(); ++i) {
+  for (size_t i = 3; i < ret.size(); ++i) {
     const std::string& item = ret[i];
     std::vector<std::string> feasign_and_slot;
     string_split(item, ':', &feasign_and_slot);
diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc
index b64c8400439..6410439816d 100644
--- a/paddle/fluid/operators/reader/ctr_reader_test.cc
+++ b/paddle/fluid/operators/reader/ctr_reader_test.cc
@@ -91,16 +91,16 @@ static inline void check_all_data(
 
 TEST(CTR_READER, read_data) {
   const std::vector<std::string> ctr_data = {
-      "0 0:6002 1:6003 2:6004 3:6005 4:6006 \n",
-      "0 5:6003 6:6003 7:6003 8:6004 9:6004 \n",
-      "1 10:6002 11:6002 12:6002 13:6002 14:6002 \n",
-      "0 15:6003 16:6003 17:6003 18:6003 19:6004 \n",
-      "1 20:6001 21:6001 22:6001 23:6001 24:6001 \n",
-      "1 25:6004 26:6004 27:6004 28:6005 29:6005 \n",
-      "0 30:6002 31:6003 32:6004 33:6004 34:6005 \n",
-      "1 35:6003 36:6003 37:6005 38:6005 39:6005 \n",
-      "1 40:6002 41:6003 42:6004 43:6004 44:6005 \n",
-      "1 46:6006 45:6006 47:6003 48:6003 49:6003 \n",
+      "aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n",
+      "bbbb 1 0 5:6003 6:6003 7:6003 8:6004 9:6004 -1\n",
+      "cccc 1 1 10:6002 11:6002 12:6002 13:6002 14:6002 -2\n",
+      "dddd 1 0 15:6003 16:6003 17:6003 18:6003 19:6004 -3\n",
+      "1111 1 1 20:6001 21:6001 22:6001 23:6001 24:6001 12\n",
+      "2222 1 1 25:6004 26:6004 27:6004 28:6005 29:6005 aa\n",
+      "3333 1 0 30:6002 31:6003 32:6004 33:6004 34:6005 er\n",
+      "eeee 1 1 35:6003 36:6003 37:6005 38:6005 39:6005 dd\n",
+      "ffff 1 1 40:6002 41:6003 42:6004 43:6004 44:6005 66\n",
+      "gggg 1 1 46:6006 45:6006 47:6003 48:6003 49:6003 ba\n",
   };
   std::string gz_file_name = "test_ctr_reader_data.gz";
   generatedata(ctr_data, gz_file_name);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index b3bb1abf4da..1a2feee11c9 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -37,20 +37,6 @@ constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
 
 using StepScopeVar = std::vector<framework::Scope *>;
 
-static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
-                            framework::Scope *parent_scope,
-                            StepScopeVar *step_scopes) {
-  if (step_scopes->empty()) return;
-
-  dev_ctx.Wait();
-
-  for (auto *sub_scope : *step_scopes) {
-    parent_scope->DeleteScope(sub_scope);
-  }
-
-  step_scopes->clear();
-}
-
 // StepScopes manages scopes inside RNN.
 //    StepScopes::CurScope() get the current scope
 //    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
@@ -67,8 +53,7 @@ static void ClearStepScopes(const platform::DeviceContext &dev_ctx,
 //   access scopes from begin to end.
 class StepScopes {
  public:
-  StepScopes(const platform::DeviceContext &dev_ctx,
-             const framework::Scope &parent, StepScopeVar *scopes,
+  StepScopes(const framework::Scope &parent, StepScopeVar *scopes,
              bool is_train, size_t seq_len, bool is_backward = false)
       : counter_(is_backward ? seq_len - 1 : 0UL),
         scopes_(scopes),
@@ -78,7 +63,7 @@ class StepScopes {
     PADDLE_ENFORCE(is_train || !is_backward,
                    "Cannot backward when is not training");
     if (!is_backward_) {
-      ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&parent), scopes);
+      PADDLE_ENFORCE(scopes->empty());
       scopes->reserve(static_cast<size_t>(num_step_scopes));
       for (size_t i = 0; i < num_step_scopes; ++i) {
         scopes->emplace_back(&parent.NewScope());
@@ -259,22 +244,18 @@ class RecurrentOp : public RecurrentBase {
                const platform::Place &place) const override {
     bool has_state = Attr<bool>(kHasStates);
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
+    VLOG(3) << "Static RNN input sequence length = " << seq_len;
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
+    auto reverse = Attr<bool>(kReverse);
 
     // get device context from pool
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(place);
 
-    VLOG(3) << "Static RNN input sequence length = " << seq_len;
-    StepScopes scopes = CreateStepScopes(dev_ctx, scope, seq_len);
-    auto reverse = Attr<bool>(kReverse);
-
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
     auto *program = block->Program();
-    auto ctx = executor.Prepare(
-        *program, block->ID(), std::vector<std::string>() /*skip_ref_cnt_vars*/,
-        true /*force_disable_gc*/);
 
     for (size_t i = 0; i < seq_len; ++i) {
       size_t seq_offset = reverse ? seq_len - i - 1 : i;
@@ -308,9 +289,10 @@ class RecurrentOp : public RecurrentBase {
       }
 
       // Every inputs are linked now, execute!
-      executor.RunPreparedContext(ctx.get(), &cur_scope,
-                                  false /*create_local_scope*/,
-                                  true /*create_vars*/, true /* keep_kids */);
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/, true /*create_vars*/,
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
@@ -334,12 +316,11 @@ class RecurrentOp : public RecurrentBase {
   }
 
  private:
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                              const framework::Scope &scope,
+  StepScopes CreateStepScopes(const framework::Scope &scope,
                               size_t seq_len) const {
     auto *var = scope.FindVar(Output(kStepScopes));
     PADDLE_ENFORCE(var != nullptr);
-    return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
                       Attr<bool>(kIsTrain), seq_len);
   }
 };
@@ -357,20 +338,16 @@ class RecurrentGradOp : public RecurrentBase {
                const platform::Place &place) const override {
     bool has_state = Attr<bool>(kHasStates);
     const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
-
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    StepScopes scopes = CreateStepScopes(dev_ctx, scope, seq_len);
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
-    auto ctx = executor.Prepare(
-        *program, block->ID(), std::vector<std::string>() /*skip_ref_cnt_vars*/,
-        true /*force_disable_gc*/);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
 
     for (size_t step_id = 0; step_id < seq_len; ++step_id) {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
@@ -428,9 +405,10 @@ class RecurrentGradOp : public RecurrentBase {
 
       VLOG(5) << "Recurrent memory linking finished ";
       // Run step block with cur_scope
-      executor.RunPreparedContext(ctx.get(), &cur_scope,
-                                  false /*create_local_scope*/,
-                                  true /*create_vars*/, true /* keep_kids */);
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/, true /*create_vars*/,
+                   std::vector<std::string>() /*skip_ref_cnt_vars*/,
+                   true /*force_disable_gc*/);
 
       VLOG(5) << "executor.Run finished ";
 
@@ -523,20 +501,21 @@ class RecurrentGradOp : public RecurrentBase {
       scopes.Next();
     }
     // Delete the scope of StepScopes
+    dev_ctx.Wait();
     auto *var = scope.FindVar(Input(kStepScopes));
     PADDLE_ENFORCE(var != nullptr);
-    auto *step_scopes = var->GetMutable<StepScopeVar>();
-    ClearStepScopes(dev_ctx, const_cast<framework::Scope *>(&scope),
-                    step_scopes);
+    auto step_scopes = var->GetMutable<StepScopeVar>();
+    for (auto *sub_scope : *step_scopes) {
+      const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
+    }
   }
 
  private:
-  StepScopes CreateStepScopes(const platform::DeviceContext &dev_ctx,
-                              const framework::Scope &scope,
+  StepScopes CreateStepScopes(const framework::Scope &scope,
                               size_t seq_len) const {
     auto *var = scope.FindVar(Input(kStepScopes));
     PADDLE_ENFORCE(var != nullptr);
-    return StepScopes(dev_ctx, scope, var->GetMutable<StepScopeVar>(),
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
                       Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
   }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index d1b508792c2..072bc34d3e2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -13,67 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include <memory>
-#include <string>
-#include <vector>
 
-namespace paddle {
-namespace operators {
-
-// NOTE(dengkaipeng): Input(Out) is unnecessary in reduce_mean_grad
-// calcualtion, but will incur a reduce_mean_grad op after
-// reduce_mean_grad_grad, delete Input(Out) here.
-// This change has no effect on reduce_mean_grad calculations.
-class ReduceMeanOpGradDescMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
-    op->SetType("reduce_mean_grad");
-    op->SetInput("X", Input("X"));
-    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
-    op->SetAttrMap(Attrs());
-    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
-    return op;
-  }
-};
-
-class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase {
- public:
-  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<framework::OpDesc>> ops;
-    auto x_gg = OutputGrad(framework::GradVarName("X"));  // input ddx
-    auto out_grads = InputGrad(framework::GradVarName("Out"));
-    if (!out_grads.empty()) {
-      auto* out_grad_op = new framework::OpDesc();
-      out_grad_op->SetType("reduce_mean");
-      out_grad_op->SetInput("X", x_gg);
-      out_grad_op->SetAttrMap(Attrs());
-      out_grad_op->SetOutput("Out", out_grads);
-      ops.emplace_back(out_grad_op);
-    }
-
-    return ops;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-class __reduce_meanMaker__ : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "reduce_mean"; }
-  virtual std::string GetOpType() const { return "Reduce reduce_mean"; }
-};
-
-REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__,
-                  ops::ReduceMeanOpGradDescMaker);
-REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
-                  ops::ReduceMeanDoubleGradMaker);
+REGISTER_REDUCE_OP(reduce_mean);
 REGISTER_OP_CPU_KERNEL(reduce_mean,
                        ops::ReduceKernel<paddle::platform::CPUDeviceContext,
                                          float, ops::MeanFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 67fd3e1dad4..c86591fdafa 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -88,10 +88,6 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto* output = context.Output<Tensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
 
-    // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
-    // not be set as Input in grad Maker, use Out_grad to replace here
-    if (!input1) input1 = input2;
-
     if (reduce_all) {
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 9750bc87b00..f3719e8f438 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -19,29 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
-inline std::vector<int> get_new_shape(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
-                      "shape of dim tensor should be [1]");
-    if (platform::is_gpu_place(tensor->place())) {
-      framework::Tensor temp;
-      TensorCopySync(*tensor, platform::CPUPlace(), &temp);
-
-      vec_new_shape.push_back(static_cast<int32_t>(*temp.data<int32_t>()));
-    } else {
-      vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-    }
-  }
-
-  return vec_new_shape;
-}
-
 class ReshapeOp : public framework::OperatorWithKernel {
  public:
   ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -55,24 +32,17 @@ class ReshapeOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ReshapeOp should not be null.");
 
-    if (ctx->HasInputs("ShapeTensor")) {
-      // top prority shape
-      auto inputs_name = ctx->Inputs("ShapeTensor");
-      PADDLE_ENFORCE(inputs_name.size() > 0, "shape tensor size can't be zero");
-      auto out_dims = std::vector<int>(inputs_name.size(), -1);
-      ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(!shape.empty(),
+                   "The shape information must be set by Attr(shape).");
 
-      return;
-    }
     if (ctx->HasInput("Shape") && ctx->IsRuntime()) {
       // If true, set the shape of Output(Out) according to Input(Shape) in
       // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel.
       ctx->ShareLoD("X", /*->*/ "Out");
       return;
     }
-    const std::vector<int> &shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    PADDLE_ENFORCE(!shape.empty(),
-                   "The shape information must be set by Attr(shape).");
+
     auto x_dims = ctx->GetInputDim("X");
     auto out_dims = ValidateShape(shape, x_dims);
     ctx->SetOutputDim("Out", out_dims);
@@ -144,16 +114,6 @@ class ReshapeOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
                                    ctx.device_context());
   }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -166,18 +126,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
              "the shape attribute, while the shape attribute still should be "
              "set correctly to gurantee shape inference in compile time.")
         .AsDispensable();
-    AddInput(
-        "ShapeTensor",
-        "(vector<Tensor<int32>>, optional). If provided, reshape will use this"
-        "The shape of the tensor in vector MUST BE [1]"
-        "it has the highest priority compare with Input(Shape) and "
-        "attr(shape).")
-        .AsDuplicable()
-        .AsDispensable();
     AddOutput("Out", "(Tensor). The output tensor of reshape operator.");
     AddAttr<std::vector<int>>(
-        "shape", "(std::vector<int>) Target shape of reshape operator.")
-        .SetDefault({});
+        "shape", "(std::vector<int>) Target shape of reshape operator.");
     AddComment(R"DOC(
 Reshape Operator.
 
@@ -251,35 +202,24 @@ class ReshapeKernel {
     auto *out = ctx.Output<framework::LoDTensor>("Out");
     auto *in = ctx.Input<framework::LoDTensor>("X");
 
-    framework::DDim out_dims = out->dims();
+    auto *shape_tensor = ctx.HasInput("Shape")
+                             ? ctx.Input<framework::LoDTensor>("Shape")
+                             : nullptr;
 
-    auto list_new_shape_tensor =
-        ctx.MultiInput<framework::Tensor>("ShapeTensor");
-    if (list_new_shape_tensor.size() > 0) {
-      // have shape tensor
-      auto new_shape = get_new_shape(list_new_shape_tensor);
-      out_dims = ReshapeOp::ValidateShape(new_shape, in->dims());
+    framework::DDim out_dims = out->dims();
 
-    } else {
-      auto *shape_tensor = ctx.HasInput("Shape")
-                               ? ctx.Input<framework::LoDTensor>("Shape")
-                               : nullptr;
-
-      if (shape_tensor) {
-        auto *shape_data = shape_tensor->data<int>();
-        framework::Tensor cpu_shape_tensor;
-        if (platform::is_gpu_place(shape_tensor->place())) {
-          TensorCopySync(*shape_tensor, platform::CPUPlace(),
-                         &cpu_shape_tensor);
-          shape_data = cpu_shape_tensor.data<int>();
-        }
-        auto shape =
-            std::vector<int>(shape_data, shape_data + shape_tensor->numel());
-        out_dims = ReshapeOp::ValidateShape(shape, in->dims());
+    if (shape_tensor) {
+      auto *shape_data = shape_tensor->data<int>();
+      framework::Tensor cpu_shape_tensor;
+      if (platform::is_gpu_place(shape_tensor->place())) {
+        TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
+        shape_data = cpu_shape_tensor.data<int>();
       }
+      auto shape =
+          std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+      out_dims = ReshapeOp::ValidateShape(shape, in->dims());
     }
 
-    out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
     framework::TensorCopy(
         *in, ctx.GetPlace(),
@@ -348,7 +288,6 @@ class Reshape2GradMaker : public framework::SingleGradOpDescMaker {
     auto *grad_op = new framework::OpDesc();
     grad_op->SetType("reshape2_grad");
     grad_op->SetInput("XShape", Output("XShape"));
-    grad_op->SetInput("ShapeTensor", Input("ShapeTensor"));
     grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     grad_op->SetAttrMap(Attrs());
@@ -381,16 +320,6 @@ class Reshape2GradOp : public framework::OperatorWithKernel {
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
         ctx.device_context());
   }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const override {
-    if (var_name == "ShapeTensor") {
-      return expected_kernel_type;
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
 };
 
 class ReshapeOpInplaceInToOut : public framework::InplaceOpInference {
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index ce4af44266e..b2e79f6c82b 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <unordered_set>
-#include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -27,33 +24,17 @@ using Tensor = framework::Tensor;
 #define CUDA_1D_KERNEL_LOOP(i, n)                              \
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
-template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size,
-                                      bool overwrite) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
-    int indices_i = i / slice_size;
-    int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT scatter_i = indices[indices_i];
-    IndexT out_i = scatter_i * slice_size + slice_i;
-    *(output + out_i) = static_cast<T>(0);
-  }
-}
 
-template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
                                   T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
+                                  size_t slice_size) {
   CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
-    IndexT scatter_i = indices[indices_i];
-    IndexT out_i = scatter_i * slice_size + slice_i;
-    if (overwrite) {
-      *(output + out_i) = *(params + i);
-    } else {
-      paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
-    }
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
   }
 }
 
@@ -62,17 +43,14 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
  * Return a new updated tensor from source tensor, scatter-assigned according to
  * index
  * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
-                      bool overwrite = true) {
+template <typename T>
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
   // PADDLE_ENFORCE(platform::is_gpu_place(place));
   // check index of shape 1-D
-
-  const auto& ctx = context.device_context();
   PADDLE_ENFORCE(index.dims().size() == 1 ||
                  (index.dims().size() == 2 && index.dims()[1] == 1));
   int index_size = index.dims()[0];
@@ -86,27 +64,17 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
   const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
-  const size_t& slice_bytes = slice_size * sizeof(T);
 
-  // set block and grid num
   int block = 512;
   int n = slice_size * index_size;
   int grid = (n + block - 1) / block;
 
-  // if not overwrite mode, init data
-  if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-        p_index, p_output, index_size, slice_size, overwrite);
-  }
-
-  ScatterCUDAKernel<T, IndexT><<<
+  ScatterCUDAKernel<T><<<
       grid, block, 0,
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size, overwrite);
+      p_src, p_index, p_output, index_size, slice_size);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index 680dc282c14..8bae6606c94 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -14,14 +14,11 @@ limitations under the License. */
 
 #pragma once
 #include <cstring>
-#include <string>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/place.h"
-#include "unordered_set"
 
 namespace paddle {
 namespace operators {
@@ -29,48 +26,13 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 /**
-  * Return the updated array pointer, use blas or eigen lib to optimize time
- * cost
- */
-template <typename T, typename IndexT = int>
-typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, const T* dist_pointer,
-                      T* result_dist_pointer, const framework::Tensor& src,
-                      framework::Tensor* dist, const int& src_index,
-                      const IndexT& dist_index, const int& slice_size,
-                      const size_t& slice_bytes) {
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
-            dist_pointer + dist_index * slice_size,
-            result_dist_pointer + dist_index * slice_size);
-}
-
-template <typename T, typename IndexT = int>
-typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, const T* dist_pointer,
-                      T* result_dist_pointer, const framework::Tensor& src,
-                      framework::Tensor* dist, const int& src_index,
-                      const IndexT& dist_index, const int& slice_size,
-                      const size_t& slice_bytes) {
-  auto src_slice = src.Slice(src_index, src_index + 1);
-  auto dist_slice = dist->Slice(dist_index, dist_index + 1);
-
-  auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
-  auto eigen_dist = framework::EigenVector<T>::Flatten(dist_slice);
-
-  eigen_dist += eigen_src;
-}
-/**
- * Return an updated tensor from source tensor, scattered according to index:
+ * Return a updated tensor from source tensor, scattered according to index:
  * dst[i] = src[index[i]]
  * input[src]: type-T source Tensor
- * input[index]: type-IndexT index Tensor (1-D)
+ * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
-template <typename T, typename IndexT = int>
+template <typename T>
 void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
                    const Tensor& index, Tensor* output) {
   PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
@@ -83,7 +45,7 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   auto dst_dims = output->dims();
 
   const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
+  const int* p_index = index.data<int>();
   T* p_output = output->data<T>();
 
   // check src shape and dst shape should match
@@ -97,52 +59,10 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   const size_t slice_bytes = slice_size * sizeof(T);
 
   for (int i = 0; i < index_size; ++i) {
-    IndexT index_ = p_index[i];
+    int index_ = p_index[i];
     memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
   }
 }
 
-template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE(platform::is_cpu_place(ctx.device_context().GetPlace()));
-  // check index of shape 1-D
-  PADDLE_ENFORCE(index.dims().size() == 1 ||
-                 (index.dims().size() == 2 && index.dims()[1] == 1));
-  int index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-  auto dst_dims = output->dims();
-
-  const T* p_src = src.data<T>();
-  const IndexT* p_index = index.data<IndexT>();
-
-  const T* p_output = output->data<T>();
-  T* result_p_output = output->data<T>();
-
-  // check src shape and dst shape should match
-  for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
-
-  // slice size
-  size_t slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t& slice_bytes = slice_size * sizeof(T);
-
-  // if not in overwrite mode, need to init output data
-  for (int i = 0; i < index_size; ++i) {
-    const IndexT& index_ = p_index[i];
-    memset(result_p_output + slice_size * index_, 0, slice_bytes);
-  }
-
-  for (int i = 0; i < index_size; ++i) {
-    const IndexT& index_ = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, result_p_output, src,
-                                     output, i, index_, slice_size,
-                                     slice_bytes);
-  }
-}
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index f5a1b32e5c2..68ad223b3c3 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -80,14 +80,6 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Ids", "The index input of scatter op where X will be updated");
     AddInput("Updates", "The updated value of scatter op");
     AddOutput("Out", "The output of scatter op");
-    AddAttr<bool>("overwrite",
-                  "(bool, defalut: True) "
-                  "The mode that updating the output when has same index,"
-                  "If True, use the overwrite mode to update the output"
-                  "of the same index, if False, use the accumulate mode to"
-                  "update the output of the same index,Default value is True."
-                  "You can set overwrite=False to implement scatter_add.")
-        .SetDefault(true);
     AddComment(R"DOC(
 Scatter Operator.
 
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index e9ad3475381..a70b9091727 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -30,10 +30,10 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
-    bool overwrite = ctx.Attr<bool>("overwrite");
 
     Out->ShareDataWith(*X);
-    GPUScatterAssign<T>(ctx, *Updates, *Ids, Out, overwrite);
+
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 9c237dc0f1f..2eefbba9726 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -33,33 +33,11 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
-    double overwrite = ctx.Attr<bool>("overwrite");
 
     // In place output: Out = X, Out[Ids] = Updates
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
     // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = Ids->type();
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE(
-        index_type_match,
-        "Index holds the wrong type, it holds %s, but desires to be %s or %s",
-        paddle::framework::DataTypeToString(index_type),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT32),
-        paddle::framework::DataTypeToString(framework::proto::VarType::INT64));
-    if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
-      } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
-      }
-    } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
-      } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
-      }
-    }
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
index f3193fdc556..b4923571df9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc
@@ -57,9 +57,6 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
         "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
         .SetDefault("AVERAGE")
         .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
-    AddAttr<float>("pad_value",
-                   "(float, default 0.0) The value to pad for empty sequence.")
-        .SetDefault(0.0);
     AddComment(R"DOC(
 Sequence Pool Operator.
 
@@ -72,8 +69,6 @@ It supports six pooling types:
 5. FIRST:   Out[i] = first instance in i-th sequence X[i]
 6. MAX:     $$Out[i] = max(X_i)$$
 
-and for the empty sequence Out[i] = attr(pad_value).
-
 The following example explains how this works:
 For a mini-batch of 3 variable-length sentences,
 containing 2, 3, and 2 time-steps:
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index c32734808c3..f2e4a55dee4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -32,7 +32,6 @@ class SequencePoolKernel : public framework::OpKernel<T> {
     auto* in = context.Input<LoDTensor>("X");
     auto* out = context.Output<Tensor>("Out");
     std::string pooltype = context.Attr<std::string>("pooltype");
-    T pad_value = static_cast<T>(context.Attr<float>("pad_value"));
 
     auto dims = in->dims();
     auto lod = in->lod();
@@ -59,8 +58,8 @@ class SequencePoolKernel : public framework::OpKernel<T> {
       index->mutable_data<int>(context.GetPlace());
     }
     math::SequencePoolFunctor<DeviceContext, T> pool;
-    pool(context.template device_context<DeviceContext>(), pooltype, pad_value,
-         *in, out, is_test, index);
+    pool(context.template device_context<DeviceContext>(), pooltype, *in, out,
+         is_test, index);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 14e4fc9b0dd..44bfb1b0f8e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -110,6 +110,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(x.lod().size(), 1,
                       "SequenceReverse Op only support one level lod.");
 
+    auto &dev_ctx = ctx.template device_context<DeviceContext>();
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
@@ -131,24 +132,10 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NE(x_data, y_data,
                       "SequenceReverse Op does not support in-place operation");
 
-    if (platform::is_cpu_place(ctx.GetPlace())) {
-      for (size_t idx = 0; idx < lod_count - 1; idx++) {
-        auto start_pos = lod[idx];
-        auto end_pos = lod[idx + 1];
-        for (auto pos = start_pos; pos < end_pos; pos++) {
-          auto cur_pos = end_pos - pos - 1 + start_pos;
-          std::memcpy(y_data + pos * row_numel, x_data + cur_pos * row_numel,
-                      row_numel * sizeof(T));
-        }
-      }
-    } else {
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-
-      SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
-                                        row_numel);
-      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      for_range(functor);
-    }
+    SequenceReverseFunctor<T> functor(x_data, y_data, lod, lod_count,
+                                      row_numel);
+    platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
+    for_range(functor);
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index a07fc54090d..146b5cc9b3c 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -135,8 +135,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     }
 
     auto lod = in->lod();
-    // to avoid out_grad missing lod, compute lod again
-    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_lod = out_grad->lod();
 
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 70f26055b7c..fe8ca41b698 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -33,6 +33,7 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
     auto* x_t = ctx.Input<LoDTensor>("X");
     auto* len_t = ctx.Input<LoDTensor>("Length");
     auto* out_t = ctx.Output<LoDTensor>("Out");
+    out_t->mutable_data<T>(ctx.GetPlace());
 
     const int64_t* seq_len_ptr = nullptr;
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -66,9 +67,6 @@ class SequenceUnpadOpKernel : public framework::OpKernel<T> {
     }
     out_t->Resize(framework::make_ddim(out_dims_vec));
 
-    // after set the lod of output, allocate the memory
-    out_t->mutable_data<T>(ctx.GetPlace());
-
     int64_t padded_length = x_t->dims()[1];
     math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), *x_t, out_t,
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 08b7bf3d1e9..589c98e51e3 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -97,27 +97,27 @@ the start or end indices, it represents number of elements before the end
 of that dimension. If the value passed to start or end is larger than
 the n (the number of elements in this dimension), it represents n.
 For slicing to the end of a dimension with unknown size, it is recommended
-to pass in INT_MAX. The size of axes must be equal to starts\' and ends\'.
+to pass in INT_MAX. If axes are omitted, they are set to [0, ..., ndim-1].
 Following examples will explain how slice works:
 
-.. code-block:: text
-
-    Case1:
-        Given:
-            data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-            axes = [0, 1]
-            starts = [1, 0]
-            ends = [2, 3]
-        Then:
-            result = [ [5, 6, 7], ]
-
-    Case2:
-        Given:
-            data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-            starts = [0, 1]
-            ends = [-1, 1000]
-        Then:
-            result = [ [2, 3, 4], ]
+    .. code-block:: text
+
+        Cast1:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                axes = [0, 1]
+                starts = [1, 0]
+                ends = [2, 3]
+            Then:
+                result = [ [5, 6, 7], ]
+
+        Cast2:
+            Given:
+                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
+                starts = [0, 1]
+                ends = [-1, 1000]
+            Then:
+                result = [ [2, 3, 4], ]
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 716826bf156..456f78d2022 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -248,6 +248,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
     grad_op->SetType("softmax_with_cross_entropy_grad");
     grad_op->SetInput("Label", Input("Label"));
     grad_op->SetInput("Softmax", Output("Softmax"));
+    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
     grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
     grad_op->SetAttrMap(Attrs());
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index e6c87726425..1eb4076d64d 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -111,7 +111,7 @@ class SumOp : public framework::OperatorWithKernel {
                        "Input var[%s] should not be nullptr", x_vars_name[idx]);
         auto tensor =
             framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]);
-        if (tensor->numel() <= 0 || (!tensor->IsInitialized())) {
+        if (tensor->numel() == 0) {
           continue;
         }
         if (dtype == -1) {
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 790626a59d0..5cecb7e09e7 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -126,20 +126,12 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
     auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
 
     auto length = in_0.numel();
-    if (length && in_0.IsInitialized() && in_1.IsInitialized()) {
+    if (length) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       auto in_0_e = EigenVector<T>::Flatten(in_0);
       auto in_1_e = EigenVector<T>::Flatten(in_1);
       result.device(place) = in_0_e + in_1_e;
-    } else if (length && in_0.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_1);
     }
     return;
   }
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 21cf15cb0b0..7f470924b33 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -43,16 +43,15 @@ class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
   std::unordered_set<std::string> param_names_;
-  mutable TensorRTEngine *trt_engine_{nullptr};
+  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
   int max_batch_size_;
   int workspace_size_;
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
   bool enable_int8_;
-  bool use_calib_mode_;
   std::string calibration_data_;
   std::string engine_key_;
+  std::string engine_serialized_data_;
   bool calibration_mode_;
-  int predictor_id_;
   int device_id_;
 
  public:
@@ -66,10 +65,9 @@ class TensorRTEngineOp : public framework::OperatorBase {
     workspace_size_ = Attr<int>("workspace_size");
     device_id_ = Attr<int>("gpu_id");
     enable_int8_ = Attr<bool>("enable_int8");
-    use_calib_mode_ = Attr<bool>("use_calib_mode");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
-    predictor_id_ = Attr<int>("predictor_id");
+    engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
@@ -77,21 +75,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     // calibration_mode is ture represents we need to
     // generate the calibration table data.
-    calibration_mode_ =
-        (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
+    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
 
     VLOG(4) << "calibration_mode: " << calibration_mode_;
     if (enable_int8_ && calibration_data_.size()) {
       calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
     }
-    bool has_engine =
-        inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-            .Has(engine_key_ + std::to_string(predictor_id_));
-
-    if (!calibration_mode_ && has_engine) {
-      trt_engine_ =
-          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-              .Get(engine_key_ + std::to_string(predictor_id_));
+
+    if (!calibration_mode_ && !engine_serialized_data_.empty()) {
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          device_id_));
+      PADDLE_ENFORCE(engine_serialized_data_.size(),
+                     "TRT serialized data should not be empty here,"
+                     "there must be error when generate serialized data in TRT "
+                     "subgraph detect pass.");
+      trt_engine_->Deserialize(engine_serialized_data_);
     }
   }
 
@@ -237,14 +236,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
   TensorRTEngine *GetEngine(const framework::Scope &scope,
                             const platform::Place &dev_place) const {
     if (!trt_engine_) {
-      trt_engine_ =
-          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-              .Create(engine_key_ + std::to_string(predictor_id_),
-                      max_batch_size_, workspace_size_, enable_int8_,
-                      calibrator_.get(), device_id_);
-      PrepareTRTEngine(scope, trt_engine_);
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          device_id_));
+      PrepareTRTEngine(scope, trt_engine_.get());
     }
-    return trt_engine_;
+    return trt_engine_.get();
   }
 
   void PrepareTRTEngine(const framework::Scope &scope,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index efc50fc06f4..cc4d8d6e6f7 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -102,10 +102,8 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
   engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
   engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
-  engine_op_desc.SetAttr("predictor_id", 1);
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
-  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z0"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
@@ -202,10 +200,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("parameters",
                          std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
-  engine_op_desc.SetAttr("predictor_id", 1);
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
-  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z3"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index e388ec5ae39..96abad3de9b 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -61,10 +61,10 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
 
     auto& device_ctx = context.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
-
-    in_x_grad->mutable_data<T>(context.GetPlace());
-    zero(device_ctx, in_x_grad, static_cast<T>(0));
-
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
     math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
     unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
   }
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index deb5681f210..217d400bb3c 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -54,15 +54,6 @@ class WarpCTCOp : public framework::OperatorWithKernel {
     framework::LibraryType library_{framework::LibraryType::kPlain};
 #ifdef PADDLE_WITH_CUDA
     if (platform::CanCUDNNBeUsed(ctx)) {
-#if CUDA_VERSION >= 9000
-      LOG(WARNING)
-          << "The cudnnCTCLoss of CUDNN7 have some diff between "
-             "CUDA9/CUDA10 and CUDA8. You can close use_cudnn option to "
-             "use "
-             "baidu-research/warp-ctc(https://github.com/baidu-research/"
-             "warp-ctc)";
-#endif
-
       library_ = framework::LibraryType::kCUDNN;
     }
 #endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 5de00db55ad..c4386689d3e 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -72,7 +72,7 @@ ENDIF()
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    temp_allocator ${dgc_deps})
+    temp_allocator ${dgc_deps} xxhash)
 
 if(WIN32)
     if(WITH_GPU AND NOT WITH_DSO)
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index 39a50b3bc99..4ed51acb587 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -183,17 +183,15 @@ class ConvolutionDescriptor {
     CUDNN_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
         desc, pads.size(), pads.data(), strides.data(), dilations.data(),
         CUDNN_CROSS_CORRELATION, compute_type));
+    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+        desc, CUDNN_DEFAULT_MATH));
 #if CUDNN_VERSION_MIN(7, 0, 1)
     CUDNN_ENFORCE(
         platform::dynload::cudnnSetConvolutionGroupCount(desc, groups));
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        desc, CUDNN_DEFAULT_MATH));
     if (dtype == CUDNN_DATA_HALF) {
       CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
           desc, CUDNN_TENSOR_OP_MATH));
     }
-#endif
 #endif
   }
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 4f048d44685..61386bdf05a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -167,7 +167,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
     if (UNLIKELY(num_bytes == 0)) {
       return nullptr;
     }
-    auto buf = paddle::memory::Alloc(place_, num_bytes);
+    auto buf = paddle::memory::Alloc(place_, num_bytes,
+                                     memory::Allocator::kScratchpad);
     void* retv = buf->ptr();
     {
       std::lock_guard<std::mutex> lock(mtx_);
@@ -231,7 +232,8 @@ void CudnnHolder::ReallocateWorkspace(size_t required_workspace_len) {
     PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
     workspace_.reset();
   }
-  workspace_ = paddle::memory::Alloc(place_, required_workspace_len);
+  workspace_ = paddle::memory::Alloc(place_, required_workspace_len,
+                                     paddle::memory::Allocator::kScratchpad);
 }
 
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
@@ -266,14 +268,12 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
+                          << (cudnn_dso_ver % 100) / 10 << ".";
 
   {
     // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+    auto local_cuda_version = runtime_version_ / 100;
+    auto compile_cuda_version = CUDA_VERSION / 100;
     if (local_cuda_version < compile_cuda_version) {
       LOG_FIRST_N(WARNING, 1)
           << "WARNING: device: " << place_.device
@@ -316,9 +316,7 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
 #if !defined(_WIN32)
-  if (nccl_comm_) {
-    PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
-  }
+  PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
 #endif
 }
 
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 67e2a18dd37..3008c166938 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -172,19 +172,16 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7001
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
-  __macro(cudnnSetConvolutionGroupCount);                 \
-  __macro(cudnnSetConvolutionMathType);                   \
-  __macro(cudnnConvolutionBiasActivationForward);         \
-  __macro(cudnnCreateCTCLossDescriptor);                  \
-  __macro(cudnnDestroyCTCLossDescriptor);                 \
-  __macro(cudnnGetCTCLossDescriptor);                     \
-  __macro(cudnnSetCTCLossDescriptor);                     \
-  __macro(cudnnGetCTCLossWorkspaceSize);                  \
-  __macro(cudnnCTCLoss);                                  \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
-  __macro(cudnnGetConvolutionForwardAlgorithm_v7);
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)        \
+  __macro(cudnnSetConvolutionGroupCount);         \
+  __macro(cudnnSetConvolutionMathType);           \
+  __macro(cudnnConvolutionBiasActivationForward); \
+  __macro(cudnnCreateCTCLossDescriptor);          \
+  __macro(cudnnDestroyCTCLossDescriptor);         \
+  __macro(cudnnGetCTCLossDescriptor);             \
+  __macro(cudnnSetCTCLossDescriptor);             \
+  __macro(cudnnGetCTCLossWorkspaceSize);          \
+  __macro(cudnnCTCLoss);
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 9aafc180b90..62f623b175e 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -107,8 +107,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
 
   if (nullptr == dso_handle) {
     LOG(WARNING) << "Can not find library: " << dso_path
-                 << ". The process maybe hang. Please try to add the lib path "
-                    "to LD_LIBRARY_PATH.";
+                 << ". Please try to add the lib path to LD_LIBRARY_PATH.";
   }
   return dso_handle;
 }
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index c4d16766c80..bb22628cdfb 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -206,6 +206,9 @@ void InitGLOG(const std::string &prog_name) {
   // glog will not hold the ARGV[0] inside.
   // Use strdup to alloc a new string.
   google::InitGoogleLogging(strdup(prog_name.c_str()));
+#ifndef _WIN32
+  google::InstallFailureSignalHandler();
+#endif
 }
 
 #if defined(PADDLE_WITH_DGC)
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 33d0fe62680..07eaf42d2d3 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -20,6 +20,24 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+template <typename T>
+void print_lod_tensor(const std::string& var_name,
+                      const framework::LoDTensor& lod_tensor,
+                      const std::string& print_info) {
+  auto inspect = lod_tensor.data<T>();
+  auto element_num = lod_tensor.numel();
+
+  std::ostringstream sstream;
+  sstream << print_info << "\t";
+  sstream << var_name << "\t";
+  sstream << inspect[0];
+  for (int j = 1; j < element_num; ++j) {
+    sstream << " " << inspect[j];
+  }
+
+  std::cout << sstream.str() << std::endl;
+}
+
 void PrintVar(framework::Scope* scope, const std::string& var_name,
               const std::string& print_info) {
   framework::Variable* var = scope->FindVar(var_name);
@@ -34,11 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
     return;
   }
 
-  std::ostringstream sstream;
-  sstream << print_info << "\t";
-  sstream << var_name << "\t";
-  sstream << *tensor << "\t";
-  std::cout << sstream.str() << std::endl;
+  framework::LoDTensor printed_tensor;
+  printed_tensor.set_lod(tensor->lod());
+  printed_tensor.Resize(tensor->dims());
+  if (platform::is_cpu_place(tensor->place())) {
+    printed_tensor.ShareDataWith(*tensor);
+  } else {
+    platform::CPUPlace place;
+    framework::TensorCopy(*tensor, place, &printed_tensor);
+  }
+
+#define PrintLoDTensorCallback(cpp_type, proto_type)                    \
+  do {                                                                  \
+    if (tensor->type() == proto_type) {                                 \
+      print_lod_tensor<cpp_type>(var_name, printed_tensor, print_info); \
+      return;                                                           \
+    }                                                                   \
+  } while (0)
+
+  _ForEachDataType_(PrintLoDTensorCallback);
+  VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type();
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index f1fb6b156ae..ba3a82b4b07 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include <sstream>
 #include <string>
 #include <vector>
 #include "boost/optional.hpp"
@@ -32,13 +31,10 @@ class MKLDNNHandler {
  public:
   MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
                 const std::string& base_key)
-      : dev_ctx_(dev_ctx), engine_(engine), key_common_(base_key) {
-    // TODO(jczaja): Make it faster
-    auto tid = std::this_thread::get_id();
-    std::stringstream ss;
-    ss << tid;
-    key_ = key_common_ + "-t:" + ss.str();
-  }
+      : dev_ctx_(dev_ctx),
+        engine_(engine),
+        key_(base_key),
+        is_reusing_(false) {}
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
       const mkldnn::memory::desc& md, void* ptr) {
@@ -77,11 +73,16 @@ class MKLDNNHandler {
     auto local_key = key_ + suffix;
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
     if (mem_p == nullptr) {
       mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
     }
     return mem_p;
   }
@@ -95,6 +96,8 @@ class MKLDNNHandler {
     auto local_key = key_ + suffix;
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
     if (mem_p == nullptr) {
       // Call custom reorder/preprocessing func if available
       if (custom_func) {
@@ -108,6 +111,9 @@ class MKLDNNHandler {
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
     }
     return mem_p;
   }
@@ -149,6 +155,8 @@ class MKLDNNHandler {
 
     auto target_memory_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find mem primitive in device context");
     if (target_memory_p == nullptr) {
       target_memory_p = user_memory_p;
       std::shared_ptr<mkldnn::primitive> reorder_p;
@@ -179,6 +187,7 @@ class MKLDNNHandler {
       if (reorder_p != nullptr) {
         pipeline.push_back(*reorder_p);
       }
+      is_reusing_ = true;
     }
     return target_memory_p;
   }
@@ -203,29 +212,25 @@ class MKLDNNHandler {
     dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
   }
 
-  static void AppendKey(
-      std::string* key, const mkldnn::memory::dims& input_dims,
-      const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const std::vector<int>& dilations,
-      const int& groups, const mkldnn::memory::data_type& srcdt,
-      const mkldnn::memory::format& format, const bool& relu,
-      const bool& residual, const bool& brelu, const std::string& suffix) {
+  static void AppendKey(std::string* key,
+                        const mkldnn::memory::dims& input_dims,
+                        const mkldnn::memory::dims& weights_dims,
+                        const std::vector<int>& strides,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations, const int& groups,
+                        const mkldnn::memory::data_type& srcdt,
+                        const mkldnn::memory::format& format, const bool& relu,
+                        const bool& residual, const std::string& suffix) {
     AppendKeyDims(key, input_dims);
-
     AppendKeyDims(key, weights_dims);
-
     AppendKeyVec(key, strides);
-
     AppendKeyVec(key, paddings);
-
     AppendKeyVec(key, dilations);
-
     AppendKey(key, std::to_string(groups));
     AppendKey(key, std::to_string(srcdt));
     AppendKey(key, std::to_string(format));
     AppendKey(key, std::to_string(relu));
     AppendKey(key, std::to_string(residual));
-    AppendKey(key, std::to_string(brelu));
     AppendKey(key, suffix);
   }
 
@@ -259,7 +264,7 @@ class MKLDNNHandler {
   const MKLDNNDeviceContext& dev_ctx_;
   mkldnn::engine engine_;
   std::string key_;
-  std::string key_common_;
+  bool is_reusing_;
 
  public:
   static constexpr int MaxKeyLength = 256;
@@ -281,6 +286,8 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     auto local_key = key_ + "@user_src_mem_p";
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
     if (mem_p == nullptr) {
       // Make memory descriptor using input format, unless it
       // cannot be trusted (nchw) then make up memory fmt manually
@@ -296,6 +303,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       mem_p->set_data_handle(ptr);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
     }
     return mem_p;
   }
@@ -305,17 +315,23 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     auto local_key = key_ + "@user_dst_mem_p";
     auto mem_p =
         std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
+                   " find mem primitive in device context");
     if (mem_p == nullptr) {
       auto dst_mdp = mkldnn::memory::primitive_desc{
           Axis2MemoryDesc(dims_, axis_), engine_};
 
-      auto dst_data = output->mutable_data<float>(place, dst_mdp.get_size());
+      auto dst_data = output->mutable_data<float>(
+          place, paddle::memory::Allocator::kDefault, dst_mdp.get_size());
 
       mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
       dev_ctx_.SetBlob(local_key, mem_p);
     } else {
       auto dst_data = output->mutable_data<float>(place);
       mem_p->set_data_handle(dst_data);
+      // Mark that reusing happenned. All primitives from operator instance
+      // should be reused or none of them. So we check consistency
+      is_reusing_ = true;
     }
     return mem_p;
   }
@@ -326,10 +342,14 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
     auto prim_key = key_ + "@transpose_p";
     auto transpose_p =
         std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((transpose_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
     if (transpose_p == nullptr) {
       transpose_p =
           std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
       dev_ctx_.SetBlob(prim_key, transpose_p);
+    } else {
+      is_reusing_ = true;
     }
     return transpose_p;
   }
@@ -376,83 +396,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
   std::vector<int> logical_axis_;
 };
 
-class ReorderMKLDNNHandler : public MKLDNNHandler {
- public:
-  ReorderMKLDNNHandler(std::vector<int>& dims,  // NOLINT
-                       framework::proto::VarType::Type vtype,
-                       mkldnn::memory::data_type dtype,
-                       const platform::MKLDNNDeviceContext& dev_ctx,
-                       mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        dims_(dims),
-        vtype_(vtype),
-        dtype_(dtype) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::format& fmt, void* ptr) {
-    auto local_key = key_ + "@user_src_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto src_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
-      mem_p = std::make_shared<mkldnn::memory>(
-          mkldnn::memory::primitive_desc{src_md, engine_}, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      framework::Tensor* output, const mkldnn::memory::format& fmt,
-      platform::Place place) {
-    auto local_key = key_ + "@user_dst_mem_p";
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt);
-      auto dst_mdp = mkldnn::memory::primitive_desc{dst_md, engine_};
-
-      auto dst_data = output->mutable_data(place, vtype_);
-
-      mem_p = std::make_shared<mkldnn::memory>(dst_mdp, dst_data);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      auto dst_data = output->mutable_data(place, vtype_);
-      mem_p->set_data_handle(dst_data);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::reorder> AcquireReorder(
-      std::shared_ptr<mkldnn::memory> dst_memory_p,
-      std::shared_ptr<mkldnn::memory> src_memory_p) {
-    auto prim_key = key_ + "@reorder_p";
-    auto reorder_p =
-        std::static_pointer_cast<mkldnn::reorder>(dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      reorder_p =
-          std::make_shared<mkldnn::reorder>(*(src_memory_p), *(dst_memory_p));
-      dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
-  }
-
-  static std::string GetHash(std::vector<int>& shape,  // NOLINT
-                             mkldnn::memory::format in_fmt,
-                             mkldnn::memory::format out_fmt,
-                             const std::string& suffix) {
-    return dims2str(shape) + std::to_string(in_fmt) + "->" +
-           std::to_string(out_fmt) + "#" + suffix;
-  }
-
- private:
-  std::vector<int> dims_;
-  framework::proto::VarType::Type vtype_;
-  mkldnn::memory::data_type dtype_;
-};
-
 template <typename T>
 struct convolutional_algorithm;
 
@@ -619,9 +562,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
                                scale_data, mask);
   }
 
-  mkldnn::primitive_attr CreatePostOps(bool fuse_relu, bool fuse_residual_conn,
-                                       bool fuse_brelu,
-                                       float fuse_brelu_threshold) const {
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
+                                       bool fuse_residual_conn = false) const {
     mkldnn::primitive_attr conv_attr;
     mkldnn::post_ops post_operations;
     // Fusion with Elementwise layer relies on adding a sum post-operation with
@@ -641,14 +583,6 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
       post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
                                      negative_slope, placeholder);
     }
-
-    if (fuse_brelu) {
-      constexpr float scale = 1.0f;
-      constexpr float placeholder = 0.0f;
-      post_operations.append_eltwise(scale,
-                                     mkldnn::algorithm::eltwise_bounded_relu,
-                                     fuse_brelu_threshold, placeholder);
-    }
     conv_attr.set_post_ops(post_operations);
     return conv_attr;
   }
@@ -660,45 +594,36 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
       const mkldnn::memory::desc& dst, const std::vector<int>& strides,
       const std::vector<int>& paddings, const mkldnn::engine& engine,
       const bool fuse_relu, const bool fuse_residual_conn,
-      const bool fuse_brelu, const float fuse_brelu_threshold,
       mkldnn::prop_kind fwd_prop_kind) {
-    // Conv PD has to be passed to Grad op that
-    // may be exxecuted by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_conv_pd = key_common_ + "@conv_pd";
+    const std::string key_conv_pd = key_ + "@conv_pd";
 
-    conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
+    auto conv_pd = std::static_pointer_cast<typename forward_t::primitive_desc>(
         dev_ctx_.GetBlob(key_conv_pd));
 
-    if (conv_pd_ == nullptr) {
-      static std::mutex acquire_barrier;
-      std::lock_guard<std::mutex> block_threads_until_finish_this_job(
-          acquire_barrier);
-
-      conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-          dev_ctx_.GetBlob(key_conv_pd));
-      if (conv_pd_ == nullptr) {
-        mkldnn::memory::dims stride_dims = strides;
-        mkldnn::memory::dims padding_dims = paddings;
-
-        auto conv_desc =
-            bias ? typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, *bias, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero)
-                 : typename forward_t::desc(
-                       fwd_prop_kind, convolutional_algorithm<forward_t>::T,
-                       src, weights, dst, stride_dims, padding_dims,
-                       padding_dims, mkldnn::padding_kind::zero);
-
-        mkldnn::primitive_attr conv_attr = CreatePostOps(
-            fuse_relu, fuse_residual_conn, fuse_brelu, fuse_brelu_threshold);
-
-        conv_pd_.reset(new typename forward_t::primitive_desc(
-            conv_desc, conv_attr, engine));
-        // Save conv_pd/src_memory/weights_memory for backward pass
-        dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
-      }
+    if (conv_pd == nullptr) {
+      mkldnn::memory::dims stride_dims = strides;
+      mkldnn::memory::dims padding_dims = paddings;
+
+      auto conv_desc =
+          bias ? typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, *bias, dst, stride_dims, padding_dims,
+                     padding_dims, mkldnn::padding_kind::zero)
+               : typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, dst, stride_dims, padding_dims, padding_dims,
+                     mkldnn::padding_kind::zero);
+
+      mkldnn::primitive_attr conv_attr =
+          CreatePostOps(fuse_relu, fuse_residual_conn);
+
+      conv_pd_.reset(
+          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
+    } else {
+      conv_pd_ = conv_pd;
+      is_reusing_ = true;
     }
 
     return conv_pd_;
@@ -711,11 +636,15 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     auto prim_key = key_ + "@conv_p";
     auto conv_p =
         std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
       conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
                                            *weights_memory_p, *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
     }
     return conv_p;
   }
@@ -728,12 +657,16 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     auto prim_key = key_ + "@conv_p";
     auto conv_p =
         std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
+                   "Fail to find convolution primitive in device context");
     if (conv_p == nullptr) {
       conv_p = std::make_shared<forward_t>(*conv_pd_, *src_memory_p,
                                            *weights_memory_p, *bias_memory_p,
                                            *dst_memory_p);
 
       dev_ctx_.SetBlob(prim_key, conv_p);
+    } else {
+      is_reusing_ = true;
     }
     return conv_p;
   }
@@ -745,12 +678,17 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     auto prim_key = key_ + "@conv_bwd_weights_p";
     auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
         dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd weights primitive in device context");
     if (conv_bwd_weights_p == nullptr) {
       // create backward conv primitive for weights
       conv_bwd_weights_p = std::make_shared<backward_weights_t>(
           *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
           *diff_weights_memory_p);
       dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
+    } else {
+      is_reusing_ = true;
     }
     return conv_bwd_weights_p;
   }
@@ -762,31 +700,20 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
     auto prim_key = key_ + "@conv_bwd_data_p";
     auto conv_bwd_data_p =
         std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
+    PADDLE_ENFORCE(
+        (conv_bwd_data_p != nullptr) || (is_reusing_ == false),
+        "Fail to find convolution bwd data primitive in device context");
     if (conv_bwd_data_p == nullptr) {
       conv_bwd_data_p = std::make_shared<backward_data_t>(
           *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
           *diff_src_memory_p);
       dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
+    } else {
+      is_reusing_ = true;
     }
     return conv_bwd_data_p;
   }
 
-  // Generate keys for storing/retriving primitives for this operator
-  // TODO(jczaja): Make hashing function more optimial
-  static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
-                             mkldnn::memory::dims& weights_dims,  // NOLINT
-                             const bool& fuse_relu,               // NOLINT
-                             const bool& fuse_brelu,              // NOLINT
-                             std::vector<int>& strides,           // NOLINT
-                             std::vector<int>& paddings,          // NOLINT
-                             std::vector<int>& dilations,         // NOLINT
-                             int groups, const std::string& suffix) {
-    return dims2str(input_dims) + dims2str(weights_dims) +
-           std::to_string(fuse_relu) + std::to_string(fuse_brelu) +
-           dims2str(strides) + dims2str(paddings) + dims2str(dilations) +
-           std::to_string(groups) + suffix;
-  }
-
   // Generate keys for storing/retriving primitives for this operator
   // TODO(jczaja): Make hashing function more optimial
   static std::string GetHash(mkldnn::memory::dims& input_dims,    // NOLINT
@@ -821,8 +748,9 @@ template <typename T>
 static std::shared_ptr<mkldnn::memory> SetDstMemory(
     const framework::ExecutionContext& ctx, framework::Tensor* output,
     const std::shared_ptr<ConvMKLDNNHandler>& handler) {
-  T* output_data =
-      output->mutable_data<T>(ctx.GetPlace(), handler->GetDstMemorySize());
+  T* output_data = output->mutable_data<T>(
+      ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
+      handler->GetDstMemorySize());
   std::shared_ptr<mkldnn::memory> dst_memory_p =
       handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
   return dst_memory_p;
@@ -853,8 +781,9 @@ static void SetDstMemoryHandler(
     const framework::ExecutionContext& ctx, framework::Tensor* output,
     const std::shared_ptr<ConvMKLDNNHandler>& handler,
     std::shared_ptr<mkldnn::memory>* dst_memory_p) {
-  T* output_data =
-      output->mutable_data<T>(ctx.GetPlace(), handler->GetDstMemorySize());
+  T* output_data = output->mutable_data<T>(
+      ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
+      handler->GetDstMemorySize());
   (*dst_memory_p)->set_data_handle(to_void_cast<T>(output_data));
 }
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index d79ff6e2b98..b8b14b3d15e 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -124,8 +124,8 @@ struct NCCLContextMap {
           } else {
             rank = trainer_id;
           }
-          VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
-                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks
+                  << " gpu id: " << gpu_id;
           PADDLE_ENFORCE(cudaSetDevice(gpu_id));
           PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
@@ -160,160 +160,6 @@ struct NCCLContextMap {
   }
 };
 
-inline std::string GetFlatNCCLVarName(size_t pos) {
-  if (pos == 0) {
-    return NCCL_ID_VARNAME;
-  }
-  return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast<int>(pos));
-}
-
-inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
-  return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME,
-                         static_cast<int>(pos));
-}
-inline std::string GetHierarchicalInterNCCLVarName(size_t pos) {
-  return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME,
-                         static_cast<int>(pos));
-}
-
-class NCCLCommunicator {
- public:
-  NCCLCommunicator() {}
-  virtual ~NCCLCommunicator() {}
-
-  NCCLContextMap *DefaultFlatCtx() const {
-    if (flat_ctxs_.size() == 0) {
-      return nullptr;
-    }
-
-    return flat_ctxs_[0].get();
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetFlatCtxs() {
-    return &flat_ctxs_;
-  }
-
-  NCCLContextMap *GetFlatCtx(size_t run_order) const {
-    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
-  }
-
-  NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order,
-                                   bool use_hierarchical_allreduce) const {
-    if (!use_hierarchical_allreduce) {
-      return GetFlatCtx(run_order);
-    }
-
-    return GetHierarchicalInterCtx(run_order);
-  }
-
-  /*
-   *When nccl inits nccl comm using ncclCommInitAll, it meets error when
-   *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
-   *create a new nccl comm for sync_batch_norm_op. And these codes should be
-   *polished with a unified nccl management.
-  */
-  NCCLContextMap *GetSyncBatchNormCtx(
-      framework::Scope *scope, const std::vector<platform::Place> &places) {
-    auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-    if (nccl_id_var != nullptr) {
-      return DefaultFlatCtx();
-    }
-
-    if (sync_batch_norm_ctx_.get() == nullptr) {
-      sync_batch_norm_ctx_.reset(new NCCLContextMap(places));
-    }
-    return sync_batch_norm_ctx_.get();
-  }
-
-  void InitFlatCtxs(const std::vector<platform::Place> &places,
-                    const std::vector<ncclUniqueId *> &nccl_ids,
-                    size_t trainers_num, size_t trainer_id) {
-    if (nccl_ids.size() == 0) {
-      auto ptr = new platform::NCCLContextMap(places);
-      VLOG(1) << "init local trainer";
-      flat_ctxs_.emplace_back(ptr);
-      return;
-    }
-
-    for (size_t i = 0; i < nccl_ids.size(); i++) {
-      auto ptr = new platform::NCCLContextMap(places, nccl_ids[i], trainers_num,
-                                              trainer_id);
-      VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
-      flat_ctxs_.emplace_back(ptr);
-    }
-  }
-
-  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-                            const std::vector<ncclUniqueId *> &inter_nccl_ids,
-                            const std::vector<ncclUniqueId *> &exter_nccl_ids,
-                            size_t trainers_num, size_t trainer_id,
-                            size_t inter_trainers_num,
-                            size_t exter_trainers_num) {
-    PADDLE_ENFORCE(trainers_num == inter_trainers_num * exter_trainers_num,
-                   "trainers_num:%llu != inter_trainers_num:%llu * "
-                   "exter_trainers_num:%llu",
-                   trainers_num, inter_trainers_num, exter_trainers_num);
-
-    PADDLE_ENFORCE(inter_trainers_num > 1, "inter_trainers_num:%llu must > 1",
-                   inter_trainers_num);
-
-    int inter_trainer_id = trainer_id % inter_trainers_num;
-    for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
-      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
-              << ", comm no:" << i;
-      auto local = new NCCLContextMap(places, inter_nccl_ids[i],
-                                      inter_trainers_num, inter_trainer_id);
-
-      h_inter_ctxs_.emplace_back(local);
-    }
-
-    int exter_trainer_id = -1;
-    if (trainer_id % inter_trainers_num == 0) {
-      exter_trainer_id = trainer_id / inter_trainers_num;
-    }
-
-    if (exter_trainer_id >= 0) {
-      for (size_t i = 0; i < exter_nccl_ids.size(); i++) {
-        auto ex = new NCCLContextMap(places, exter_nccl_ids[i],
-                                     exter_trainers_num, exter_trainer_id);
-        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
-                << ", comm no:" << i;
-        h_exter_ctxs_.emplace_back(ex);
-      }
-    }
-  }
-
-  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
-
-  NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
-  }
-
-  NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalInterCtxs() {
-    return &h_inter_ctxs_;
-  }
-
-  std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalExterCtxs() {
-    return &h_exter_ctxs_;
-  }
-
- protected:
-  // Support multi nccl comm on default nccl ring while NCCLContextMap can't.
-  std::vector<std::unique_ptr<NCCLContextMap>> flat_ctxs_;
-
-  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
-  // And h_exter_ctxs_ can support multi comm too.
-  std::vector<std::unique_ptr<NCCLContextMap>> h_inter_ctxs_;
-  std::vector<std::unique_ptr<NCCLContextMap>> h_exter_ctxs_;
-
-  // just used for sync_batch_norm op.
-  std::unique_ptr<NCCLContextMap> sync_batch_norm_ctx_;
-};
-
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
index 2bacd5bd4c3..9e6521653b8 100644
--- a/paddle/fluid/platform/ngraph_helper.h
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -77,7 +77,9 @@ std::shared_ptr<ngraph::Node> GetNode(
         std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
         ngb_node_map) {
   auto& var_names = var_map.at(name);
-  if (var_names.size() == 0) return nullptr;
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s name %s expects one associated var", op->Type(),
+                    name);
   if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
     return (*ngb_node_map)[var_names[0]];
   } else {
@@ -130,6 +132,16 @@ void SetOutputNode(
         ngb_node_map) {
   auto& var_names = op->Outputs().at(name);
   if (var_names.size() == 1) {
+    /*  */
+    auto dummy_out = GetOutputNode(op, name, ngb_node_map);
+    if (dummy_out && dummy_out->get_shape() != node->get_shape()) {
+      node = NgReshaper(node, dummy_out->get_shape());
+    }
+    if (dummy_out &&
+        dummy_out->get_element_type() != node->get_element_type()) {
+      node = std::make_shared<ngraph::op::Convert>(
+          node, dummy_out->get_element_type());
+    }
     (*ngb_node_map)[var_names[0]] = node;
   } else if (var_names.size() == 0) {
     (*ngb_node_map)[""] = node;
@@ -177,22 +189,6 @@ inline void TrimTrailingSingularDims(ngraph::Shape* shape) {
     }
   }
 }
-
-ngraph::element::Type GetNgType(paddle::framework::proto::VarType::Type dtype) {
-  ngraph::element::Type ng_dtype;
-  if (dtype == paddle::framework::proto::VarType::FP32) {
-    ng_dtype = ngraph::element::f32;
-  } else if (dtype == paddle::framework::proto::VarType::FP64) {
-    ng_dtype = ngraph::element::f64;
-  } else if (dtype == paddle::framework::proto::VarType::INT64) {
-    ng_dtype = ngraph::element::i64;
-  } else if (dtype == paddle::framework::proto::VarType::INT32) {
-    ng_dtype = ngraph::element::i32;
-  } else {
-    PADDLE_THROW("unsupported data type: %s", dtype);
-  }
-  return ng_dtype;
-}
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc
index 6177b024f0c..d489ed5368e 100644
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <memory>
+#include <utility>
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 DEFINE_int64(limit_of_tmp_allocation, -1,
@@ -30,31 +31,38 @@ namespace paddle {
 namespace platform {
 namespace alloc = memory::allocation;
 
+TemporaryAllocation::TemporaryAllocation(
+    alloc::AllocationPtr &&underlying_allocation)
+    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
+                 underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)) {}
+
 TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) {
-  temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+  temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
 }
 
 bool TemporaryAllocator::IsAllocThreadSafe() const { return true; }
 
 void TemporaryAllocator::Release(const std::function<void()> &callback) {
-  std::unique_ptr<std::multimap<size_t, alloc::Allocation *>> t_allocations;
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> t_allocations;
   {
     std::unique_lock<std::mutex> lock(mtx_);
     callback();
     t_allocations.swap(temp_mem_map_);
-    temp_mem_map_.reset(new std::multimap<size_t, alloc::Allocation *>());
+    temp_mem_map_.reset(new std::multimap<size_t, TemporaryAllocation *>());
     wait_delete_mem_ = 0;
   }
 
-  alloc::AllocationDeleter deleter;
   for (auto tmp : *t_allocations) {
     VLOG(10) << "Delete temporary allocation " << tmp.second->ptr()
              << " size: " << tmp.second->size();
-    deleter(tmp.second);
+    delete tmp.second;
   }
 }
 
-void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
+void TemporaryAllocator::Free(alloc::Allocation *allocation) {
+  auto *temp_allocation = dynamic_cast<TemporaryAllocation *>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(temp_allocation);
   if (platform::is_gpu_place(temp_allocation->place())) {
     PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_),
                    "The place should be the same.");
@@ -69,8 +77,8 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
                << "wait_delete_mem: " << wait_delete_mem;
     }
 
-    if (FLAGS_limit_of_tmp_allocation >= 0 &&
-        wait_delete_mem >= static_cast<size_t>(FLAGS_limit_of_tmp_allocation)) {
+    if (FLAGS_limit_of_tmp_allocation > 0 &&
+        wait_delete_mem > static_cast<size_t>(FLAGS_limit_of_tmp_allocation)) {
       PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized.");
       Release(callback_);
     }
@@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
   }
   VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
            << " size: " << temp_allocation->size();
-  alloc::AllocationDeleter()(temp_allocation);
+  delete temp_allocation;
 }
 
 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {
@@ -90,7 +98,8 @@ void TemporaryAllocator::SetCallback(const std::function<void()> &callback) {
   callback_ = callback;
 }
 
-alloc::Allocation *TemporaryAllocator::AllocateImpl(size_t size) {
+alloc::Allocation *TemporaryAllocator::AllocateImpl(
+    size_t size, alloc::Allocator::Attr attr) {
   {
     // Find available allocation in temp_mem_map.
     std::unique_lock<std::mutex> lock(mtx_);
@@ -112,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl(size_t size) {
   }
   // If not find the the available allocation, get allocation from
   // AllocatorFacadeInstance.
-  auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size);
+  auto raw_allocation =
+      alloc::AllocatorFacade::Instance().Alloc(place_, size, attr);
+  auto temp_mem = new TemporaryAllocation(std::move(raw_allocation));
   VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size;
-  return temp_mem.release();
+  return temp_mem;
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 41f0e4a80b7..f8a43b889d5 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -23,6 +23,14 @@
 namespace paddle {
 namespace platform {
 
+class TemporaryAllocation : public memory::allocation::Allocation {
+ public:
+  explicit TemporaryAllocation(
+      memory::allocation::AllocationPtr &&underlying_allocation);
+
+  memory::allocation::AllocationPtr underlying_allocation_;
+};
+
 /*! \brief the TemporaryAllocator is used to alloc the temporary allocation
  * which used by CUDA's async operation.
  *
@@ -49,16 +57,17 @@ class TemporaryAllocator : public memory::allocation::Allocator {
   void SetCallback(const std::function<void()> &callback);
 
  protected:
-  void FreeImpl(memory::allocation::Allocation *allocation) override;
+  void Free(memory::allocation::Allocation *allocation) override;
 
-  memory::allocation::Allocation *AllocateImpl(size_t size) override;
+  memory::allocation::Allocation *AllocateImpl(
+      size_t size, memory::allocation::Allocator::Attr attr) override;
 
  private:
   platform::Place place_;
   // When the allocation is not held by any variable, it should be placed
   // to temp_mem_map immediately.
-  std::unique_ptr<std::multimap<size_t, memory::allocation::Allocation *>>
-      temp_mem_map_{nullptr};
+  std::unique_ptr<std::multimap<size_t, TemporaryAllocation *>> temp_mem_map_{
+      nullptr};
   std::mutex mtx_;
   size_t wait_delete_mem_{0};
   std::function<void()> callback_;
diff --git a/paddle/fluid/platform/timer.h b/paddle/fluid/platform/timer.h
index ff0e1d95c29..56019ae7cf2 100644
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -50,7 +50,7 @@ class Timer {
   struct timeval _start;
   struct timeval _now;
   int _count;
-  int64_t _elapsed;
+  int _elapsed;
   bool _paused;
 
   // get us difference between start and now
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index eeee507110c..bee702519dc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,33 +1,18 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper nccl_wrapper prune
-  feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor imperative_profiler nccl_context)
+message(STATUS "use ${x86_kernels}")
+message(STATUS "use ${ops_lite}")
 
 if(WITH_PYTHON)
-  list(APPEND PYBIND_DEPS py_func_op)
-endif()
-
-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_DEPS communicator)
-endif()
+  cc_library(bind_executor_lite SRCS executor_lite.cc DEPS pybind framework_proto)
+  set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper nccl_wrapper prune
+    feed_fetch_method pass_builder parallel_executor profiler layer scope_pool bind_executor_lite cxx_api_lite scope_lite ${ops_lite} ${host_kernels} ${x86_kernels} mir_passes kernel_lite op_lite optimizer_lite 
+    tracer analysis_predictor imperative_profiler nccl_context)
+endif(WITH_PYTHON)
 
-set(PYBIND_SRCS
-  pybind.cc
-  exception.cc
-  protobuf.cc
-  const_value.cc
-  recordio.cc
-  reader_py.cc
-  async_executor_py.cc
-  fleet_wrapper_py.cc
-  nccl_wrapper_py.cc
-  data_set_py.cc
-  imperative.cc
-  ir.cc
-  inference_api.cc)
 
-if (WITH_DISTRIBUTE)
-  list(APPEND PYBIND_SRCS communicator_py.cc)
+if(WITH_PYTHON)
+  list(APPEND PYBIND_DEPS py_func_op)
 endif()
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc nccl_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 3e2c976076a..3f171b65ab8 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -66,9 +66,7 @@ void BindDataset(py::module* m) {
       .def("load_into_memory", &framework::Dataset::LoadIntoMemory)
       .def("release_memory", &framework::Dataset::ReleaseMemory)
       .def("local_shuffle", &framework::Dataset::LocalShuffle)
-      .def("global_shuffle", &framework::Dataset::GlobalShuffle)
-      .def("get_memory_data_size", &framework::Dataset::GetMemoryDataSize)
-      .def("get_shuffle_data_size", &framework::Dataset::GetShuffleDataSize);
+      .def("global_shuffle", &framework::Dataset::GlobalShuffle);
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/executor_lite.cc b/paddle/fluid/pybind/executor_lite.cc
new file mode 100644
index 00000000000..2ca4e1dce34
--- /dev/null
+++ b/paddle/fluid/pybind/executor_lite.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/executor_lite.h"
+#include <pybind11/stl.h>
+#include <memory>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/core/hvy_tensor.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "pybind11/pybind11.h"
+
+namespace lt = paddle::lite;
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindTensor(pybind11::module* m) {
+  pybind11::class_<lt::TensorHvy>(*m, "Tensor")
+      .def(pybind11::init<>())
+      .def("raw_tensor", [](lt::TensorHvy& self) { return self.raw_tensor(); })
+      .def("share_data_with",
+           [](lt::TensorHvy& self, const framework::Tensor& other) {
+             self.ShareDataWith(other);
+           });
+}
+
+void BindVariable(pybind11::module* m) {
+  pybind11::class_<lt::Variable>(*m, "Variable")
+      .def("get_mutable_tensor",
+           [](lt::Variable& self) { return self.GetMutable<lt::Tensor>(); })
+      .def("get_mutable_fetch_list",
+           [](lt::Variable& self) -> paddle::lite::FeedFetchList* {
+             return self.GetMutable<paddle::lite::FeedFetchList>();
+           },
+           py::return_value_policy::reference);
+}
+
+void BindScope(pybind11::module* m) {
+  py::class_<lt::Scope, std::shared_ptr<lt::Scope>>(*m, "Scope")
+      .def(pybind11::init<>())
+      .def("new_scope",
+           [](lt::Scope& self) -> lt::Scope* { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("var", &lt::Scope::Var, pybind11::return_value_policy::reference)
+      .def("find_var", &lt::Scope::FindVar,
+           pybind11::return_value_policy::reference)
+      .def("find_local_var", &lt::Scope::FindLocalVar,
+           pybind11::return_value_policy::reference)
+      .def("parent", &lt::Scope::parent,
+           pybind11::return_value_policy::reference)
+      .def("local_var_names", &lt::Scope::LocalVarNames,
+           pybind11::return_value_policy::reference);
+}
+
+void BindExecutorLite(pybind11::module* m) {
+  py::class_<lt::Predictor>(*m, "Predictor")
+      .def(pybind11::init<>())
+      .def("__init__",
+           [](lt::Predictor& self,
+              const std::shared_ptr<lt::Scope>& root_scope) {
+             new (&self) lt::Predictor(root_scope);
+           })
+      .def("get_input", &lt::Predictor::GetInput,
+           pybind11::return_value_policy::reference)
+      .def("get_output", &lt::Predictor::GetOutput,
+           pybind11::return_value_policy::reference)
+      .def("run", [](lt::Predictor& self) { self.Run(); })
+      .def("run", [](lt::Predictor& self,
+                     const std::vector<framework::Tensor>& tensors) {
+        self.Run(tensors);
+      });
+}
+
+void BindEnums(pybind11::module* m) {
+  py::enum_<lt::TargetType>(*m, "TargetType", py::arithmetic(),
+                            "TargetType enum")
+      .value("kUnk", lt::TargetType::kUnk)
+      .value("kHost", lt::TargetType::kHost)
+      .value("kX86", lt::TargetType::kX86)
+      .value("kCUDA", lt::TargetType::kCUDA)
+      .value("kARM", lt::TargetType::kARM)
+      .value("kAny", lt::TargetType::kAny)
+      .value("NUM", lt::TargetType::NUM);
+
+  py::enum_<lt::PrecisionType>(*m, "PrecisionType", py::arithmetic(),
+                               "PrecisionType enum")
+      .value("kUnk", lt::PrecisionType::kUnk)
+      .value("kFloat", lt::PrecisionType::kFloat)
+      .value("kInt8", lt::PrecisionType::kInt8)
+      .value("kAny", lt::PrecisionType::kAny)
+      .value("NUM", lt::PrecisionType::NUM);
+
+  py::enum_<lt::DataLayoutType>(*m, "DataLayoutType", py::arithmetic(),
+                                "DataLayoutType enum")
+      .value("kUnk", lt::DataLayoutType::kUnk)
+      .value("kNCHW", lt::DataLayoutType::kNCHW)
+      .value("kAny", lt::DataLayoutType::kAny)
+      .value("NUM", lt::DataLayoutType::NUM);
+}
+
+void BindPlace(pybind11::module* m) {
+  pybind11::class_<lt::Place, std::shared_ptr<lt::Place>>(*m, "Place")
+      .def(pybind11::init<>())
+      .def("__init__",
+           [](lt::Place& self, lt::TargetType target,
+              lt::PrecisionType precision, lt::DataLayoutType layout,
+              int16_t device) {
+             new (&self) lt::Place(target, precision, layout, device);
+           })
+      .def("is_valid", &lt::Place::is_valid,
+           pybind11::return_value_policy::reference);
+}
+
+void BindCXXTrainer(pybind11::module* m) {
+  pybind11::class_<lt::CXXTrainer, std::shared_ptr<lt::CXXTrainer>>(
+      *m, "CXXTrainer")
+      .def(
+          "__init__",
+          [](lt::CXXTrainer& self, const std::shared_ptr<lt::Scope>& root_scope,
+             const lt::Place& preferred_place,
+             const std::vector<lt::Place>& valid_places) {
+            new (&self)
+                lt::CXXTrainer(root_scope, preferred_place, valid_places);
+          })
+      .def("build_main_program_executor",
+           [](lt::CXXTrainer& self,
+              framework::ProgramDesc& desc) -> lt::Predictor& {
+             return self.BuildMainProgramExecutor(desc);
+           },
+           pybind11::return_value_policy::reference)
+      .def("run_startup_program",
+           [](lt::CXXTrainer& self, framework::ProgramDesc& desc) {
+             return self.RunStartupProgram(desc);
+           });
+}
+
+void BindLite(pybind11::module* m) {
+  BindTensor(m);
+  BindVariable(m);
+  BindScope(m);
+  BindExecutorLite(m);
+  BindEnums(m);
+  BindPlace(m);
+  BindCXXTrainer(m);
+}
+
+}  // namespace pybind
+}  // namespace paddle
+
+// USE_LITE_OP(mul);
+USE_LITE_OP(elementwise_sub);
+USE_LITE_OP(uniform_random);
+USE_LITE_OP(feed);
+USE_LITE_OP(fetch);
+USE_LITE_OP(fill_constant);
+USE_LITE_OP(mul);
+USE_LITE_OP(mul_grad);
+USE_LITE_OP(mean);
+USE_LITE_OP(square);
+USE_LITE_OP(sgd);
+
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(uniform_random, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fill_constant, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul_grad, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mean, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(sgd, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_sub_grad, kX86, kFloat, kNCHW, def);
+#endif
diff --git a/paddle/fluid/pybind/executor_lite.h b/paddle/fluid/pybind/executor_lite.h
new file mode 100644
index 00000000000..c53e92d31f9
--- /dev/null
+++ b/paddle/fluid/pybind/executor_lite.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <Python.h>
+
+#include "pybind11/pybind11.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindLite(pybind11::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index d279ff3d9e4..2f6a7d2480a 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -47,17 +47,12 @@ void BindFleetWrapper(py::module* m) {
       .def("run_server", &framework::FleetWrapper::RunServer)
       .def("init_worker", &framework::FleetWrapper::InitWorker)
       .def("init_model", &framework::FleetWrapper::PushDenseParamSync)
-      .def("save_model", &framework::FleetWrapper::SaveModel)
-      .def("load_model", &framework::FleetWrapper::LoadModel)
       .def("stop_server", &framework::FleetWrapper::StopServer)
       .def("gather_servers", &framework::FleetWrapper::GatherServers)
       .def("gather_clients", &framework::FleetWrapper::GatherClients)
       .def("get_clients_info", &framework::FleetWrapper::GetClientsInfo)
       .def("create_client2client_connection",
-           &framework::FleetWrapper::CreateClient2ClientConnection)
-      .def("shrink_sparse_table", &framework::FleetWrapper::ShrinkSparseTable)
-      .def("shrink_dense_table", &framework::FleetWrapper::ShrinkDenseTable)
-      .def("client_flush", &framework::FleetWrapper::ClientFlush);
+           &framework::FleetWrapper::CreateClient2ClientConnection);
 }  // end FleetWrapper
 }  // end namespace pybind
 }  // end namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0d15b9a44d8..265707f1bcc 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -14,18 +14,12 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/imperative.h"
 
-#include <Python.h>
 #include <pybind11/chrono.h>
 #include <pybind11/complex.h>
 #include <pybind11/functional.h>
 #include <pybind11/stl.h>
-#include <memory>
-#include <unordered_map>
-#include <utility>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
@@ -34,318 +28,77 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
-namespace py = ::pybind11;
-
-class Layer : public imperative::Layer {
- public:
-  using imperative::Layer::Layer;  // Inherit constructors
-
-  std::vector<std::shared_ptr<imperative::VarBase>> Forward(
-      const std::vector<std::shared_ptr<imperative::VarBase>> &inputs)
-      override {
-    PYBIND11_OVERLOAD(std::vector<std::shared_ptr<imperative::VarBase>>, Layer,
-                      Forward,
-                      inputs);  // NOLINT
-  }
-};
-
-class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
- public:
-  using imperative::OpBase::OpBase;  // Inherit constructors
-
-  PyOpBase(const std::string &name) : OpBase(name) {}
-};
-
-// Function like obj.attr_name in Python.
-static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
-  // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
-  // is not inside obj, but it would also set the error flag of Python.
-  // If the error flag is set in C++, C++ code would not raise Exception,
-  // but Python would raise Exception once C++ call ends.
-  // To avoid unexpected Exception raised in Python, we check whether
-  // attribute exists before calling PyObject_GetAttrString.
-  //
-  // Caution: PyObject_GetAttrString would increase reference count of PyObject.
-  // Developer should call Py_DECREF manually after the attribute is not used.
-  if (PyObject_HasAttrString(obj, attr_name)) {
-    return PyObject_GetAttrString(obj, attr_name);
-  } else {
-    return nullptr;
-  }
-}
-
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW("Python object is not type of %s", typeid(T).name());
-  }
-}
-
-// NOTE(zjl): py::handle is a very light wrapper of PyObject *.
-// Unlike py::object, py::handle does not change reference count of PyObject *.
-static std::vector<std::shared_ptr<imperative::VarBase>>
-GetVarBaseListFromPyHandle(const py::handle &handle) {
-  PyObject *py_obj = handle.ptr();  // get underlying PyObject
-  // Python None is not nullptr in C++!
-  if (!py_obj || py_obj == Py_None) {
-    return {};
-  }
-
-  const char *kIVarField = "_ivar";
-  PyObject *py_ivar = GetPythonAttribute(py_obj, kIVarField);
-  std::vector<std::shared_ptr<imperative::VarBase>> result;
-
-  if (py_ivar) {  // Variable
-    result.emplace_back(
-        PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-    Py_DECREF(py_ivar);
-  } else if (PyList_Check(py_obj)) {  // List of Variable
-    size_t len = PyList_GET_SIZE(py_obj);
-    result.reserve(len);
-    for (size_t i = 0; i < len; ++i) {
-      PyObject *py_ivar =
-          PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kIVarField);
-      PADDLE_ENFORCE_NOT_NULL(py_ivar);
-      result.emplace_back(
-          PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-      Py_DECREF(py_ivar);
-    }
-  } else if (PyTuple_Check(py_obj)) {  // Tuple of Variable
-    size_t len = PyTuple_GET_SIZE(py_obj);
-    result.reserve(len);
-    for (size_t i = 0; i < len; ++i) {
-      PyObject *py_ivar =
-          PyObject_GetAttrString(PyTuple_GET_ITEM(py_obj, i), kIVarField);
-      PADDLE_ENFORCE_NOT_NULL(py_ivar);
-      result.emplace_back(
-          PyObjectCast<std::shared_ptr<imperative::VarBase>>(py_ivar));
-      Py_DECREF(py_ivar);
-    }
-  } else {
-    PADDLE_THROW(
-        "unsupported type %s, must be Variable, List[Variable] or "
-        "tuple[Variable]",
-        py::str(handle));
-  }
-
-  PADDLE_ENFORCE(PyErr_Occurred() == nullptr,
-                 py::str(py::handle(PyErr_Occurred())));
-
-  return result;
-}
-
-using PyVarBaseMap = std::unordered_map<std::string, py::handle>;
-
-static imperative::VarBasePtrMap ConvertToVarBasePtrMap(
-    const PyVarBaseMap &map) {
-  imperative::VarBasePtrMap result;
-  for (auto &pair : map) {
-    auto var_vec = GetVarBaseListFromPyHandle(pair.second);
-    if (!var_vec.empty()) {
-      result.emplace(pair.first, std::move(var_vec));
-    }
-  }
-  return result;
-}
-
 // Bind Methods
-void BindImperative(pybind11::module *m_ptr) {
-  auto &m = *m_ptr;
-
-  py::class_<imperative::detail::BackwardStrategy> backward_strategy(
-      m, "BackwardStrategy", R"DOC(
-
-    BackwardStrategy is a descriptor of a how to run the backward process. Now it has:
-
-    1. :code:`sort_sum_gradient`, which will sum the gradient by the reverse order of trace.
-
-    Examples:
-
-     .. code-block:: python
-        import numpy as np
-        import paddle.fluid as fluid
-        from paddle.fluid import FC
-
-        x = np.ones([2, 2], np.float32)
-        with fluid.dygraph.guard():
-            inputs2 = []
-            for _ in range(10):
-                inputs2.append(fluid.dygraph.base.to_variable(x))
-            ret2 = fluid.layers.sums(inputs2)
-            loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
-      )DOC");
-  backward_strategy.def(py::init())
-      .def_property("sort_sum_gradient",
-                    [](const imperative::detail::BackwardStrategy &self) {
-                      return self.sorted_sum_gradient_;
-                    },
-                    [](imperative::detail::BackwardStrategy &self,
-                       bool sorted_sum_gradient) {
-                      self.sorted_sum_gradient_ = sorted_sum_gradient;
-                    });
-
-  m.def("start_imperative_gperf_profiler",
-        []() { imperative::StartProfile(); });
-
-  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
-
-  m.def("_is_dygraph_debug_enabled",
-        []() { return imperative::IsDebugEnabled(); });
-  m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); });
-
-  py::class_<imperative::VarBase, std::shared_ptr<imperative::VarBase>>(
-      m, "VarBase", R"DOC()DOC")
-      .def_static("_alive_vars", &imperative::VarBase::AliveVarNames)
-      .def(
-          py::init<const std::string &, paddle::framework::proto::VarType::Type,
-                   const std::vector<int64_t>, const paddle::platform::CPUPlace,
-                   bool, bool>())
-      .def(
-          py::init<const std::string &, paddle::framework::proto::VarType::Type,
-                   const std::vector<int64_t>,
-                   const paddle::platform::CUDAPlace, bool, bool>())
-      .def("_run_backward",
-           [](imperative::VarBase &self,
-              const imperative::detail::BackwardStrategy &bckst) {
-             self.RunBackward(bckst);
-           })
-      .def("_grad_name", &imperative::VarBase::GradName)
-      .def("_grad_value", &imperative::VarBase::GradValue)
-      .def("_clear_gradient", &imperative::VarBase::ClearGradient)
-      .def("_grad_ivar",
-           [](const imperative::VarBase &self) { return self.grads_; },
-           py::return_value_policy::reference)
-      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CPUPlace &place,
-              bool blocking) {
-             return self.NewVarBase(place, blocking).release();
-           },
-           py::return_value_policy::take_ownership)
-      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
-              bool blocking) {
-             return self.NewVarBase(place, blocking).release();
-           },
-           py::return_value_policy::take_ownership)
-      .def("value",
-           [](const imperative::VarBase &self) { return self.var_.get(); },
-           py::return_value_policy::reference)
-      .def_property("name", &imperative::VarBase::Name,
-                    &imperative::VarBase::SetName)
-      .def_property_readonly("shape", &imperative::VarBase::Shape)
-      .def_property_readonly("dtype", &imperative::VarBase::DataType)
-      .def_property("persistable", &imperative::VarBase::IsPersistable,
-                    &imperative::VarBase::SetPersistable)
-      .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
-                    &imperative::VarBase::SetStopGradient);
-
-  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
-      .def(py::init<const std::string &>())
-      .def("register_backward_hooks",
-           [](imperative::OpBase &self, const py::object &callable) {
-             self.RegisterBackwardHooks(callable);
-           })
-      .def_property("_trace_id",
-                    [](const imperative::OpBase &self) {
-                      py::gil_scoped_release release;
-                      return self.trace_id_;
-                    },
-                    [](imperative::OpBase &self, int trace_id) {
-                      py::gil_scoped_release release;
-                      self.trace_id_ = trace_id;
-                    },
-                    py::return_value_policy::reference)
-      .def_property_readonly("type", &imperative::OpBase::Type);
-
-  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
-  layer.def(py::init<>())
-      .def("forward",
-           [](imperative::Layer &self,
-              const std::vector<std::shared_ptr<imperative::VarBase>> &inputs) {
-             return self.Forward(inputs);
-           });
-
-  // NOTE(zjl): Tracer use PyVarBaseMap as its parameter but not VarBasePtrMap.
-  // We call Python C-API to convert PyVarBaseMap to VarBasePtrMap, instead
-  // making conversion in Python code. This speed up Tracer.trace() about 6%
-  // in ptb model and make time cost in Python to be nearly zero.
-  py::class_<imperative::Tracer>(m, "Tracer", "")
+void BindImperative(pybind11::module* m) {
+  pybind11::class_<imperative::Tracer>(*m, "Tracer", "")
       .def("__init__",
-           [](imperative::Tracer &self, framework::BlockDesc *root_block) {
+           [](imperative::Tracer& self, framework::BlockDesc* root_block) {
              new (&self) imperative::Tracer(root_block);
            })
       .def("trace",
-           [](imperative::Tracer &self, imperative::OpBase *op,
-              const PyVarBaseMap &inputs, const PyVarBaseMap &outputs,
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              imperative::VarBasePtrMap* outputs,
               framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
-             auto ins = ConvertToVarBasePtrMap(inputs);
-             auto outs = ConvertToVarBasePtrMap(outputs);
-             {
-               py::gil_scoped_release release;
-               self.Trace(op, std::move(ins), &outs, attrs_map, expected_place,
-                          stop_gradient);
-             }
+             pybind11::gil_scoped_release release;
+             return self.Trace(op, inputs, outputs, attrs_map, expected_place,
+                               stop_gradient);
+           })
+      .def("trace",
+           [](imperative::Tracer& self, imperative::OpBase* op,
+              const imperative::VarBasePtrMap& inputs,
+              imperative::VarBasePtrMap* outputs,
+              framework::AttributeMap attrs_map,
+              const platform::CUDAPlace expected_place,
+              const bool stop_gradient = false) {
+             pybind11::gil_scoped_release release;
+             return self.Trace(op, inputs, outputs, attrs_map, expected_place,
+                               stop_gradient);
            })
-      .def("trace", [](imperative::Tracer &self, imperative::OpBase *op,
-                       const PyVarBaseMap &inputs, const PyVarBaseMap &outputs,
-                       framework::AttributeMap attrs_map,
-                       const platform::CUDAPlace expected_place,
-                       const bool stop_gradient = false) {
-        auto ins = ConvertToVarBasePtrMap(inputs);
-        auto outs = ConvertToVarBasePtrMap(outputs);
-        {
-          py::gil_scoped_release release;
-          self.Trace(op, std::move(ins), &outs, attrs_map, expected_place,
-                     stop_gradient);
-        }
-      });
+      .def("py_trace", &imperative::Tracer::PyTrace,
+           pybind11::return_value_policy::take_ownership);
 
   // define parallel context
-  py::class_<imperative::ParallelStrategy> parallel_strategy(
-      m, "ParallelStrategy", "");
-  parallel_strategy.def(py::init())
+  pybind11::class_<imperative::ParallelStrategy> parallel_strategy(
+      *m, "ParallelStrategy", "");
+  parallel_strategy.def(pybind11::init())
       .def_property(
           "nranks",
-          [](const imperative::ParallelStrategy &self) { return self.nranks_; },
-          [](imperative::ParallelStrategy &self, int nranks) {
+          [](const imperative::ParallelStrategy& self) { return self.nranks_; },
+          [](imperative::ParallelStrategy& self, int nranks) {
             self.nranks_ = nranks;
           })
       .def_property("local_rank",
-                    [](const imperative::ParallelStrategy &self) {
+                    [](const imperative::ParallelStrategy& self) {
                       return self.local_rank_;
                     },
-                    [](imperative::ParallelStrategy &self, int local_rank) {
+                    [](imperative::ParallelStrategy& self, int local_rank) {
                       self.local_rank_ = local_rank;
                     })
       .def_property(
           "trainer_endpoints",
-          [](const imperative::ParallelStrategy &self) {
+          [](const imperative::ParallelStrategy& self) {
             return self.trainer_endpoints_;
           },
-          [](imperative::ParallelStrategy &self, std::vector<std::string> eps) {
+          [](imperative::ParallelStrategy& self, std::vector<std::string> eps) {
             self.trainer_endpoints_ = eps;
           })
       .def_property("current_endpoint",
-                    [](const imperative::ParallelStrategy &self) {
+                    [](const imperative::ParallelStrategy& self) {
                       return self.current_endpoint_;
                     },
-                    [](imperative::ParallelStrategy &self,
-                       const std::string &ep) { self.current_endpoint_ = ep; });
+                    [](imperative::ParallelStrategy& self,
+                       const std::string& ep) { self.current_endpoint_ = ep; });
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  py::class_<imperative::NCCLParallelContext> nccl_ctx(m,
-                                                       "NCCLParallelContext");
+  pybind11::class_<imperative::NCCLParallelContext> nccl_ctx(
+      *m, "NCCLParallelContext");
 
   nccl_ctx
-      .def(py::init<const imperative::ParallelStrategy &,
-                    const platform::CUDAPlace &>())
-      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
+      .def(pybind11::init<const imperative::ParallelStrategy&,
+                          const platform::CUDAPlace&>())
+      .def("init", [](imperative::NCCLParallelContext& self) { self.Init(); });
 #endif
 }
 
diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h
index cfe185bbfbb..f9d4a7c990e 100644
--- a/paddle/fluid/pybind/imperative.h
+++ b/paddle/fluid/pybind/imperative.h
@@ -24,6 +24,29 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+class Layer : public imperative::Layer {
+ public:
+  using imperative::Layer::Layer;  // Inherit constructors
+
+  std::vector<imperative::VarBase> Forward(
+      const std::vector<imperative::VarBase>& inputs) override {
+    PYBIND11_OVERLOAD(std::vector<imperative::VarBase>, Layer, Forward,
+                      inputs);  // NOLINT
+  }
+};
+
+class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase {
+ public:
+  using imperative::OpBase::OpBase;  // Inherit constructors
+
+  PyOpBase(const std::string& name) : OpBase(name) {}
+};
+
+class PyVarBase : public imperative::VarBase {
+ public:
+  using imperative::VarBase::VarBase;  // Inherit constructors
+};
+
 void BindImperative(pybind11::module* m);
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 27f0e30d021..b650225c64a 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -17,9 +17,7 @@
 #include <cstring>
 #include <iostream>
 #include <map>
-#include <memory>
 #include <string>
-#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
@@ -47,10 +45,6 @@ static void BindNativePredictor(py::module *m);
 static void BindAnalysisConfig(py::module *m);
 static void BindAnalysisPredictor(py::module *m);
 
-#ifdef PADDLE_WITH_MKLDNN
-static void BindMkldnnQuantizerConfig(py::module *m);
-#endif
-
 void BindInferenceApi(py::module *m) {
   BindPaddleDType(m);
   BindPaddleBuf(m);
@@ -61,9 +55,7 @@ void BindInferenceApi(py::module *m) {
   BindNativePredictor(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
-#ifdef PADDLE_WITH_MKLDNN
-  BindMkldnnQuantizerConfig(m);
-#endif
+
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<AnalysisConfig>);
   m->def("create_paddle_predictor",
@@ -237,7 +229,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
            py::arg("min_subgraph_size") = 3,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
-           py::arg("use_static") = true, py::arg("use_calib_mode") = false)
+           py::arg("use_static") = true)
       .def("enable_anakin_engine", &AnalysisConfig::EnableAnakinEngine,
            py::arg("max_batch_size") = 1,
            py::arg("max_input_shape") =
@@ -257,11 +249,6 @@ void BindAnalysisConfig(py::module *m) {
       .def("cpu_math_library_num_threads",
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
-      .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
-#ifdef PADDLE_WITH_MKLDNN
-      .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
-           py::return_value_policy::reference)
-#endif
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
       .def("model_from_memory", &AnalysisConfig::model_from_memory)
@@ -269,28 +256,6 @@ void BindAnalysisConfig(py::module *m) {
            py::return_value_policy::reference);
 }
 
-#ifdef PADDLE_WITH_MKLDNN
-void BindMkldnnQuantizerConfig(py::module *m) {
-  py::class_<MkldnnQuantizerConfig> quantizer_config(*m,
-                                                     "MkldnnQuantizerConfig");
-  quantizer_config.def(py::init<const MkldnnQuantizerConfig &>())
-      .def(py::init<>())
-      .def("set_quant_data",
-           [](MkldnnQuantizerConfig &self,
-              const std::vector<PaddleTensor> &data) {
-             auto warmup_data =
-                 std::make_shared<std::vector<PaddleTensor>>(data);
-             self.SetWarmupData(warmup_data);
-             return;
-           })
-      .def("set_quant_batch_size", &MkldnnQuantizerConfig::SetWarmupBatchSize)
-      .def(
-          "set_enabled_op_types",
-          (void (MkldnnQuantizerConfig::*)(std::unordered_set<std::string> &)) &
-              MkldnnQuantizerConfig::SetEnabledOpTypes);
-}
-#endif
-
 void BindAnalysisPredictor(py::module *m) {
   py::class_<AnalysisPredictor, PaddlePredictor>(*m, "AnalysisPredictor")
       .def(py::init<const AnalysisConfig &>())
@@ -307,9 +272,7 @@ void BindAnalysisPredictor(py::module *m) {
       .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
       .def("clone", &AnalysisPredictor::Clone)
       .def("scope", &AnalysisPredictor::scope,
-           py::return_value_policy::reference)
-      .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel,
-           py::arg("dir"));
+           py::return_value_policy::reference);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index abc10765e4a..798e488f5b0 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -24,7 +24,6 @@
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "pybind11/stl.h"
 
@@ -38,7 +37,6 @@ using paddle::framework::ir::TopologySortOperations;
 using paddle::framework::ir::BuildOperationAdjList;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
-using paddle::framework::Scope;
 using paddle::framework::VarDesc;
 using pybind11::return_value_policy;
 
@@ -59,15 +57,12 @@ void BindGraph(py::module *m) {
       .def(py::init<const ProgramDesc &>())
       .def("clone", &Graph::Clone)
       .def("has", &Graph::Has)
-      .def("get_bool", &Graph::Get<bool>)
       .def("get_int", &Graph::Get<int>)
       .def("get_float", &Graph::Get<float>)
       .def("get_double", &Graph::Get<double>)
       .def("get_string", &Graph::Get<std::string>)
       .def("get_marked_nodes", &Graph::Get<std::unordered_set<const Node *>>,
            return_value_policy::reference)
-      .def("set", [](Graph &self, const std::string &attr_name,
-                     bool attr) { return self.Set(attr_name, new bool(attr)); })
       .def("set", [](Graph &self, const std::string &attr_name,
                      int attr) { return self.Set(attr_name, new int(attr)); })
       .def("set",
@@ -95,10 +90,6 @@ void BindGraph(py::module *m) {
              return self.Set(attr_name,
                              new std::unordered_set<std::string>(attr));
            })
-      .def("set_not_owned",
-           [](Graph &self, const std::string &attr_name, Scope &attr) {
-             self.SetNotOwned<Scope>(attr_name, &attr);
-           })
       .def("erase", &Graph::Erase)
       .def("nodes", &Graph::Nodes, return_value_policy::reference)
       .def("create_var_node",
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f6096fb8ca4..fa8cee26b19 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <mutex>  // NOLINT // for call_once
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -39,12 +38,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope_pool.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
-#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
@@ -54,11 +54,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/executor_lite.h"
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
-
 #ifndef _WIN32
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
@@ -77,10 +77,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/pybind/communicator_py.h"
-#endif
-
 #include "pybind11/stl.h"
 
 DEFINE_bool(reader_queue_speed_test_mode, false,
@@ -146,12 +142,7 @@ static inline int PlaceIndex(const PlaceType &p) {
   return static_cast<int>(paddle::platform::Place(p).which());
 }
 
-#ifdef PADDLE_WITH_AVX
-PYBIND11_MODULE(core_avx, m) {
-#else
-PYBIND11_MODULE(core_noavx, m) {
-#endif
-
+PYBIND11_MODULE(core, m) {
   // Not used, just make sure cpu_info.cc is linked.
   paddle::platform::CpuTotalPhysicalMemory();
 
@@ -165,8 +156,6 @@ PYBIND11_MODULE(core_noavx, m) {
 
   BindException(&m);
 
-  m.def("set_num_threads", &platform::SetNumThreads);
-
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -196,6 +185,121 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("print_mem_usage",
         []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
 
+  m.def("start_imperative_gperf_profiler",
+        []() { imperative::StartProfile(); });
+
+  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
+
+  py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
+      .def(
+          py::init<const std::string &, paddle::framework::proto::VarType::Type,
+                   const std::vector<int64_t>, const paddle::platform::CPUPlace,
+                   bool, bool>())
+      .def(
+          py::init<const std::string &, paddle::framework::proto::VarType::Type,
+                   const std::vector<int64_t>,
+                   const paddle::platform::CUDAPlace, bool, bool>())
+      .def("_run_backward",
+           [](imperative::VarBase &self) { self.RunBackward(); })
+      .def("_grad_name", &imperative::VarBase::GradName)
+      .def("_grad_value", &imperative::VarBase::GradValue)
+      .def("_clear_gradient", &imperative::VarBase::ClearGradient)
+      .def("_grad_ivar",
+           [](const imperative::VarBase &self) { return self.grads_; },
+           py::return_value_policy::reference)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CPUPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("_copy_to",
+           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
+              bool blocking) {
+             std::unique_ptr<imperative::VarBase> new_var =
+                 self.NewVarBase(place, blocking);
+             return new_var.release();
+           },
+           py::return_value_policy::take_ownership)
+      .def("value", [](const imperative::VarBase &self) { return self.var_; },
+           py::return_value_policy::reference)
+      .def_property("name", &imperative::VarBase::Name,
+                    &imperative::VarBase::SetName)
+      .def_property_readonly("shape", &imperative::VarBase::Shape)
+      .def_property_readonly("dtype", &imperative::VarBase::DataType)
+      .def_property("persistable", &imperative::VarBase::IsPersistable,
+                    &imperative::VarBase::SetPersistable)
+      .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,
+                    &imperative::VarBase::SetStopGradient);
+
+  py::class_<imperative::OpBase, PyOpBase>(m, "OpBase", R"DOC()DOC")
+      .def(py::init<const std::string &>())
+      .def("register_backward_hooks",
+           [](imperative::OpBase &self, const py::object &callable,
+              bool front = false) {
+             self.RegisterBackwardHooks(callable, front);
+           },
+           py::arg("callable"), py::arg("front") = false)
+      .def_property("_trace_id",
+                    [](const imperative::OpBase &self) {
+                      pybind11::gil_scoped_release release;
+                      return self.trace_id_;
+                    },
+                    [](imperative::OpBase &self, int trace_id) {
+                      pybind11::gil_scoped_release release;
+                      self.trace_id_ = trace_id;
+                    },
+                    py::return_value_policy::reference)
+      .def_property(
+          "forward_id",
+          [](const imperative::OpBase &self) { return self.forward_id_; },
+          [](imperative::OpBase &self, int forward_id) {
+            self.forward_id_ = forward_id;
+          },
+          py::return_value_policy::reference)
+      .def_property_readonly("type", &imperative::OpBase::Type)
+      .def_property(
+          "backward_id",
+          [](const imperative::OpBase &self) { return self.backward_id_; },
+          [](imperative::OpBase &self, int backward_id) {
+            self.backward_id_ = backward_id;
+          },
+          py::return_value_policy::reference);
+
+  py::class_<imperative::Layer, Layer /* <--- trampoline*/> layer(m, "Layer");
+  layer.def(py::init<>())
+      .def("forward", [](imperative::Layer &self,
+                         const std::vector<imperative::VarBase> &inputs) {
+        return self.Forward(inputs);
+      });
+
+  py::class_<imperative::PyLayer>(m, "PyLayer")
+      .def(py::init<>())
+      .def_static(
+          "apply",
+          [](int func_id, const std::vector<imperative::VarBase *> &inputs)
+              -> std::vector<imperative::VarBase *> {
+                auto ret_vars = imperative::PyLayer::Apply(func_id, inputs);
+                std::vector<imperative::VarBase *> outputs;
+                outputs.reserve(ret_vars.size());
+                for (size_t i = 0U; i != ret_vars.size(); ++i) {
+                  framework::Variable *v = ret_vars[i];
+                  // TODO(minqiyang): use unique_name generator to set a name
+                  outputs.emplace_back(
+                      new imperative::VarBase("", v, nullptr, true));
+                }
+
+                return outputs;
+              },
+          py::return_value_policy::take_ownership)
+      .def_static("register_func",
+                  [](int func_id, const py::object &callable) {
+                    imperative::PyLayer::RegisterFunc(func_id, callable);
+                  })
+      .def_static("num_funcs", &imperative::PyLayer::NumFuncs);
+
   BindImperative(&m);
 
   py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
@@ -236,7 +340,6 @@ PYBIND11_MODULE(core_noavx, m) {
            [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
-      .def("_clear", &Tensor::clear)
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
@@ -264,18 +367,14 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("set", PyCUDAPinnedTensorSetFromArray<int8_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("memory_size", [](Tensor &self) { return self.memory_size(); })
       .def("_set_float_element", TensorSetElement<float>)
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
       .def("_place", [](Tensor &self) { return self.place(); })
       .def("_dtype", [](Tensor &self) { return self.type(); })
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const Tensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
+      .def("__getitem__", PySliceTensor, py::return_value_policy::reference);
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
     LoDTensor is a Tensor with optional LoD information.
@@ -286,8 +385,8 @@ PYBIND11_MODULE(core_noavx, m) {
     LoD is short for Level of Details and is usually used for varied sequence
     length. You can skip the following comment if you don't need optional LoD.
 
-    For example, a LoDTensor X can look like the example below. It contains
-    2 sequences. The first has length 2 and the second has length 3, as
+    For example, a LoDTensor X can look like the example below. It contains 
+    2 sequences. The first has length 2 and the second has length 3, as 
     described by x.lod.
 
     The first tensor dimension 5=2+3 is calculated from LoD if it's available.
@@ -295,7 +394,7 @@ PYBIND11_MODULE(core_noavx, m) {
     columns, hence [5, 2].
 
     x.lod  = [[2, 3]]
-
+     
     x.data = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]
 
     x.shape = [5, 2]
@@ -494,12 +593,7 @@ PYBIND11_MODULE(core_noavx, m) {
 
            Returns:
                out (Tensor): new Tensor(NOT LoDTensor).
-           )DOC")
-      .def("__str__", [](const LoDTensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
+           )DOC");
 
   py::class_<SelectedRows>(m, "SelectedRows")
       .def("__init__",
@@ -921,38 +1015,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) { return op.OutputVars(false); })
       .def("support_gpu", &OperatorBase::SupportGPU);
 
-  py::class_<framework::ExecutorPrepareContext>(m, "ExecutorPrepareContext")
-      .def(py::init<const ProgramDesc &, size_t>());
-
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
       .def("close", &Executor::Close)
-      .def("run_from_dataset", &Executor::RunFromDataset,
-           py::call_guard<py::gil_scoped_release>())
-      .def("run_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              std::map<std::string, const LoDTensor *> *feed_targets,
-              std::map<std::string, LoDTensor *> *fetch_targets,
-              bool create_local_scope = true, bool create_vars = true,
-              const std::string &feed_holder_name = "feed",
-              const std::string &fetch_holder_name = "fetch") {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
-                                     create_local_scope, create_vars,
-                                     feed_holder_name, fetch_holder_name);
-           })
-      .def("run_cached_prepared_ctx",
-           [](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
-              bool create_local_scope = true, bool create_vars = true,
-              bool keep_kids = false) {
-             pybind11::gil_scoped_release release;
-             self.RunPreparedContext(ctx, scope, create_local_scope,
-                                     create_vars, keep_kids);
-           })
-      .def("prepare_ctx_cache", &Executor::PrepareCtxCache,
-           py::call_guard<py::gil_scoped_release>())
-      .def("create_variables", &Executor::CreateVariables,
-           py::call_guard<py::gil_scoped_release>())
+      .def("run_from_dataset", &Executor::RunFromDataset)
       .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
                      int block_id, bool create_local_scope, bool create_vars,
                      const std::vector<std::string> &fetch_vars) {
@@ -1005,7 +1071,7 @@ All parameter, weight, gradient are variables in Paddle.
 
     Examples:
         .. code-block:: python
-
+        
           import paddle.fluid as fluid
 
           arr = fluid.LoDTensorArray()
@@ -1127,23 +1193,15 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-          y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-          y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-          cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-          avg_loss = fluid.layers.mean(cost)
-
-          sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-          sgd_optimizer.minimize(avg_loss)
-
           exec_strategy = fluid.ExecutionStrategy()
           exec_strategy.num_threads = 4
 
-          train_exe = fluid.ParallelExecutor(use_cuda=False,
-                                             loss_name=avg_loss.name,
+          train_exe = fluid.ParallelExecutor(use_cuda=True,
+                                             loss_name=loss.name,
                                              exec_strategy=exec_strategy)
 
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
+
         )DOC");
 
   exec_strategy.def(py::init())
@@ -1179,8 +1237,7 @@ All parameter, weight, gradient are variables in Paddle.
           },
           R"DOC(The type is BOOL, allow_op_delay represents whether to delay the
                 communication operators to run, it may make the execution faster.
-                Note that this option is invalid now, and it will be removed in
-                next version. Default False.)DOC")
+                Note that in some models, allow_op_delay may cause program hang. Default False.)DOC")
       .def_property(
           "num_iteration_per_drop_scope",
           [](const ExecutionStrategy &self) {
@@ -1192,8 +1249,7 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations.
-                Default 1.
+                because the temp variable's shape maybe the same between two iterations. Default 100.
 
                 NOTES:
                     1. If you fetch data when calling the 'run', the ParallelExecutor
@@ -1235,9 +1291,14 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            build_strategy = fluid.BuildStrategy()
-            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+          build_strategy = fluid.BuildStrategy()
+          build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+
+          train_exe = fluid.ParallelExecutor(use_cuda=True,
+                                             loss_name=loss.name,
+                                             build_strategy=build_strategy)
+
+          train_loss, = train_exe.run([loss.name], feed=feed_dict)
 )DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
@@ -1259,19 +1320,11 @@ All parameter, weight, gradient are variables in Paddle.
             self.reduce_ = strategy;
           },
           R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor,
-                'AllReduce' and 'Reduce'. If you want that all the parameters'
-                optimization are done on all devices independently, you should choose 'AllReduce';
-                if you choose 'Reduce', all the parameters' optimization will be evenly distributed
-                to different devices, and then broadcast the optimized parameter to other devices.
-                In some models, `Reduce` is faster. Default 'AllReduce'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
-                  )DOC")
+                  'AllReduce' and 'Reduce'. If you want that all the parameters'
+                  optimization are done on all devices independently, you should choose 'AllReduce';
+                  if you choose 'Reduce', all the parameters' optimization will be evenly distributed
+                  to different devices, and then broadcast the optimized parameter to other devices.
+                  In some models, `Reduce` is faster. Default 'AllReduce'. )DOC")
       .def_property(
           "gradient_scale_strategy",
           [](const BuildStrategy &self) { return self.gradient_scale_; },
@@ -1281,18 +1334,10 @@ All parameter, weight, gradient are variables in Paddle.
             self.gradient_scale_ = strategy;
           },
           R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in
-                ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
-                ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
-                If you want to customize :math:`loss@grad`, you can choose 'Customized'.
-                Default 'CoeffNumDevice'.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.gradient_scale_strategy = True
-                   )DOC")
+                   ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default,
+                   ParallelExecutor sets the :math:`loss@grad` according to the number of devices.
+                   If you want to customize :math:`loss@grad`, you can choose 'Customized'.
+                   Default 'CoeffNumDevice'.)DOC")
       .def_property(
           "debug_graphviz_path",
           [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
@@ -1301,16 +1346,8 @@ All parameter, weight, gradient are variables in Paddle.
             self.debug_graphviz_path_ = path;
           },
           R"DOC(The type is STR, debug_graphviz_path indicate the path that
-                writing the SSA Graph to file in the form of graphviz.
-                It is useful for debugging. Default ""
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.debug_graphviz_path = ""
-                    )DOC")
+                    writing the SSA Graph to file in the form of graphviz, you.
+                    It is useful for debugging. Default "")DOC")
       .def_property(
           "enable_sequential_execution",
           [](const BuildStrategy &self) {
@@ -1320,15 +1357,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.enable_sequential_execution_ = b;
           },
-          R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.enable_sequential_execution = True
-          )DOC")
+          R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False.)DOC")
       .def_property(
           "remove_unnecessary_lock",
           [](const BuildStrategy &self) {
@@ -1338,22 +1367,11 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized.");
             self.remove_unnecessary_lock_ = b;
           },
-          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.remove_unnecessary_lock = True
-          )DOC")
+          R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True.)DOC")
       .def_property(
           "num_trainers",
           [](const BuildStrategy &self) { return self.num_trainers_; },
           [](BuildStrategy &self, int num_trainers) {
-#ifdef WIN32
-            PADDLE_THROW("Windows has NO support to distribute mode.");
-#endif
             self.num_trainers_ = num_trainers;
           })
       .def_property(
@@ -1368,34 +1386,6 @@ All parameter, weight, gradient are variables in Paddle.
                     [](BuildStrategy &self, int trainer_id) {
                       self.trainer_id_ = trainer_id;
                     })
-      .def_property(
-          "nccl_comm_num",
-          [](const BuildStrategy &self) { return self.nccl_comm_num_; },
-          [](BuildStrategy &self, int nccl_comm_num) {
-            self.nccl_comm_num_ = nccl_comm_num;
-          })
-      .def_property("use_hierarchical_allreduce_",
-                    [](const BuildStrategy &self) {
-                      return self.use_hierarchical_allreduce_;
-                    },
-                    [](BuildStrategy &self, bool use) {
-                      self.use_hierarchical_allreduce_ = use;
-                    })
-      .def_property("hierarchical_allreduce_inter_nranks_",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_inter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_inter_nranks_ = nranks;
-                    })
-      .def_property("hierarchical_allreduce_exter_nranks_",
-                    [](const BuildStrategy &self) {
-                      return self.hierarchical_allreduce_exter_nranks_;
-                    },
-                    [](BuildStrategy &self, int nranks) {
-                      self.hierarchical_allreduce_exter_nranks_ = nranks;
-                    })
-
       .def_property(
           "fuse_elewise_add_act_ops",
           [](const BuildStrategy &self) {
@@ -1406,16 +1396,8 @@ All parameter, weight, gradient are variables in Paddle.
             self.fuse_elewise_add_act_ops_ = b;
           },
           R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether
-                to fuse elementwise_add_op and activation_op,
-                it may make the execution faster. Default False
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_elewise_add_act_ops = True
-                     )DOC")
+                     to fuse elementwise_add_op and activation_op,
+                     it may make the execution faster. Default False)DOC")
       .def_property(
           "fuse_relu_depthwise_conv",
           [](const BuildStrategy &self) {
@@ -1426,18 +1408,10 @@ All parameter, weight, gradient are variables in Paddle.
             self.fuse_relu_depthwise_conv_ = b;
           },
           R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether
-                to fuse relu and depthwise_conv2d,
-                it will save GPU memory and may make the execution faster.
-                This options is only available in GPU devices.
-                Default False.
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.fuse_relu_depthwise_conv = True
-          )DOC")
+                      to fuse relu and depthwise_conv2d,
+                      it will save GPU memory and may make the execution faster.
+                      This options is only available in GPU devices.
+                      Default False.)DOC")
       .def_property(
           "fuse_broadcast_ops",
           [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; },
@@ -1474,40 +1448,24 @@ All parameter, weight, gradient are variables in Paddle.
                 Current implementation doesn't support FP16 training and CPU.
                 And only synchronous on one machine, not all machines.
 
-                Default False
-
-                Examples:
-                    .. code-block:: python
-
-                        import paddle.fluid as fluid
-                        build_strategy = fluid.BuildStrategy()
-                        build_strategy.sync_batch_norm = True
-                )DOC")
+                Default False)DOC")
       .def_property(
           "memory_optimize",
           [](const BuildStrategy &self) { return self.memory_optimize_; },
           [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; },
-          R"DOC(The type is BOOL, memory opitimize aims to save total memory
+          R"DOC(The type is BOOL, memory opitimize aims to save total memory 
                 consumption, set to True to enable it.
-
-                Memory Optimize is our experimental feature, some variables
+                
+                Memory Optimize is our experimental feature, some variables 
                 may be reused/removed by optimize strategy. If you need to
                 fetch some variable values when using this feature, please
                 set the persistable property of the variables to True.
-
+                
                 Default False)DOC")
       .def_property(
           "is_distribution",
           [](const BuildStrategy &self) { return self.is_distribution_; },
-          [](BuildStrategy &self, bool b) {
-#ifdef WIN32
-            if (b) {
-              PADDLE_THROW("Windows has NO support to distribute mode.");
-            }
-#else
-            self.is_distribution_ = b;
-#endif
-          })
+          [](BuildStrategy &self, bool b) { self.is_distribution_ = b; })
       .def_property("async_mode",
                     [](const BuildStrategy &self) { return self.async_mode_; },
                     [](BuildStrategy &self, bool b) { self.async_mode_ = b; })
@@ -1519,26 +1477,14 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
           [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
-      .def_property("enable_backward_optimizer_op_deps",
-                    [](const BuildStrategy &self) {
-                      return self.enable_backward_optimizer_op_deps_;
-                    },
-                    [](BuildStrategy &self, bool b) {
-                      self.enable_backward_optimizer_op_deps_ = b;
-                    })
       .def_property(
           "cache_runtime_context",
           [](const BuildStrategy &self) { return self.cache_runtime_context_; },
           [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
       .def_property(
-          "mkldnn_enabled_op_types",
-          [](const BuildStrategy &self) {
-            return self.mkldnn_enabled_op_types_;
-          },
-          [](BuildStrategy &self,
-             const std::unordered_set<std::string> &mkldnn_enabled_op_types) {
-            self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types;
-          })
+          "cache_expected_kernel",
+          [](const BuildStrategy &self) { return self.cache_expected_kernel_; },
+          [](BuildStrategy &self, bool b) { self.cache_expected_kernel_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
@@ -1584,9 +1530,9 @@ All parameter, weight, gradient are variables in Paddle.
   BindNode(&m);
   BindInferenceApi(&m);
   BindDataset(&m);
-#ifdef PADDLE_WITH_DISTRIBUTE
-  BindCommunicator(&m);
-#endif
+
+  py::module lite = m.def_submodule("lite", "submodule lite");
+  BindLite(&lite);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 4c304e8626b..af7d30552ed 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -31,7 +31,6 @@ class MultiDeviceFeedReader {
  public:
   using ResultDictList =
       std::vector<std::unordered_map<std::string, framework::LoDTensor>>;
-  using ResultList = std::vector<std::vector<framework::LoDTensor>>;
 
   MultiDeviceFeedReader(
       const std::shared_ptr<operators::reader::LoDTensorBlockingQueue> &queue,
@@ -82,21 +81,6 @@ class MultiDeviceFeedReader {
     return result;
   }
 
-  ResultList ReadNextList() {
-    bool success = WaitFutures();
-    if (!success) {
-      return {};
-    }
-
-    ResultList result;
-    result.reserve(ret_.size());
-    for (size_t i = 0; i < ret_.size(); ++i) {
-      result.emplace_back(std::move(ret_[i]));
-    }
-    ReadAsync();
-    return result;
-  }
-
   void Reset() {
     Shutdown();
     Start();
@@ -158,8 +142,6 @@ void BindReader(py::module *module) {
   py::class_<MultiDeviceFeedReader>(m, "MultiDeviceFeedReader", "")
       .def("read_next", &MultiDeviceFeedReader::ReadNext,
            py::call_guard<py::gil_scoped_release>())
-      .def("read_next_list", &MultiDeviceFeedReader::ReadNextList,
-           py::call_guard<py::gil_scoped_release>())
       .def("reset", &MultiDeviceFeedReader::Reset,
            py::call_guard<py::gil_scoped_release>());
 
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 66b768665b6..16bb3771f2e 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) {
   Fprintf(std::cout, fmt, args...);
 }
 
-inline std::string HumanReadableSize(double f_size) {
+template <typename T>
+std::string HumanReadableSize(T size) {
   size_t i = 0;
+  double f_size = static_cast<double>(size);
   double orig = f_size;
   const std::vector<std::string> units(
       {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"});
-  while (f_size >= 1024) {
+  while (f_size > 1024) {
     f_size /= 1024;
     i++;
   }
diff --git a/paddle/scripts/README.md b/paddle/scripts/README.md
index 39db5a601d3..1db262f06d9 100644
--- a/paddle/scripts/README.md
+++ b/paddle/scripts/README.md
@@ -108,14 +108,6 @@ RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.wh
 Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
 the directory containing your own `Dockerfile`.
 
-We also release a script and Dockerfile for building PaddlePaddle docker images
-across different cuda versions. To build these docker images, run:
-
-```bash
-bash ./build_docker_images.sh
-docker build -t [REPO]/paddle:tag -f [generated_docker_file] .
-```
-
 - NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
 
 ### Use Docker Images
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e5e1ef6c25e..f1630e0b509 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -52,7 +52,10 @@ function init() {
     fi
 }
 
-function cmake_base() {
+function cmake_gen() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+
     # build script will not fail if *.deb does not exist
     rm *.deb 2>/dev/null || true
     # delete previous built whl packages
@@ -224,7 +227,6 @@ EOF
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_NGRAPH=${WITH_NGRAPH:-OFF} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
         -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} \
@@ -247,12 +249,6 @@ EOF
 
 }
 
-function cmake_gen() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    cmake_base $1
-}
-
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
     echo "Please use pre-commit to check what is wrong." 1>&2
@@ -291,17 +287,6 @@ function check_style() {
 #              Build
 #=================================================
 
-function build_base() {
-    parallel_number=`nproc`
-    if [[ "$1" != "" ]]; then
-      parallel_number=$1
-    fi
-    make clean
-    make -j ${parallel_number}
-    make install -j `nproc`
-}
-
-
 function build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -310,7 +295,13 @@ function build() {
     Building in /paddle/build ...
     ============================================
 EOF
-    build_base $1
+    parallel_number=`nproc`
+    if [[ "$1" != "" ]]; then
+      parallel_number=$1
+    fi
+    make clean
+    make -j ${parallel_number}
+    make install -j `nproc`
 }
 
 function build_mac() {
@@ -343,25 +334,6 @@ EOF
     fi
 }
 
-
-function combine_avx_noavx_build() {
-    mkdir -p ${PADDLE_ROOT}/build.noavx
-    cd ${PADDLE_ROOT}/build.noavx
-    WITH_AVX=OFF
-    cmake_base ${PYTHON_ABI:-""}
-    build_base
-
-    # build combined one
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    NOAVX_CORE_FILE=`find ${PADDLE_ROOT}/build.noavx/python/paddle/fluid/ -name "core_noavx.*"`
-    WITH_AVX=ON
-
-    cmake_base ${PYTHON_ABI:-""}
-    build_base
-}
-
-
 function run_brpc_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -418,19 +390,6 @@ EOF
         ctest --output-on-failure -j $2
         # make install should also be test when unittest
         make install -j 8
-
-        set +ex
-        if [ "$1" == "cp27-cp27m" ]; then
-            pip uninstall -y paddlepaddle
-        elif [ "$1" == "cp35-cp35m" ]; then
-            pip3.5 uninstall -y paddlepaddle
-        elif [ "$1" == "cp36-cp36m" ]; then
-            pip3.6 uninstall -y paddlepaddle
-        elif [ "$1" == "cp37-cp37m" ]; then
-            pip3.7 uninstall -y paddlepaddle
-        fi
-        set -ex
-
         if [ "$1" == "cp27-cp27m" ]; then
             set -e
             pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
@@ -444,6 +403,16 @@ EOF
         fi
 
         paddle version
+
+        if [ "$1" == "cp27-cp27m" ]; then
+            pip uninstall -y paddlepaddle
+        elif [ "$1" == "cp35-cp35m" ]; then
+            pip3.5 uninstall -y paddlepaddle
+        elif [ "$1" == "cp36-cp36m" ]; then
+            pip3.6 uninstall -y paddlepaddle
+        elif [ "$1" == "cp37-cp37m" ]; then
+            pip3.7 uninstall -y paddlepaddle
+        fi
     fi
 }
 
@@ -496,7 +465,6 @@ function assert_api_spec_approvals() {
                "paddle/fluid/framework/ir/node.h"
                "paddle/fluid/framework/ir/graph.h"
                "paddle/fluid/framework/framework.proto"
-               "python/requirements.txt"
                "python/paddle/fluid/compiler.py"
                "python/paddle/fluid/__init__.py"
                "paddle/fluid/operators/distributed/send_recv.proto.in")
@@ -505,34 +473,34 @@ function assert_api_spec_approvals() {
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
       if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-          # approval_user_list: XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,xsrobin 50069408,qingqing01 7845005,junjun315 3124479. 
-          approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
+          # approval_user_list: velconia 1979255,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
           if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 50069408 46782768 30176695 6836917 7845005`
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 35982308 46782768 30176695`
             if [ "${APPROVALS}" == "TRUE" ];then
-              APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 50069408`
+              APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+              python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
             fi
           elif [ "${API_FILE}" == "CMakeLists.txt" ];then
-            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
           elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 50069408`
-          elif [ "${API_FILE}" == "python/requirements.txt" ];then
-             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 3124479 6836917`
+             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
           else
-            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
           fi
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
             if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
-              echo "You must have one RD (chengduoZH or XiaoguangHu01 or qingqing01 or luotao1) and one PM (xsrobin) approval for the api change! ${API_FILE} for the management reason of API interface and API document."
+              echo "You must have one RD (chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE} for the management reason of API interface and API document."
             elif [ "${API_FILE}" == "CMakeLists.txt" ];then
               echo "You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for the cmakelist change! ${API_FILE} for the management reason of the Compilation parameter."
-            elif [ "${API_FILE}" == "python/requirements.txt" ];then
-              echo "You must have one RD (junjun315 or luotao1) approval for the python/requirements.txt change! ${API_FILE} for the management reason of the Compilation parameter."
             elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-              echo "You must have xsrobin approval for the python/paddle/fluid/__init__.py change! ${API_FILE} for the management reason of the environment variables."
+              echo "You must have shanyi15 approval for the python/paddle/fluid/__init__.py change! ${API_FILE} for the management reason of the environment variables."
             else
-              echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao) approval for the api change! ${API_FILE} for the management reason of the underlying code for fluid."
+              echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE} for the management reason of the underlying code for fluid."
             fi
             exit 1
           fi
@@ -542,10 +510,10 @@ function assert_api_spec_approvals() {
     HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 21351065 3048612 46782768 30176695 12538138 6836917 32832641`
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have one RD (XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao) approval for the api change! ${API_FILE} for the avoidance of the bad C++ code habits."
+            echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE} for the avoidance of the bad C++ code habits."
             exit 1
         fi
     fi
@@ -666,7 +634,9 @@ function card_test() {
     set +m
 }
 
-function parallel_test_base() {
+function parallel_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
     ========================================
@@ -731,18 +701,12 @@ set +x
         card_test "$multiple_card_tests" 2  # run cases with two GPUs
         card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
         if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+            exit 1;
         fi
 set -ex
     fi
 }
 
-function parallel_test() {
-    mkdir -p ${PADDLE_ROOT}/build
-    cd ${PADDLE_ROOT}/build
-    parallel_test_base
-}
-
 function gen_doc_lib() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -840,7 +804,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
-        pip3 install opencv-python py-cpuinfo==5.0.0 && pip3 install /*.whl; apt-get install -f -y && \
+        pip3 install opencv-python && pip3 install /*.whl; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f /*.whl && \
         ${PADDLE_VERSION} && \
@@ -994,13 +958,6 @@ function main() {
         gen_dockerfile ${PYTHON_ABI:-""}
         assert_api_spec_approvals
         ;;
-      combine_avx_noavx)
-        combine_avx_noavx_build
-        ;;
-      combine_avx_noavx_build_and_test)
-        combine_avx_noavx_build
-        parallel_test_base
-        ;;
       test)
         parallel_test
         ;;
@@ -1032,6 +989,7 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         build ${parallel_number}
         parallel_test
+        assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
       cicheck_brpc)
         cmake_gen ${PYTHON_ABI:-""}
@@ -1062,6 +1020,7 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         build ${parallel_number}
         parallel_test
+        assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
       cmake_gen)
         cmake_gen ${PYTHON_ABI:-""}
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ec6c19cf098..81c34beeef2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -10,60 +10,19 @@ else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
 
-set(FLUID_CORE_NAME "core")
-if(WITH_AVX AND AVX_FOUND)
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
-  if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(WARNING "You are building AVX version without NOAVX core, \
-      and the wheel package may fail on NOAVX machine.")
-  endif()
-
-  if(NOAVX_CORE_FILE AND NOT EXISTS "${NOAVX_CORE_FILE}")
-    message(FATAL_ERROR "The file ${NOAVX_CORE_FILE} does not exist!")
-  endif()
-
-  set(HAS_NOAVX_CORE ON)
-else()
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_noavx")
-endif()
-
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-
-set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
-
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
+    set(FLUID_CORE ${FLUID_DST_DIR}/core.pyd)
 ELSE()
-    set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
+    set(FLUID_CORE ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so)
 ENDIF()
-
-set(FLUID_CORE_DEPS ${FLUID_CORE})
-
-if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
-  get_filename_component(NOAVX_CORE_NAME ${NOAVX_CORE_FILE} NAME)
-  get_filename_component(NOAVX_CORE_EXT ${NOAVX_CORE_FILE} EXT)
-  if(WIN32)
-    if(NOT NOAVX_CORE_EXT STREQUAL ".pyd")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!")
-    endif()
-  else()
-    if(NOT NOAVX_CORE_EXT STREQUAL ".so")
-      message(FATAL_ERROR "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
-    endif()
-  endif()
-  add_custom_command(OUTPUT ${FLUID_NOAVX_CORE}
-    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE} DEPENDS paddle_pybind)
-  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
-endif()
-
 add_custom_command(OUTPUT ${FLUID_CORE}
         COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE})
 
 IF(WIN32)
     add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 969ad3c922f..5728a37fc33 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -117,28 +117,26 @@ def reader_creator(data_file,
 
     def reader():
         while True:
-            with open(file_list, 'r') as f_list:
-                for file in f_list:
-                    file = file.strip()
-                    batch = None
-                    with open(file, 'rb') as f:
-                        if six.PY2:
-                            batch = pickle.load(f)
-                        else:
-                            batch = pickle.load(f, encoding='bytes')
-
-                        if six.PY3:
-                            batch = cpt.to_text(batch)
-                        data_batch = batch['data']
-                        labels_batch = batch['label']
-                        for sample, label in six.moves.zip(data_batch,
-                                                           labels_batch):
-                            yield sample, int(label) - 1
+            for file in open(file_list):
+                file = file.strip()
+                batch = None
+                with open(file, 'rb') as f:
+                    if six.PY2:
+                        batch = pickle.load(f)
+                    else:
+                        batch = pickle.load(f, encoding='bytes')
+                if six.PY3:
+                    batch = cpt.to_text(batch)
+                data = batch['data']
+                labels = batch['label']
+                for sample, label in six.moves.zip(data, batch['label']):
+                    yield sample, int(label) - 1
             if not cycle:
                 break
 
     if use_xmap:
-        return xmap_readers(mapper, reader, min(4, cpu_count()), buffered_size)
+        cpu_num = int(os.environ.get('CPU_NUM', cpu_count()))
+        return xmap_readers(mapper, reader, cpu_num, buffered_size)
     else:
         return map_readers(mapper, reader)
 
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 8dae48fae18..847ca187206 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -78,10 +78,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
                         buffer_size, rows * cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
+                    images = images / 255.0 * 2.0 - 1.0
                     for i in range(buffer_size):
                         yield images[i, :], int(labels[i])
 
@@ -93,7 +90,7 @@ def train():
     MNIST training set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
-    [-1, 1] and label in [0, 9].
+    [0, 1] and label in [0, 9].
 
     :return: Training reader creator
     :rtype: callable
@@ -110,7 +107,7 @@ def test():
     MNIST test set creator.
 
     It returns a reader creator, each sample in the reader is image pixels in
-    [-1, 1] and label in [0, 9].
+    [0, 1] and label in [0, 9].
 
     :return: Test reader creator.
     :rtype: callable
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 770efe03a80..1052d24c57b 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -48,7 +48,8 @@ __all__ = [
     "get_dict",
 ]
 
-DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
 DATA_MD5 = "0c38be43600334966403524a40dcd81e"
 
 TOTAL_EN_WORDS = 11250
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index 06369ea6b70..d8153fa0026 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,58 +11,87 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-paddle.distributed.launch is a module that spawns multiple distributed 
-process on each trainning node for gpu trainning.
-
-Usage:
-    In both of single node training or multiple node training, this module 
-launch a process on each of the given gpu card.
-
-    1. for single node trainning with all visible gpu cards:
-       python -m paddle.distributed.launch \
-         your_training_py (arg1 arg2 and all others)
-    
-    2. for single node trainning with [0,4) cards
-       python -m paddle.distributed.launch --selected_gpus="0,1,2,3" \
-         your_training_py (arg1 arg2 and all others)
-
-    3. for mulitple node training such as two node:192.168.0.16, 192.168.0.17
-        on 192.168.0.16:
-            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.16 \
-                your_training_py (arg1 arg2 and all others)
-
-        on 192.168.0.17:
-            python -m paddle.distributed.launch --cluster_node_ips="192.168.0.16,192.168.0.17" \
-                --node_ip=192.168.0.17 \
-                your_training_py (arg1 arg2 and all others)
-"""
 
 from __future__ import print_function
-import sys
-from sys import version
+
 import subprocess
 import os
-import six
-import copy
-from argparse import ArgumentParser, REMAINDER
-import paddle.fluid as fluid
+import sys
+import time
+import argparse
+
+default_envs = {
+    "PADDLE_TRAINER_ENDPOINTS":
+    "127.0.0.1:6170,127.0.0.1:6171,127.0.0.1:6172,127.0.0.1:6173,127.0.0.1:6174,127.0.0.1:6175,127.0.0.1:6176,127.0.0.1:6177",
+    "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+    "PATH": os.getenv("PATH"),
+    "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
+    "PADDLE_TRAINERS_NUM": "8",
+    "NCCL_DEBUG": "INFO",
+    "GLOG_v": "0",
+    "NCCL_SOCKET_IFNAME": "eth0",
+    "NCCL_IB_GID_INDEX": "3",
+    "NCCL_IB_RETRY_CNT": "0",
+    "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+}
+
+GPUS = 8
+
+
+def start_procs(gpus, entrypoint, entrypoint_args, log_dir):
+    procs = []
+    log_fns = []
+    os.system("mkdir -p %s" % log_dir)
+    # ======== update parent envs =======
+    for k, v in os.environ.items():
+        if k.startswith("FLAGS_") or k.startswith("NCCL_") or \
+            k.startswith("GLOG_"):
+            default_envs[k] = v
+
+    # ======== for dist training =======
+    node_trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+    current_ip = os.getenv("POD_IP", "127.0.0.1")
+    trainer_ips = os.getenv("PADDLE_TRAINERS", current_ip).split(",")
+    num_nodes = len(trainer_ips)
+    all_nodes_devices_endpoints = ""
+    for n in trainer_ips:
+        for i in range(gpus):
+            if all_nodes_devices_endpoints:
+                all_nodes_devices_endpoints += ","
+            all_nodes_devices_endpoints += "%s:617%d" % (n, i)
+    nranks = num_nodes * gpus
+    # ======== for dist training =======
+
+    for i in range(gpus):
+        curr_env = {}
+        curr_env.update(default_envs)
+        curr_env.update({
+            "FLAGS_selected_gpus": "%d" % i,
+            "PADDLE_TRAINER_ID": "%d" % (node_trainer_id * gpus + i),
+            "PADDLE_CURRENT_ENDPOINT": "%s:617%d" % (current_ip, i),
+            # nranks
+            "PADDLE_TRAINERS_NUM": "%d" % nranks,
+            "PADDLE_TRAINER_ENDPOINTS": all_nodes_devices_endpoints
+        })
+
+        print("starting process ", i, entrypoint, entrypoint_args, curr_env)
+        fn = open("%s/workerlog.%d" % (log_dir, i), "w")
+        log_fns.append(fn)
+        cmd = [sys.executable, "-u", entrypoint] + entrypoint_args
+        procs.append(subprocess.Popen(cmd, stdout=fn, stderr=fn, env=curr_env))
 
+    for i in range(gpus):
+        try:
+            procs[i].communicate()
+            procs[i].terminate()
+            log_fns[i].close()
+        except:
+            pass
 
-def _print_arguments(args):
-    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(six.iteritems(vars(args))):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------------")
 
+def parse_args():
 
-def _parse_args():
-    """
-    Helper function parsing the command line options
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(
+    parser = argparse.ArgumentParser(
         description='''start paddle training using multi-process mode.
 NOTE: your train program ***must*** run as distributed nccl2 mode,
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
@@ -75,148 +104,33 @@ PADDLE_TRAINERS_NUM
 PADDLE_TRAINER_ENDPOINTS
 POD_IP (current node ip address, not needed for local training)
 ''')
-
-    # Optional arguments for the launch helper
     parser.add_argument(
-        "--cluster_node_ips",
-        type=str,
-        default="127.0.0.1",
-        help="Paddle cluster nodes ips, such as 192.168.0.16,192.168.0.17..")
-
-    parser.add_argument(
-        "--node_ip",
-        type=str,
-        default="127.0.0.1",
-        help="The current node ip. ")
-
-    parser.add_argument(
-        "--started_port",
+        '--gpus',
         type=int,
-        default=6170,
-        help="The trainer's started port on a single node")
-
-    parser.add_argument(
-        "--print_config",
-        type=bool,
-        default=True,
-        help="Print the config or not")
-
+        default=8,
+        help='start number of processes for every gpu')
     parser.add_argument(
-        "--selected_gpus",
+        '--log_dir',
         type=str,
-        default=None,
-        help="It's for gpu trainning and the trainning process will run on the selected_gpus,"
-        "each process is bound to a single GPU. And if it's not setted, this module will use all the gpu cards for training."
-    )
-
+        default="mylog",
+        help='directory to put logs per process.')
     parser.add_argument(
-        "--log_dir",
+        'entrypoint_script',
         type=str,
-        help="The path for each process's log.If it's not setted, the log will printed to default pipe."
-    )
-
-    # positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help="The full path to the single GPU training "
-        "program/script to be launched in parallel, "
-        "followed by all the arguments for the "
-        "training script")
-
-    # rest from the training program
-    parser.add_argument('training_script_args', nargs=REMAINDER)
+        help="The entrypoint script to be launched in parallel,"
+        "followed by all the arguments for each process,"
+        "e.g. train.py --lr 0.1")
+    parser.add_argument('entrypoint_args', nargs=argparse.REMAINDER)
     return parser.parse_args()
 
 
-def start_procs(args):
-    """
-    """
-    procs = []
-    log_fns = []
-
-    default_env = os.environ.copy()
-
-    current_node_ip = args.node_ip
-    node_ips = [x.strip() for x in args.cluster_node_ips.split(',')]
-    node_id = node_ips.index(current_node_ip)
-    num_nodes = len(node_ips)
-
-    if args.selected_gpus is None:
-        gpus_num = fluid.core.get_cuda_device_count()
-        selected_gpus = [str(x) for x in range(0, gpus_num)]
-    else:
-        selected_gpus = [x.strip() for x in args.selected_gpus.split(',')]
-    selected_gpus_num = len(selected_gpus)
-
-    trainers_endpoints = ""
-    for ip in node_ips:
-        for i in range(selected_gpus_num):
-            if trainers_endpoints != "":
-                trainers_endpoints += ","
-            trainers_endpoints += "%s:617%d" % (ip, i)
-
-    nranks = num_nodes * selected_gpus_num
-
-    if args.print_config:
-        print("trainers_endpoints:", trainers_endpoints, ", node_id:", node_id,
-              ", current_node_ip:", current_node_ip, ", num_nodes:", num_nodes,
-              ", node_ips:", node_ips, ", nranks:", nranks)
-
-    current_env = copy.copy(default_env)
-    # paddle broadcast ncclUniqueId use socket, and
-    # proxy maybe make trainers unreachable, so delete them.
-    # if we set them to "", grpc will log error message "bad uri"
-    # so just delete them.
-    current_env.pop("http_proxy", None)
-    current_env.pop("https_proxy", None)
-
-    procs = []
-    cmds = []
-    for i in range(0, selected_gpus_num):
-        current_env.update({
-            "FLAGS_selected_gpus": "%s" % selected_gpus[i],
-            "PADDLE_TRAINER_ID": "%d" % (node_id * selected_gpus_num + i),
-            "PADDLE_CURRENT_ENDPOINT":
-            "%s:%d" % (current_node_ip, args.started_port + i),
-            "PADDLE_TRAINERS_NUM": "%d" % nranks,
-            "PADDLE_TRAINER_ENDPOINTS": trainers_endpoints
-        })
-
-        cmd = [sys.executable, "-u", args.training_script
-               ] + args.training_script_args
-
-        cmds.append(cmd)
-
-        if args.log_dir is not None:
-            os.system("mkdir -p {}".format(args.log_dir))
-            fn = open("%s/workerlog.%d" % (args.log_dir, i), "w")
-            log_fns.append(fn)
-
-            proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
-        else:
-            proc = subprocess.Popen(cmd, env=current_env)
-
-        procs.append(proc)
-
-    for i in range(0, len(procs)):
-        proc = procs[i]
-
-        proc.wait()
-        if len(log_fns) > 0:
-            log_fns[i].close()
-
-        if proc.returncode != 0:
-            raise subprocess.CalledProcessError(
-                returncode=procs[i].returncode, cmd=cmds[i])
-
+def main():
+    args = parse_args()
 
-def launch():
-    args = _parse_args()
-    if args.print_config:
-        _print_arguments(args)
-    start_procs(args)
+    # launch multiple training process
+    start_procs(args.gpus, args.entrypoint_script, args.entrypoint_args,
+                args.log_dir)
 
 
 if __name__ == "__main__":
-    launch()
+    main()
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1a3a1dd5096..cf22c109bca 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -39,7 +39,6 @@ from . import contrib
 from . import nets
 from . import optimizer
 from . import backward
-from .backward import gradients
 from . import regularizer
 from . import average
 from . import metrics
@@ -55,7 +54,6 @@ from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
 from . import clip
-from . import dygraph_grad_clip
 from . import profiler
 from . import unique_name
 from . import recordio_writer
@@ -67,13 +65,14 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
 from .dygraph.nn import *
 from .dygraph.layers import *
+from .cxx_trainer import *
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + \
     trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \
     parallel_executor.__all__ + lod_tensor.__all__ + \
-    data_feed_desc.__all__ + compiler.__all__ + backward.__all__ + [
+    data_feed_desc.__all__ + compiler.__all__ + [
         'io',
         'initializer',
         'layers',
@@ -95,7 +94,6 @@ __all__ = framework.__all__ + executor.__all__ + \
         'WeightNormParamAttr',
         'DataFeeder',
         'clip',
-        'dygraph_grad_clip',
         'profiler',
         'unique_name',
         'recordio_writer',
@@ -142,8 +140,8 @@ def __bootstrap__():
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
         'enable_parallel_graph', 'fuse_parameter_groups_size',
-        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug'
+        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
+        'fuse_parameter_memory_size', 'tracer_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
@@ -170,7 +168,7 @@ def __bootstrap__():
         # env for communicator
         read_env_flags.append('communicator_independent_recv_thread')
         read_env_flags.append('communicator_send_queue_size')
-        read_env_flags.append('communicator_min_send_grad_num_before_recv')
+        read_env_flags.append('communicator_max_send_grad_num_before_recv')
         read_env_flags.append('communicator_thread_pool_size')
         read_env_flags.append('communicator_max_merge_var_num')
         read_env_flags.append('communicator_fake_rpc')
@@ -185,8 +183,8 @@ def __bootstrap__():
             'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
             'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
             'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
-            'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
-            'limit_of_tmp_allocation',
+            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',
+            'sync_nccl_allreduce', 'limit_of_tmp_allocation',
             'times_excess_than_required_tmp_allocation',
             'enable_inplace_whitelist', 'cudnn_batchnorm_spatial_persistent'
         ]
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 9de001849b9..c57b35d02ad 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -22,7 +22,7 @@ import six
 from .. import compat as cpt
 from . import unique_name
 
-__all__ = ['append_backward', 'gradients']
+__all__ = ['append_backward']
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
@@ -71,6 +71,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
             op_desc.set_block_attr(name, val.desc)
         else:
             op_desc._set_attr(name, val)
+    op_desc.check_attrs()
     return op_desc
 
 
@@ -142,7 +143,6 @@ def _addup_repetitive_outputs_(op_descs):
     pending_sum_ops = []
     var_rename_count = collections.defaultdict(int)
     renamed_vars = collections.defaultdict(list)
-    renamed_var_start_idx = collections.defaultdict(list)
     for idx, op_desc in enumerate(op_descs):
         for var_name in op_desc.input_arg_names():
             if len(renamed_vars[var_name]) > 1:
@@ -160,7 +160,6 @@ def _addup_repetitive_outputs_(op_descs):
                 if len(renamed_vars[var_name]) == 0:
                     # it's the first time we get the variable
                     renamed_vars[var_name] = [var_name]
-                    renamed_var_start_idx[var_name] = idx
                 else:
                     if len(renamed_vars[var_name]) == 1:
                         new_name = var_name + "@RENAME@" + \
@@ -168,12 +167,7 @@ def _addup_repetitive_outputs_(op_descs):
                         var_rename_count[var_name] += 1
                         # rename original var_name
                         renamed_vars[var_name][0] = new_name
-                        # before change: _rename_arg_(op_descs, var_name,
-                        #                             new_name, 0, idx)
-                        # rename arg from idx of the first appearance
-                        # in backward, not always from 0
-                        _rename_arg_(op_descs, var_name, new_name,
-                                     renamed_var_start_idx[var_name], idx)
+                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
                         _rename_arg_(pending_sum_ops, var_name, new_name)
 
                         for p in op_desc.output_names()[:param_idx]:
@@ -239,8 +233,15 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
         for arg in op_desc.input_arg_names():
             if core.grad_var_suffix() in arg and arg in no_grad_set:
                 x_in = _strip_grad_suffix_(arg)
-                to_insert.append((_create_op_desc_(
-                    "fill_zeros_like", {"X": [x_in]}, {"Out": [arg]}, {}), idx))
+                x_in_var_desc = op_desc.block().find_var_recursive(
+                    cpt.to_bytes(x_in))
+                assert x_in_var_desc is not None, "Variable {} not found".format(
+                    x_in)
+                dtype = x_in_var_desc.dtype()
+
+                to_insert.append(
+                    (_create_op_desc_("fill_zeros_like2", {"X": [x_in]},
+                                      {"Out": [arg]}, {"dtype": dtype}), idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
@@ -261,8 +262,7 @@ def _append_backward_ops_(block,
                           target_block,
                           no_grad_dict,
                           grad_to_var,
-                          callbacks=None,
-                          input_grad_names_set=None):
+                          callbacks=None):
     """
     Create all grad ops, and insert them into given block
 
@@ -294,13 +294,8 @@ def _append_backward_ops_(block,
             sub_block = program.block(op._block_attr_id("sub_block"))
             grad_sub_block = program._create_block()
             grad_sub_block._set_forward_block_idx(sub_block.idx)
-            # see follwing comments for why set None here.
-            pre_input_grad_names_set = copy.copy(input_grad_names_set)
-            input_grad_names_set = None
             _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
-                                  no_grad_dict, grad_to_var, callbacks,
-                                  input_grad_names_set)
-            input_grad_names_set = pre_input_grad_names_set
+                                  no_grad_dict, grad_to_var, callbacks)
 
             program._rollback()
             grad_sub_block_list.append(grad_sub_block.desc)
@@ -309,33 +304,8 @@ def _append_backward_ops_(block,
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
-        # If input_grad_names_set is not None, extend grad_op_descs only when
-        # any input grad in outputs of previous grad ops.
-        # But this strategy is not suited for while op for some control flow,
-        # for example, for while op, the grads maybe generated in next loop.
-        if input_grad_names_set is not None:
-            is_append_grad = False
-            for op_desc in grad_op_desc:
-                input_grad_names = [
-                    name for name in op_desc.input_arg_names()
-                    if name.find(core.grad_var_suffix()) != -1
-                ]
-                # some code of gradient ops, like increment, are not very
-                # standard, there is no @GRAD in these ops' inputs.
-                if len(input_grad_names) == 0:
-                    is_append_grad = True
-                    break
-
-                if _some_in_set_(input_grad_names, input_grad_names_set):
-                    grad_op_descs.append(op_desc)
-                    is_append_grad = True
-                    for name in op_desc.output_arg_names():
-                        input_grad_names_set.add(name)
-            if is_append_grad:
-                grad_to_var.update(op_grad_to_var)
-        else:
-            grad_op_descs.extend(grad_op_desc)
-            grad_to_var.update(op_grad_to_var)
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
 
     grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
 
@@ -519,8 +489,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
         isinstance(callbacks, list)
 
     program = loss.block.program
-    program._appending_grad_times += 1
-
     if no_grad_set is None:
         no_grad_set = set()
     no_grad_set = copy.copy(no_grad_set)
@@ -551,23 +519,10 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
     op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
-
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
 
-    input_grad_names_set = None
-    # For double backward, input_grad_names is used for filter
-    # some non-used gradients op.
-    if program._appending_grad_times > 1:
-        input_grad_names_set = set([_append_grad_suffix_(loss.name)])
-
-    _append_backward_ops_(
-        root_block,
-        op_path,
-        root_block,
-        no_grad_dict,
-        grad_to_var,
-        callbacks,
-        input_grad_names_set=input_grad_names_set)
+    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
+                          grad_to_var, callbacks)
 
     # Because calc_gradient may be called multiple times,
     # we need rename the internal gradient variables so that they have
@@ -671,20 +626,17 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
 
 def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     """
-    Backpropagate the gradients of targets to inputs.
+    Backpropagate the graidents of targets to inputs.
 
     Args:
         targets(Variable|list[Variable]): The target variables
         inputs(Variable|list[Variable]): The input variables
-        target_gradients (Variable|list[Variable]|None): The gradient variables
-            of targets which has the same shape with targets, If None, ones will
-            be created for them.
         no_grad_set(set[string]): The names of variables that have no gradients
             in Block 0. All variables with `stop_gradient=True` from all blocks
             will be automatically added.
 
     Return:
-        (list[Variable]): A list of gradients for inputs
+        (list[Variable]): list of gradients for inputs
         If an input does not affect targets, the corresponding gradient variable
         will be None
     """
@@ -694,8 +646,6 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     block = targets[0].block
     prog = block.program
-    # increase appending gradients times
-    prog._appending_grad_times += 1
     block_idx = block.idx
 
     if not target_gradients:
@@ -713,8 +663,6 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
 
     fwd_op_num = block.desc.op_size()
 
-    input_grad_names_set = set()
-
     target_grad_map = {}
     for i, grad in enumerate(target_gradients):
         target = targets[i]
@@ -730,7 +678,6 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
                                            'output_dim_idx': 0
                                        })
             block.desc.append_op().copy_from(op_desc)
-            input_grad_names_set.add(grad_name)
         else:
             if target.block.idx != block_idx or target.block.program != prog:
                 raise ValueError("all targets must be in the same block")
@@ -739,12 +686,6 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
                     "The shapes of target and grad are different: %s %s" % (
                         target.name, grad.name))
             target_grad_map[_append_grad_suffix_(target.name)] = grad.name
-            input_grad_names_set.add(grad.name)
-
-    # For double backward, input_grad_names is used for filter
-    # some non-used gradients op.
-    if prog._appending_grad_times == 1:
-        input_grad_names_set = None
 
     for input in inputs:
         if input.block.program != prog:
@@ -755,13 +696,7 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
-    _append_backward_ops_(
-        block,
-        op_path,
-        block,
-        no_grad_dict,
-        grad_to_var,
-        input_grad_names_set=input_grad_names_set)
+    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
 
     # Because calc_gradient may be called multiple times,
     # we need rename the internal gradient variables so that they have
@@ -785,40 +720,3 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
         return grad_vars[0]
     else:
         return grad_vars
-
-
-def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
-    """
-    Backpropagate the gradients of targets to inputs.
-
-    Args:
-        targets (Variable|list[Variable]): The target variables.
-        inputs (Variable|list[Variable]): The input variables.
-        target_gradients (Variable|list[Variable]|None): The gradient variables
-            of targets which has the same shape with targets, If None, ones will
-            be created for them.
-        no_grad_set (set[string]): The names of variables that have no gradients
-            in Block 0. All variables with `stop_gradient=True` from all blocks
-            will be automatically added.
-
-    Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
-        will be None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[2,8,8], dtype='float32')
-            x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            z = fluid.gradients([y], x)
-            print(z)
-    """
-    outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
-    return _as_list(outs)
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 1c51ef296c6..0f7dd531b3e 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -21,7 +21,6 @@ import functools
 from . import layers
 from . import framework
 from . import core
-from .dygraph import not_support
 
 __all__ = [
     'ErrorClipByValue',
@@ -56,23 +55,7 @@ class ErrorClipByValue(BaseErrorClipAttr):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            BATCH_SIZE = 128
-            CLIP_MAX = 2e-6
-            CLIP_MIN = -1e-6
-            prog = fluid.framework.Program()
-            with fluid.program_guard(main_program=prog):
-                image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                cost = fluid.layers.cross_entropy(input=predict, label=label)
-                avg_cost = fluid.layers.mean(cost)
-            prog_clip = prog.clone()
-            prog_clip.block(0).var(hidden1.name)._set_error_clip(
-                fluid.clip.ErrorClipByValue(
-                    max=CLIP_MAX, min=CLIP_MIN)
+            var = fluid.framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
     """
 
     def __init__(self, max, min=None):
@@ -151,14 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             w_param_attrs = fluid.ParamAttr(name=None,
               initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
               learning_rate=1.0,
               regularizer=fluid.regularizer.L1Decay(1.0),
               trainable=True,
-              gradient_clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+              clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
             y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
     """
 
@@ -204,14 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            w_param_attrs = fluid.ParamAttr(name=None,
+            w_param_attrs = flui.ParamAttr(name=None,
               initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
               learning_rate=1.0,
               regularizer=fluid.regularizer.L1Decay(1.0),
               trainable=True,
-              gradient_clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+              clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
             y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
 
     """
@@ -260,20 +239,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            prog = fluid.framework.Program()
-            startup_program = fluid.framework.Program()
-            with fluid.program_guard(
-                    main_program=prog, startup_program=startup_program):
-                image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-                label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
-                hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
-                predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
-                cost = fluid.layers.cross_entropy(input=predict, label=label)
-                avg_cost = fluid.layers.mean(cost)
-            prog_clip = prog.clone()
-            avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
             p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
 
             with fluid.program_guard(main_program=prog_clip):
@@ -336,7 +301,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
         return param, new_grad
 
 
-@not_support
 def set_gradient_clip(clip, param_list=None, program=None):
     """
     To specify parameters that require gradient clip.
diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py
index 87a6ce0881f..f01a6dd9da2 100644
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -98,7 +98,6 @@ class CompiledProgram(object):
     def __init__(self, program_or_graph):
         if isinstance(program_or_graph, core.Graph):
             self._graph = program_or_graph
-            # don't not create a new program here.
             self._program = None
         elif isinstance(program_or_graph, framework.Program):
             self._graph = core.Graph(program_or_graph.desc)
@@ -107,6 +106,7 @@ class CompiledProgram(object):
             raise ValueError("Wrong program_to_graph type: %s" %
                              type(program_or_graph))
 
+        self._program_desc = self._graph.origin_program_desc()
         self._scope = None
         self._place = None
         self._executor = None
@@ -299,7 +299,6 @@ class CompiledProgram(object):
 
         # TODO(wuyi): trainer endpoings should be passed in through
         # build_strategy, not program.xxx.
-        # TODO(gongwb): let user to set them once.
         if self._program and self._build_strategy.num_trainers > 1 and \
                 self._program._trainers_endpoints:
             tps = self._program._trainers_endpoints
@@ -308,12 +307,6 @@ class CompiledProgram(object):
                 tps), "num_trainers == len(end_points)"
             self._build_strategy.trainers_endpoints = tps
 
-        if self._program:
-            self._build_strategy.nccl_comm_num = self._program._nccl_comm_num
-            self._build_strategy.use_hierarchical_allreduce_ = self._program._use_hierarchical_allreduce
-            self._build_strategy.hierarchical_allreduce_inter_nranks_ = self._program._hierarchical_allreduce_inter_nranks
-            self._build_strategy.hierarchical_allreduce_exter_nranks_ = self._program._hierarchical_allreduce_exter_nranks
-
         if self._build_strategy.sync_batch_norm:
             self._build_strategy.enable_sequential_execution = True
 
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index 72437c0138f..f808f30bba4 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -36,8 +36,6 @@ from . import model_stat
 from .model_stat import *
 from . import mixed_precision
 from .mixed_precision import *
-from . import layers
-from .layers import *
 
 __all__ = []
 __all__ += decoder.__all__
@@ -50,4 +48,3 @@ __all__ += slim.__all__
 __all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
-__all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/int8_inference/README.md b/python/paddle/fluid/contrib/int8_inference/README.md
index 7dc7c8d2a37..3228610f968 100644
--- a/python/paddle/fluid/contrib/int8_inference/README.md
+++ b/python/paddle/fluid/contrib/int8_inference/README.md
@@ -6,7 +6,7 @@ PaddlePaddle supports offline INT8 calibration to accelerate the inference speed
 You need to install at least PaddlePaddle-1.3 python package `pip install paddlepaddle==1.3`.
 
 ## 1. How to generate INT8 model
-You can refer to the unit test in [test_calibration_resnet50.py](../tests/test_calibration_resnet50.py). Basically, there are three steps:
+You can refer to the unit test in [test_calibration.py](../tests/test_calibration.py). Basically, there are three steps:
 * Construct calibration object.
 
 ```python
@@ -68,19 +68,18 @@ Notes:
 * The INT8 theoretical speedup is 4X on Intel® Xeon® Cascadelake Server (please refer to `The theoretical peak compute gains are 4x int8 OPS over fp32 OPS.` in  [Reference](https://software.intel.com/en-us/articles/lower-numerical-precision-deep-learning-inference-and-training "Reference")). Therefore, op-level gain is 4X and topology-level is smaller.
 
 ## 4. How to reproduce the results
-* Small dataset for ResNet-50 (Single core)
+* Small dataset (Single core)
 ```bash
-FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
+FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
->Note: Change `test_calibration_resnet50.py` to `test_calibration_mobilenetv1.py` for MobileNet-V1. Same for the following commands.
 
-* Full dataset for ResNet-50 (Single core)
+* Full dataset (Single core)
 ```bash
-FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
+FLAGS_use_mkldnn=true DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 
-* Full dataset for ResNet-50 (Multi-core)
+* Full dataset (Multi-core)
 ```bash
-FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
+FLAGS_use_mkldnn=true OMP_NUM_THREADS=20 DATASET=full python python/paddle/fluid/contrib/tests/test_calibration.py
 ```
 > Notes: This is an example command with 20 cores by using set `OMP_NUM_THREADS` value.
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index e07f6ce8ab7..f17b63434de 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -18,7 +18,6 @@ from ... import layers
 from ... import unique_name
 from . import fp16_utils
 from .fp16_utils import create_master_params_grads, master_param_to_train_param
-from .fp16_utils import update_loss_scaling
 
 __all__ = ["decorate"]
 
@@ -36,51 +35,15 @@ class OptimizerWithMixedPrecison(object):
         optimizer (Optimizer): A common Optimizer object.
         init_loss_scaling (float): The initial loss scaling factor.
         use_dynamic_loss_scaling (bool): Whether to use dynamic loss scaling.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
-                                 steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
-                                      inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           the loss scaling.
-
     """
 
-    def __init__(self, optimizer, init_loss_scaling, use_dynamic_loss_scaling,
-                 incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
-                 decr_ratio):
+    def __init__(self, optimizer, init_loss_scaling, use_dynamic_loss_scaling):
         self._optimizer = optimizer
         self._param_grads = None
         self._train_program = default_main_program()
         self._startup_prog = default_startup_program()
-        self._loss_scaling = layers.create_global_var(
-            name=unique_name.generate("loss_scaling"),
-            shape=[1],
-            value=init_loss_scaling,
-            dtype='float32',
-            persistable=True)
+        self._loss_scaling = init_loss_scaling
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
-        if self._use_dynamic_loss_scaling:
-            self._incr_every_n_steps = layers.fill_constant(
-                shape=[1], dtype='int32', value=incr_every_n_steps)
-            self._decr_every_n_nan_or_inf = layers.fill_constant(
-                shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
-            self._incr_ratio = incr_ratio
-            self._decr_ratio = decr_ratio
-            self._num_good_steps = layers.create_global_var(
-                name=unique_name.generate("num_good_steps"),
-                shape=[1],
-                value=0,
-                dtype='int32',
-                persistable=True)
-            self._num_bad_steps = layers.create_global_var(
-                name=unique_name.generate("num_bad_steps"),
-                shape=[1],
-                value=0,
-                dtype='int32',
-                persistable=True)
 
         # Ensure the data type of learning rate vars is float32 (same as the 
         # master parameter dtype)
@@ -141,33 +104,9 @@ class OptimizerWithMixedPrecison(object):
         Returns:
             A list of optimize operators.
         """
-
-        if self._use_dynamic_loss_scaling:
-
-            grads = [layers.reduce_sum(g) for [_, g] in master_params_grads]
-            all_grads = layers.concat(grads)
-            all_grads_sum = layers.reduce_sum(all_grads)
-            is_overall_finite = layers.isfinite(all_grads_sum)
-
-            update_loss_scaling(is_overall_finite, self._loss_scaling,
-                                self._num_good_steps, self._num_bad_steps,
-                                self._incr_every_n_steps,
-                                self._decr_every_n_nan_or_inf, self._incr_ratio,
-                                self._decr_ratio)
-
-            # apply_gradient append all ops in global block, thus we shouldn't
-            # apply gradient in the switch branch.
-            with layers.Switch() as switch:
-                with switch.case(is_overall_finite):
-                    pass
-                with switch.default():
-                    for _, g in master_params_grads:
-                        layers.assign(layers.zeros_like(g), g)
-
         optimize_ops = self._optimizer.apply_gradients(master_params_grads)
         master_param_to_train_param(master_params_grads, self._param_grads,
                                     self._train_program)
-
         return optimize_ops
 
     def minimize(self, loss):
@@ -187,28 +126,13 @@ class OptimizerWithMixedPrecison(object):
         return scaled_loss, optimize_ops, master_params_grads
 
 
-def decorate(optimizer,
-             init_loss_scaling=1.0,
-             incr_every_n_steps=1000,
-             decr_every_n_nan_or_inf=2,
-             incr_ratio=2.0,
-             decr_ratio=0.8,
-             use_dynamic_loss_scaling=False):
+def decorate(optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=False):
     """ 
     Decorate the given optimizer to adapt to the mixed-precision training.
 
     Args:
         optimizer(Optimizer): A common Optimizer.
         init_loss_scaling(float): The initial loss scaling factor.
-        incr_every_n_steps(int): Increases loss scaling every n consecutive 
-                                 steps with finite gradients.
-        decr_every_n_nan_or_inf(int): Decreases loss scaling every n 
-                                      accumulated steps with nan or 
-                                      inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           the loss scaling.
         use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling.
 
     Returns:
@@ -227,8 +151,7 @@ def decorate(optimizer,
             scaled_loss, _, _ = mp_optimizer.minimize(loss)
     """
 
-    mp_optimizer = OptimizerWithMixedPrecison(
-        optimizer, init_loss_scaling, use_dynamic_loss_scaling,
-        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio)
+    mp_optimizer = OptimizerWithMixedPrecison(optimizer, init_loss_scaling,
+                                              use_dynamic_loss_scaling)
 
     return mp_optimizer
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 3445cdbcbb4..5e7fdcedead 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -91,11 +91,15 @@ def create_master_params_grads(params_grads, main_prog, startup_prog,
             append_cast_op(startup_p, startup_master_param, startup_prog)
             # cast fp16 gradients to fp32 before apply gradients
             if g.name.find("batch_norm") > -1:
-                scaled_g = g / loss_scaling
+                if loss_scaling > 1:
+                    scaled_g = g / float(loss_scaling)
+                else:
+                    scaled_g = g
                 master_params_grads.append([p, scaled_g])
                 continue
             master_grad = layers.cast(x=g, dtype="float32")
-            master_grad = master_grad / loss_scaling
+            if loss_scaling > 1:
+                master_grad = master_grad / float(loss_scaling)
             master_params_grads.append([master_param, master_grad])
 
     return master_params_grads
@@ -119,77 +123,3 @@ def master_param_to_train_param(master_params_grads, params_grads, main_prog):
         with main_prog._optimized_guard([m_p_g[0], m_p_g[1]]):
             # fp32 -> fp16
             append_cast_op(m_p_g[0], train_p, main_prog)
-
-
-def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
-                        num_bad_steps, incr_every_n_steps,
-                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-    """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwisw, loss scaling will decrease by decr_ratio after 
-    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
-
-    Args:
-        is_overall_finite (Variable): A boolean variable indicates whether 
-                                     all gradients are finite.
-        prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
-                                   all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
-                                  some gradients are infinite.
-        incr_every_n_steps (Variable): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
-                                       finite gradients.
-        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
-                                            loss scaling every n accumulated 
-                                            steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           loss scaling.
-    """
-    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
-    with layers.Switch() as switch:
-        with switch.case(is_overall_finite):
-            should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
-                                                        num_good_steps + 1)
-            with layers.Switch() as switch1:
-                with switch1.case(should_incr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * incr_ratio
-                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
-                    with layers.Switch() as switch2:
-                        with switch2.case(loss_scaling_is_finite):
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                        with switch2.default():
-                            pass
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-                with switch1.default():
-                    layers.increment(num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-        with switch.default():
-            should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
-                                                        num_bad_steps + 1)
-            with layers.Switch() as switch3:
-                with switch3.case(should_decr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * decr_ratio
-                    static_loss_scaling = \
-                        layers.fill_constant(shape=[1],
-                                             dtype='float32',
-                                             value=1.0)
-                    less_than_one = layers.less_than(new_loss_scaling,
-                                                     static_loss_scaling)
-                    with layers.Switch() as switch4:
-                        with switch4.case(less_than_one):
-                            layers.assign(static_loss_scaling,
-                                          prev_loss_scaling)
-                        with switch4.default():
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-                with switch3.default():
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.increment(num_bad_steps)
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
index 07c5430916a..9e4b7d1ce3d 100644
--- a/python/paddle/fluid/contrib/reader/README.md
+++ b/python/paddle/fluid/contrib/reader/README.md
@@ -13,13 +13,3 @@ and two types of data format:
    * label dense_fea,dense_fea sparse_fea,sparse_fea
  - the svm data format is :
    * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, it splits the origin batch samples to N sub-batch samples, and the N is equal to the number of processes. The usage is similar to `paddle.batch`.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - Because each process reads the original batch data and then divides the data, the performance may be poor.
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/fluid/contrib/reader/__init__.py
index e96acc5682a..4cf85ffc166 100644
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/fluid/contrib/reader/__init__.py
@@ -15,8 +15,5 @@
 from __future__ import print_function
 
 from . import ctr_reader
-from .distributed_reader import *
 
-__all__ = []
-__all__ += distributed_reader.__all__
-__all__ += ctr_reader.__all__
+__all__ = ctr_reader.__all__
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
index 2627f7f004b..b97508018ac 100644
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ....core import CPUPlace, EOFException
+from ....core import CPUPlace
 from .... import compiler
-from ....framework import Variable
 from .... import io
 from .... import profiler
 from .... import scope_guard
 from ....data_feeder import DataFeeder
-from ....log_helper import get_logger
 from ..graph import *
 from .config import ConfigFactory
 import numpy as np
@@ -30,12 +28,12 @@ import logging
 import sys
 import pickle
 import functools
-import traceback
 
 __all__ = ['Context', 'Compressor']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 def cached_reader(reader, sampled_rate, cache_path, cached_id):
@@ -85,8 +83,7 @@ class Context(object):
                  eval_reader=None,
                  teacher_graphs=None,
                  train_optimizer=None,
-                 distiller_optimizer=None,
-                 search_space=None):
+                 distiller_optimizer=None):
         """
         Args:
             place: The device place where the compression job running.
@@ -122,9 +119,6 @@ class Context(object):
         self.cache_path = './eval_cache'
         self.eval_results = {}
 
-        self.skip_training = False
-        self.search_space = search_space
-
     def to_file(self, file_name):
         """
         Save the context into file.
@@ -187,30 +181,14 @@ class Context(object):
         if sampled_rate:
             reader = cached_reader(reader, sampled_rate, self.cache_path,
                                    cached_id)
-
-        if isinstance(reader, Variable):
-            reader.start()
-            try:
-                while True:
-                    result = executor.run(eval_graph, self.scope)
-                    result = [np.mean(r) for r in result]
-                    results.append(result)
-                    if batch_id % 20 == 0:
-                        _logger.info("batch-{}; {}={}".format(
-                            batch_id, eval_graph.out_nodes.keys(), result))
-                    batch_id += 1
-            except EOFException:
-                reader.reset()
-        else:
-            for data in reader():
-                result = executor.run(eval_graph, self.scope, data=data)
-                result = [np.mean(r) for r in result]
-                results.append(result)
-                if batch_id % 20 == 0:
-                    _logger.info("batch-{}; {}={}".format(
-                        batch_id, eval_graph.out_nodes.keys(), result))
-                batch_id += 1
-
+        for data in reader():
+            result = executor.run(eval_graph, self.scope, data=data)
+            result = [np.mean(r) for r in result]
+            results.append(result)
+            if batch_id % 20 == 0:
+                _logger.info("batch-{}; {}={}".format(
+                    batch_id, eval_graph.out_nodes.keys(), result))
+            batch_id += 1
         result = np.mean(np.array(results), axis=0)
         _logger.info("Final eval result: {}={}".format(
             eval_graph.out_nodes.keys(), result))
@@ -243,10 +221,9 @@ class Compressor(object):
                  eval_feed_list=None,
                  eval_fetch_list=None,
                  teacher_programs=[],
-                 checkpoint_path=None,
+                 checkpoint_path='./checkpoints',
                  train_optimizer=None,
-                 distiller_optimizer=None,
-                 search_space=None):
+                 distiller_optimizer=None):
         """
         Args:
             place(fluid.Place): The device place where the compression job running.
@@ -274,14 +251,12 @@ class Compressor(object):
                                  this optimizer is used to minimize the combined loss of student-net and
                                  teacher-net while train_optimizer is used to minimize loss of
                                  student-net in fine-tune stage. 
-            search_space(slim.nas.SearchSpace): The instance that define the searching space. It must inherite
-                              slim.nas.SearchSpace class and overwrite the abstract methods.
 
         """
-        assert train_feed_list is None or isinstance(
+        assert isinstance(
             train_feed_list, list
         ), "train_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
-        assert eval_feed_list is None or isinstance(
+        assert isinstance(
             eval_feed_list, list
         ), "eval_feed_list should be a list of tuple, such as [('image', image.name), ('label', gt.name)]"
         self.strategies = []
@@ -306,8 +281,6 @@ class Compressor(object):
         self.distiller_optimizer = distiller_optimizer
         self.init_model = None
 
-        self.search_space = search_space
-
     def _add_strategy(self, strategy):
         """
         Add a strategy to current compress pass.
@@ -333,9 +306,6 @@ class Compressor(object):
         if 'init_model' in factory.compressor:
             self.init_model = factory.compressor['init_model']
 
-        if 'eval_epoch' in factory.compressor:
-            self.eval_epoch = factory.compressor['eval_epoch']
-
     def _init_model(self, context):
         """
         Load model that has been compressed. 
@@ -432,8 +402,7 @@ class Compressor(object):
         """
         Train one epoch.
         """
-        if context.skip_training:
-            return
+
         executor = SlimGraphExecutor(self.place)
 
         if context.optimize_graph.compiled_graph is None:
@@ -441,44 +410,21 @@ class Compressor(object):
                 context.optimize_graph.program).with_data_parallel(
                     loss_name=context.optimize_graph.out_nodes['loss'])
 
-        if isinstance(context.train_reader, Variable):
-            context.train_reader.start()
-            try:
-                while True:
-
-                    for strategy in self.strategies:
-                        strategy.on_batch_begin(context)
-                    results = executor.run(context.optimize_graph,
-                                           context.scope)
-                    results = [float(np.mean(result)) for result in results]
-                    if context.batch_id % 20 == 0:
-                        _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
-                            context.epoch_id, context.batch_id,
-                            context.optimize_graph.out_nodes.keys(
-                            ), [round(r, 3) for r in results]))
-                    for strategy in self.strategies:
-                        strategy.on_batch_end(context)
-                    context.batch_id += 1
-
-            except EOFException:
-                context.train_reader.reset()
-
-        else:
-            for data in context.train_reader():
-                for strategy in self.strategies:
-                    strategy.on_batch_begin(context)
-                results = executor.run(context.optimize_graph,
-                                       context.scope,
-                                       data=data)
-                results = [float(np.mean(result)) for result in results]
-                if context.batch_id % 20 == 0:
-                    _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
-                        context.epoch_id, context.batch_id,
-                        context.optimize_graph.out_nodes.keys(
-                        ), [round(r, 3) for r in results]))
-                for strategy in self.strategies:
-                    strategy.on_batch_end(context)
-                context.batch_id += 1
+        for data in context.train_reader():
+            for strategy in self.strategies:
+                strategy.on_batch_begin(context)
+            results = executor.run(context.optimize_graph,
+                                   context.scope,
+                                   data=data)
+            results = [float(np.mean(result)) for result in results]
+            if context.batch_id % 20 == 0:
+                _logger.info("epoch:{}; batch_id:{}; {} = {}".format(
+                    context.epoch_id, context.batch_id,
+                    context.optimize_graph.out_nodes.keys(
+                    ), [round(r, 3) for r in results]))
+            for strategy in self.strategies:
+                strategy.on_batch_end(context)
+            context.batch_id += 1
         context.batch_id = 0
 
     def _eval(self, context):
@@ -504,8 +450,7 @@ class Compressor(object):
             eval_reader=self.eval_reader,
             teacher_graphs=self.teacher_graphs,
             train_optimizer=self.train_optimizer,
-            distiller_optimizer=self.distiller_optimizer,
-            search_space=self.search_space)
+            distiller_optimizer=self.distiller_optimizer)
         self.context = context
         if self.teacher_graphs:
             context.put('teachers', self.teacher_graphs)
@@ -522,25 +467,18 @@ class Compressor(object):
 
         for strategy in self.strategies:
             strategy.on_compression_begin(context)
-        if 'MKLDNNPostTrainingQuantStrategy' in [
-                i.__class__.__name__ for i in self.strategies
-        ]:
-            return None
         start = context.epoch_id
+        self._eval(context)
         for epoch in range(start, self.epoch):
             context.epoch_id = epoch
-            try:
-                for strategy in self.strategies:
-                    strategy.on_epoch_begin(context)
-                self._train_one_epoch(context)
-                if self.eval_epoch and epoch % self.eval_epoch == 0:
-                    self._eval(context)
-                self._save_checkpoint(context)
-                for strategy in self.strategies:
-                    strategy.on_epoch_end(context)
-            except Exception:
-                _logger.error(traceback.print_exc())
-                continue
+            for strategy in self.strategies:
+                strategy.on_epoch_begin(context)
+            self._train_one_epoch(context)
+            for strategy in self.strategies:
+                strategy.on_epoch_end(context)
+            if self.eval_epoch and epoch % self.eval_epoch == 0:
+                self._eval(context)
+            self._save_checkpoint(context)
         for strategy in self.strategies:
             strategy.on_compression_end(context)
         return context.eval_graph
diff --git a/python/paddle/fluid/contrib/slim/core/config.py b/python/paddle/fluid/contrib/slim/core/config.py
index 9b08a0324a5..9bb395aee95 100644
--- a/python/paddle/fluid/contrib/slim/core/config.py
+++ b/python/paddle/fluid/contrib/slim/core/config.py
@@ -20,15 +20,11 @@ from ..prune import *
 from ..quantization import *
 from .strategy import *
 from ..distillation import *
-from ..searcher import *
-from ..nas import *
 
 __all__ = ['ConfigFactory']
 """This factory is used to create instances by loading and parsing configure file with yaml format.
 """
 
-PLUGINS = ['pruners', 'quantizers', 'quantizers', 'strategies', 'controllers']
-
 
 class ConfigFactory(object):
     def __init__(self, config):
@@ -84,7 +80,7 @@ class ConfigFactory(object):
                     assert self.version == int(key_values['version'])
 
                 # parse pruners
-                if key in PLUGINS:
+                if key == 'distillers' or key == 'pruners' or key == 'quantizers' or key == 'strategies':
                     instances = key_values[key]
                     for name in instances:
                         self._new_instance(name, instances[name])
@@ -95,12 +91,8 @@ class ConfigFactory(object):
                     if 'init_model' in key_values[key]:
                         self.compressor['init_model'] = key_values[key][
                             'init_model']
-                    if 'checkpoint_path' in key_values[key]:
-                        self.compressor['checkpoint_path'] = key_values[key][
-                            'checkpoint_path']
-                    if 'eval_epoch' in key_values[key]:
-                        self.compressor['eval_epoch'] = key_values[key][
-                            'eval_epoch']
+                    self.compressor['checkpoint_path'] = key_values[key][
+                        'checkpoint_path']
                     if 'strategies' in key_values[key]:
                         for name in key_values[key]['strategies']:
                             strategy = self.instance(name)
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
index 42389079f8d..d8e08c3ebef 100644
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
@@ -14,14 +14,14 @@
 
 from ..core.strategy import Strategy
 from ....framework import Program, Variable, program_guard
-from ....log_helper import get_logger
 from .... import Executor
 import logging
 
 __all__ = ['DistillationStrategy']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 class DistillationStrategy(Strategy):
diff --git a/python/paddle/fluid/contrib/slim/graph/executor.py b/python/paddle/fluid/contrib/slim/graph/executor.py
index 041ccbb3a31..70438a90eb7 100644
--- a/python/paddle/fluid/contrib/slim/graph/executor.py
+++ b/python/paddle/fluid/contrib/slim/graph/executor.py
@@ -41,7 +41,6 @@ class SlimGraphExecutor(object):
             results(list): A list of result with the same order indicated by graph.out_nodes.
         """
         assert isinstance(graph, GraphWrapper)
-        feed = None
         if data is not None:
             feeder = DataFeeder(
                 feed_list=graph.in_nodes.values(),
diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
index b01c98aab9d..e7f5f0d6a21 100644
--- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py
@@ -209,7 +209,6 @@ class GraphWrapper(object):
             if var.persistable:
                 self.persistables[var.name] = var
         self.compiled_graph = None
-        in_nodes = [] if in_nodes is None else in_nodes
         self.in_nodes = OrderedDict(in_nodes)
         self.out_nodes = OrderedDict(out_nodes)
         self._attrs = OrderedDict()
@@ -242,7 +241,7 @@ class GraphWrapper(object):
         """
         return var._var.persistable
 
-    def compile(self, for_parallel=True, for_test=False, mem_opt=False):
+    def compile(self, for_parallel=True, for_test=False):
         """
         Compile the program in this wrapper to framework.CompiledProgram for next running.
         This function must be called if the program is modified.
@@ -258,9 +257,8 @@ class GraphWrapper(object):
         if for_parallel:
             # disable memory optimize for stable training
             build_strategy = compiler.BuildStrategy()
-            build_strategy.enable_inplace = mem_opt
-            build_strategy.memory_optimize = mem_opt
-            #            build_strategy.async_mode = False
+            build_strategy.enable_inplace = False
+            build_strategy.memory_optimize = False
             self.compiled_graph = compiler.CompiledProgram(
                 target).with_data_parallel(
                     loss_name=loss, build_strategy=build_strategy)
@@ -477,12 +475,8 @@ class GraphWrapper(object):
         for var in self.program.list_vars():
             if var.persistable and var.name not in self.persistables:
                 self.persistables[var.name] = var
-        persistables = []
-        for var in self.persistables:
-            if 'reader' not in var and 'double_buffer' not in var:
-                persistables.append(self.persistables[var])
 
-        io.save_vars(exe.exe, path, vars=persistables)
+        io.save_vars(exe.exe, path, vars=self.persistables.values())
 
     def load_persistables(self, path, exe):
         """
@@ -495,11 +489,8 @@ class GraphWrapper(object):
         def if_exist(var):
             return os.path.exists(os.path.join(path, var.name))
 
-        persistables = []
-        for var in self.persistables:
-            if 'reader' not in var and 'double_buffer' not in var:
-                persistables.append(self.persistables[var])
-        io.load_vars(exe.exe, path, vars=persistables, predicate=if_exist)
+        io.load_vars(
+            exe.exe, path, vars=self.persistables.values(), predicate=if_exist)
 
     def update_param_shape(self, scope):
         """
diff --git a/python/paddle/fluid/contrib/slim/prune/__init__.py b/python/paddle/fluid/contrib/slim/prune/__init__.py
index ae487a21e34..764a45bb130 100644
--- a/python/paddle/fluid/contrib/slim/prune/__init__.py
+++ b/python/paddle/fluid/contrib/slim/prune/__init__.py
@@ -16,9 +16,6 @@ from . import pruner
 from .pruner import *
 from . import prune_strategy
 from .prune_strategy import *
-from . import auto_prune_strategy
-from .auto_prune_strategy import *
 
 __all__ = pruner.__all__
 __all__ += prune_strategy.__all__
-__all__ += auto_prune_strategy.__all__
diff --git a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
index 6f430bc9e2f..7a25c3a61e0 100644
--- a/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
+++ b/python/paddle/fluid/contrib/slim/prune/prune_strategy.py
@@ -15,7 +15,6 @@
 from ..core.strategy import Strategy
 from ..graph import VarWrapper, OpWrapper, GraphWrapper
 from ....framework import Program, program_guard, Parameter
-from ....log_helper import get_logger
 from .... import layers
 import prettytable as pt
 import numpy as np
@@ -27,10 +26,11 @@ import pickle
 import logging
 import sys
 
-__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy', 'PruneStrategy']
+__all__ = ['SensitivePruneStrategy', 'UniformPruneStrategy']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 class PruneStrategy(Strategy):
@@ -61,6 +61,8 @@ class PruneStrategy(Strategy):
         self.metric_name = metric_name
         self.pruned_params = pruned_params
         self.pruned_list = []
+        self.backup = {}
+        self.param_shape_backup = {}
 
     def _eval_graph(self, context, sampled_rate=None, cached_id=0):
         """
@@ -80,9 +82,7 @@ class PruneStrategy(Strategy):
                                 ratio,
                                 place,
                                 lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
+                                only_graph=False):
         """
         Pruning filters by given ratio.
         Args:
@@ -103,16 +103,16 @@ class PruneStrategy(Strategy):
         for param in params:
             assert isinstance(param, VarWrapper)
             param_t = scope.find_var(param.name()).get_tensor()
-            if param_backup is not None and (param.name() not in param_backup):
-                param_backup[param.name()] = copy.deepcopy(np.array(param_t))
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
             pruned_param = self.pruner.prune_tensor(
                 np.array(param_t), pruned_idx, pruned_axis=0, lazy=lazy)
             if not only_graph:
                 param_t.set(pruned_param, place)
             ori_shape = param.shape()
-            if param_shape_backup is not None and (
-                    param.name() not in param_shape_backup):
-                param_shape_backup[param.name()] = copy.deepcopy(param.shape())
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
             new_shape = list(param.shape())
             new_shape[0] = pruned_param.shape[0]
             param.set_shape(new_shape)
@@ -120,8 +120,7 @@ class PruneStrategy(Strategy):
                 '|----------------------------------------+----+------------------------------+------------------------------|'
             )
             _logger.debug('|{:^40}|{:^4}|{:^30}|{:^30}|'.format(
-                str(param.name()),
-                str(ratio), str(ori_shape), str(param.shape())))
+                str(param.name()), str(0), str(ori_shape), str(param.shape())))
             self.pruned_list[0].append(param.name())
         return pruned_idx
 
@@ -132,9 +131,7 @@ class PruneStrategy(Strategy):
                                 pruned_axis,
                                 place,
                                 lazy=False,
-                                only_graph=False,
-                                param_shape_backup=None,
-                                param_backup=None):
+                                only_graph=False):
         """
         Pruning parameters in given axis.
         Args:
@@ -153,17 +150,16 @@ class PruneStrategy(Strategy):
         for param in params:
             assert isinstance(param, VarWrapper)
             param_t = scope.find_var(param.name()).get_tensor()
-            if param_backup is not None and (param.name() not in param_backup):
-                param_backup[param.name()] = copy.deepcopy(np.array(param_t))
+            if lazy:
+                self.backup[param.name()] = copy.deepcopy(np.array(param_t))
             pruned_param = self.pruner.prune_tensor(
                 np.array(param_t), pruned_idx, pruned_axis, lazy=lazy)
             if not only_graph:
                 param_t.set(pruned_param, place)
             ori_shape = param.shape()
-
-            if param_shape_backup is not None and (
-                    param.name() not in param_shape_backup):
-                param_shape_backup[param.name()] = copy.deepcopy(param.shape())
+            if param.name() not in self.param_shape_backup:
+                self.param_shape_backup[param.name()] = copy.deepcopy(
+                    param.shape())
             new_shape = list(param.shape())
             new_shape[pruned_axis] = pruned_param.shape[pruned_axis]
             param.set_shape(new_shape)
@@ -255,9 +251,7 @@ class PruneStrategy(Strategy):
                                         ratio=None,
                                         pruned_idxs=None,
                                         lazy=False,
-                                        only_graph=False,
-                                        param_backup=None,
-                                        param_shape_backup=None):
+                                        only_graph=False):
         """
         Pruning all the parameters affected by the pruning of given parameter.
         Args:
@@ -290,9 +284,7 @@ class PruneStrategy(Strategy):
                 pruned_axis=0,
                 place=place,
                 lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
+                only_graph=only_graph)
 
         else:
             pruned_idxs = self._prune_filters_by_ratio(
@@ -300,9 +292,7 @@ class PruneStrategy(Strategy):
                 ratio,
                 place,
                 lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
+                only_graph=only_graph)
         corrected_idxs = pruned_idxs[:]
 
         for idx, op in enumerate(related_ops):
@@ -317,9 +307,7 @@ class PruneStrategy(Strategy):
                             pruned_axis=1,
                             place=place,
                             lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
+                            only_graph=only_graph)
             if op.type() == "depthwise_conv2d":
                 for in_var in op.all_inputs():
                     if graph.is_parameter(in_var):
@@ -331,9 +319,7 @@ class PruneStrategy(Strategy):
                             pruned_axis=0,
                             place=place,
                             lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
+                            only_graph=only_graph)
             elif op.type() == "elementwise_add":
                 # pruning bias
                 for in_var in op.all_inputs():
@@ -346,9 +332,7 @@ class PruneStrategy(Strategy):
                             pruned_axis=0,
                             place=place,
                             lazy=lazy,
-                            only_graph=only_graph,
-                            param_backup=param_backup,
-                            param_shape_backup=param_shape_backup)
+                            only_graph=only_graph)
             elif op.type() == "mul":  # pruning fc layer
                 fc_input = None
                 fc_param = None
@@ -370,9 +354,7 @@ class PruneStrategy(Strategy):
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
+                    only_graph=only_graph)
 
             elif op.type() == "concat":
                 concat_inputs = op.all_inputs()
@@ -396,36 +378,28 @@ class PruneStrategy(Strategy):
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
+                    only_graph=only_graph)
                 self._prune_parameter_by_idx(
                     scope, [variance] + self._get_accumulator(graph, variance),
                     corrected_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
+                    only_graph=only_graph)
                 self._prune_parameter_by_idx(
                     scope, [alpha] + self._get_accumulator(graph, alpha),
                     corrected_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
+                    only_graph=only_graph)
                 self._prune_parameter_by_idx(
                     scope, [beta] + self._get_accumulator(graph, beta),
                     corrected_idxs,
                     pruned_axis=0,
                     place=place,
                     lazy=lazy,
-                    only_graph=only_graph,
-                    param_backup=param_backup,
-                    param_shape_backup=param_shape_backup)
+                    only_graph=only_graph)
 
     def _prune_parameters(self,
                           graph,
@@ -434,9 +408,7 @@ class PruneStrategy(Strategy):
                           ratios,
                           place,
                           lazy=False,
-                          only_graph=False,
-                          param_backup=None,
-                          param_shape_backup=None):
+                          only_graph=False):
         """
         Pruning the given parameters.
         Args:
@@ -472,9 +444,7 @@ class PruneStrategy(Strategy):
                 place,
                 ratio=ratio,
                 lazy=lazy,
-                only_graph=only_graph,
-                param_backup=param_backup,
-                param_shape_backup=param_shape_backup)
+                only_graph=only_graph)
             ops = param.outputs()
             for op in ops:
                 if op.type() == 'conv2d':
@@ -488,9 +458,7 @@ class PruneStrategy(Strategy):
                                 place,
                                 ratio=ratio,
                                 lazy=lazy,
-                                only_graph=only_graph,
-                                param_backup=param_backup,
-                                param_shape_backup=param_shape_backup)
+                                only_graph=only_graph)
         _logger.debug(
             '|----------------------------------------+----+------------------------------+------------------------------|'
         )
@@ -607,24 +575,23 @@ class UniformPruneStrategy(PruneStrategy):
             _logger.debug(
                 '-----------Try pruning ratio: {:.2f}-----------'.format(ratio))
             ratios = [ratio] * len(pruned_params)
-            param_shape_backup = {}
             self._prune_parameters(
                 context.eval_graph,
                 context.scope,
                 pruned_params,
                 ratios,
                 context.place,
-                only_graph=True,
-                param_shape_backup=param_shape_backup)
+                only_graph=True)
 
             pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
             pruned_size = 1 - (float(context.eval_graph.numel_params()) /
                                model_size)
             _logger.debug('Pruned flops: {:.2f}'.format(pruned_flops))
             _logger.debug('Pruned model size: {:.2f}'.format(pruned_size))
-            for param in param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(param_shape_backup[
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
                     param])
+            self.param_shape_backup = {}
 
             if abs(pruned_flops - self.target_ratio) < 1e-2:
                 break
@@ -705,6 +672,8 @@ class SensitivePruneStrategy(PruneStrategy):
         self.pruned_list = []
         self.sensitivities = sensitivities
         self.sensitivities_file = sensitivities_file
+        self.backup = {}
+        self.param_shape_backup = {}
         self.num_steps = num_steps
         self.eval_rate = eval_rate
         self.pruning_step = 1 - pow((1 - target_ratio), 1.0 / self.num_steps)
@@ -759,6 +728,8 @@ class SensitivePruneStrategy(PruneStrategy):
         Computing the sensitivities of all parameters.
         """
         _logger.info("calling _compute_sensitivities.")
+        self.param_shape_backup = {}
+        self.backup = {}
         cached_id = np.random.randint(1000)
         if self.start_epoch == context.epoch_id:
             sensitivities_file = self.sensitivities_file
@@ -790,15 +761,12 @@ class SensitivePruneStrategy(PruneStrategy):
                 if metric is None:
                     metric = self._eval_graph(context, self.eval_rate,
                                               cached_id)
-
-                param_backup = {}
                 # prune parameter by ratio
                 self._prune_parameters(
                     context.eval_graph,
                     context.scope, [param], [ratio],
                     context.place,
-                    lazy=True,
-                    param_backup=param_backup)
+                    lazy=True)
                 self.pruned_list[0]
                 # get accuracy after pruning and update self.sensitivities
                 pruned_metric = self._eval_graph(context, self.eval_rate,
@@ -819,11 +787,12 @@ class SensitivePruneStrategy(PruneStrategy):
                 self._save_sensitivities(sensitivities, sensitivities_file)
 
                 # restore pruned parameters
-                for param_name in param_backup.keys():
+                for param_name in self.backup.keys():
                     param_t = context.scope.find_var(param_name).get_tensor()
-                    param_t.set(self.param_backup[param_name], context.place)
+                    param_t.set(self.backup[param_name], context.place)
 
 #                pruned_metric = self._eval_graph(context)
+                self.backup = {}
 
                 ratio += self.delta_rate
         return sensitivities
@@ -834,6 +803,8 @@ class SensitivePruneStrategy(PruneStrategy):
         """
         _logger.info('_get_best_ratios for pruning ratie: {}'.format(
             target_ratio))
+        self.param_shape_backup = {}
+        self.backup = {}
 
         def func(params, x):
             a, b, c, d = params
@@ -883,24 +854,23 @@ class SensitivePruneStrategy(PruneStrategy):
             _logger.info('Pruned ratios={}'.format(
                 [round(ratio, 3) for ratio in ratios]))
             # step 2.2: Pruning by current ratios
-            param_shape_backup = {}
             self._prune_parameters(
                 context.eval_graph,
                 context.scope,
                 sensitivities.keys(),
                 ratios,
                 context.place,
-                only_graph=True,
-                param_shape_backup=param_shape_backup)
+                only_graph=True)
 
             pruned_flops = 1 - (float(context.eval_graph.flops()) / flops)
             pruned_size = 1 - (float(context.eval_graph.numel_params()) /
                                model_size)
             _logger.info('Pruned flops: {:.4f}'.format(pruned_flops))
             _logger.info('Pruned model size: {:.4f}'.format(pruned_size))
-            for param in param_shape_backup.keys():
-                context.eval_graph.var(param).set_shape(param_shape_backup[
+            for param in self.param_shape_backup.keys():
+                context.eval_graph.var(param).set_shape(self.param_shape_backup[
                     param])
+            self.param_shape_backup = {}
 
             # step 2.3: Check whether current ratios is enough
             if abs(pruned_flops - target_ratio) < 0.015:
@@ -932,6 +902,9 @@ class SensitivePruneStrategy(PruneStrategy):
             self._prune_parameters(context.optimize_graph, context.scope,
                                    params, ratios, context.place)
 
+            self.param_shape_backup = {}
+            self.backup = {}
+
             model_size = context.eval_graph.numel_params()
             flops = context.eval_graph.flops()
             _logger.debug('################################')
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
index 659265895a5..1c51aa15373 100644
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -18,11 +18,5 @@ from . import quantization_pass
 from .quantization_pass import *
 from . import quantization_strategy
 from .quantization_strategy import *
-from . import mkldnn_post_training_strategy
-from .mkldnn_post_training_strategy import *
-from . import quantization_mkldnn_pass
-from .quantization_mkldnn_pass import *
 
 __all__ = quantization_pass.__all__ + quantization_strategy.__all__
-__all__ += mkldnn_post_training_strategy.__all__
-__all__ += quantization_mkldnn_pass.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 1ea2f080c64..0d989903a9a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -22,8 +22,7 @@ from .... import unique_name
 
 __all__ = [
     'QuantizationTransformPass', 'QuantizationFreezePass', 'ConvertToInt8Pass',
-    'TransformForMobilePass', 'ScaleForTrainingPass', 'ScaleForInferencePass',
-    'AddQuantDequantPass'
+    'TransformForMobilePass', 'ScaleForTrainingPass', 'ScaleForInferencePass'
 ]
 
 
@@ -995,8 +994,6 @@ class ScaleForTrainingPass(object):
         Args:
             graph(IrGraph): the target graph.
         """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
         self._is_test = graph.is_test()
         ops = graph.all_op_nodes()
         for op_node in ops:
@@ -1102,8 +1099,6 @@ class ScaleForInferencePass(object):
         Args:
             graph(IrGraph): the target graph.
         """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
         ops = graph.all_op_nodes()
         for op_node in ops:
             name = op_node.name()
@@ -1122,137 +1117,3 @@ class ScaleForInferencePass(object):
         Return the scale name for the var named `var_name`.
         """
         return "%s@scale" % (var_name)
-
-
-class AddQuantDequantPass(object):
-    def __init__(self, scope=None, place=None, moving_rate=0.9, quant_bits=8):
-        """
-        This pass is used to add quant_dequant op for some ops, such as the
-        `elementwise_add` op.
-        """
-        self._scope = scope
-        self._place = place
-        self._moving_rate = moving_rate
-        self._quant_bits = quant_bits
-        self._is_test = None
-        self._target_ops = ["elementwise_add", "pool2d"]
-
-    def apply(self, graph):
-        """
-        Add quant_dequant before some ops, such as the `elementwise_add` op. This
-        is required by TensorRT.
-        Args:
-            graph(IrGraph): the target graph.
-        """
-        assert isinstance(graph,
-                          IrGraph), 'graph must be the instance of IrGraph.'
-        self._is_test = graph.is_test()
-        ops = graph.all_op_nodes()
-        for op_node in ops:
-            name = op_node.name()
-            if name in self._target_ops:
-                in_nodes_all_not_persistable = True
-                for input_name in op_node.input_arg_names():
-                    in_node = graph._find_node_by_name(op_node.inputs,
-                                                       input_name)
-                    in_nodes_all_not_persistable = (
-                        in_nodes_all_not_persistable and
-                        not in_node.persistable())
-                if not in_nodes_all_not_persistable:
-                    continue
-                input_names = op_node.input_arg_names()
-                for input_name in input_names:
-                    in_node = graph._find_node_by_name(op_node.inputs,
-                                                       input_name)
-                    quant_var_node, scale_var_node = self._inser_quant_dequant_moving_average_abs_max_op(
-                        graph, in_node, self._quant_bits)
-                    graph.update_input_link(in_node, quant_var_node, op_node)
-        graph.resolve_hazard()
-        return graph
-
-    def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node,
-                                                       quant_bits):
-        """Insert fake_quantize_dequantize_moving_average_abs_max op.
-        """
-        quant_var_node = graph.create_var_node(
-            name="{}.quant_dequant".format(var_node.name()),
-            var_type=var_node.type(),
-            shape=var_node.shape(),
-            var_dtype=var_node.dtype())
-        scale_in_node = graph.create_persistable_node(
-            name="{}.quant_dequant.scale".format(var_node.name()),
-            var_type=core.VarDesc.VarType.LOD_TENSOR,
-            shape=[1],
-            var_dtype=var_node.dtype())
-        data_type = 'float64' if var_node.dtype(
-        ) == core.VarDesc.VarType.FP64 else 'float32'
-        _init_var_node(
-            scale_in_node,
-            np.array(
-                [0.001], dtype=data_type),
-            self._scope,
-            self._place)
-
-        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
-        ins = {'X': var_node, 'InScale': scale_in_node}
-        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
-        if not self._is_test:
-            state_in_node = graph.create_persistable_node(
-                name=unique_name.generate('quant_dequant.state'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            data_type = 'float64' if var_node.dtype(
-            ) == core.VarDesc.VarType.FP64 else 'float32'
-            _init_var_node(
-                state_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            accum_in_node = graph.create_persistable_node(
-                name=unique_name.generate('quant_dequant.accum'),
-                var_type=core.VarDesc.VarType.LOD_TENSOR,
-                var_dtype=var_node.dtype(),
-                shape=[1])
-            _init_var_node(
-                accum_in_node,
-                np.ones(
-                    [1], dtype=data_type),
-                self._scope,
-                self._place)
-            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
-            ))
-            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
-            ))
-
-            ins['InState'] = state_in_node
-            ins['InAccum'] = accum_in_node
-            outs['OutState'] = state_out_node
-            outs['OutAccum'] = accum_out_node
-
-        attrs = {
-            'bit_length': quant_bits,
-            'moving_rate': self._moving_rate,
-            'is_test': self._is_test,
-            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-        }
-
-        quant_op_node = graph.create_op_node(
-            op_type='fake_quantize_dequantize_moving_average_abs_max',
-            attrs=attrs,
-            inputs=ins,
-            outputs=outs)
-
-        graph.link_to(var_node, quant_op_node)
-        graph.link_to(scale_in_node, quant_op_node)
-        graph.link_to(quant_op_node, quant_var_node)
-        graph.link_to(quant_op_node, scale_out_node)
-
-        if not self._is_test:
-            graph.link_to(state_in_node, quant_op_node)
-            graph.link_to(accum_in_node, quant_op_node)
-            graph.link_to(quant_op_node, state_out_node)
-            graph.link_to(quant_op_node, accum_out_node)
-
-        return quant_var_node, scale_out_node
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index c3d977f708f..12c1ce98992 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -21,14 +21,14 @@ from .... import core
 from ....compiler import CompiledProgram
 from ....compiler import BuildStrategy
 from ....framework import IrGraph, Variable, Program
-from ....log_helper import get_logger
 from ..core.strategy import Strategy
 from .quantization_pass import *
 
 __all__ = ['QuantizationStrategy']
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 class QuantizationStrategy(Strategy):
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index c59df49f626..848f063f677 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -1,147 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-function(inference_analysis_python_api_int8_test target model_dir data_dir filename)
-    py_test(${target} SRCS ${filename}
-        ENVS CPU_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-        ARGS --infer_model ${model_dir}/model
-             --infer_data ${data_dir}/data.bin
-             --int8_model_save_path int8_models/${target}
-             --warmup_batch_size 100
-             --batch_size 50)
-endfunction()
-
-function(inference_qat_int8_test target model_dir data_dir test_script use_mkldnn)
-    py_test(${target} SRCS ${test_script}
-            ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
-                 FLAGS_use_mkldnn=${use_mkldnn}
-            ARGS --qat_model ${model_dir}/model
-                 --infer_data ${data_dir}/data.bin
-                 --batch_size 25
-                 --batch_num 2
-                 --acc_diff_threshold 0.1)
-endfunction()
-
 # NOTE: TODOOOOOOOOOOO
 # temporarily disable test_distillation_strategy since it always failed on a specified machine with 4 GPUs
 # Need to figure out the root cause and then add it back
 list(REMOVE_ITEM TEST_OPS test_distillation_strategy)
 
-if(WIN32)
-    list(REMOVE_ITEM TEST_OPS test_light_nas)
-endif()
-
-# int8 image classification python api test
-if(LINUX AND WITH_MKLDNN)
-  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-  set(MKLDNN_INT8_TEST_FILE "test_mkldnn_int8_quantization_strategy.py")
-
-  # googlenet int8
-  set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  inference_analysis_python_api_int8_test(test_slim_int8_googlenet ${INT8_GOOGLENET_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-  # mobilenet int8
-  set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  inference_analysis_python_api_int8_test(test_slim_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-  # temporarily adding WITH_SLIM_MKLDNN_FULL_TEST FLAG for QA testing the following UTs locally,
-  # since the following UTs cost too much time on CI test.
-  if (WITH_SLIM_MKLDNN_FULL_TEST)
-    # resnet50 int8
-    set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-    inference_analysis_python_api_int8_test(test_slim_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-    # mobilenetv2 int8
-    set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-    inference_analysis_python_api_int8_test(test_slim_int8_mobilenetv2 ${INT8_MOBILENETV2_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-    # resnet101 int8
-    set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-    inference_analysis_python_api_int8_test(test_slim_int8_resnet101 ${INT8_RESNET101_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-    # vgg16 int8
-    set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-    inference_analysis_python_api_int8_test(test_slim_int8_vgg16 ${INT8_VGG16_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-
-    # vgg19 int8
-    set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-    inference_analysis_python_api_int8_test(test_slim_int8_vgg19 ${INT8_VGG19_MODEL_DIR} ${INT8_DATA_DIR} ${MKLDNN_INT8_TEST_FILE})
-  endif()
-endif()
-
-# Since test_mkldnn_int8_quantization_strategy only supports testing on Linux
-# with MKL-DNN, we remove it here for not repeating test, or not testing on other systems.
-list(REMOVE_ITEM TEST_OPS test_mkldnn_int8_quantization_strategy)
-
-# QAT FP32 & INT8 comparison python api tests
-if(LINUX AND WITH_MKLDNN)
-	set(DATASET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-	set(QAT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2")
-	set(QAT_MODELS_BASE_URL "${INFERENCE_URL}/int8/QAT_models")
-	set(MKLDNN_QAT_TEST_FILE "qat_int8_comparison.py")
-	set(MKLDNN_QAT_TEST_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_QAT_TEST_FILE}")
-
-	# ImageNet small dataset
-	# May be already downloaded for INT8v2 unit tests
-	if (NOT EXISTS ${DATASET_DIR})
-		inference_download_and_uncompress(${DATASET_DIR} "${INFERENCE_URL}/int8" "imagenet_val_100_tail.tar.gz")
-	endif()
-
-	# QAT ResNet50
-	set(QAT_RESNET50_MODEL_DIR "${QAT_DATA_DIR}/ResNet50_QAT")
-	if (NOT EXISTS ${QAT_RESNET50_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_RESNET50_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "ResNet50_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_resnet50_mkldnn ${QAT_RESNET50_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT ResNet101
-	set(QAT_RESNET101_MODEL_DIR "${QAT_DATA_DIR}/ResNet101_QAT")
-	if (NOT EXISTS ${QAT_RESNET101_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_RESNET101_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "ResNet101_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_resnet101_mkldnn ${QAT_RESNET101_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT GoogleNet
-	set(QAT_GOOGLENET_MODEL_DIR "${QAT_DATA_DIR}/GoogleNet_QAT")
-	if (NOT EXISTS ${QAT_GOOGLENET_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_GOOGLENET_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "GoogleNet_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_googlenet_mkldnn ${QAT_GOOGLENET_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT MobileNetV1
-	set(QAT_MOBILENETV1_MODEL_DIR "${QAT_DATA_DIR}/MobileNetV1_QAT")
-	if (NOT EXISTS ${QAT_MOBILENETV1_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_MOBILENETV1_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "MobileNetV1_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_mobilenetv1_mkldnn ${QAT_MOBILENETV1_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT MobileNetV2
-	set(QAT_MOBILENETV2_MODEL_DIR "${QAT_DATA_DIR}/MobileNetV2_QAT")
-	if (NOT EXISTS ${QAT_MOBILENETV2_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_MOBILENETV2_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "MobileNetV2_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_mobilenetv2_mkldnn ${QAT_MOBILENETV2_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT VGG16
-	set(QAT_VGG16_MODEL_DIR "${QAT_DATA_DIR}/VGG16_QAT")
-	if (NOT EXISTS ${QAT_VGG16_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_VGG16_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "VGG16_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_vgg16_mkldnn ${QAT_VGG16_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-
-	# QAT VGG19
-	set(QAT_VGG19_MODEL_DIR "${QAT_DATA_DIR}/VGG19_QAT")
-	if (NOT EXISTS ${QAT_VGG19_MODEL_DIR})
-		inference_download_and_uncompress(${QAT_VGG19_MODEL_DIR} "${QAT_MODELS_BASE_URL}" "VGG19_qat_model.tar.gz" )
-	endif()
-	inference_qat_int8_test(test_qat_int8_vgg19_mkldnn ${QAT_VGG19_MODEL_DIR} ${DATASET_DIR} ${MKLDNN_QAT_TEST_FILE_PATH} true)
-endif()
-
-# Since the test for QAT FP32 & INT8 comparison supports only testing on Linux 
-# with MKL-DNN, we remove it here to not test it on other systems.
-list(REMOVE_ITEM TEST_OPS qat_int8_comparison.py)
-
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
index 69080cf50ec..0ab8052d7ab 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py
@@ -19,8 +19,6 @@ import six
 import numpy as np
 from paddle.fluid.contrib.slim.graph import GraphWrapper
 from paddle.fluid import core
-import os
-os.environ['CPU_NUM'] = str(4)
 
 
 def residual_block(num):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 0739c9c1f7b..1ed41da0f84 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -24,7 +24,6 @@ from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
 from paddle.fluid.contrib.slim.quantization import ScaleForTrainingPass
 from paddle.fluid.contrib.slim.quantization import ScaleForInferencePass
-from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
@@ -99,7 +98,6 @@ class TestQuantizationScalePass(unittest.TestCase):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             exe.run(startup)
-
         transform_pass = QuantizationTransformPass(
             scope=scope,
             place=place,
@@ -107,14 +105,8 @@ class TestQuantizationScalePass(unittest.TestCase):
             weight_quantize_type=weight_quant_type)
         transform_pass.apply(main_graph)
         transform_pass.apply(test_graph)
-
-        add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
-        add_quant_dequant_pass.apply(main_graph)
-        add_quant_dequant_pass.apply(test_graph)
-
         scale_training_pass = ScaleForTrainingPass(scope=scope, place=place)
         scale_training_pass.apply(main_graph)
-
         dev_name = '_gpu' if use_cuda else '_cpu'
         if not for_ci:
             marked_nodes = set()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py b/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
index 214d6c7557f..4eb397e55b7 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration_mobilenetv1.py
@@ -30,16 +30,16 @@ class TestCalibrationForMobilenetv1(TestCalibration):
 
     def test_calibration(self):
         self.download_model()
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
+        print("Start FP32 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
         (fp32_throughput, fp32_latency,
          fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
-        print("Start INT8 calibration for {0} on {1} images ...".format(
-            self.model, self.sample_iterations * self.batch_size))
+        print("Start INT8 calibration for {0} on {1} images ...").format(
+            self.model, self.sample_iterations * self.batch_size)
         self.run_program(
             self.model_cache_folder + "/model", True, algo=self.algo)
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
+        print("Start INT8 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program(self.int8_model)
         delta_value = fp32_acc1 - int8_acc1
diff --git a/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py b/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
index a5286e5b0a6..0bbaa21a711 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration_resnet50.py
@@ -193,7 +193,7 @@ class TestCalibration(unittest.TestCase):
             file_name = data_urls[0].split('/')[-1]
             zip_path = os.path.join(self.cache_folder, file_name)
 
-        print('Data is downloaded at {0}'.format(zip_path))
+        print('Data is downloaded at {0}').format(zip_path)
         self.cache_unzipping(data_cache_folder, zip_path)
         return data_cache_folder
 
@@ -297,16 +297,16 @@ class TestCalibrationForResnet50(TestCalibration):
 
     def test_calibration(self):
         self.download_model()
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
+        print("Start FP32 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
         (fp32_throughput, fp32_latency,
          fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
-        print("Start INT8 calibration for {0} on {1} images ...".format(
-            self.model, self.sample_iterations * self.batch_size))
+        print("Start INT8 calibration for {0} on {1} images ...").format(
+            self.model, self.sample_iterations * self.batch_size)
         self.run_program(
             self.model_cache_folder + "/model", True, algo=self.algo)
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            self.model, self.infer_iterations * self.batch_size))
+        print("Start INT8 inference for {0} on {1} images ...").format(
+            self.model, self.infer_iterations * self.batch_size)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program(self.int8_model)
         delta_value = fp32_acc1 - int8_acc1
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index a2e700803dc..b7a14fa59b4 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -132,12 +132,10 @@ def train(net_type, use_cuda, save_dirname, is_local):
         # Test program
         test_program = train_program.clone(for_test=True)
 
-        optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
+        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
         mp_optimizer = fluid.contrib.mixed_precision.decorate(
-            optimizer=optimizer,
-            init_loss_scaling=8.0,
-            use_dynamic_loss_scaling=True)
+            optimizer=optimizer, init_loss_scaling=8.0)
 
         scaled_loss, _, _ = mp_optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
index 1bfc966de88..35ddf97ff23 100644
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
@@ -24,12 +24,12 @@ import copy
 import errno
 
 import logging
-from paddle.fluid.log_helper import get_logger
 
 __all__ = ["HDFSClient", "multi_download", "multi_upload"]
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("hdfs_utils")
+_logger.setLevel(logging.INFO)
 
 
 class HDFSClient(object):
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index b15ee94f635..a127f5b11b7 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -22,17 +22,15 @@ import paddle
 from paddle.fluid import core
 from paddle.fluid import io
 from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
 
 __all__ = [
     "load_persistables_for_increment", "load_persistables_for_inference",
     "convert_dist_to_sparse_program"
 ]
 
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
+_logger = logging.getLogger("lookup_table_utils")
+_logger.setLevel(logging.INFO)
 
 model_filename = "__model__"
 lookup_table_dir = "__lookup_table__"
diff --git a/python/paddle/fluid/cxx_trainer.py b/python/paddle/fluid/cxx_trainer.py
new file mode 100644
index 00000000000..d25e4422496
--- /dev/null
+++ b/python/paddle/fluid/cxx_trainer.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from . import core
+from . import framework
+from . import executor
+from . import compiler
+import sys
+
+from .framework import default_main_program, Variable
+
+__all__ = ['add_feed_fetch_op']
+
+
+def _has_feed_operators(block, feed_targets, feed_holder_name):
+    """ Check whether the block already has feed operators.
+
+    Return false if the block does not have any feed operators.
+    If some feed operators have been prepended to the block, check that
+    the info contained in these feed operators matches the feed_targets
+    and feed_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has feed operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        feed_targets: a dictionary of {feed_target_name: feed_target_data}
+        feed_holder_name: the name of the variable that holds the data of
+            all feed targets. The type of this feed_holder variable is
+            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
+
+    Returns:
+        A boolean value that indicates whether a block has feed operators
+        that match the info contained in feed_targets and feed_holder_name.
+    """
+
+    feed_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'feed':
+            feed_count += 1
+            assert op.desc.input('X')[0] == feed_holder_name
+            feed_target_name = op.desc.output('Out')[0]
+            if feed_target_name not in feed_targets:
+                raise Exception("'feed_targets' does not have {} variable".
+                                format(feed_target_name))
+        else:
+            break
+    if feed_count > 0 and feed_count != len(feed_targets):
+        raise Exception(
+            "Feed operators in program desc do not match 'feed_targets'")
+    return feed_count > 0
+
+
+def _has_fetch_operators(block, fetch_targets, fetch_holder_name):
+    """ Check whether the block already has fetch operators.
+
+    Return false if the block does not have any fetch operators.
+    If some fetch operators have been appended to the block, check that
+    the info contained in these fetch operators matches the fetch_targets
+    and fetch_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has fetch operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
+        fetch_holder_name: the name of the variable that holds the data of
+            all fetch targets. The type of this fetch_holder variable is
+            FETCH_LIST, which is essentially vector<LoDTensor>.
+
+    Return:
+        A boolean value that indicates whether a block has fetch operators
+        that match the info contained in fetch_targets and fetch_holder_name.
+    """
+
+    fetch_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_count += 1
+            assert op.desc.output('Out')[0] == fetch_holder_name
+            fetch_target_name = op.desc.input('X')[0]
+            if fetch_target_name not in [
+                    var.desc.name() for var in fetch_targets
+            ]:
+                raise Exception("'fetch_targets' does not have {} variable".
+                                format(fetch_target_name))
+            idx = op.desc.attr('col')
+            assert fetch_target_name == fetch_targets[idx].desc.name()
+    if fetch_count > 0 and fetch_count != len(fetch_targets):
+        raise Exception(
+            "Fetch operators in program desc do not match 'fetch_targets'")
+    return fetch_count > 0
+
+
+def _add_feed_fetch_ops(program,
+                        feed,
+                        fetch_list,
+                        feed_var_name='feed',
+                        fetch_var_name='fetch'):
+    tmp_program = program.clone()
+
+    global_block = tmp_program.global_block()
+
+    if feed_var_name in global_block.vars:
+        feed_var = global_block.var(feed_var_name)
+    else:
+        feed_var = global_block.create_var(
+            name=feed_var_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+    if fetch_var_name in global_block.vars:
+        fetch_var = global_block.var(fetch_var_name)
+    else:
+        fetch_var = global_block.create_var(
+            name=fetch_var_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+
+    # prepend feed operators
+    if not _has_feed_operators(global_block, feed, feed_var_name):
+        for i, name in enumerate(feed):
+            out = global_block.var(name)
+            global_block._prepend_op(
+                type='feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+
+    # append fetch_operators
+    if not _has_fetch_operators(global_block, fetch_list, fetch_var_name):
+        for i, var in enumerate(fetch_list):
+            assert isinstance(var, Variable) or isinstance(
+                var, six.string_types), ("Wrong type for fetch_list[%s]: %s" %
+                                         (i, type(var)))
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+    return tmp_program
+
+
+def add_feed_fetch_op(program, feed, fetch_list, scope, place):
+
+    if program is None:
+        program = default_main_program()
+
+    program = _add_feed_fetch_ops(
+        program=program, feed=feed, fetch_list=fetch_list)
+
+    return program
diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py
index 5ed38f9999f..80745aac830 100644
--- a/python/paddle/fluid/data_feed_desc.py
+++ b/python/paddle/fluid/data_feed_desc.py
@@ -24,32 +24,28 @@ class DataFeedDesc(object):
     currently only used for AsyncExecutor (See comments for class AsyncExecutor
     for a brief introduction)
 
-    DataFeedDesc shall be initialized from a valid protobuf message from disk.
+    DataFeedDesc shall be initialized from a valid protobuf message from disk:
+    >>> data_feed = fluid.DataFeedDesc('data.proto')
 
     See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
     A typical message might look like:
 
-    .. code-block:: python
-
-      f = open("data.proto", "w")
-      print >> f, 'name: "MultiSlotDataFeed"'
-      print >> f, 'batch_size: 2'
-      print >> f, 'multi_slot_desc {'
-      print >> f, '    slots {'
-      print >> f, '         name: "words"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '     }'
-      print >> f, '     slots {'
-      print >> f, '         name: "label"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '    }'
-      print >> f, '}'
-      f.close()
-      data_feed = fluid.DataFeedDesc('data.proto')
+    >>> name: "MultiSlotDataFeed"
+    >>> batch_size: 2
+    >>> multi_slot_desc {
+    >>>     slots {
+    >>>         name: "words"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>>     slots {
+    >>>         name: "label"
+    >>>         type: "uint64"
+    >>>         is_dense: false
+    >>>         is_used: true
+    >>>     }
+    >>> }
 
     However, users usually shouldn't care about the message format; instead,
     they are encouragd to use :code:`Data Generator` as a tool to generate a
@@ -58,23 +54,16 @@ class DataFeedDesc(object):
 
     DataFeedDesc can also be changed during runtime. Once you got familiar with
     what each field mean, you can modify it to better suit your need. E.g.:
-
-    .. code-block:: python
-
-      data_feed = fluid.DataFeedDesc('data.proto')
-      data_feed.set_batch_size(128)
-      data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
-      data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
+    >>> data_feed.set_batch_size(128)
+    >>> data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
+    >>> data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
 
     Finally, the content can be dumped out for debugging purpose:
-
-    .. code-block:: python
-
-      print(data_feed.desc())
+    >>> print(data_feed.desc())
 
     Args:
         proto_file(string): Disk file containing a data feed description.
-
+    
     """
 
     def __init__(self, proto_file):
@@ -93,28 +82,8 @@ class DataFeedDesc(object):
         Set batch size. Will be effective during training
 
         Example:
-            .. code-block:: python
-
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_batch_size(128)
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_batch_size(128)
 
         Args:
             batch_size: batch size
@@ -129,28 +98,8 @@ class DataFeedDesc(object):
         sparse slot will be fed into a LoDTensor
 
         Example:
-            .. code-block:: python
-
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_dense_slots(['words'])
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_dense_slots(['words'])
 
         Args:
             dense_slots_name: a list of slot names which will be set dense
@@ -160,7 +109,7 @@ class DataFeedDesc(object):
         """
         if self.proto_desc.name != "MultiSlotDataFeed":
             raise ValueError(
-                "Only MultiSlotDataFeed needs set_dense_slots, please check your datafeed.proto"
+                "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
             )
         for name in dense_slots_name:
             self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
@@ -173,28 +122,8 @@ class DataFeedDesc(object):
         ones will be used for a specific model.
 
         Example:
-            .. code-block:: python
-
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              data_feed.set_use_slots(['words'])
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> data_feed.set_use_slots(['words'])
 
         Args:
             use_slots_name: a list of slot names which will be used in training
@@ -204,7 +133,7 @@ class DataFeedDesc(object):
         """
         if self.proto_desc.name != "MultiSlotDataFeed":
             raise ValueError(
-                "Only MultiSlotDataFeed needs set_use_slots, please check your datafeed.proto"
+                "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
             )
         for name in use_slots_name:
             self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
@@ -215,28 +144,8 @@ class DataFeedDesc(object):
         Returns a protobuf message for this DataFeedDesc
 
         Example:
-            .. code-block:: python
-
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = fluid.DataFeedDesc('data.proto')
-              print(data_feed.desc())
+            >>> data_feed = fluid.DataFeedDesc('data.proto')
+            >>> print(data_feed.desc())
 
         Returns:
             A string message
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 1090c781422..00c4e5691a2 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -21,8 +21,8 @@ import six
 from six.moves import zip, range, xrange
 import multiprocessing
 
-from .framework import Variable, default_main_program, _current_expected_place
-from .framework import _cpu_num, _cuda_ids
+from .framework import Variable, default_main_program
+
 __all__ = ['DataFeeder']
 
 
@@ -149,7 +149,6 @@ class DataFeeder(object):
 
     ..  code-block:: python
 
-        import paddle.fluid as fluid
         place = fluid.CPUPlace()
         img = fluid.layers.data(name='image', shape=[1, 28, 28])
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
@@ -162,16 +161,10 @@ class DataFeeder(object):
 
     ..  code-block:: python
 
-        import paddle
-        import paddle.fluid as fluid
-        
         place=fluid.CUDAPlace(0)
-        data = fluid.layers.data(name='data', shape=[3, 224, 224], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        
         feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
         reader = feeder.decorate_reader(
-                paddle.batch(paddle.dataset.flowers.train(), batch_size=16), multi_devices=False)
+            paddle.batch(flowers.train(), batch_size=16))
 
     Args:
         feed_list(list): The Variables or Variables'name that will
@@ -187,36 +180,17 @@ class DataFeeder(object):
         ValueError: If some Variable is not in this Program.
 
     Examples:
-        ..  code-block:: python
+        .. code-block:: python
 
-
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            
+            # ...
             place = fluid.CPUPlace()
-            
-            def reader():
-                yield [np.random.random([4]).astype('float32'), np.random.random([3]).astype('float32')],
-            
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            
-            with fluid.program_guard(main_program, startup_program):
-                data_1 = fluid.layers.data(name='data_1', shape=[1, 2, 2])
-                data_2 = fluid.layers.data(name='data_2', shape=[1, 1, 3])
-                out = fluid.layers.fc(input=[data_1, data_2], size=2)
-                # ...
-            
-            feeder = fluid.DataFeeder([data_1, data_2], place)
-                        
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
+            feed_list = [
+                main_program.global_block().var(var_name) for var_name in feed_vars_name
+            ] # feed_vars_name is a list of variables' name.
+            feeder = fluid.DataFeeder(feed_list, place)
             for data in reader():
                 outs = exe.run(program=main_program,
-                               feed=feeder.feed(data),
-                               fetch_list=[out])
-
+                               feed=feeder.feed(data))
     """
 
     def __init__(self, feed_list, place, program=None):
@@ -248,23 +222,6 @@ class DataFeeder(object):
 
         Returns:
             dict: the result of conversion.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle.fluid as fluid
-                
-                def reader(limit=5):
-                    for i in range(limit):
-                        yield random.random([784]).astype('float32'), random.random([1]).astype('int64'), random.random([256]).astype('float32')
-                
-                data_1 = fluid.layers.data(name='data_1', shape=[1, 28, 28])
-                data_2 = fluid.layers.data(name='data_2', shape=[1], dtype='int64')
-                data_3 = fluid.layers.data(name='data_3', shape=[16, 16], dtype='float32')
-                feeder = fluid.DataFeeder(['data_1','data_2', 'data_3'], fluid.CPUPlace())
-                
-                result = feeder.feed(reader()) 
         """
         converter = []
         for lod_level, shape, dtype in six.moves.zip(
@@ -303,32 +260,6 @@ class DataFeeder(object):
 
         Notes:
             The number of devices and number of mini-batches must be same.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle.fluid as fluid
-                
-                def reader(limit=10):
-                    for i in range(limit):
-                        yield [random.random([784]).astype('float32'), random.randint(10)],
-                
-                x = fluid.layers.data(name='x', shape=[1, 28, 28])
-                y = fluid.layers.data(name='y', shape=[1], dtype='int64')
-                
-                feeder = fluid.DataFeeder(['x','y'], fluid.CPUPlace())
-                place_num = 2
-                places = [fluid.CPUPlace() for x in range(place_num)]
-                data = []
-                exe = fluid.Executor(fluid.CPUPlace())
-                exe.run(fluid.default_startup_program())
-                program = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(places=places)
-                for item in reader():
-                    data.append(item)
-                    if place_num == len(data):
-                        exe.run(program=program, feed=list(feeder.feed_parallel(data, place_num)), fetch_list=[])
-                        data = []
         """
         if isinstance(self.place, core.CUDAPlace):
             places = [
@@ -359,9 +290,11 @@ class DataFeeder(object):
         if num_places is not None:
             return int(num_places)
         elif isinstance(self.place, core.CUDAPlace):
-            return len(_cuda_ids())
+            return core.get_cuda_device_count()
         else:
-            return _cpu_num()
+            cpu_num = int(
+                os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
+            return cpu_num
 
     def decorate_reader(self,
                         reader,
@@ -386,29 +319,6 @@ class DataFeeder(object):
 
         Raises:
             ValueError: If drop_last is False and the data batch cannot fit for devices.
-
-        Examples:
-            ..  code-block:: python
-
-                import numpy.random as random
-                import paddle
-                import paddle.fluid as fluid
-                
-                def reader(limit=5):
-                    for i in range(limit):
-                        yield (random.random([784]).astype('float32'), random.random([1]).astype('int64')),
-                
-                place=fluid.CUDAPlace(0)
-                data = fluid.layers.data(name='data', shape=[1, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                
-                feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
-                reader = feeder.decorate_reader(reader, multi_devices=False)
-                
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in reader():
-                    exe.run(feed=data)
         """
 
         def __reader_creator__():
@@ -430,63 +340,3 @@ class DataFeeder(object):
                         "not implemented")
 
         return __reader_creator__
-
-
-class NumpyToLoDTensorConverter(object):
-    def __init__(self, place):
-        self.place = place
-        self.data = []
-        self._reset()
-
-    def _reset(self):
-        self.data = []
-
-    def feed(self, data):
-        self.data.append(data)
-
-    def done(self):
-        arr = numpy.array(self.data)
-        t = core.LoDTensor()
-        t.set(arr, self.place)
-        self._reset()
-        return t
-
-
-class ListTensorProvider(object):
-    def __init__(self, generator, places):
-        self.generator = generator
-        self.converters = []
-        self.places = []
-        if places:
-            if not isinstance(places, (list, tuple)):
-                places = [places]
-            assert len(
-                places) == 1, "dygraph mode CAN NOT specify multiple places."
-            for place in places:
-                if isinstance(place, (core.CUDAPlace, core.CPUPlace)):
-                    self.places.append(place)
-                else:
-                    raise ValueError(
-                        "Please specify a valid place values such as core.CPUPlace or core.CUDAPlace"
-                    )
-        if len(self.places) == 0:
-            self.places.append(_current_expected_place())
-
-    def _readData(self, iterable, places):
-        for place, each_sample in six.moves.zip(places, iterable):
-            for item in each_sample:
-                if len(self.converters) < len(item):
-                    for i in item:
-                        self.converters.append(NumpyToLoDTensorConverter(place))
-                for each_converter, each_slot in six.moves.zip(self.converters,
-                                                               item):
-                    each_converter.feed(each_slot)
-            yield [c.done() for c in self.converters]
-
-    def __call__(self):
-        item = []
-        for batch in self.generator():
-            item.append(batch)
-            if len(item) == len(self.places):
-                yield list(self._readData(item, self.places))
-                item = []
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index b3d58a589bd..c97e0bc6e88 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -21,36 +21,27 @@ __all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset']
 class DatasetFactory(object):
     """
     DatasetFactory is a factory which create dataset by its name,
-    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+    you can create "QueueDataset" or "InMemoryDataset",
     the default is "QueueDataset".
 
     Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-
+        dataset = paddle.fluid.DatasetFactory.create_dataset("InMemoryDataset")
     """
 
     def __init__(self):
-        """ Init. """
+        """
+        Init
+        """
         pass
 
     def create_dataset(self, datafeed_class="QueueDataset"):
         """
-        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
+        Create "QueueDataset" or "InMemoryDataset",
         the default is "QueueDataset".
 
-        Args:
-            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
-                                 Default is QueueDataset.
-
         Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-
+            import paddle.fluid as fluid
+            dataset = fluid.DatasetFactory().create_dataset()
         """
         try:
             dataset = globals()[datafeed_class]()
@@ -61,10 +52,14 @@ class DatasetFactory(object):
 
 
 class DatasetBase(object):
-    """ Base dataset class. """
+    """
+    Base dataset class
+    """
 
     def __init__(self):
-        """ Init. """
+        """
+        Init
+        """
         # define class name here
         # to decide whether we need create in memory instance
         self.proto_desc = data_feed_pb2.DataFeedDesc()
@@ -77,15 +72,11 @@ class DatasetBase(object):
         Set pipe command of current dataset
         A pipe command is a UNIX pipeline command that can be used only
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
+        Example:
+            >>> dataset.set_pipe_command("python my_script.py")
 
         Args:
-            pipe_command(str): pipe command
+            pipe_command: pipe command
 
         """
         self.proto_desc.pipe_command = pipe_command
@@ -94,15 +85,11 @@ class DatasetBase(object):
         """
         Set batch size. Will be effective during training
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
+        Example:
+            >>> dataset.set_batch_size(128)
 
         Args:
-            batch_size(int): batch size
+            batch_size: batch size
 
         """
         self.proto_desc.batch_size = batch_size
@@ -111,15 +98,11 @@ class DatasetBase(object):
         """
         Set thread num, it is the num of readers.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
+        Example:
+            >>> dataset.set_thread(12)
 
         Args:
-            thread_num(int): thread num
+            thread_num: thread num
         """
         self.dataset.set_thread_num(thread_num)
         self.thread_num = thread_num
@@ -128,15 +111,11 @@ class DatasetBase(object):
         """
         Set file list in current worker.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_filelist(['a.txt', 'b.txt'])
+        Example:
+            >>> dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
-            filelist(list): file list
+            filelist: file list
         """
         self.dataset.set_filelist(filelist)
 
@@ -144,15 +123,11 @@ class DatasetBase(object):
         """
         Set Variables which you will use.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
+        Example:
+            >>> dataset.set_use_var([data, label])
 
         Args:
-            var_list(list): variable list
+            var_list: variable list
         """
         multi_slot = self.proto_desc.multi_slot_desc
         for var in var_list:
@@ -175,16 +150,12 @@ class DatasetBase(object):
         """
         Set hdfs config: fs name ad ugi
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+        Example:
+            >>> dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
-            fs_name(str): fs name
-            fs_ugi(str): fs ugi
+            fs_name: fs name
+            fs_ugi: fs ugi
         """
         self.dataset.set_hdfs_config(fs_name, fs_ugi)
 
@@ -199,12 +170,8 @@ class DatasetBase(object):
         """
         Returns a protobuf message for this DataFeedDesc
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              print(dataset.desc())
+        Example:
+            >>> print(dataset.desc())
 
         Returns:
             A string message
@@ -219,11 +186,13 @@ class InMemoryDataset(DatasetBase):
     This class should be created by DatasetFactory
 
     Example:
-        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        dataset = paddle.fluid.DatasetFactory.create_dataset("InMemoryDataset")
     """
 
     def __init__(self):
-        """ Init. """
+        """
+        Init
+        """
         super(InMemoryDataset, self).__init__()
         self.proto_desc.name = "MultiSlotInMemoryDataFeed"
 
@@ -231,14 +200,12 @@ class InMemoryDataset(DatasetBase):
         """
         Load data into memory
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
         """
         self._prepare_to_run()
         self.dataset.load_into_memory()
@@ -247,15 +214,13 @@ class InMemoryDataset(DatasetBase):
         """
         Local shuffle
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.local_shuffle()
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.local_shuffle()
         """
         self.dataset.local_shuffle()
 
@@ -267,141 +232,58 @@ class InMemoryDataset(DatasetBase):
         If you run in distributed mode, you should pass fleet instead of None.
 
         Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
+            >>> import paddle.fluid as fluid
+            >>> from paddle.fluid.incubate.fleet.pslib import fleet
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.global_shuffle(fleet)
 
         Args:
-            fleet(Fleet): fleet singleton. Default None.
-
+            fleet: fleet singleton. Default None.
         """
         trainer_num = 1
         fleet_send_batch_size = 80000
         if fleet is not None:
-            fleet._role_maker._barrier_worker()
+            fleet.fleet_instance.role_maker_._barrier_worker()
             trainer_num = fleet.worker_num()
         self.dataset.register_client2client_msg_handler()
         self.dataset.set_trainer_num(trainer_num)
         self.dataset.set_fleet_send_batch_size(fleet_send_batch_size)
         if fleet is not None:
-            fleet._role_maker._barrier_worker()
+            fleet.fleet_instance.role_maker_._barrier_worker()
         self.dataset.global_shuffle()
         if fleet is not None:
-            fleet._role_maker._barrier_worker()
+            fleet.fleet_instance.role_maker_._barrier_worker()
 
     def release_memory(self):
         """
         Release InMemoryDataset memory data, when data will not be used again.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              exe = fluid.Executor(fluid.CPUPlace())
-              exe.run(fluid.default_startup_program())
-              exe.train_from_dataset(fluid.default_main_program(), dataset)
-              dataset.release_memory()
-
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> import paddle.fluid.incubate.fleet.parameter_server as fleet
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.global_shuffle(fleet)
+            >>> exe = fluid.Executor(fluid.CPUPlace())
+            >>> exe.run(fluid.default_startup_program())
+            >>> exe.train_from_dataset(fluid.default_main_program(), dataset)
+            >>> dataset.release_memory()
         """
         self.dataset.release_memory()
 
-    def get_memory_data_size(self, fleet=None):
-        """
-        Get memory data size, user can call this function to know the num
-        of ins in all workers after load into memory.
-
-        Note:
-            This function may cause bad performance, because it has barrier
-
-        Args:
-            fleet(Fleet): Fleet Object.
-
-        Returns:
-            The size of memory data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_memory_data_size(fleet)
-
-        """
-        import numpy as np
-        local_data_size = self.dataset.get_memory_data_size()
-        local_data_size = np.array([local_data_size])
-        if fleet is not None:
-            global_data_size = local_data_size * 0
-            fleet._role_maker._node_type_comm.Allreduce(local_data_size,
-                                                        global_data_size)
-            return global_data_size[0]
-        return local_data_size[0]
-
-    def get_shuffle_data_size(self, fleet=None):
-        """
-        Get shuffle data size, user can call this function to know the num
-        of ins in all workers after local/global shuffle.
-
-        Note:
-            This function may cause bad performance to local shuffle,
-            because it has barrier. It does not affect global shuffle.
-
-        Args:
-            fleet(Fleet): Fleet Object.
-
-        Returns:
-            The size of shuffle data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              print dataset.get_shuffle_data_size(fleet)
-
-        """
-        import numpy as np
-        local_data_size = self.dataset.get_shuffle_data_size()
-        local_data_size = np.array([local_data_size])
-        if fleet is not None:
-            global_data_size = local_data_size * 0
-            fleet._role_maker._node_type_comm.Allreduce(local_data_size,
-                                                        global_data_size)
-            return global_data_size[0]
-        return local_data_size[0]
-
 
 class QueueDataset(DatasetBase):
     """
     QueueDataset, it will process data streamly.
 
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-
+    Example:
+        import paddle.fluid as fluid
+        dataset = fluid.DatasetFactory.create_dataset("QueueDataset")
     """
 
     def __init__(self):
@@ -414,18 +296,10 @@ class QueueDataset(DatasetBase):
 
     def local_shuffle(self):
         """
-        Local shuffle data.
+        Local shuffle
 
         Local shuffle is not supported in QueueDataset
         NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
-
         """
         raise NotImplementedError(
             "QueueDataset does not support local shuffle, "
@@ -433,53 +307,9 @@ class QueueDataset(DatasetBase):
 
     def global_shuffle(self, fleet=None):
         """
-        Global shuffle data.
-
         Global shuffle is not supported in QueueDataset
         NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.global_shuffle(fleet)
-
         """
         raise NotImplementedError(
             "QueueDataset does not support global shuffle, "
             "please use InMemoryDataset for global_shuffle")
-
-
-class FileInstantDataset(DatasetBase):
-    """
-    FileInstantDataset, it will process data streamly.
-    Example:
-        import paddle.fluid as fluid
-        dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
-    """
-
-    def __init__(self):
-        """
-        Init
-        """
-        super(FileInstantDataset, self).__init__()
-        self.proto_desc.name = "MultiSlotFileInstantDataFeed"
-
-    def local_shuffle(self):
-        """
-        Local shuffle
-        FileInstantDataset does not support local shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 80989d5804d..0998f779acf 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['DeviceWorker', 'Hogwild', 'DownpourSGD', 'Section']
+__all__ = ['DeviceWorker', 'Hogwild', 'DownpourSGD']
 
 
 class DeviceWorker(object):
@@ -155,16 +155,10 @@ class DownpourSGD(DeviceWorker):
             self._fleet_desc.trainer_param.sparse_table[0].slot_value)
         sparse_table.sparse_grad_name.extend(
             self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
-        if opt_info["use_cvm"]:
-            sparse_table.emb_dim = \
-                self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                0].accessor.fea_dim
-            sparse_table.fea_dim = sparse_table.emb_dim
-        else:
-            sparse_table.emb_dim = \
-                self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
-                0].accessor.fea_dim - 2
-            sparse_table.fea_dim = sparse_table.emb_dim + 2
+        sparse_table.emb_dim = \
+                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
+                        0].accessor.fea_dim - 2
+        sparse_table.fea_dim = sparse_table.emb_dim + 2
         # TODO(guru4elephant): hard code here, need to improve
         sparse_table.label_var_name = "click"
 
@@ -181,58 +175,6 @@ class DownpourSGD(DeviceWorker):
             downpour.push_sparse = False
 
 
-class Section(DeviceWorker):
-    """
-    SectionWorker
-    """
-
-    def __init__(self):
-        """
-        Init.
-        """
-        super(Section, self).__init__()
-
-    def _gen_worker_desc(self, trainer_desc):
-        """
-        Generator worker desc, which device worker is SectionWorker.
-        Args:
-            trainer_desc(TrainerDesc): a TrainerDesc object
-        """
-        from google.protobuf import text_format
-        from . import core
-        trainer_desc.device_worker_name = "SectionWorker"
-        pipeline_opt = self._program._pipeline_opt
-        section_param = trainer_desc.section_param
-        section_param.queue_size = pipeline_opt["queue_size"]
-        section_param.sync_steps = pipeline_opt["sync_steps"]
-        section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
-        for e in pipeline_opt["param_need_sync"]:
-            section_param.param_need_sync.append(e)
-        for i, program in enumerate(pipeline_opt["section_program_list"]):
-            cfg = section_param.section_config.add()
-            cfg.program_desc.ParseFromString(program["program"]._get_desc()
-                                             .serialize_to_string())
-            # TODO: why does not work
-            #cfg.program_desc.CopyFrom(program.program._get_desc())
-            place = pipeline_opt["place_list"][i]
-            if isinstance(place, core.CPUPlace):
-                cfg.place = cfg.CPUPlace
-            elif isinstance(place, core.CUDAPlace):
-                cfg.place = cfg.CUDAPlace
-            elif isinstance(place, core.CUDAPinnedPlace):
-                cfg.place = cfg.CUDAPinnedPlace
-            else:
-                raise NotImplementedError(
-                    "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
-                )
-
-            cfg.concurrency = pipeline_opt["concurrency_list"][i]
-            for var in program["input_set"]:
-                cfg.section_in_var_names.append(var)
-            for var in program["output_set"]:
-                cfg.section_out_var_names.append(var)
-
-
 class DeviceWorkerFactory(object):
     def _create_device_worker(self, worker_type):
         classname = worker_type.capitalize()
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 7ab1dfdf767..9bb72ede304 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -38,9 +38,6 @@ from .checkpoint import *
 from . import learning_rate_scheduler
 from .learning_rate_scheduler import *
 
-from . import backward_strategy
-from .backward_strategy import *
-
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
@@ -50,4 +47,3 @@ __all__ += profiler.__all__
 __all__ += parallel.__all__
 __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
-__all__ += backward_strategy.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 133eb6a19c2..bf484b35c7b 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -11,116 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..wrapped_decorator import signature_safe_contextmanager, wrap_decorator
-import contextlib
+from ..wrapped_decorator import signature_safe_contextmanager
 import numpy as np
-import os
 
 from paddle.fluid import core
 from paddle.fluid import framework
 from .tracer import Tracer
-import logging
 
-__all__ = [
-    'enabled',
-    'no_grad',
-    'not_support',
-    'guard',
-    'to_variable',
-]
+__all__ = ['enabled', 'guard', 'to_variable']
 
 
 def enabled():
     return framework.in_dygraph_mode()
 
 
-@contextlib.contextmanager
-def _switch_tracer_mode_guard_(is_train=True):
-    tracer = framework._dygraph_tracer()
-    if tracer:
-        mode = tracer._train_mode
-        tracer._train_mode = is_train
-        yield
-        tracer._train_mode = mode
-    else:
-        yield
-
-
-def _dygraph_not_support_(func):
-    def __impl__(*args, **kwargs):
-        assert not framework.in_dygraph_mode(
-        ), "We don't support %s in Dygraph mode" % func.__name__
-        return func(*args, **kwargs)
-
-    return __impl__
-
-
-def _no_grad_(func):
-    """
-    This Decorator will avoid the func being decorated creating backward network in dygraph mode
-
-    Args:
-        func: the func don't need grad
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        @fluid.dygraph.no_grad
-        def test_layer():
-            with fluid.dygraph.guard():
-                inp = np.ones([3, 32, 32], dtype='float32')
-                t = fluid.dygraph.base.to_variable(inp)
-                fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-                fc2 = fluid.FC('fc2', size=4)
-                ret = fc1(t)
-                dy_ret = fc2(ret)
-
-        test_layer()
-
-    """
-
-    def __impl__(*args, **kwargs):
-        with _switch_tracer_mode_guard_(is_train=False):
-            return func(*args, **kwargs)
-
-    return __impl__
-
-
-no_grad = wrap_decorator(_no_grad_)
-not_support = wrap_decorator(_dygraph_not_support_)
-
-
 @signature_safe_contextmanager
 def guard(place=None):
-    """
-    This context will create a dygraph context for dygraph to run
-
-    Args:
-        place(fluid.CPUPlace|fluid.CUDAPlace|None): Place to run
-
-    return:
-        None
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        with fluid.dygraph.guard():
-            inp = np.ones([3, 32, 32], dtype='float32')
-            t = fluid.dygraph.base.to_variable(inp)
-            fc1 = fluid.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
-            fc2 = fluid.FC('fc2', size=4)
-            ret = fc1(t)
-            dy_ret = fc2(ret)
-
-    """
     train = framework.Program()
     startup = framework.Program()
     tracer = Tracer(train.current_block().desc)
@@ -138,45 +44,7 @@ def guard(place=None):
                     yield
 
 
-def _print_debug_msg():
-    if not core._is_dygraph_debug_enabled():
-        logging.warn(
-            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
-        )
-        return
-
-    unique_name_size = len(framework.unique_name.generator.ids)
-    tracer_var_size = len(framework._dygraph_tracer()._vars)
-    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    logging.warn(
-        'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-        .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
-
-
 def to_variable(value, block=None, name=None):
-    """
-    This function will create a variable from ndarray
-
-    Args:
-        value(ndarray): the numpy value need to be convert
-        block(fluid.Block|None): which block this variable will be in
-        name(str|None): Name of Varaible
-
-    return:
-        Variable: The variable created from given numpy
-
-    Examples:
-
-     .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        with fluid.dygraph.guard():
-            x = np.ones([2, 2], np.float32)
-            y = fluid.dygraph.to_variable(x)
-
-    """
     if isinstance(value, np.ndarray):
         assert enabled(), "to_variable could only be called in dygraph mode"
 
@@ -195,6 +63,3 @@ def to_variable(value, block=None, name=None):
         return py_var
     elif isinstance(value, framework.Variable):
         return value
-    else:
-        raise TypeError(
-            "to_variable only accepts 'ndarray' and 'Variable' as value's input")
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 52849405558..f96b53e8c0b 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,18 +16,16 @@ from __future__ import print_function
 
 import os
 import collections
+from .. import core
 from ..framework import Variable, default_main_program
-import pickle
-from . import learning_rate_scheduler
-import warnings
 
 __all__ = ['save_persistables', 'load_persistables']
 
 
-def save_persistables(model_dict, dirname='save_dir', optimizers=None):
+def save_persistables(vardict, dirname, filename=None):
     """
     This function filters out all variables in layer.parameters from the
-    give `layer`, and optimizer's learning rate decay and then trys to load these variables from the folder
+    give `layer` and then trys to load these variables from the folder
     `dirname` or the file `filename`.
 
     Use the `dirname` to specify the folder where persistable variables were
@@ -36,11 +34,13 @@ def save_persistables(model_dict, dirname='save_dir', optimizers=None):
     the file name.
 
     Args:
-        model_dict(dict of Parameters): The parameters will
+        vardict(dict of Parameters): The parameters will
                                     be saved. If it is None, nothing
                                     will be deal.
         dirname(str): The directory path.
-        optimizers(fluid.Optimizer|list(fluid.Optimizer)|None): The optimizers to be saved
+        filename(str|None): The file which saved all variables. If variables were
+                            saved in differnet files, set it to None.
+                            Default: None
 
     Returns:
 
@@ -52,7 +52,7 @@ def save_persistables(model_dict, dirname='save_dir', optimizers=None):
                 num_layers=num_layers,
                 num_steps=num_steps,
                 init_scale=init_scale)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+
             x_data = np.arange(12).reshape(4, 3).astype('int64')
             y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
             x_data = x_data.reshape((-1, num_steps, 1))
@@ -67,17 +67,15 @@ def save_persistables(model_dict, dirname='save_dir', optimizers=None):
             init_cell = to_variable(init_cell_data)
             dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                         init_cell)
-            dy_loss.backward()
-            sgd.minimize(dy_loss)
-            ptb_model.clear_gradient()
             param_path = "./my_paddle_model"
-            fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, sgd)
+            fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path,
+                                       layer=ptb_model)
     """
-    if isinstance(model_dict, collections.OrderedDict):
-        _save_var_to_file(model_dict, optimizers, dirname, None)
+    if isinstance(vardict, collections.OrderedDict):
+        _save_var_to_file(vardict, dirname, filename)
 
 
-def load_persistables(dirname='save_dir'):
+def load_persistables(dirname):
     """
     This function trys to load persistable variables from the folder
     `dirname` or the file `filename`.
@@ -88,26 +86,24 @@ def load_persistables(dirname='save_dir'):
     the file name.
 
     Args:
-        dirname(str): The directory path. default is save_dir
+        dirname(str): The directory path.
 
     Returns:
         dict: The parameter-dict resumed from file
-        optimizer dict: The optimizer
 
     Examples:
         .. code-block:: python
             my_layer = layer(fluid.Layer)
             param_path = "./my_paddle_model"
-            sgd = SGDOptimizer(learning_rate=1e-3)
-            param_dict, optimizer_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
+
+            param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
             param_1 = param_dict['PtbModel_0.w_1']
-            sgd.load(optimizer_dict)
 
         """
     return _load_var_from_file(dirname)
 
 
-def _save_var_to_file(stat_dict, optimizers, file_dir, file_name):
+def _save_var_to_file(stat_dict, file_dir, file_name):
     save_block = default_main_program().global_block()
     save_var_map = {}
     for var_key, each_var in stat_dict.items():
@@ -122,38 +118,6 @@ def _save_var_to_file(stat_dict, optimizers, file_dir, file_name):
                                               os.path.normpath(each_var.name))
                 })
 
-    if optimizers is not None:
-        if isinstance(optimizers, (list, tuple)):
-            optimizers = optimizers
-        else:
-            optimizers = [optimizers]
-        if os.path.exists(
-                os.path.join(file_dir, os.path.normpath("optimizers"))):
-            pass
-        else:
-            os.mkdir(os.path.join(file_dir, os.path.normpath("optimizers")))
-        for optimizer in optimizers:
-            if isinstance(optimizer._learning_rate,
-                          learning_rate_scheduler.LearningRateDecay):
-                try:
-                    f = open(
-                        os.path.join(file_dir, "optimizers",
-                                     os.path.normpath(str(optimizer._name))),
-                        "wb")
-                    pickle.dump(optimizer._learning_rate, f, 2)
-                    f.close()
-                except ():
-                    raise IOError("Can't load %s",
-                                  os.path.join(
-                                      file_dir, "optimizers",
-                                      os.path.normpath(str(optimizer._name))))
-            else:
-                warnings.warn(
-                    "Optimizer not saved, Only optimizer with 'LearningRateDecay' under DyGraph mode need to be saved"
-                )
-    else:
-        pass
-
     if file_name is not None:
         save_var_list = []
         for name in sorted(save_var_map.keys()):
@@ -174,8 +138,6 @@ def _load_var_from_file(file_dir):
         var_name_list = []
         if os.path.exists(base_path):
             for dirpath, dirnames, filenames in os.walk(base_path):
-                if "optimizers" in dirpath:
-                    continue
                 pt = dirpath.replace(base_path, "", 1)
                 if pt.startswith("/") or pt.startswith("\\"):
                     pt = pt[1:]
@@ -190,7 +152,6 @@ def _load_var_from_file(file_dir):
 
     load_block = default_main_program().global_block()
     load_var_map = {}
-    load_optimizer_map = {}
     file_var_list = walk_filename(file_dir)
     for var_name in file_var_list:
         new_var = Variable(block=load_block, name=var_name)
@@ -204,25 +165,8 @@ def _load_var_from_file(file_dir):
             })
 
         load_var_map[new_var.name] = new_var
-    opt_path = os.path.join(file_dir, "optimizers")
-    for _, _, optimizers in os.walk(opt_path):
-        for optimizer in optimizers:
-            try:
-                f = open(os.path.join(opt_path, optimizer), "rb")
-                load_optimizer_map[optimizer] = pickle.load(f)
-                f.close()
-            except IOError:
-                raise IOError("Can't load %s",
-                              os.path.join(
-                                  file_dir, "optimizers",
-                                  os.path.normpath(str(optimizer._name))))
-    if len(load_optimizer_map) == 0:
-        print(
-            "No optimizer loaded. If you didn't save optimizer, please ignore this. The program can still work with new optimizer. "
-        )
-        pass
-
-    return load_var_map, load_optimizer_map
+
+    return load_var_map
 
 
 def _clone_var_in_block_(block, var):
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index eca8d060b0f..7ddf94146c7 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -18,14 +18,13 @@ import sys
 import numpy as np
 import collections
 import six
-from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 
-__all__ = ['Layer']
+__all__ = ['Layer', 'PyLayer']
 
 
 class Layer(core.Layer):
@@ -147,17 +146,14 @@ class Layer(core.Layer):
 
     def clear_gradients(self):
         for p in self.parameters():
-            if p.trainable:
-                p.clear_gradient()
+            p.clear_gradient()
 
-    def _build_once(self, *args):
+    def build_once(self, *args):
         pass
 
     def __call__(self, *inputs):
         if not self._built:
-            self._build_once(*inputs)
-            if parallel_helper._is_data_parallel_mode():
-                parallel_helper._broadcast_parameters(self._parameters.values())
+            self.build_once(*inputs)
 
         outputs = self.forward(*inputs)
         self._built = True
@@ -197,14 +193,10 @@ class Layer(core.Layer):
             the parameter passed in.
         """
         assert isinstance(parameter, framework.Parameter)
-
-        if parameter.name in self._loaddict_holder:
-            var = parameter._ivar.value()
-            tensor = var.get_tensor()
-            tensor.set(self._loaddict_holder[parameter.name].numpy(),
-                       framework._current_expected_place())
-
         self._parameters[name] = parameter
+        if parameter.name in self._loaddict_holder:
+            self._parameters[name] = self._loaddict_holder[parameter.name]
+            parameter = self._loaddict_holder[parameter.name]
         return parameter
 
     def __getattr__(self, name):
@@ -220,11 +212,9 @@ class Layer(core.Layer):
                 raise ValueError(
                     "super(YourLayer, self).__init__() should be called first")
             if value.name in self._loaddict_holder:
-                var = value._ivar.value()
-                tensor = var.get_tensor()
-                tensor.set(self._loaddict_holder[value.name].numpy(),
-                           framework._current_expected_place())
-            params[name] = value
+                params[name] = self._loaddict_holder[value.name]
+            else:
+                params[name] = value
         elif isinstance(value, core.Layer):
             layers = self.__dict__.get('_sub_layers', None)
             if layers is None:
@@ -242,19 +232,20 @@ class Layer(core.Layer):
         else:
             object.__delattr__(self, name)
 
-    def state_dict(self, destination=None, include_sublayers=True):
+    def state_dict(self, destination=None, prefix='', include_sublayers=True):
         if destination is None:
             destination = collections.OrderedDict()
         for name, data in self._parameters.items():
             if data is not None:
-                destination[data.name] = data
+                destination[prefix + name] = data
 
         if include_sublayers:
             for layer_name, layer_item in self._sub_layers.items():
                 if layer_item is not None:
                     destination_temp = destination.copy()
                     destination_temp.update(
-                        layer_item.state_dict(destination_temp,
+                        layer_item.state_dict(destination_temp, prefix +
+                                              layer_name + ".",
                                               include_sublayers))
                     destination = destination_temp
         return destination
@@ -272,3 +263,76 @@ class Layer(core.Layer):
             for layer_name, layer_item in self._sub_layers.items():
                 if layer_item is not None:
                     layer_item.load_dict(stat_dict)
+
+
+class PyLayer(core.PyLayer):
+    """Layers composed of user-defined python codes."""
+
+    def __init__(self):
+        super(PyLayer, self).__init__()
+
+    def train(self):
+        framework._dygraph_tracer().train_mode()
+
+    def eval(self):
+        framework._dygraph_tracer().eval_mode()
+
+    @classmethod
+    def _do_forward(cls, inputs):
+        return cls._to_tuple(cls.forward(inputs))
+
+    @classmethod
+    def _do_backward(cls, inputs):
+        return cls._to_tuple(cls.backward(inputs))
+
+    @staticmethod
+    def _to_tuple(inputs):
+        if not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            inputs = [inputs]
+        ret = []
+        for inp in inputs:
+            if isinstance(inp, core.LoDTensor):
+                ret.append(inp)
+            else:
+                tensor = core.LoDTensor()
+                tensor.set(inp, core.CPUPlace())
+                ret.append(tensor)
+        return tuple(ret)
+
+    @staticmethod
+    def forward(*inputs):
+        raise NotImplementedError
+
+    @staticmethod
+    def backward(*douts):
+        raise NotImplementedError
+
+    @classmethod
+    def __call__(cls, *inputs):
+        tracer = framework._dygraph_tracer()
+        block = framework.default_main_program().current_block()
+        ivar_inputs = [x._ivar for x in inputs]
+
+        if not hasattr(cls, 'forward_id'):
+            cls.forward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.forward_id, cls._do_forward)
+            cls.backward_id = core.PyLayer.num_funcs() + 1
+            PyLayer.register_func(cls.backward_id, cls._do_backward)
+
+        iop = core.OpBase(cls.__class__.__name__ + str(cls.forward_id))
+        iop.forward_id = cls.forward_id
+        iop.backward_id = cls.backward_id
+        block.ops.append(iop)
+        ivars = tracer.py_trace(iop, ivar_inputs, False)
+        ret = []
+        for ivar in ivars:
+            tensor = ivar.value().get_tensor()
+            py_var = framework.Variable(
+                block,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                name=None,
+                shape=tensor.shape(),
+                dtype=tensor._dtype(),
+                ivar=ivar)
+            ret.append(py_var)
+        return ret
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 500ab63b0e0..3209fa76d95 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -27,10 +27,6 @@ __all__ = [
 class LearningRateDecay(object):
     """
     Base class of learning rate decay
-    
-    Define the common interface of an LearningRateDecay.
-    User should not use this class directly,
-    but need to use one of it's implementation.
     """
 
     def __init__(self, begin=0, step=1, dtype='float32'):
@@ -46,21 +42,13 @@ class LearningRateDecay(object):
         return lr
 
     def create_lr_var(self, lr):
-        """
-        convert lr from float to variable
-
-        Args: 
-            lr: learning rate
-        Returns:
-            learning rate variable
-        """
         from .. import layers
         lr = layers.create_global_var(
             name=unique_name.generate("learning_rate"),
             shape=[1],
             value=float(lr),
             dtype=self.dtype,
-            persistable=False)
+            persistable=True)
         return lr
 
     def step(self):
@@ -68,40 +56,6 @@ class LearningRateDecay(object):
 
 
 class PiecewiseDecay(LearningRateDecay):
-    """
-    piecewise decay scheduler
-
-    The algorithm can be described as the code below.
-
-    .. code-block:: text
-
-      boundaries = [10000, 20000]
-      values = [1.0, 0.5, 0.1]
-      if step < 10000:
-          learning_rate = 1.0
-      elif 10000 <= step < 20000:
-          learning_rate = 0.5
-      else:
-          learning_rate = 0.1
-    Args:
-        boundaries: A list of steps numbers.
-        values: A list of learning rate values that will be picked during
-            different step boundaries.
-        begin: The begin step to initilize the self.step_num
-        step: The step_size using when calculate the new step_num (Defalult is 1)
-        dtype: The dtype used to create the learning rate variable
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          boundaries = [10000, 20000]
-          values = [1.0, 0.5, 0.1]
-          with fluid.dygraph.guard():
-              optimizer = fluid.optimizer.SGD(
-                 learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
-    """
-
     def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
         super(PiecewiseDecay, self).__init__(begin, step, dtype)
         self.boundaries = boundaries
@@ -109,51 +63,16 @@ class PiecewiseDecay(LearningRateDecay):
 
         self.vars = []
         for value in values:
-            self.vars.append(value)
+            self.vars.append(self.create_lr_var(value))
 
     def step(self):
         for i in range(len(self.boundaries)):
             if self.step_num < self.boundaries[i]:
                 return self.vars[i]
-        return self.create_lr_var(self.vars[len(self.values) - 1])
+        return self.vars[len(self.values) - 1]
 
 
 class NaturalExpDecay(LearningRateDecay):
-    """
-    Applies natural exponential decay to the initial learning rate.
-    
-    .. code-block:: python
-
-        if not staircase:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-        else:
-            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
-
-    Args:
-        learning_rate: A scalar float32 value or a Variable. This
-          will be the initial learning rate during training
-        decay_steps: A Python `int32` number.
-        decay_rate: A Python `float` number.
-        staircase: Boolean. If set true, decay the learning rate every decay_steps.
-        begin: A Python 'int32' number, the begin step (Default is 0)
-        step: A Python 'int32' number, the step size (Default is 1)
-        dtype: A Python 'str', the dtype used to create learning rate variable (Default is 'float32')
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-        	      learning_rate=fluid.dygraph.NaturalExpDecay(
-	    	            learning_rate=base_lr,
-        		    decay_steps=10000,
-		            decay_rate=0.5,
-		            staircase=True))
-
-    """
-
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -180,45 +99,6 @@ class NaturalExpDecay(LearningRateDecay):
 
 
 class ExponentialDecay(LearningRateDecay):
-    """
-    Applies exponential decay to the learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    'decay_rate' every 'decay_steps' steps.
-    
-    .. code-block:: python
-
-        if staircase == True:
-            decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
-        else:
-            decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-    	            learning_rate=fluid.dygraph.ExponentialDecay(
-		        learning_rate=base_lr,
-    		        decay_steps=10000,
-		        decay_rate=0.5,
-		        staircase=True))
-
-    """
-
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -245,43 +125,6 @@ class ExponentialDecay(LearningRateDecay):
 
 
 class InverseTimeDecay(LearningRateDecay):
-    """
-    Applies inverse time decay to the initial learning rate.
-
-    When training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, an inverse decay function will be
-    applied to the initial learning rate.
-
-    >>> if staircase == True:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
-    >>> else:
-    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
-
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        decay_steps(int): See the decay computation above.
-        decay_rate(float): The decay rate. See the decay computation above.
-        staircase(Boolean): If True, decay the learning rate at discrete intervals.
-                            Default: False
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          with fluid.dygraph.guard():
-              sgd_optimizer = fluid.optimizer.SGD(
-	          learning_rate=fluid.dygraph.InverseTimeDecay(
-		        learning_rate=base_lr,
-		        decay_steps=10000,
-		        decay_rate=0.5,
-		        staircase=True))
-
-    """
-
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -308,43 +151,6 @@ class InverseTimeDecay(LearningRateDecay):
 
 
 class PolynomialDecay(LearningRateDecay):
-    """
-    Applies polynomial decay to the initial learning rate.
-
-    .. code-block:: text
-
-     if cycle:
-       decay_steps = decay_steps * ceil(global_step / decay_steps)
-     else:
-       global_step = min(global_step, decay_steps)
-       decayed_learning_rate = (learning_rate - end_learning_rate) *
-            (1 - global_step / decay_steps) ^ power + end_learning_rate
-
-    Args:
-        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
-          will be the initial learning rate during training.
-        decay_steps(int32): A Python `int32` number.
-        end_learning_rate(float): A Python `float` number.
-        power(float): A Python `float` number.
-        cycle(bool): If set true, decay the learning rate every decay_steps.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          start_lr = 0.01
-          total_step = 5000
-          end_lr = 0
-          with fluid.dygraph.guard():
-              optimizer  = fluid.optimizer.SGD(
-                  learning_rate = fluid.dygraph.PolynomialDecay(
-                  start_lr, total_step, end_lr, power=1.0) )
-
-    """
-
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -383,35 +189,6 @@ class PolynomialDecay(LearningRateDecay):
 
 
 class CosineDecay(LearningRateDecay):
-    """
-    Applies cosine decay to the learning rate.
-
-    when training a model, it is often recommended to lower the learning rate as the
-    training progresses. By using this function, the learning rate will be decayed by
-    following cosine decay strategy.
-
-    .. math::
-
-	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
-    
-    Args:
-        learning_rate(Variable|float): The initial learning rate.
-        step_each_epoch(int): the number of steps in an epoch.
-        epochs(int): the number of epochs.
-        begin(int): The begin step (default is 0).
-        step(int): The step size (default is 1).
-        dtype(str): The dtype used to create learning rate (default is 'float32').
-
-    Examples:
-	.. code-block:: python
-
-  	    base_lr = 0.1
-            with fluid.dygraph.guard():
-                optimizer  = fluid.optimizer.SGD(
-        	    learning_rate = fluid.dygraph.CosineDecay(
-	                    base_lr, 10000, 120) )
-    """
-
     def __init__(self,
                  learning_rate,
                  step_each_epoch,
@@ -434,45 +211,6 @@ class CosineDecay(LearningRateDecay):
 
 
 class NoamDecay(LearningRateDecay):
-    """
-    Noam decay method. The numpy implementation of noam decay as follows.
-
-    .. code-block:: python
-      
-      import numpy as np
-      # set hyper parameters
-      d_model = 2
-      current_steps = 20
-      warmup_steps = 200
-      # compute
-      lr_value = np.power(d_model, -0.5) * np.min([
-                              np.power(current_steps, -0.5),
-                              np.power(warmup_steps, -1.5) * current_steps])
-
-    Please reference `attention is all you need
-    <https://arxiv.org/pdf/1706.03762.pdf>`_.
-
-    Args:
-        d_model(Variable): The dimensionality of input and output of model.
-
-        warmup_steps(Variable): A super parameter.
-        begin(int): The begin step (default is 0)
-        step(int): The step size (default is 1)
-        dtype(str): The dtype used to create learning rate (default is 'float32')
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          warmup_steps = 100
-          learning_rate = 0.01
-          with fluid.dygraph.guard():
-              optimizer  = fluid.optimizer.SGD(
-                  learning_rate = fluid.dygraph.NoamDecay(
-                         1/(warmup_steps *(learning_rate ** 2)),
-                         warmup_steps) )
-    """
-
     def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
         super(NoamDecay, self).__init__(begin, step, dtype)
         self.d_model = d_model
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 3fa74d78f5f..d6360fedd47 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -27,7 +27,8 @@ import numpy as np
 __all__ = [
     'Conv2D', 'Conv3D', 'Pool2D', 'FC', 'BatchNorm', 'Embedding', 'GRUUnit',
     'LayerNorm', 'NCE', 'PRelu', 'BilinearTensorProduct', 'Conv2DTranspose',
-    'Conv3DTranspose', 'GroupNorm', 'SpectralNorm', 'TreeConv'
+    'Conv3DTranspose', 'SequenceConv', 'RowConv', 'GroupNorm', 'SpectralNorm',
+    'TreeConv'
 ]
 
 
@@ -83,7 +84,7 @@ class Conv2D(layers.Layer):
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        name_scope(str) : The name for this class.
+        input (Variable): The input image with [N, C, H, W] format.
         num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
@@ -117,6 +118,12 @@ class Conv2D(layers.Layer):
             library is installed. Default: True
         act (str): Activation type, if it is set to None, activation is not appended.
             Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -124,37 +131,25 @@ class Conv2D(layers.Layer):
 
     Examples:
         .. code-block:: python
-          
-          with fluid.dygraph.guard():
-             conv2d = Conv2D( "conv2d", 2, 3)
-             data = to_variable( data )
-             conv = conv2d( data )
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import Conv2D
-          import numpy as np
-
-          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              conv2d = Conv2D( "conv2d", 2, 3)
-              data = to_variable( data )
-              conv = conv2d( data )
 
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     def __init__(self,
                  name_scope,
+                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
                  padding=0,
                  dilation=1,
                  groups=None,
-                 param_attr=None,
-                 bias_attr=None,
                  use_cudnn=True,
                  act=None,
-                 dtype='float32'):
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__(name_scope, dtype)
         self._groups = groups
@@ -165,11 +160,7 @@ class Conv2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._dtype = dtype
+        self._num_channels = num_channels
         # if (self._num_channels == self._groups and
         #         num_filters % self._num_channels == 0 and not self._use_cudnn):
         #     self._l_type = 'depthwise_conv2d'
@@ -178,26 +169,22 @@ class Conv2D(layers.Layer):
         #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
         self._l_type = 'conv2d'
 
-    def _build_once(self, input):
-        self._num_channels = input.shape[1]
-        if self._groups is None:
-            num_filter_channels = self._num_channels
+        if groups is None:
+            num_filter_channels = num_channels
         else:
-            if self._num_channels % self._groups != 0:
+            if num_channels % groups != 0:
                 raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = self._num_channels // self._groups
-        filter_size = utils.convert_to_list(self._filter_size, 2, 'filter_size')
-        filter_shape = [self._num_filters, int(num_filter_channels)
-                        ] + filter_size
+            num_filter_channels = num_channels // groups
+        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+        filter_shape = [num_filters, int(num_filter_channels)] + filter_size
 
         def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[
-                1] * self._num_channels
+            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
             std = (2.0 / filter_elem_num)**0.5
             return Normal(0.0, std, 0)
 
         self._filter_param = self.create_parameter(
-            attr=self._param_attr,
+            attr=param_attr,
             shape=filter_shape,
             dtype=self._dtype,
             default_initializer=_get_default_param_initializer())
@@ -217,8 +204,8 @@ class Conv2D(layers.Layer):
                 type=core.VarDesc.VarType.RAW)
 
         self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
+            attr=bias_attr,
+            shape=[num_filters],
             dtype=self._dtype,
             is_bias=True)
 
@@ -242,17 +229,15 @@ class Conv2D(layers.Layer):
                 'use_mkldnn': False,
             })
 
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type='elementwise_add',
-                inputs={'X': [pre_bias],
-                        'Y': [self._bias_param]},
-                outputs={'Out': [pre_act]},
-                attrs={'axis': 1})
-        else:
-            pre_act = pre_bias
+        pre_act = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type='elementwise_add',
+            inputs={'X': [pre_bias],
+                    'Y': [self._bias_param]},
+            outputs={'Out': [pre_act]},
+            attrs={'axis': 1})
 
         # Currently, we don't support inplace in dygraph mode
         return self._helper.append_activation(pre_act, act=self._act)
@@ -354,16 +339,8 @@ class Conv3D(layers.Layer):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-
-              conv3d = fluid.dygraph.nn.Conv3D(
-                    'Conv3D', num_filters=2, filter_size=3, act="relu")
-              ret = conv3d(fluid.dygraph.base.to_variable(data))
-
+          data = fluid.layers.data(name='data', shape=[3, 12, 32, 32], dtype='float32')
+          conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     def __init__(self,
@@ -393,7 +370,7 @@ class Conv3D(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
 
-    def _build_once(self, input):
+    def build_once(self, input):
         num_channels = input.shape[1]
         self._dtype = self._helper.input_dtype(input)
 
@@ -562,19 +539,12 @@ class Conv3DTranspose(layers.Layer):
     Examples:
        .. code-block:: python
 
-         import paddle.fluid as fluid
-         import numpy
-
-         with fluid.dygraph.guard():
-             data = numpy.random.random((5, 3, 12, 32, 32)).astype('float32')
-
-             conv3dTranspose = fluid.dygraph.nn.Conv3DTranspose(
-                    'Conv3DTranspose',
-                    num_filters=12,
-                    filter_size=12,
-                    use_cudnn=False)
-             ret = conv3dTranspose(fluid.dygraph.base.to_variable(data))
-
+          conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+          transpose_res = conv3d_transpose(base.to_variable(input_array))
     """
 
     def __init__(self,
@@ -607,7 +577,7 @@ class Conv3DTranspose(layers.Layer):
         self._bias_attr = bias_attr
         self._act = act
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         self._input_channel = input.shape[1]
 
@@ -681,12 +651,14 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
-    # TODO, should delete this class
     """
     ${comment}
 
     Args:
-        name_scope(str) : The name of this class.
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
@@ -840,7 +812,8 @@ class FC(layers.Layer):
             out.shape = (1, 2)
 
     Args:
-        name(str): The name of this class.
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
         size(int): The number of output units in this layer.
         num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
             two dimensions. If this happens, the multidimensional tensor will first be flattened
@@ -858,35 +831,37 @@ class FC(layers.Layer):
             If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
-        dtype(str): Dtype used for weight
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: The transformation result.
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
 
     Examples:
         .. code-block:: python
-        
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import FC
-          import numpy as np
-          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              fc = FC( "fc", 64, num_flatten_dims=2)
-              data = to_variable( data )
-              conv = fc( data )
 
+          # when input is single tensor
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc(data)
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc([data_1, data_2])
     """
 
     def __init__(self,
                  name_scope,
                  size,
-                 num_flatten_dims=1,
                  param_attr=None,
                  bias_attr=None,
-                 act=None,
-                 is_test=False,
-                 dtype="float32"):
+                 num_flatten_dims=1,
+                 dtype=core.VarDesc.VarType.FP32,
+                 act=None):
         super(FC, self).__init__(name_scope, dtype)
 
         self._size = size
@@ -906,7 +881,7 @@ class FC(layers.Layer):
         assert isinstance(value, Parameter)
         self.__w[i] = value
 
-    def _build_once(self, input):
+    def build_once(self, input):
         i = 0
         for inp, param in self._helper.iter_inputs_and_params(input,
                                                               self._param_attr):
@@ -1047,8 +1022,6 @@ class BatchNorm(layers.Layer):
             or is_test to true, and the behavior is equivalent.
             In train mode, when setting use_global_stats True, the global mean
             and variance are also used during train period.
-        trainable_statistics(bool, Default False): Whether to calculate mean and var in eval mode. In eval mode, when
-            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
 
     Returns:
         Variable: A tensor variable which is the result after applying batch normalization on the input.
@@ -1071,24 +1044,23 @@ class BatchNorm(layers.Layer):
                  epsilon=1e-05,
                  param_attr=None,
                  bias_attr=None,
-                 dtype='float32',
+                 dtype=core.VarDesc.VarType.FP32,
                  data_layout='NCHW',
                  in_place=False,
                  moving_mean_name=None,
                  moving_variance_name=None,
                  do_model_average_for_mean_and_var=False,
                  fuse_with_relu=False,
-                 use_global_stats=False,
-                 trainable_statistics=False):
+                 use_global_stats=False):
         super(BatchNorm, self).__init__(name_scope, dtype)
         self._param_attr = param_attr
-        self._bias_attr = bias_attr
+        self._param_attr = bias_attr
         self._act = act
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
-        if dtype == "float16":
-            self._dtype = "float32"
+        if dtype == core.VarDesc.VarType.FP16:
+            self._dtype = core.VarDesc.VarType.FP32
         else:
             self._dtype = dtype
 
@@ -1104,7 +1076,7 @@ class BatchNorm(layers.Layer):
             self._scale.stop_gradient = True
 
         self._bias = self.create_parameter(
-            attr=self._bias_attr,
+            attr=self._param_attr,
             shape=param_shape,
             dtype=self._dtype,
             is_bias=True)
@@ -1137,9 +1109,8 @@ class BatchNorm(layers.Layer):
         self._is_test = is_test
         self._fuse_with_relu = fuse_with_relu
         self._use_global_stats = use_global_stats
-        self._trainable_statistics = trainable_statistics
 
-    def _build_once(self, input):
+    def build_once(self, input):
         pass
 
     def forward(self, input):
@@ -1178,8 +1149,7 @@ class BatchNorm(layers.Layer):
                 "is_test": self._is_test,
                 "use_mkldnn": False,
                 "fuse_with_relu": self._fuse_with_relu,
-                "use_global_stats": self._use_global_stats,
-                "trainable_statistics": self._trainable_statistics
+                "use_global_stats": self._use_global_stats
             })
 
         # Currently, we don't support inplace in dygraph mode
@@ -1193,15 +1163,22 @@ class Embedding(layers.Layer):
     This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
     a lookup table. The result of this lookup is the embedding of each ID in the
     :attr:`input`.
-    All the input variables are passed in as local variables to the LayerHelper constructor
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
 
     Args:
         name_scope: See base class.
-        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size of the dictionary of embeddings and the size of each embedding vector respectively.
-
+        size(tuple|list): The shape of the look up table parameter. It should
+            have two elements which indicate the size of the dictionary of
+            embeddings and the size of each embedding vector respectively.
         is_sparse(bool): The flag indicating whether to use sparse update.
         is_distributed(bool): Whether to run lookup table from remote parameter server.
-        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. Otherwise the given :attr:`padding_idx` indicates padding the output with zeros whenever lookup encounters it in :attr:`input`. If :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is :math:`size[0] + dim`.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output
+            with zeros whenever lookup encounters it in :attr:`input`. If
+            :math:`padding_idx < 0`, the :attr:`padding_idx` to use in lookup is
+            :math:`size[0] + dim`.
         param_attr(ParamAttr): Parameters for this layer
         dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc
 
@@ -1210,18 +1187,12 @@ class Embedding(layers.Layer):
                   supplied inputs.
 
     Examples:
-
         .. code-block:: python
 
-          inp_word = np.array([[[1]]]).astype('int64')
-          dict_size = 20
-          with fluid.dygraph.guard():
-              emb = fluid.Embedding(
-                  name_scope='embedding',
-                  size=[dict_size, 32],
-                  param_attr='emb.w',
-                  is_sparse=False)
-            static_rlt3 = emb2(base.to_variable(inp_word))
+          dict_size = len(dataset.ids)
+          input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          embedding = fluid.Embedding(size=[dict_size, 16])
+          fc = embedding(input)
     """
 
     def __init__(self,
@@ -1271,13 +1242,7 @@ class Embedding(layers.Layer):
 
 class LayerNorm(layers.Layer):
     """
-    Assume feature vectors exist on dimensions
-    `begin_norm_axis ... rank(input)` and calculate the moment statistics along these dimensions for each feature
-    vector `a` with size `H`, then normalize each feature vector using the corresponding
-    statistics. After that, apply learnable gain and bias on the normalized
-    tensor to scale and shift if `scale` and `shift` are set.
-
-    Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
+    ${comment}
 
     The formula is as follows:
 
@@ -1299,7 +1264,7 @@ class LayerNorm(layers.Layer):
     * :math:`b`: the trainable bias parameter.
 
     Args:
-        name_scope (str): See base class.
+        input(Variable): The input tensor variable.
         scale(bool): Whether to learn the adaptive gain :math:`g` after
             normalization. Default True.
         shift(bool): Whether to learn the adaptive bias :math:`b` after
@@ -1322,21 +1287,13 @@ class LayerNorm(layers.Layer):
         act(str): Activation to be applied to the output of layer normalizaiton.
                   Default None.
     Returns:
-        Result after normalization
+        ${y_comment}
 
     Examples:
 
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              x = numpy.random.random((3, 32, 32)).astype('float32')
-              layerNorm = fluid.dygraph.nn.LayerNorm(
-                    'LayerNorm', begin_norm_axis=1)
-             ret = layerNorm(fluid.dygraph.base.to_variable(x))
-
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
 
     def __init__(self,
@@ -1357,7 +1314,7 @@ class LayerNorm(layers.Layer):
         self._bias_attr = bias_attr
         self._act = act
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         input_shape = input.shape
         param_shape = [
@@ -1481,7 +1438,6 @@ class GRUUnit(layers.Layer):
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
                                   Default: 'sigmoid'
-        dtype(string): The dtype of the layers
 
     Returns:
         tuple: The hidden value, reset-hidden value and gate values.
@@ -1503,8 +1459,8 @@ class GRUUnit(layers.Layer):
             sigmoid=1,
             tanh=2,
             relu=3, )
-        self.activation = activation_dict[activation]
-        self.gate_activation = activation_dict[gate_activation]
+        activation = activation_dict[activation]
+        gate_activation = activation_dict[gate_activation]
 
         self._dtype = dtype
         size = size // 3
@@ -1536,8 +1492,8 @@ class GRUUnit(layers.Layer):
                 'Hidden': updated_hidden,
             },
             attrs={
-                'activation': self.activation,
-                'gate_activation': self.gate_activation,
+                'activation': 2,  # tanh
+                'gate_activation': 1,  # sigmoid
             })
 
         return updated_hidden, reset_hidden_pre, gate
@@ -1545,15 +1501,12 @@ class GRUUnit(layers.Layer):
 
 class NCE(layers.Layer):
     """
-    Compute and return the noise-contrastive estimation training loss. See
-    `Noise-contrastive estimation: A new estimation principle for unnormalized
-    statistical models
-     <http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf>`_.
-    By default this operator uses a uniform distribution for sampling.
+    ${comment}
 
     Args:
-        name_scope (str): See base class.
-        num_total_classes (int): Total number of classes in all samples
+        input (Variable): input variable.
+        label (Variable): label.
+        num_total_classes (int):${num_total_classes_comment}
         sample_weight (Variable|None): A Variable of shape [batch_size, 1]
             storing a weight for each sample. The default weight for each
             sample is 1.0.
@@ -1566,7 +1519,7 @@ class NCE(layers.Layer):
              If it is set to None or one attribute of ParamAttr, nce
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. Default: None.
-        num_neg_samples (int): The number of negative classes. The default value is 10.
+        num_neg_samples (int): ${num_neg_samples_comment}
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
         sampler (str): The sampler used to sample class from negtive classes.
@@ -1585,45 +1538,37 @@ class NCE(layers.Layer):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-
             window_size = 5
-            dict_size = 20
-            label_word = int(window_size // 2) + 1
-            inp_word = np.array([[[1]], [[2]], [[3]], [[4]], [[5]]]).astype('int64')
-            nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32')
-
-            with fluid.dygraph.guard():
-                words = []
-                for i in range(window_size):
-                    words.append(fluid.dygraph.base.to_variable(inp_word[i]))
-
-                emb = fluid.Embedding(
-                    'embedding',
-                    size=[dict_size, 32],
-                    param_attr='emb.w',
-                    is_sparse=False)
-
-                embs3 = []
-                for i in range(window_size):
-                    if i == label_word:
-                        continue
-
-                    emb_rlt = emb(words[i])
-                    embs3.append(emb_rlt)
-
-                embs3 = fluid.layers.concat(input=embs3, axis=1)
-                nce = fluid.NCE('nce',
-                             num_total_classes=dict_size,
-                             num_neg_samples=2,
-                             sampler="custom_dist",
-                             custom_dist=nid_freq_arr.tolist(),
-                             seed=1,
-                             param_attr='nce.w',
-                             bias_attr='nce.b')
-
-                nce_loss3 = nce(embs3, words[label_word])
+            words = []
+            for i in xrange(window_size):
+                words.append(layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+            dict_size = 10000
+            label_word = int(window_size / 2) + 1
+
+            embs = []
+            for i in xrange(window_size):
+                if i == label_word:
+                    continue
+
+                emb = layers.embedding(input=words[i], size=[dict_size, 32],
+                                       param_attr='emb.w', is_sparse=True)
+                embs.append(emb)
+
+            embs = layers.concat(input=embs, axis=1)
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=dict_size, param_attr='nce.w',
+                          bias_attr='nce.b')
+
+            #or use custom distribution
+            dist = fluid.layers.assign(input=np.array([0.05,0.5,0.1,0.3,0.05]).astype("float32"))
+            loss = layers.nce(input=embs, label=words[label_word],
+                          num_total_classes=5, param_attr='nce.w',
+                          bias_attr='nce.b',
+                          num_neg_samples=3,
+                          sampler="custom_dist",
+                          custom_dist=dist)
 
     """
 
@@ -1731,7 +1676,7 @@ class NCE(layers.Layer):
             'remote_prefetch': remote_prefetch
         }
 
-    def _build_once(self, input, label, sample_weight=None):
+    def build_once(self, input, label, sample_weight=None):
         assert isinstance(input, Variable)
         assert isinstance(label, Variable)
 
@@ -1786,13 +1731,13 @@ class PRelu(layers.Layer):
         y = \max(0, x) + \\alpha * \min(0, x)
 
     Args:
-        name_scope (str): See base class.
+        x (Variable): The input tensor.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+          weight (alpha).
         mode (string): The mode for weight sharing. It supports all, channel
           and element. all: all elements share same weight
           channel:elements in a channel share same weight
           element:each element has a weight
-        param_attr(ParamAttr|None): The parameter attribute for the learnable
-          weight (alpha).
         name(str|None): A name for this layer(optional). If set None, the layer
           will be named automatically.
 
@@ -1803,14 +1748,9 @@ class PRelu(layers.Layer):
 
         .. code-block:: python
 
-        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
-        with fluid.dygraph.guard():
+            x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
             mode = 'channel'
-            prelu = fluid.PRelu(
-                'prelu',
-                mode=mode,
-                param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(1.0)))
-            dy_rlt = prelu(fluid.dygraph.base.to_variable(inp_np))
+            output = fluid.layers.prelu(x,mode)
     """
 
     def __init__(self, name_scope, mode, param_attr=None):
@@ -1822,7 +1762,7 @@ class PRelu(layers.Layer):
             raise ValueError('mode should be one of all, channel, element.')
         self._alpha_shape = [1]
 
-    def _build_once(self, input):
+    def build_once(self, input):
         if self._mode == 'channel':
             self._alpha_shape = [1, input.shape[1], 1, 1]
         elif self._mode == 'element':
@@ -1865,7 +1805,8 @@ class BilinearTensorProduct(layers.Layer):
      - :math:`y^\mathrm{T}`: the transpose of :math:`y_{2}`.
 
     Args:
-       name_scope (str): See base class.
+       x (Variable): 2-D input tensor with shape [batch_size, M]
+       y (Variable): 2-D input tensor with shape [batch_size, N]
        size (int): The dimension of this layer.
        act (str, default None): Activation to be applied to the output of this layer.
        name (str, default None): The name of this layer.
@@ -1881,16 +1822,7 @@ class BilinearTensorProduct(layers.Layer):
     Examples:
        .. code-block:: python
 
-         import paddle.fluid as fluid
-         import numpy
-
-         with fluid.dygraph.guard():
-             layer1 = numpy.random.random((5, 5)).astype('float32')
-             layer2 = numpy.random.random((5, 4)).astype('float32')
-             bilinearTensorProduct = fluid.dygraph.nn.BilinearTensorProduct(
-                    'BilinearTensorProduct', size=1000)
-             ret = bilinearTensorProduct(fluid.dygraph.base.to_variable(layer1),
-                                fluid.dygraph.base.to_variable(layer2))
+         tensor = bilinear_tensor_product(x=layer1, y=layer2, size=1000)
     """
 
     def __init__(self,
@@ -1908,7 +1840,7 @@ class BilinearTensorProduct(layers.Layer):
         self._name = name
         self._inputs = dict()
 
-    def _build_once(self, x, y):
+    def build_once(self, x, y):
         self._dtype = self._helper.input_dtype(x)
 
         param_shape = [self._size, x.shape[1], y.shape[1]]
@@ -2000,7 +1932,7 @@ class Conv2DTranspose(layers.Layer):
            W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
 
     Args:
-        name_scope (str): See base class.
+        input(Variable): The input image with [N, C, H, W] format.
         num_filters(int): The number of the filter. It is as same as the output
             image channel.
         output_size(int|tuple|None): The output image size. If output size is a
@@ -2053,15 +1985,8 @@ class Conv2DTranspose(layers.Layer):
     Examples:
        .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((3, 32, 32)).astype('float32')
-              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
-                    'Conv2DTranspose', num_filters=2, filter_size=3)
-              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
 
     def __init__(self,
@@ -2091,7 +2016,7 @@ class Conv2DTranspose(layers.Layer):
         self._output_size = output_size
         self._op_type = 'conv2d_transpose'
 
-    def _build_once(self, input):
+    def build_once(self, input):
         input_channel = input.shape[1]
         if (input_channel == self._groups and
                 self._num_filters == input_channel and not self._use_cudnn):
@@ -2126,7 +2051,7 @@ class Conv2DTranspose(layers.Layer):
             self._filter_size = [filter_size_h, filter_size_w]
         else:
             self._filter_size = utils.convert_to_list(
-                self._filter_size, 2, 'conv2d_transpose.filter_size')
+                self._output_size, 2, 'conv2d_transpose.filter_size')
 
         if self._output_size is None:
             self._output_size = []
@@ -2173,7 +2098,7 @@ class SequenceConv(layers.Layer):
     in the input parameters to the function.
 
     Args:
-        name_scope (str): See base class.
+        input (Variable): ${x_comment}
         num_filters (int): number of filters.
         filter_size (int): the filter size (H and W).
         filter_stride (int): stride of the filter.
@@ -2215,7 +2140,7 @@ class SequenceConv(layers.Layer):
         self._bias_attr = bias_attr
         self._param_attr = param_attr
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._filter_size * input.shape[1], self._num_filters]
         self._filter_param = self.create_parameter(
@@ -2240,49 +2165,6 @@ class SequenceConv(layers.Layer):
 
 
 class RowConv(layers.Layer):
-    """
-    ***Row-convolution operator***
-
-    The row convolution is called lookahead convolution.  This operator was introduced in the following paper for DeepSpeech2:
-    http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf
-
-    The main motivation is that a bidirectional RNN, useful in DeepSpeech like speech models, learns representation for a sequence by performing a
-    forward and a backward pass through the entire sequence. However, unlike
-    unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
-    and low-latency setting. The lookahead convolution incorporates information
-    from future subsequences in a computationally efficient manner to improve
-    unidirectional recurrent neural networks. The row convolution operator is
-    different from the 1D sequence convolution, and is computed as follows:
-
-    Given an input sequence X of length t and input dimension D, and a filter (W) of size context * D.
-
-    More details about row_conv please refer to the design document https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645 .
-
-    Args:
-        name_scope (str): See base class.
-        future_context_size (int): Future context size. Please note, the shape
-            of convolution kernel is [future_context_size + 1, D].
-        param_attr (ParamAttr): Attributes of parameters, including
-            name, initializer etc.
-        act (str): Non-linear activation to be applied to output variable.
-
-    Returns:
-        the output(Out) is a LodTensor, which supports variable time-length input sequences. The underlying tensor in this LodTensor is a matrix with shape T x N, i.e., the same shape as X.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              x = numpy.random.random((16)).astype('float32')
-              rowConv = fluid.dygraph.nn.RowConv(
-                    'RowConv', future_context_size=2)
-              ret = rowConv(fluid.dygraph.base.to_variable(x))
-
-    """
-
     def __init__(self,
                  name_scope,
                  future_context_size,
@@ -2295,7 +2177,7 @@ class RowConv(layers.Layer):
         self._param_attr = param_attr
         self._future_context_size = future_context_size
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._future_context_size + 1, input.shape[1]]
         self._filter_param = self.create_parameter(
@@ -2338,16 +2220,6 @@ class GroupNorm(layers.Layer):
         Returns:
             Variable: A tensor variable which is the result after applying group normalization on the input.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              import numpy
-
-              with fluid.dygraph.guard():
-                  x = numpy.random.random((8, 32, 32)).astype('float32')
-                  groupNorm = fluid.dygraph.nn.GroupNorm('GroupNorm', groups=4)
-                  ret = groupNorm(fluid.dygraph.base.to_variable(x))
 
     """
 
@@ -2368,7 +2240,7 @@ class GroupNorm(layers.Layer):
         if data_layout != 'NCHW':
             raise ValueError("unsupported data layout:" + data_layout)
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         param_shape = [input.shape[1]]
         if self._bias_attr:
@@ -2415,70 +2287,13 @@ class GroupNorm(layers.Layer):
 
 
 class SpectralNorm(layers.Layer):
-    """
-    **Spectral Normalization Layer**
-
-    This layer calculates the spectral normalization value of weight parameters of
-    fc, conv1d, conv2d, conv3d layers which should be 2-D, 3-D, 4-D, 5-D
-    Parameters. Calculations are showed as follows.
-
-    Step 1:
-    Generate vector U in shape of [H], and V in shape of [W].
-    While H is the :attr:`dim` th dimension of the input weights,
-    and W is the product result of remaining dimensions.
-
-    Step 2:
-    :attr:`power_iters` shoule be a positive interger, do following
-    calculations with U and V for :attr:`power_iters` rounds.
-
-    .. math::
-
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
-
-        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
-
-    Step 3:
-    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
-
-    .. math::
-
-        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
-
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
-
-
-    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
-
-    Args:
-        name_scope (str): See base class.
-        dim(int): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer, default 0
-        power_iters(int): number of power iterations to calculate spectral norm, default 1
-        eps(float): epsilon for numerical stability in calculating norms
-        name (str): The name of this layer. It is optional.
-
-    Returns:
-        Variable: A tensor variable of weight parameters after spectral normalization.
-
-    Examples:
-       .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy
-
-            with fluid.dygraph.guard():
-                x = numpy.random.random((2, 8, 32, 32)).astype('float32')
-                spectralNorm = fluid.dygraph.nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
-                ret = spectralNorm(fluid.dygraph.base.to_variable(x))
-
-    """
-
     def __init__(self, name_scope, dim=0, power_iters=1, eps=1e-12, name=None):
         super(SpectralNorm, self).__init__(name_scope)
         self._power_iters = power_iters
         self._eps = eps
         self._dim = dim
 
-    def _build_once(self, weight):
+    def build_once(self, weight):
         self._dtype = self._helper.input_dtype(weight)
         input_shape = weight.shape
         h = input_shape[self._dim]
@@ -2515,44 +2330,6 @@ class SpectralNorm(layers.Layer):
 
 
 class TreeConv(layers.Layer):
-    """
-        ***Tree-Based Convolution Operator***
-
-        Tree-Based Convolution is a kind of convolution based on tree structure.
-        Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-        which is used to classify tree structures, such as Abstract Syntax Tree.
-        Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-        which regards multiway tree as binary tree.
-        The paper of Tree-Based Convolution Operator is here: https://arxiv.org/abs/1409.5718v1
-
-
-        Args:
-            name_scope (str): See base class.
-            output_size(int): output feature width
-            num_filters(int): number of filters, Default 1
-            max_depth(int): max depth of filters, Default 2
-            act(str): activation function, Default tanh
-            param_attr(ParamAttr): the parameter attribute for the filters, Default None
-            bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None
-            name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None
-
-        Returns:
-            out(Variable): (Tensor) The feature vector of subtrees. The shape of the output tensor is [max_tree_node_size, output_size, num_filters]. The output tensor could be a new feature vector for next tree convolution layers
-
-        Examples:
-            .. code-block:: python
-              import paddle.fluid as fluid
-              import numpy
-
-              with fluid.dygraph.guard():
-                  nodes_vector = numpy.random.random((1, 10, 5)).astype('float32')
-                  edge_set = numpy.random.random((1, 9, 2)).astype('int32')
-                  treeConv = fluid.dygraph.nn.TreeConv(
-                    'TreeConv', output_size=6, num_filters=1, max_depth=2)
-                  ret = treeConv(fluid.dygraph.base.to_variable(nodes_vector), fluid.dygraph.base.to_variable(edge_set))
-
-    """
-
     def __init__(self,
                  name_scope,
                  output_size,
@@ -2571,7 +2348,7 @@ class TreeConv(layers.Layer):
         self._bias_attr = bias_attr
         self._param_attr = param_attr
 
-    def _build_once(self, nodes_vector, edge_set):
+    def build_once(self, nodes_vector, edge_set):
         assert isinstance(nodes_vector, Variable)
         assert isinstance(edge_set, Variable)
         self._dtype = self._helper.input_dtype(nodes_vector)
@@ -2591,7 +2368,6 @@ class TreeConv(layers.Layer):
             is_bias=False)
 
     def forward(self, nodes_vector, edge_set):
-
         if self._name:
             out = self.create_variable(
                 name=self._name, dtype=self._dtype, persistable=False)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 37716cea14c..44c20166b89 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -13,42 +13,35 @@
 # limitations under the License.
 import os
 import six
-import numpy as np
 
 from .. import core
 from . import layers
-from . import parallel_helper
 from .. import framework
+
 from ..layers import collective
-from . import to_variable
 
 __all__ = ["prepare_context"]
 
 ParallelStrategy = core.ParallelStrategy
 
+__parallel_ctx__clz__ = None
+
 
-def prepare_context(strategy=None):
-    if strategy is None:
-        strategy = ParallelStrategy()
-        strategy.nranks = Env().nranks
-        strategy.local_rank = Env().local_rank
-        strategy.trainer_endpoints = Env().trainer_endpoints
-        strategy.current_endpoint = Env().current_endpoint
-    if strategy.nranks < 2:
-        return
-    assert framework.in_dygraph_mode() is True,\
-        "dygraph.parallel.prepare_context should be used with dygrahp mode."
+def prepare_context(parallel_strategy):
+    global __parallel_ctx__clz__
+    assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once."
+    assert framework.in_dygraph_mode(
+    ) is True, "dygraph.parallel.prepare_context should be used with dygrahp mode."
     place = framework._current_expected_place()
-    assert place is not None, \
-        "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard."
+    assert place is not None, "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard."
+
     if isinstance(place, core.CUDAPlace):
-        parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
+        __parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy,
+                                                         place)
     else:
         # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
         assert ("Only support CUDAPlace for now.")
-    parallel_helper._init_parallel_ctx()
-    return strategy
+    __parallel_ctx__clz__.init()
 
 
 class Env(object):
@@ -82,108 +75,31 @@ class Env(object):
 
 
 class DataParallel(layers.Layer):
-    """
-    Runs the module with data parallelism.
-
-    Currently, DataParallel only supports to run the dynamic graph
-    with multi-process. The usage is:
-    `python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`.
-    And the content of `dynamic_graph_test.py` is the code of examples.
-
-    Examples:
-        .. code-block:: python
-
-           import numpy as np
-           import paddle.fluid as fluid
-           import paddle.fluid.dygraph as dygraph
-           from paddle.fluid.optimizer import AdamOptimizer
-           from paddle.fluid.dygraph.nn import FC
-           from paddle.fluid.dygraph.base import to_variable
-
-           place = fluid.CUDAPlace(0)
-           with fluid.dygraph.guard(place=place):
-
-               # prepare the data parallel context
-               strategy=dygraph.parallel.prepare_context()
-
-               fc_layer = FC("FC", 10, act="softmax")
-               adam = fluid.optimizer.AdamOptimizer()
-
-               # make the module become the data parallelism module
-               fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy)
-
-               x_data = np.random.random(size=[10, 1]).astype(np.float32)
-               data = to_variable(x_data)
-
-               hidden = fc_layer(data)
-               avg_loss = fluid.layers.mean(hidden)
-
-               # scale the loss according to the number of trainers.
-               avg_loss = fc_layer.scale_loss(avg_loss)
-
-               avg_loss.backward()
-
-               # collect the gradients of trainers.
-               fc_layer.apply_collective_grads()
-
-               adam.minimize(avg_loss)
-               fc_layer.clear_gradients()
-
-    Args:
-        layers(Layer): The module that should be executed by data parallel.
-        strategy(ParallelStrategy): The strategy of data parallelism.
-
-    Returns:
-        Layer: The data paralleled module.
-    """
-
-    def __init__(self, layers, strategy):
+    def __init__(self, layers):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
-
         self._layers = layers
-        self._strategy = strategy
+
+    def build_once(self, *inputs, **kwargs):
+        #TODO(Yancey1989): broadcast all the paramters
+        pass
 
     def forward(self, *inputs, **kwargs):
-        return self._layers(*inputs, **kwargs)
-
-    def scale_loss(self, loss):
-        """
-        Scale the loss. In data parallel mode, the loss should be scale with
-        the number of trainers. If not in data parallel mode, return the loss
-        directly.
-
-        Args:
-            loss(Layer): The loss of the current Model.
-
-        Returns:
-            Layer: the scaled loss.
-        """
-        if not self._is_data_parallel_mode():
-            return loss
-
-        loss_scale = to_variable(
-            np.array([self._strategy.nranks]).astype("float32"))
-        loss_scale.stop_gradient = True
-        loss = loss / loss_scale
-        return loss
-
-    def apply_collective_grads(self):
-        """
-        AllReduce the Parameters' gradient.
-        """
-        if not self._is_data_parallel_mode():
-            return
-
-        for param in self._layers.parameters():
-            # NOTE(zcd): The grad_ivar maybe no generated.
-            if param.trainable and param._ivar._grad_ivar():
-                g_var = framework.Variable(
-                    block=self._helper.main_program.current_block(),
-                    name=param._ivar._grad_name(),
-                    stop_gradient=True,
-                    ivar=param._ivar._grad_ivar())
-                collective._allreduce(g_var, g_var, sync_mode=True)
-
-    def _is_data_parallel_mode(self):
-        return self._strategy.nranks > 1
+        def _collective_hook(iop):
+            op = framework._dygraph_tracer()._ops[iop._trace_id]
+            for k, v in six.iteritems(op.inputs):
+                for ivar in v:
+                    g = ivar._grad_ivar()
+                    if g:
+                        g_var = framework.Variable(
+                            block=self._helper.main_program.current_block(),
+                            name=ivar._grad_name(),
+                            stop_gradient=True,
+                            ivar=g)
+                        collective._allreduce(g_var, g_var, sync_mode=True)
+
+        outs = self._layers(*inputs, **kwargs)
+        for _, op in six.iteritems(framework._dygraph_tracer()._ops):
+            # hook collective ops
+            op.iop.register_backward_hooks(_collective_hook, front=True)
+        return outs
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index aea95f2f530..9d2cbb4f03f 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -24,7 +24,9 @@ __all__ = ['Tracer']
 
 
 def release_op(op):
-    del framework._dygraph_tracer()._ops[op._trace_id]
+    del framework._dygraph_tracer()._ops[op._trace_id].inputs
+    del framework._dygraph_tracer()._ops[op._trace_id].outputs
+    del framework._dygraph_tracer()._ops[op._trace_id].backward_refs
 
 
 class Tracer(core.Tracer):
@@ -47,23 +49,74 @@ class Tracer(core.Tracer):
         return list((item for name, item in six.iteritems(self._vars)
                      if isinstance(item, framework.Parameter)))
 
-    def _clear_ops(self):
-        self._ops = defaultdict()
-        self._trace_id = 0
-
     def trace_op(self, op, inputs, outputs, stop_gradient=False):
+        # TODO(minqiyang): remove this line after we take apart all
+        # backward grads and forward variables
+        if self._train_mode:
+            op.inputs = inputs
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        outs[k].append(var._ivar)
+        else:
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    op.previous_ops.append(vars.op)
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        op.previous_ops.append(var.op)
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    vars.op = op
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        var.op = op
+                        outs[k].append(var._ivar)
+
         # record op's trace id
         op.iop._trace_id = self._trace_id
 
-        self.trace(op.iop, inputs, outputs, op.attrs,
-                   framework._current_expected_place(), stop_gradient)
+        backward_refs = self.trace(op.iop, inps, outs, op.attrs,
+                                   framework._current_expected_place(),
+                                   stop_gradient)
 
         if not stop_gradient and self._train_mode:
             self._trace_id += 1
             self._ops[op.iop._trace_id] = op
 
             # register backward hooks and variables if needed
-            op.iop.register_backward_hooks(release_op)
+            if len(backward_refs) > 0:
+                op.iop.register_backward_hooks(release_op)
+
+                # TODO(minqiyang): remove all inputs and outputs after separate
+                # var and grad
+                op.backward_refs = defaultdict(list)
+                for k, v in six.iteritems(inputs):
+                    if k in backward_refs:
+                        op.backward_refs[k] = inputs[k]
+
+                for k, v in six.iteritems(outputs):
+                    if k in backward_refs:
+                        op.backward_refs[k] = outputs[k]
 
     def train_mode(self):
         self._train_mode = True
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index bde828a6691..c84dd4bc475 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -22,7 +22,6 @@ from .framework import Program, Variable, program_guard
 from . import unique_name
 from .layer_helper import LayerHelper
 from .initializer import Constant
-from .layers import detection
 
 __all__ = [
     'ChunkEvaluator',
@@ -375,7 +374,7 @@ class DetectionMAP(Evaluator):
             label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
+        map = layers.detection_map(
             input,
             label,
             class_num,
@@ -397,7 +396,7 @@ class DetectionMAP(Evaluator):
         self.has_state = var
 
         # calculate accumulative mAP
-        accum_map = detection.detection_map(
+        accum_map = layers.detection_map(
             input,
             label,
             class_num,
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index bf9754ce2bf..063b65e8eef 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -247,10 +247,6 @@ def _to_name_str(var):
         raise TypeError(str(var) + " should be Variable or str")
 
 
-def _get_strong_program_cache_key(program, feed, fetch_list):
-    return str(id(program)) + _get_program_cache_key(feed, fetch_list)
-
-
 def _get_program_cache_key(feed, fetch_list):
     feed_var_names = list(feed.keys())
     fetch_var_names = list(map(_to_name_str, fetch_list))
@@ -360,38 +356,17 @@ class Executor(object):
     def __init__(self, place):
         self.place = place
         self.program_caches = dict()
-        self.ctx_caches = dict()
-        self.scope_caches = dict()
-        self.var_caches = dict()
         p = core.Place()
         p.set_place(self.place)
         self._default_executor = core.Executor(p)
         self._closed = False
 
-    def _get_var_cache(self, program_cache_key):
-        return self.var_caches.get(program_cache_key, None)
-
-    def _get_scope_cache(self, program_cache_key):
-        return self.scope_caches.get(program_cache_key, None)
-
-    def _get_ctx_cache(self, program_cache_key):
-        return self.ctx_caches.get(program_cache_key, None)
-
     def _get_program_cache(self, program_cache_key):
         return self.program_caches.get(program_cache_key, None)
 
     def _add_program_cache(self, program_cache_key, program):
         self.program_caches[program_cache_key] = program
 
-    def _add_ctx_cache(self, ctx_cache_key, ctx):
-        self.ctx_caches[ctx_cache_key] = ctx
-
-    def _add_scope_cache(self, scope_cache_key, scope):
-        self.scope_caches[scope_cache_key] = scope
-
-    def _add_var_cache(self, var_cache_key, var):
-        self.var_caches[var_cache_key] = var
-
     def _add_feed_fetch_ops(self, program, feed, fetch_list, feed_var_name,
                             fetch_var_name):
         tmp_program = program.clone()
@@ -670,7 +645,6 @@ class Executor(object):
             # performance.
             # TODO(panyx0718): executor should be able to run graph.
             assert program._program, "CompiledProgram is compiled from graph, can only run with_data_parallel."
-            # use_program_cache is not valid with CompiledProgram
             return self._run(
                 program._program,
                 self._default_executor,
@@ -680,7 +654,7 @@ class Executor(object):
                 fetch_var_name=fetch_var_name,
                 scope=scope,
                 return_numpy=return_numpy,
-                use_program_cache=False)
+                use_program_cache=use_program_cache)
 
     def _run(self, program, exe, feed, fetch_list, feed_var_name,
              fetch_var_name, scope, return_numpy, use_program_cache):
@@ -703,12 +677,9 @@ class Executor(object):
                 "Executor requires Program as its Parameter. But you passed in %s"
                 % (type(program)))
 
+        cache_key = _get_program_cache_key(feed, fetch_list)
         if use_program_cache:
-            cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
             cached_program = self._get_program_cache(cache_key)
-            cached_ctx = self._get_ctx_cache(cache_key)
-            cached_scope = self._get_scope_cache(cache_key)
-            cached_var = self._get_var_cache(cache_key)
             if cached_program is None:
                 cached_program = self._add_feed_fetch_ops(
                     program=program,
@@ -717,25 +688,9 @@ class Executor(object):
                     feed_var_name=feed_var_name,
                     fetch_var_name=fetch_var_name)
                 self._add_program_cache(cache_key, cached_program)
-                fetch_list_str = list(map(_to_name_str, fetch_list))
-                cached_ctx = self._default_executor.prepare_ctx_cache(
-                    cached_program.desc, 0, fetch_list_str, False)
-                cached_var = self._default_executor.create_variables(
-                    cached_program.desc, scope, 0)
-                # currently, we cache program, vars, sub_scope here
-                # we suppose that in a life cycle of training, a user
-                # will not create many programs. So, here the basic
-                # rule of caching is to cache all unseen (program, var, scope)
-                # when a user use use_program_cache.
-                cached_scope = scope.new_scope()
-                self._add_ctx_cache(cache_key, cached_ctx)
-                self._add_var_cache(cache_key, cached_var)
-                self._add_scope_cache(cache_key, cached_scope)
             program = cached_program
-            ctx = cached_ctx
-            scope = cached_scope
-            var = cached_var
         else:
+            self.program_caches.pop(cache_key, None)
             program = self._add_feed_fetch_ops(
                 program=program,
                 feed=feed,
@@ -744,10 +699,7 @@ class Executor(object):
                 fetch_var_name=fetch_var_name)
 
         self._feed_data(program, feed, feed_var_name, scope)
-        if not use_program_cache:
-            exe.run(program.desc, scope, 0, True, True, fetch_var_name)
-        else:
-            exe.run_cached_prepared_ctx(ctx, scope, False, False, False)
+        exe.run(program.desc, scope, 0, True, True, fetch_var_name)
         outs = self._fetch_data(fetch_list, fetch_var_name, scope)
         if return_numpy:
             outs = as_numpy(outs)
@@ -781,23 +733,12 @@ class Executor(object):
         assert len(fetch_list) == len(fetch_info)
         compiled = isinstance(program, compiler.CompiledProgram)
         if not compiled:
-            # TODO: Need a better way to distinguish and specify different execution mode
-            if program._pipeline_opt:
-                trainer = TrainerFactory()._create_trainer(
-                    program._pipeline_opt)
-            else:
-                trainer = TrainerFactory()._create_trainer(program._fleet_opt)
+            trainer = TrainerFactory()._create_trainer(program._fleet_opt)
             trainer._set_program(program)
         else:
-            if program._pipeline_opt:
-                trainer = TrainerFactory()._create_trainer(
-                    program.program._pipeline_opt)
-            else:
-                trainer = TrainerFactory()._create_trainer(
-                    program.program._fleet_opt)
+            trainer = TrainerFactory()._create_trainer(
+                program.program._fleet_opt)
             trainer._set_program(program.program)
-
-        # The following thread_num-determined logic will be deprecated
         if thread <= 0:
             if dataset.thread_num <= 0:
                 raise RuntimeError(
@@ -807,26 +748,6 @@ class Executor(object):
                 trainer._set_thread(dataset.thread_num)
         else:
             trainer._set_thread(thread)
-
-        # Adjust the reader size for small file num
-        if program._pipeline_opt:
-            dataset.set_thread(thread *
-                               program._pipeline_opt["concurrency_list"][0])
-            file_size = len(dataset.dataset.get_filelist())
-            if file_size < thread:
-                thread = file_size
-                print(
-                    "Pipeline: setting the pipeline num to %d is enough because there are only %d files"
-                    % (file_size, file_size))
-            if file_size < thread * program._pipeline_opt["concurrency_list"][
-                    0]:
-                print(
-                    "Pipeline: setting the 1st element in concurrency_list to %d is enough because there are only %d files"
-                    % (file_size / thread, file_size))
-                program._pipeline_opt["concurrency_list"][
-                    0] = file_size / thread
-                dataset.set_thread(
-                    program._pipeline_opt["concurrency_list"][0] * thread)
         trainer._set_debug(debug)
         trainer._set_fetch_var_and_info(fetch_list, fetch_info, print_period)
         return scope, trainer
@@ -901,7 +822,8 @@ class Executor(object):
         trainer._set_infer(True)
         trainer._gen_trainer_desc()
         dataset._prepare_to_run()
-        self._dump_debug_info(program=program, trainer=trainer)
+        if debug:
+            self._dump_debug_info(program=program, trainer=trainer)
         self._default_executor.run_from_dataset(program.desc, scope,
                                                 dataset.dataset,
                                                 trainer._desc())
@@ -980,7 +902,8 @@ class Executor(object):
             print_period=print_period)
         trainer._gen_trainer_desc()
         dataset._prepare_to_run()
-        self._dump_debug_info(program=program, trainer=trainer)
+        if debug:
+            self._dump_debug_info(program=program, trainer=trainer)
         self._default_executor.run_from_dataset(program.desc, scope,
                                                 dataset.dataset,
                                                 trainer._desc())
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7e89c4a36ec..806499ca2e8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -27,11 +27,35 @@ import six
 import numpy as np
 import subprocess
 import multiprocessing
-import sys
+
 from .. import compat as cpt
 from .proto import framework_pb2
-
-from . import core
+try:
+    if os.name == 'nt':
+        import sys
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
+
+    from . import core
+except ImportError as e:
+    if os.name == 'nt':
+        executable_path = os.path.abspath(os.path.dirname(sys.executable))
+        raise ImportError(
+            """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
+        if you encounters \"DLL load failed\" errors. If you have python
+        installed in other directory, replace \"%s\" with your own
+        directory. The original error is: \n %s""" %
+            (executable_path, executable_path, cpt.get_exception_message(e)))
+    else:
+        raise ImportError(
+            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        if you encounters \"libmkldnn.so not found\" errors. If you have python
+        installed in other directory, replace \"/usr/local/lib\" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
+except Exception as e:
+    raise e
 from . import unique_name
 
 __all__ = [
@@ -82,24 +106,7 @@ def _current_expected_place():
 
 
 def _cpu_num():
-    if "CPU_NUM" not in os.environ.keys():
-        sys.stderr.write(
-            'The CPU_NUM is not specified, you should set CPU_NUM in '
-            'the environment variable list, i.e export CPU_NUM=1. CPU_NUM '
-            'indicates that how many CPUPlace are used in the current task.\n'
-            '!!! The default number of CPUPlaces is 1.\n\n')
-        os.environ['CPU_NUM'] = str(1)
-    cpu_num = os.environ.get('CPU_NUM')
-    return int(cpu_num)
-
-
-def _cuda_ids():
-    gpus_env = os.getenv("FLAGS_selected_gpus")
-    if gpus_env:
-        device_ids = [int(s) for s in gpus_env.split(",")]
-    else:
-        device_ids = six.moves.range(core.get_cuda_device_count())
-    return device_ids
+    return int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
 
 
 def cuda_places(device_ids=None):
@@ -133,7 +140,11 @@ def cuda_places(device_ids=None):
     assert core.is_compiled_with_cuda(), \
         "Not compiled with CUDA"
     if device_ids is None:
-        device_ids = _cuda_ids()
+        gpus_env = os.getenv("FLAGS_selected_gpus")
+        if gpus_env:
+            device_ids = [int(s) for s in gpus_env.split(",")]
+        else:
+            device_ids = six.moves.range(core.get_cuda_device_count())
     elif not isinstance(device_ids, (list, tuple)):
         device_ids = [device_ids]
     return [core.CUDAPlace(dev_id) for dev_id in device_ids]
@@ -518,14 +529,8 @@ class Variable(object):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
         return np.array(new_ivar.value().get_tensor())
 
-    def backward(self, backward_strategy=None):
-        from .dygraph import BackwardStrategy
-        if backward_strategy is None:
-            backward_strategy = BackwardStrategy()
-            backward_strategy.sort_sum_gradient = False
-
-        self._ivar._run_backward(backward_strategy)
-        _dygraph_tracer()._clear_ops()
+    def backward(self):
+        self._ivar._run_backward()
 
     def gradient(self):
         new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
@@ -553,9 +558,8 @@ class Variable(object):
         """
         if in_dygraph_mode():
             # TODO(panyx0718): add more dygraph debug info.
-            return 'name %s, dtype: %s shape: %s %s' % (
-                self.name, self.dtype, self.shape,
-                str(self._ivar.value().get_tensor()))
+            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
+                                                     self.shape)
 
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -643,8 +647,6 @@ class Variable(object):
     @property
     def lod_level(self):
         # TODO(minqiyang): Support lod_level in dygraph mode
-        if in_dygraph_mode():
-            raise Exception("Dygraph model DO NOT supprt lod")
         return self.desc.lod_level()
 
     @property
@@ -756,8 +758,10 @@ class Variable(object):
     def _cloneVar(self, copy=False):
         if not copy:
             return self.block.create_var(
-                name=unique_name.generate_with_ignorable_key(self.name),
-                dtype=self.dtype)
+                name=unique_name.generate(".".join(self.name)),
+                dtype=self.dtype,
+                persistable=self.persistable,
+                stop_gradient=self.stop_gradient, )
         else:
             return self
 
@@ -988,12 +992,12 @@ class Operator(object):
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
                 op_attrs[op_maker.kOpRoleAttrName(
-                )] = self.block.program._op_role
+                )] = self.block.program.op_role
 
             role_var_name = op_maker.kOpRoleVarAttrName()
             if len(self.block.program.
-                   _op_role_var) != 0 and role_var_name not in op_attrs:
-                op_attrs[role_var_name] = self.block.program._op_role_var
+                   op_role_var) != 0 and role_var_name not in op_attrs:
+                op_attrs[role_var_name] = self.block.program.op_role_var
 
             if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
                 del op_attrs[role_var_name]
@@ -1002,7 +1006,7 @@ class Operator(object):
                 return
             if type is None:
                 raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
+                    "`type` to initilized an Operator can not be None.")
             else:
                 callstack_var_name = op_maker.kOpCreationCallstackAttrName()
                 op_attrs[callstack_var_name] = list(
@@ -1025,6 +1029,7 @@ class Operator(object):
                     found = find_name(inputs, in_proto.name)
                     assert found or in_proto.dispensable, "Input {} not found".format(
                         in_proto.name)
+
                     if found:
                         in_args = inputs[in_proto.name]
                         if not isinstance(in_args, list):
@@ -1034,17 +1039,13 @@ class Operator(object):
                                 "Input %s expects only one input, but %d are given."
                                 % (in_proto.name, len(in_args)))
                         in_arg_names = []
-                        for index, arg in enumerate(in_args):
+                        for arg in in_args:
                             if isinstance(arg, six.string_types):
                                 in_arg_names.append(arg)
                             elif isinstance(arg, six.binary_type):
                                 in_arg_names.append(arg.decode())
-                            elif isinstance(arg, Variable):
-                                in_arg_names.append(cpt.to_text(arg.name))
                             else:
-                                raise ValueError(
-                                    "not suprt args type , should be[ string_type, binary_type, Varibale]"
-                                )
+                                in_arg_names.append(cpt.to_text(arg.name))
                         self.desc.set_input(in_proto.name, in_arg_names)
                     else:
                         self.desc.set_input(in_proto.name, [])
@@ -1379,9 +1380,7 @@ class Block(object):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            cur_program = fluid.Program()
+            cur_program = Program()
             cur_block = cur_program.current_block()
             var = cur_block.create_var(name="X",
                                        shape=[-1, 23, 48],
@@ -1663,22 +1662,13 @@ class Block(object):
             Operator: the append Operator.
         """
         if in_dygraph_mode():
-            attrs = kwargs.get("attrs", {})
-            if _dygraph_tracer_._train_mode == False:
-                # eval mode
-                if ('trainable_statistics' not in attrs
-                    ) or not attrs['trainable_statistics']:
-                    attrs['is_test'] = True
-                else:
-                    attrs['is_test'] = False
-
             op = Operator(
                 block=self,
                 desc=None,
                 type=kwargs.get("type", None),
                 inputs=None,
                 outputs=None,
-                attrs=attrs)
+                attrs=kwargs.get("attrs", {}))
 
             # record ops in tracer rather than blocks
             #
@@ -2716,19 +2706,12 @@ class Program(object):
         A empty program.
 
     Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-                x = fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-                y = fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-                z = fluid.layers.fc(name="fc", input=x, size=10, act="relu")
-
-            print("main program is: {}".format(main_program))
-            print("start up program is: {}".format(startup_program))
+        >>> main_program = fluid.Program()
+        >>> startup_program = fluid.Program()
+        >>> with fluid.program_guard(main_program=main_program, startup_program=startup_program):
+        >>>     fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
+        >>>     fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
+        >>>     fluid.layers.fc(name="fc", shape=[10], dtype='float32', act="relu")
 
     """
 
@@ -2738,7 +2721,7 @@ class Program(object):
         self.current_block_idx = 0
         self._seed = 0
         self._current_role = core.op_proto_and_checker_maker.OpRole.Forward
-        self.__op_role_var = []
+        self._op_role_var = []
 
         # for distribute training
         # _is_distributed = True if under distributed training
@@ -2758,10 +2741,6 @@ class Program(object):
 
         # use Deep gradient comrepssion or not
         self._enable_dgc = False
-        self._nccl_comm_num = 1
-        self._use_hierarchical_allreduce = False
-        self._hierarchical_allreduce_inter_nranks = 0
-        self._hierarchical_allreduce_exter_nranks = 0
 
         # @deprecated(the python memory optimize transpiler is deprecated)
         # whether the program is optimized by memory_optimize_transpiler
@@ -2772,12 +2751,6 @@ class Program(object):
         self._fleet_opt = None
         self._program_config = None
 
-        # assigned if this program has been parsed by a pipeline optimizer
-        self._pipeline_opt = None
-
-        # appending gradients times
-        self._appending_grad_times = 0
-
     @property
     def _is_mem_optimized(self):
         # if the program is optimized, operator input/outputs
@@ -2789,7 +2762,7 @@ class Program(object):
         self.__is_mem_optimized = target
 
     @property
-    def _op_role(self):
+    def op_role(self):
         """
         The operator role. In a enum {Forward, Backward, Optimize}.
 
@@ -2798,27 +2771,31 @@ class Program(object):
 
         For example, the forward operator should be executed on every device.
         The backward operator should be executed on every device and the
-        parameter gradient of backward (use :code:`_op_role_var` to get this
+        parameter gradient of backward (use :code:`op_role_var` to get this
         variable) operator should be merged to one device. The optimization
         operators should be executed on only one device and broadcast the
         optimization result, i.e., the new parameter, to every other device.
         """
         return self._current_role
 
-    @_op_role.setter
-    def _op_role(self, role):
+    @op_role.setter
+    def op_role(self, role):
         self._current_role = role
 
     @property
-    def _op_role_var(self):
+    def op_role_var(self):
         """
-        The auxiliary variables for :code:`_op_role` property.
+        The auxiliary variables for :code:`op_role` property.
 
-        See Also: :code:`Program._op_role`'s documentation for details.
+        See Also: :code:`Program.op_role`'s documentation for details.
 
         Notes: This is a very low-level API. Users should not use it directly.
         """
-        return self.__op_role_var
+        return self._op_role_var
+
+    @op_role_var.setter
+    def set_op_role_var(self, var_name):
+        self._op_role_var = [var_name]
 
     @contextlib.contextmanager
     def _backward_role_guard(self):
@@ -2847,16 +2824,16 @@ class Program(object):
             >>>     p = p - 0.001 * g
         """
         tmp_role = self._current_role
-        tmp_var = self.__op_role_var
+        tmp_var = self._op_role_var
 
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.Optimize
-        self.__op_role_var = [
+        self._op_role_var = [
             var.name if isinstance(var, Variable) else var
             for var in param_and_grads
         ]
         yield
-        self.__op_role_var = tmp_var
+        self._op_role_var = tmp_var
         self._current_role = tmp_role
 
     @signature_safe_contextmanager
@@ -2881,16 +2858,16 @@ class Program(object):
         """
 
         tmp_role = self._current_role
-        tmp_var = self.__op_role_var
+        tmp_var = self._op_role_var
 
         OpRole = core.op_proto_and_checker_maker.OpRole
         self._current_role = OpRole.LRSched
         if is_with_opt:
             self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize)
         # TODO(typhoonzero): how to set target learning rate var
-        self.__op_role_var = []
+        self._op_role_var = []
         yield
-        self.__op_role_var = tmp_var
+        self._op_role_var = tmp_var
         self._current_role = tmp_role
 
     def __str__(self):
@@ -2924,15 +2901,6 @@ class Program(object):
             ValueError: If any of required fields is not set and throw_on_error is
                 True.
 
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                prog_string = prog.to_string(throw_on_error=True, with_details=False)
-                print(prog_string)
-
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -2972,20 +2940,13 @@ class Program(object):
 
         * Set for_test to False when we want to clone the program for training.
         * Set for_test to True when we want to clone the program for testing.
-          We will not do any prune on program here, So if you just want an
-          forward program for testing, please use :code:`clone` before using
-          :code:`Opimizer.minimize`
-
-        Notes: 
-        1. :code:`Program.clone()` method DOES NOT clone :code:`py_reader`.
-        2. This API DOES NOT prune any operator. Use
-        :code:`clone(for_test=True)` before backward and optimization please. E.g.
 
-        .. code-block:: python
+        Notes: This API DOES NOT prune any operator. Use
+        :code:`clone(for_test=True)` before backward and optimization please. e.g.
 
-            test_program = fluid.default_main_program().clone(for_test=True)
-            optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-            optimizer.minimize()
+            >>> test_program = fluid.default_main_program().clone(for_test=True)
+            >>> optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+            >>> optimizer.minimize()
 
         Args:
             for_test(bool): True if change the :code:`is_test` attribute of
@@ -2996,107 +2957,55 @@ class Program(object):
 
         Examples:
 
-        Notes: The Program Descs' order maybe different after :code:`clone` and
-        this will not affect your training or testing progress. In the following
-        example we give you an simple method :code:`print_prog(program)` to
-        print Program Descs inorder to make sure you have same print result
-        after :code:`clone`:
-
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import six
-
-
-                def print_prog(prog):
-                    for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                        print(value)
-                    for op in prog.block(0).ops:
-                        print("op type is {}".format(op.type))
-                        print("op inputs are {}".format(op.input_arg_names))
-                        print("op outputs are {}".format(op.output_arg_names))
-                        for key, value in sorted(six.iteritems(op.all_attrs())):
-                            if key not in ['op_callstack', 'op_role_var']:
-                                print(" [ attrs: {}:   {} ]".format(key, value))
-
-
-        1. To clone a test program, the sample code is:
-                .. code-block:: python
-
-                    import paddle.fluid as fluid
-                    import six
-
-                    def print_prog(prog):
-                        for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(six.iteritems(op.all_attrs())):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-
-                    train_program = fluid.Program()
-                    startup_program = fluid.Program()
-                    with fluid.program_guard(train_program, startup_program):
-                        with fluid.unique_name.guard():
-                            img = fluid.layers.data(name='image', shape=[784])
-                            hidden = fluid.layers.fc(input=img, size=200, act='relu')
-                            hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
-                            loss = fluid.layers.cross_entropy(
-                                                      input=fluid.layers.fc(hidden, size=10, act='softmax'),
-                                        label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-                            avg_loss = fluid.layers.mean(loss)
-                            test_program = train_program.clone(for_test=False)
-                    print_prog(test_program)
-                    with fluid.program_guard(train_program, startup_program):
-                        with fluid.unique_name.guard():
-                            sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                            sgd.minimize(avg_loss)
-
-
-        2. The clone method can be avoid if you create program for training and program for testing individually.
-                .. code-block:: python
-
-                    import paddle.fluid as fluid
-                    import six
-
-                    def print_prog(prog):
-                        for name, value in sorted(six.iteritems(prog.block(0).vars)):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(six.iteritems(op.all_attrs())):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-                    def network(is_test):
-                        img = fluid.layers.data(name='image', shape=[784])
-                        hidden = fluid.layers.fc(input=img, size=200, act='relu')
-                        hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
-                        loss = fluid.layers.cross_entropy(
-                            input=fluid.layers.fc(hidden, size=10, act='softmax'),
-                            label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
-                        avg_loss = fluid.layers.mean(loss)
-                        return avg_loss
-
-
-                    train_program_2 = fluid.Program()
-                    startup_program_2 = fluid.Program()
-                    test_program_2 = fluid.Program()
-                    with fluid.program_guard(train_program_2, startup_program_2):
-                        with fluid.unique_name.guard():
-                             sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                             sgd.minimize(avg_loss)
-                    # the test startup program is not used.
-                    with fluid.program_guard(test_program_2, fluid.Program()):
-                        with fluid.unique_name.guard():
-                            loss = network(is_test=True)
-                    print(test_program_2)
-
-        The two code snippets above will generate and print same programs.
+            1. To clone a test program, the sample code is:
+
+            >>> import paddle.fluid as fluid
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>
+            >>> test_program = train_program.clone(for_test=True)
+            >>>
+            >>> sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     sgd.minimize(loss)
+
+            2. The :code:`clone` method can be avoid if you create program for
+            training and program for testing individually.
+
+            >>> import paddle.fluid as fluid
+            >>>
+            >>> def network(is_test):
+            >>>     img = fluid.layers.data(name='image', shape=[784])
+            >>>     hidden = fluid.layers.fc(input=img, size=200, act='relu')
+            >>>     hidden = fluid.layers.dropout(hidden, dropout_prob=0.5, is_test=is_test)
+            >>>     loss = fluid.layers.cross_entropy(
+            >>>                 input=fluid.layers.fc(hidden, size=10, act='softmax'),
+            >>>                 label=fluid.layers.data(name='label', shape=[1], dtype='int64'))
+            >>>     return loss
+            >>>
+            >>> train_program = fluid.Program()
+            >>> startup_program = fluid.Program()
+            >>> test_program = fluid.Program()
+            >>>
+            >>> with fluid.program_guard(train_program, startup_program):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=False)
+            >>>         sgd = fluid.optimizer.SGD(learning_rate=1e-3)
+            >>>         sgd.minimize(loss)
+            >>>
+            >>> # the test startup program is not used.
+            >>> with fluid.program_guard(test_program, fluid.Program()):
+            >>>     with fluid.unique_name.guard():
+            >>>         loss = network(is_test=True)
+
+            The two code snippets above will generate same programs.
         """
         if for_test:
             p = self._inference_optimize(prune_read_op=False)
@@ -3110,8 +3019,7 @@ class Program(object):
             ]
 
             p._current_role = self._current_role
-            p.__op_role_var = self.__op_role_var
-            p._appending_grad_times = self._appending_grad_times
+            p._op_role_var = self._op_role_var
 
             p._sync_with_cpp()
 
@@ -3267,17 +3175,6 @@ class Program(object):
         the random seed from random device.
 
         Notes: It must be set before the operators have been added.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                random_seed = prog.random_seed
-                print(random_seed)
-                prog.random_seed = 1
-                print(prog.random_seed)
         """
         return self._seed
 
@@ -3285,15 +3182,6 @@ class Program(object):
     def num_blocks(self):
         """
         The number of blocks in this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                num_blocks = prog.num_blocks
-                print(num_blocks)
         """
         return self.desc.num_blocks()
 
@@ -3309,15 +3197,6 @@ class Program(object):
     def global_block(self):
         """
         Get the first block of this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                gb_block = prog.global_block()
-                print(gb_block)
         """
         return self.blocks[0]
 
@@ -3329,15 +3208,6 @@ class Program(object):
 
         Returns:
             Block: The :code:`index` block
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                block_0 = prog.block(0)
-                print(block_0)
         """
         return self.blocks[index]
 
@@ -3345,15 +3215,6 @@ class Program(object):
         """
         Get the current block. The :code:`current` block is the block to append
         operators.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                current_blk = prog.current_block()
-                print(current_blk)
         """
         return self.blocks[self.current_block_idx]
 
@@ -3472,17 +3333,6 @@ class Program(object):
 
         Returns:
             iterable: The generator will yield every variable in this program.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                img = fluid.layers.data(name='img', shape=[1,28,28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[128,1], dtype='int64')
-                for var in prog.list_vars():
-                    print(var)
         """
         for each_block in self.blocks:
             for each_var in list(each_block.vars.values()):
@@ -3551,15 +3401,6 @@ class Parameter(Variable):
 
         Returns(str): The debug string.
 
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                rlt = fluid.layers.data("fake_data", shape=[1,1], dtype='float32')
-                debug_str = prog.to_string(throw_on_error=True, with_details=False)
-                print(debug_str)
         """
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
@@ -3596,21 +3437,6 @@ def default_startup_program():
 
     Returns:
         Program: startup program
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            main_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(main_program=main_program, startup_program=startup_program):
-                x = fluid.layers.data(name="x", shape=[-1, 784], dtype='float32')
-                y = fluid.layers.data(name="y", shape=[-1, 1], dtype='int32')
-                z = fluid.layers.fc(name="fc", input=x, size=10, act="relu")
-
-                print("main program is: {}".format(fluid.default_main_program()))
-                print("start up program is: {}".format(fluid.default_startup_program()))
     """
     return _startup_program_
 
@@ -3629,35 +3455,6 @@ def default_main_program():
 
     Returns:
         Program: main program
-
-    Examples:
-        ..  code-block:: python
-
-            import paddle.fluid as fluid
-            
-            # Sample Network:
-            data = fluid.layers.data(name='image', shape=[3, 224, 224], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-            
-            conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None)
-            bn1 = fluid.layers.batch_norm(conv1, act='relu')
-            pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
-            conv2 = fluid.layers.conv2d(pool1, 16, 5, 1, act=None)
-            bn2 = fluid.layers.batch_norm(conv2, act='relu')
-            pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)
-            
-            fc1 = fluid.layers.fc(pool2, size=50, act='relu')
-            fc2 = fluid.layers.fc(fc1, size=102, act='softmax')
-            
-            loss = fluid.layers.cross_entropy(input=fc2, label=label)
-            loss = fluid.layers.mean(loss)
-            opt = fluid.optimizer.Momentum(
-                learning_rate=0.1,
-                momentum=0.9,
-                regularization=fluid.regularizer.L2Decay(1e-4))
-            opt.minimize(loss)
-            
-            print(fluid.default_main_program())
     """
     return _main_program_
 
@@ -3696,8 +3493,8 @@ def switch_startup_program(program):
 @signature_safe_contextmanager
 def program_guard(main_program, startup_program=None):
     """
-    Change the global main program and startup program with `"with"` statement.
-    Layer functions in the Python `"with"` block will append operators and
+    Change the global main program and startup program with `with` statement.
+    Layer functions in the Python `with` block will append operators and
     variables to the new main programs.
 
     Examples:
@@ -3725,9 +3522,9 @@ def program_guard(main_program, startup_program=None):
              data = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
 
     Args:
-        main_program(Program): New main program inside `"with"` statement.
-        startup_program(Program): New startup program inside `"with"` statement.
-            None means not changing startup program.
+        main_program(Program): New main program inside `with` statement.
+        startup_program(Program): New startup program inside `with` statement.
+            None means do not change startup program.
     """
     if not isinstance(main_program, Program):
         raise TypeError("main_program should be Program")
diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
index acabec3e82a..f2f72b0f505 100644
--- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py
+++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py
@@ -16,21 +16,22 @@ from __future__ import print_function
 
 import abc
 
-import paddle.fluid as fluid
-from paddle.fluid.executor import Executor
+from enum import Enum
+
 from paddle.fluid.optimizer import SGD
+from paddle.fluid.executor import Executor
 
-from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
-from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
-from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker
+from role_maker import RoleMakerBase
+from role_maker import MPISymetricRoleMaker
+from role_maker import UserDefinedRoleMaker
 
 
-class Mode:
+class Mode(Enum):
     """
     There are various mode for fleet, each of them is designed for different model.
     """
-    TRANSPILER = 1
-    PSLIB = 2
+    TRANSPILER = 1,
+    PSLIB = 2,
     COLLECTIVE = 3
 
 
@@ -47,6 +48,7 @@ class Fleet(object):
     __metaclass__ = abc.ABCMeta
 
     def __init__(self, mode):
+        assert isinstance(mode, Mode)
         self._is_initialized = False
         self._mode = mode
         self._optimizer = None
@@ -77,9 +79,9 @@ class Fleet(object):
         Get current total worker number.
 
         Returns:
-            int: worker numbers
+            int: worker number
         """
-        return self._role_maker.worker_num()
+        return len(self._role_maker.get_trainer_endpoints())
 
     def is_worker(self):
         """
@@ -171,25 +173,36 @@ class Fleet(object):
             end += length
         return files[start:end]
 
-    def init(self, role_maker=None):
+    def init(self, executor, role_maker=None):
         """
         should be called only once in user's python scripts,
         init() will initialize RoleMaker which is used for identifying
             current node's role, e.g. worker, server, etc.
 
         Args:
+            executor(Executor): The executor to run fleet.
             role_maker(RoleMakerBase): subclass of RoleMakerBase.
 
         Returns:
             None
         """
-        self._executor = Executor(fluid.CPUPlace())
+        if not isinstance(executor, Executor):
+            raise ValueError("executor must be an instance of Executor")
 
         if role_maker and not isinstance(role_maker, RoleMakerBase):
             raise ValueError("role_maker must be an instance of RoleMakerBase")
 
-        self._role_maker = role_maker
-        self._role_maker.generate_role()
+        if isinstance(role_maker, MPISymetricRoleMaker):
+            self._role_maker = role_maker
+            self._role_maker.generate_role()
+
+        elif isinstance(role_maker, UserDefinedRoleMaker):
+            self._role_maker = role_maker
+
+        else:
+            raise ValueError(
+                "role_maker must be an instance of UserDefinedRoleMaker/MPISymetricRoleMaker"
+            )
 
         self._is_initialized = True
 
@@ -202,20 +215,23 @@ class Fleet(object):
         pass
 
     @abc.abstractmethod
-    def run_server(self):
+    def run_server(self, ):
         pass
 
     @abc.abstractmethod
     def stop_worker(self):
         pass
 
+    @abc.abstractmethod
+    def stop(self):
+        pass
+
     @abc.abstractmethod
     def distributed_optimizer(self, optimizer, strategy=None):
         pass
 
     @abc.abstractmethod
     def save_inference_model(self,
-                             executor,
                              dirname,
                              feeded_var_names,
                              target_vars,
@@ -224,7 +240,7 @@ class Fleet(object):
         pass
 
     @abc.abstractmethod
-    def save_persistables(self, executor, dirname, main_program=None):
+    def save_persistables(self, dirname, main_program=None):
         pass
 
 
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index ae6768f8f56..5371252213b 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -13,15 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
+from enum import Enum
 
 __all__ = [
-    'Role', 'RoleMakerBase', 'MPISymetricRoleMaker', 'UserDefinedRoleMaker',
-    'UserDefinedCollectiveRoleMaker', 'PaddleCloudRoleMaker'
+    'Role', 'RoleMakerBase', 'MPISymetricRoleMaker', 'UserDefinedRoleMaker'
 ]
 
 
-class Role:
-    WORKER = 1
+class Role(Enum):
+    WORKER = 1,
     SERVER = 2
 
 
@@ -61,15 +61,6 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_num(self):
-        """
-        Get current total worker number.
-
-        Returns:
-            int: worker number
-        """
-        raise NotImplementedError("Please implement this method in child class")
-
     def worker_index(self):
         """
         Get current worker id.
@@ -206,9 +197,6 @@ class MPISymetricRoleMaker(MPIRoleMaker):
             return self.is_worker() and 0 == self.worker_index()
         return False
 
-    def worker_num(self):
-        return self._worker_num()
-
     def is_worker(self):
         """
         return whether current process is worker assigned by role maker
@@ -281,8 +269,8 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         """
         if not self._role_is_generated:
             # TODO(guru4elephant): only allow to be called once
-            self._worker_endpoints = self._get_ips()[1::2]
-            self._server_endpoints = self._get_ips()[::2]
+            self._worker_endpoints = self._get_ips()
+            self._server_endpoints = self._get_ips()
 
             if 0 == self._get_rank() % self._proc_per_node % 2:
                 self._node_type = 0
@@ -292,50 +280,6 @@ class MPISymetricRoleMaker(MPIRoleMaker):
             self._role_is_generated = True
 
 
-class PaddleCloudRoleMaker(RoleMakerBase):
-    def __init__(self):
-        super(PaddleCloudRoleMaker, self).__init__()
-
-    def generate_role(self):
-        if not self._role_is_generated:
-            self.port = os.getenv("PADDLE_PORT", "6174")
-            self.pserver_ips = os.getenv("PADDLE_PSERVERS", "")
-            eplist = []
-            for ip in pserver_ips.split(","):
-                eplist.append(':'.join([ip, port]))
-                self.endpoints = ",".join(eplist)
-                self.trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-                self.current_endpoint = os.getenv("POD_IP",
-                                                  "localhost") + ":" + port
-                self.role = os.getenv("TRAINING_ROLE", "TRAINER")
-                self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-            self.eplist = eplist
-            self.endpoints = self.endpoints.split(",")
-            if self.role.upper() == "PSERVER":
-                self.current_id = self.endpoints.index(self.current_endpoint)
-            else:
-                self.current_id = self.trainer_id
-            self._role_is_generated = True
-
-    def is_wokrer(self):
-        return self._role == Role.WORKER
-
-    def is_server(self):
-        return self._role == Role.SERVER
-
-    def is_first_worker(self):
-        return self._role == Role.WORKER and self._current_id == 0
-
-    def worker_index(self):
-        return self._current_id
-
-    def server_index(self):
-        return self._current_id
-
-    def worker_num(self):
-        return self._worker_num
-
-
 class UserDefinedRoleMaker(RoleMakerBase):
     def __init__(self,
                  current_id=0,
@@ -349,32 +293,10 @@ class UserDefinedRoleMaker(RoleMakerBase):
         """
         super(UserDefinedRoleMaker, self).__init__()
 
-        if not isinstance(current_id, int):
-            raise TypeError("current_id must be as int")
-        else:
-            if current_id < 0:
-                raise ValueError("current_id must be gather or equal 0")
-            self._current_id = current_id
-
-        if role != Role.WORKER and role != Role.SERVER:
-            raise TypeError("role must be as Role")
-        else:
-            self._role = role
-
-        if not isinstance(worker_num, int):
-            raise TypeError("worker_num must be as int")
-        else:
-            if worker_num < 0:
-                raise ValueError("worker_num must be gather or equal 0")
-            self._worker_num = worker_num
-
-        if not isinstance(server_endpoints, list):
-            raise TypeError("server_endpoints must be as string list")
-        else:
-            self._server_endpoints = server_endpoints
-
-    def generate_role(self):
-        self._role_is_generated = True
+        self._current_id = current_id
+        self._role = role
+        self._worker_num = worker_num
+        self._server_endpoints = server_endpoints
 
     def is_worker(self):
         return self._role == Role.WORKER
@@ -390,43 +312,3 @@ class UserDefinedRoleMaker(RoleMakerBase):
 
     def server_index(self):
         return self._current_id
-
-    def worker_num(self):
-        return self._worker_num
-
-
-class UserDefinedCollectiveRoleMaker(RoleMakerBase):
-    def __init__(self, current_id=0, worker_endpoints=None):
-        """
-        UserDefinedCollectiveRoleMaker is designed for worker assignment
-        under manual for collective mode.
-        """
-        super(UserDefinedCollectiveRoleMaker, self).__init__()
-
-        if not isinstance(current_id, int):
-            raise TypeError("current_id must be as int")
-        else:
-            if current_id < 0:
-                raise ValueError("current_id must be greater or equal 0")
-            self._current_id = current_id
-
-        if not isinstance(worker_endpoints, list):
-            raise TypeError("worker_endpoints must be as string list")
-        else:
-            self._worker_endpoints = worker_endpoints
-        self._worker_num = len(self._worker_endpoints)
-
-    def generate_role(self):
-        self._role_is_generated = True
-
-    def is_worker(self):
-        return True
-
-    def is_first_worker(self):
-        return self._current_id == 0
-
-    def worker_index(self):
-        return self._current_id
-
-    def worker_num(self):
-        return self._worker_num
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index 100474244c5..e381a0d8c71 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -17,9 +17,9 @@ import paddle.fluid as fluid
 import paddle.fluid.io as io
 import paddle.fluid.transpiler.distribute_transpiler as dist_transpiler
 
-from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
-from paddle.fluid.incubate.fleet.base.fleet_base import Mode
-from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
+from ..base.fleet_base import Fleet
+from ..base.fleet_base import Mode
+from ..base.fleet_base import DistributedOptimizer
 
 
 class Collective(Fleet):
@@ -47,12 +47,17 @@ class Collective(Fleet):
         logging.warn(
             "You should not call 'stop_worker' method for collective mode.")
 
+    def stop(self):
+        """
+        stop(): will be called after a user finishes his/her training task.
+        """
+        logging.warn("You should not call 'stop' method for collective mode.")
+
     def distributed_optimizer(self, optimizer, strategy=None):
         self._optimizer = CollectiveOptimizer(optimizer, strategy)
         return self._optimizer
 
     def save_inference_model(self,
-                             executor,
                              dirname,
                              feeded_var_names=None,
                              target_vars=None,
@@ -62,7 +67,7 @@ class Collective(Fleet):
                                 self._executor, main_program, None, None,
                                 export_for_deployment)
 
-    def save_persistables(self, executor, dirname, main_program=None):
+    def save_persistables(self, dirname, main_program=None):
         io.save_persistables(self._executor, dirname, main_program, None)
 
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index 2111831b9fa..ec066187c23 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 
 import sys
-from optimizer_factory import *
+from .optimizer_factory import *
 from google.protobuf import text_format
 
 import paddle.fluid as fluid
 from paddle.fluid.framework import Program
 
-from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
-from paddle.fluid.incubate.fleet.base.fleet_base import Mode
-from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
-from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
+from ...base.fleet_base import Fleet
+from ...base.fleet_base import Mode
+from ...base.role_maker import MPISymetricRoleMaker
+from ...base.fleet_base import DistributedOptimizer
 
 
 class PSLib(Fleet):
@@ -33,8 +33,8 @@ class PSLib(Fleet):
         self._main_programs = []
         self._scopes = []
 
-    def init(self, role_maker=None):
-        super(PSLib, self).init(MPISymetricRoleMaker())
+    def init(self, executor, role_maker=None):
+        super(PSLib, self).init(executor, MPISymetricRoleMaker())
         self._fleet_ptr = fluid.core.Fleet()
 
     def init_worker(self):
@@ -106,33 +106,14 @@ class PSLib(Fleet):
             raise NameError(
                 "You should run DistributedOptimizer.minimize() first")
 
-    def init_server(self, model_dir=None, **kwargs):
-        """
-        init_server() will be called by user. It will load model from model_dir.
-
-        Args:
-            model_dir(str): load model path, can be local or hdfs/afs path.
-            kwargs: user-defined attributes, currently support following:
-                model(int): load model mode.
-                            0 is for load whole model,
-                            1 is for load delta model (load diff),
-                            default is 0.
-
-        Example:
-            >>> fleet.init_server("/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.load_model(model_dir, mode)
-        self._role_maker._barrier_worker()
+    def init_server(self, model_dir=None):
+        pass
 
     def run_server(self):
         """
          init_pserver(): will be called by user. When a user knows current process is_worker(), he/she
              should call init_pserver() to initialize global information about parameter server
-        """
+         """
         if self._opt_info:
             if "fleet_desc" in self._opt_info:
                 self._dist_desc_str = text_format.MessageToString(
@@ -169,12 +150,23 @@ class PSLib(Fleet):
         self._role_maker._barrier_all()
         self._role_maker._finalize()
 
-    def distributed_optimizer(self, optimizer, strategy={}):
+    def stop(self):
+        """
+        stop(): will be called after a user finishes his/her training task. Fleet instance will be
+            destroyed when stop() is called.
+        """
+        self._role_maker._barrier_worker()
+        if self._role_maker.is_first_worker():
+            self._fleet_ptr.stop_server()
+        self._role_maker._barrier_worker()
+        self._role_maker._barrier_all()
+        self._role_maker._finalize()
+
+    def distributed_optimizer(self, optimizer, strategy=None):
         self._optimizer = DownpourOptimizer(optimizer, strategy)
         return self._optimizer
 
     def save_inference_model(self,
-                             executor,
                              dirname,
                              feeded_var_names=None,
                              target_vars=None,
@@ -185,81 +177,8 @@ class PSLib(Fleet):
         """
         self._fleet_ptr.save_model(dirname)
 
-    def save_persistables(self, executor, dirname, main_program=None, **kwargs):
-        """
-        save presistable parameters,
-        when using fleet, it will save sparse and dense feature
-
-        Args:
-            dirname(str): save path. It can be hdfs/afs path or local path
-            main_program(Program): fluid program, default None
-            kwargs: use define property, current support following
-                mode(int): 0 means save all pserver model,
-                           1 means save delta pserver model (save diff),
-                           2 means save xbox base,
-                           3 means save batch model.
-
-        Example:
-            >>> fleet.save_persistables(dirname="/you/path/to/model", mode = 0)
-
-        """
-        mode = kwargs.get("mode", 0)
-        self._fleet_ptr.client_flush()
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            self._fleet_ptr.save_model(dirname, mode)
-        self._role_maker._barrier_worker()
-
-    def shrink_sparse_table(self):
-        """
-        shrink cvm of all sparse embedding in pserver, the decay rate
-        is defined as "show_click_decay_rate" in fleet_desc.prototxt
-
-        Example:
-            >>> fleet.shrink_sparse_table()
-
-        """
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            for i in self._opt_info["fleet_desc"].trainer_param.sparse_table:
-                self._fleet_ptr.shrink_sparse_table(i.table_id)
-        self._role_maker._barrier_worker()
-
-    def shrink_dense_table(self, decay, scope=None, table_id=None):
-        """
-        shrink all dense params in pserver by multiplying by decay
-
-        Args:
-            decay(float): the decay rate, usually range in (0, 1)
-            scope(Scope): Scope object, default is fluid.global_scope()
-            table_id(int): table id of shrinking dense table. None means shrink all,
-                           you should specify it when using multiple scopes,
-                           default is None.
-
-        Example:
-            >>> fleet.shrink_dense_table(0.98, myscope1, 1)
-            >>> fleet.shrink_dense_table(0.98, myscope1, 2)
-            >>> fleet.shrink_dense_table(0.98, myscope2, 3)
-
-        """
-        if scope is None:
-            scope = fluid.global_scope()
-        self._role_maker._barrier_worker()
-        if self._role_maker.is_first_worker():
-            for i in self._opt_info["fleet_desc"].trainer_param.dense_table:
-                if table_id is not None and table_id != i.table_id:
-                    continue
-                var_list = [var for var in i.dense_variable_name]
-                skip = False
-                for var in var_list:
-                    if scope.find_var(var) is None:
-                        skip = True
-                        break
-                if skip:
-                    continue
-                self._fleet_ptr.shrink_dense_table(i.table_id, scope, var_list,
-                                                   decay)
-        self._role_maker._barrier_worker()
+    def save_persistables(self, dirname, main_program=None):
+        self._fleet_ptr.save_model(dirname)
 
     def _set_opt_info(self, opt_info):
         """
@@ -354,8 +273,7 @@ class DownpourOptimizer(DistributedOptimizer):
                           losses,
                           startup_programs,
                           parameter_list,
-                          no_grad_set,
-                          self._strategy)
+                          no_grad_set)
 
         fleet._set_opt_info(opt_info)
 
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
index ed6ca5db49d..7a1925a95fd 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py
@@ -62,19 +62,10 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_SPARSE_TABLE:
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_SPARSE_TABLE, table.type))
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourSparseTable"
         table.type = pslib.PS_SPARSE_TABLE
-        table.compress_in_save = True
-        table.shard_num = 1000
         table.accessor.accessor_class = "DownpourFeatureValueAccessor"
         table.accessor.sparse_sgd_param.learning_rate = learning_rate
         table.accessor.sparse_sgd_param.initial_g2sum = 3
@@ -103,24 +94,10 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        fea_dim = 0
-        for param in filter(lambda x: x.name.find("embedding") == -1,
-                            param_var):
-            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
-
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_DENSE_TABLE:
-                    table.accessor.fea_dim = fea_dim
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_DENSE_TABLE, table.type))
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = True
         table.accessor.accessor_class = "DownpourDenseValueAccessor"
         table.accessor.dense_sgd_param.name = "adam"
         table.accessor.dense_sgd_param.adam.learning_rate = learning_rate
@@ -129,6 +106,10 @@ class DownpourServer(Server):
         table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8
         table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99
         table.accessor.dense_sgd_param.naive.learning_rate = 0.0002
+        fea_dim = 0
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
+            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
         table.accessor.fea_dim = fea_dim
 
     def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var):
@@ -142,27 +123,17 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        fea_dim = 0
-        for param in filter(lambda x: x.name.find("embedding") == -1,
-                            param_var):
-            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
-
-        for table in self._server.downpour_server_param.downpour_table_param:
-            if table.table_id == table_id:
-                if table.type == pslib.PS_DENSE_TABLE:
-                    table.accessor.fea_dim = fea_dim
-                    return
-                else:
-                    raise ValueError("expect table %s type=%s, but actual type=%s" \
-                        %(table_id, pslib.PS_DENSE_TABLE, table.type))
         table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
-        table.compress_in_save = True
         table.accessor.accessor_class = "DownpourDenseValueAccessor"
         table.accessor.dense_sgd_param.name = "summary"
         table.accessor.dense_sgd_param.summary.summary_decay_rate = 0.999999
+        fea_dim = 0
+        for param in filter(lambda x: x.name.find("embedding") == -1,
+                            param_var):
+            fea_dim += reduce(lambda x, y: x * y, param.shape, 1)
         table.accessor.fea_dim = fea_dim
 
     def get_desc(self):
@@ -198,9 +169,6 @@ class DownpourWorker(Worker):
         Returns:
             return None 
         """
-        for table in self._worker.sparse_table:
-            if table.table_id == table_id:
-                return
         table = self._worker.sparse_table.add()
         table.table_id = table_id
         table.slot_key.extend([var.name for var in slot_key_vars])
@@ -219,9 +187,6 @@ class DownpourWorker(Worker):
         Returns:
             return None 
         """
-        for table in self._worker.dense_table:
-            if table.table_id == table_id:
-                return
         table = self._worker.dense_table.add()
         table.table_id = table_id
         table.dense_variable_name.extend(
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 895fb6889cc..31f964a0e34 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -52,8 +52,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
                   losses,
                   startup_program=None,
                   parameter_list=None,
-                  no_grad_set=None,
-                  strategy={}):
+                  no_grad_set=None):
         """
         DownpounSGD is a distributed optimizer so
         that user can call minimize to generate backward
@@ -64,7 +63,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
             parameter_list(str list): parameter names defined by users
             no_grad_set(set): a set of variables that is defined by users
             so that these variables do not need gradient computation
-            strategy(dict): user-defined properties
         Returns:
             [optimize_ops, grads_and_weights]
         """
@@ -78,15 +76,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
         ps_param = pslib.PSParameter()
         server = DownpourServer()
         worker = DownpourWorker(self._window)
-        # if user specify a fleet_desc.prototxt file, then load the file
-        # instead of creating default fleet_desc.prototxt.
-        # user can specify server_param or trainer_param or fs_client_param.
-        if strategy.get("fleet_desc_file") is not None:
-            fleet_desc_file = strategy["fleet_desc_file"]
-            with open(fleet_desc_file) as f:
-                text_format.Merge(f.read(), ps_param)
-            server.get_desc().CopyFrom(ps_param.server_param)
-            worker.get_desc().CopyFrom(ps_param.trainer_param)
         sparse_table_index = 0
         server.add_sparse_table(sparse_table_index, self._learning_rate,
                                 prefetch_slots, prefetch_slots_emb)
@@ -151,8 +140,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         # Todo(guru4elephant): figure out how to support more sparse parameters
         # currently only support lookup_table
         worker_skipped_ops = ["lookup_table", "lookup_table_grad"]
-        if len(ps_param.trainer_param.skip_op) == 0:
-            ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
+        ps_param.trainer_param.skip_op.extend(worker_skipped_ops)
 
         opt_info = {}
         opt_info["program_configs"] = program_configs
@@ -161,7 +149,6 @@ class DistributedAdam(DistributedOptimizerImplBase):
         opt_info["optimizer"] = "DownpourSGD"
         opt_info["fleet_desc"] = ps_param
         opt_info["worker_skipped_ops"] = worker_skipped_ops
-        opt_info["use_cvm"] = strategy.get("use_cvm", False)
 
         for loss in losses:
             loss.block.program._fleet_opt = opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
index 378f606d648..5c9b2def076 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py
@@ -32,7 +32,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
     package='paddle',
     syntax='proto2',
     serialized_pb=_b(
-        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xc4\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x17\n\tshard_num\x18\x03 \x01(\x04:\x04\x31\x30\x30\x30\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
+        '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01'
     ))
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 
@@ -49,8 +49,8 @@ _TABLETYPE = _descriptor.EnumDescriptor(
     ],
     containing_type=None,
     options=None,
-    serialized_start=3494,
-    serialized_end=3546, )
+    serialized_start=3489,
+    serialized_end=3541, )
 _sym_db.RegisterEnumDescriptor(_TABLETYPE)
 
 TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE)
@@ -134,8 +134,8 @@ _PSCMDID = _descriptor.EnumDescriptor(
     ],
     containing_type=None,
     options=None,
-    serialized_start=3549,
-    serialized_end=3866, )
+    serialized_start=3544,
+    serialized_end=3861, )
 _sym_db.RegisterEnumDescriptor(_PSCMDID)
 
 PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID)
@@ -168,8 +168,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor(
     ],
     containing_type=None,
     options=None,
-    serialized_start=3462,
-    serialized_end=3492, )
+    serialized_start=3457,
+    serialized_end=3487, )
 _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE)
 
 _PSPARAMETER = _descriptor.Descriptor(
@@ -994,15 +994,15 @@ _TABLEPARAMETER = _descriptor.Descriptor(
             extension_scope=None,
             options=None),
         _descriptor.FieldDescriptor(
-            name='shard_num',
-            full_name='paddle.TableParameter.shard_num',
+            name='shared_num',
+            full_name='paddle.TableParameter.shared_num',
             index=2,
             number=3,
             type=4,
             cpp_type=4,
             label=1,
-            has_default_value=True,
-            default_value=1000,
+            has_default_value=False,
+            default_value=0,
             message_type=None,
             enum_type=None,
             containing_type=None,
@@ -1067,7 +1067,7 @@ _TABLEPARAMETER = _descriptor.Descriptor(
     extension_ranges=[],
     oneofs=[],
     serialized_start=1573,
-    serialized_end=1769, )
+    serialized_end=1764, )
 
 _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='TableAccessorParameter',
@@ -1213,8 +1213,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=1772,
-    serialized_end=2141, )
+    serialized_start=1767,
+    serialized_end=2136, )
 
 _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     name='DownpourTableAccessorParameter',
@@ -1344,8 +1344,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2144,
-    serialized_end=2350, )
+    serialized_start=2139,
+    serialized_end=2345, )
 
 _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     name='TableAccessorSaveParameter',
@@ -1411,8 +1411,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2352,
-    serialized_end=2435, )
+    serialized_start=2347,
+    serialized_end=2430, )
 
 _PSREQUESTMESSAGE = _descriptor.Descriptor(
     name='PsRequestMessage',
@@ -1510,8 +1510,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2437,
-    serialized_end=2538, )
+    serialized_start=2432,
+    serialized_end=2533, )
 
 _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='SparseSGDRuleParameter',
@@ -1593,8 +1593,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2540,
-    serialized_end=2659, )
+    serialized_start=2535,
+    serialized_end=2654, )
 
 _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     name='DenseSGDRuleParameter',
@@ -1692,8 +1692,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2662,
-    serialized_end=2887, )
+    serialized_start=2657,
+    serialized_end=2882, )
 
 _ADAMSGDPARAMETER = _descriptor.Descriptor(
     name='AdamSGDParameter',
@@ -1791,8 +1791,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=2890,
-    serialized_end=3024, )
+    serialized_start=2885,
+    serialized_end=3019, )
 
 _NAIVESGDPARAMETER = _descriptor.Descriptor(
     name='NaiveSGDParameter',
@@ -1842,8 +1842,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3026,
-    serialized_end=3092, )
+    serialized_start=3021,
+    serialized_end=3087, )
 
 _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     name='SummarySGDParameter',
@@ -1877,8 +1877,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3094,
-    serialized_end=3153, )
+    serialized_start=3089,
+    serialized_end=3148, )
 
 _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     name='MovingAverageRuleParameter',
@@ -1912,8 +1912,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3155,
-    serialized_end=3201, )
+    serialized_start=3150,
+    serialized_end=3196, )
 
 _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     name='PsResponseMessage',
@@ -1979,8 +1979,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3203,
-    serialized_end=3276, )
+    serialized_start=3198,
+    serialized_end=3271, )
 
 _FSCLIENTPARAMETER = _descriptor.Descriptor(
     name='FsClientParameter',
@@ -2110,8 +2110,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor(
     syntax='proto2',
     extension_ranges=[],
     oneofs=[],
-    serialized_start=3279,
-    serialized_end=3492, )
+    serialized_start=3274,
+    serialized_end=3487, )
 
 _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER
 _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 0deafaff1bf..58819efea04 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -42,9 +42,8 @@ def force_init_on_cpu():
 
         .. code-block:: python
 
-	    if fluid.initializer.force_init_on_cpu():
-    		step = fluid.layers.create_global_var(
-        	    shape=[2,3], value=1.0, dtype='float32')
+            if force_init_on_cpu():
+                create_op('force_cpu': force_init_on_cpu())
 
     """
     return _force_init_on_cpu_
@@ -58,9 +57,8 @@ def init_on_cpu():
     Examples:
         .. code-block:: python
 
-	    with fluid.initializer.init_on_cpu():
-    		step = fluid.layers.create_global_var(
-        	    shape=[2,3], value=1.0, dtype='float32')
+            with init_on_cpu():
+                step = layers.create_global_var()
 
     """
     global _force_init_on_cpu_
@@ -133,10 +131,8 @@ class ConstantInitializer(Initializer):
     Examples:
         .. code-block:: python
 
-    	    x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-	    fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Constant(value=2.0))
-
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Constant(value=2.0))
     """
 
     def __init__(self, value=0.0, force_cpu=False):
@@ -212,7 +208,7 @@ class UniformInitializer(Initializer):
             import paddle.fluid as fluid
             x = fluid.layers.data(name='x', shape=[1], dtype='float32')
             fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
+                param_attr=fluid.initializer.Uniform(low=-0.5, high=0.5))
     """
 
     def __init__(self, low=-1.0, high=1.0, seed=0):
@@ -292,10 +288,8 @@ class NormalInitializer(Initializer):
     Examples:
         .. code-block:: python
 
-	    x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-	    fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
-
+            fc = fluid.layers.fc(input=x, size=10,
+                param_attr=fluid.initializer.Normal(loc=0.0, scale=2.0))
     """
 
     def __init__(self, loc=0.0, scale=1.0, seed=0):
@@ -607,11 +601,10 @@ class MSRAInitializer(Initializer):
 
     Examples:
         .. code-block:: python
-		
-	    x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-	    fc = fluid.layers.fc(input=x, size=10,
-    		param_attr=fluid.initializer.MSRA(uniform=False))
 
+            fc = fluid.layers.fc(
+                input=queries, size=10,
+                param_attr=fluid.initializer.MSRA(uniform=False))
     """
 
     def __init__(self, uniform=True, fan_in=None, seed=0):
@@ -710,24 +703,19 @@ class BilinearInitializer(Initializer):
 
         .. code-block:: python
 
-	    factor = 2
-	    C = 2
-	    w_attr = fluid.initializer.ParamAttr(
-		learning_rate=0., 
-		regularizer=fluid.regularizer.L2Decay(0.),
-                initializer=fluid.initializer.Bilinear())
-	    x = fluid.layers.data(name="data", shape=[3, 32, 32], 
-				  dtype="float32")
-	    conv_up = fluid.layers.conv2d_transpose(
-    		input=x,
-    		num_filters=C,
-    		output_size=None,
-    		filter_size=2 * factor - factor % 2,
-    		padding=int(math.ceil((factor - 1) / 2.)),
-    		stride=factor,
-    		groups=C,
-    		param_attr=w_attr,
-    		bias_attr=False)
+            factor = 2
+            w_attr = ParamAttr(learning_rate=0., regularizer=L2Decay(0.),
+                               initializer=Bilinear())
+            conv_up = fluid.layers.conv2d_transpose(
+                input,
+                num_filters=C,
+                output_size=None,
+                filter_size=2 * factor - factor % 2,
+                padding=ceil((factor - 1) / 2.),
+                stride=factor,
+                groups=C,
+                param_attr=w_attr,
+                bias_attr=False)
 
     Where, `num_filters=C` and `groups=C` means this is channel-wise transposed
     convolution. The filter shape will be (C, 1, K, K) where K is `filer_size`,
@@ -836,7 +824,6 @@ class NumpyArrayInitializer(Initializer):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name="x", shape=[5], dtype='float32')
             fc = fluid.layers.fc(input=x, size=10,
                 param_attr=fluid.initializer.NumpyArrayInitializer(numpy.array([1,2])))
     """
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index dd1725b45ac..3cdd05533f7 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -31,7 +31,7 @@ class SimpleLayer(Layer):
         super(SimpleLayer, self).__init__(name_scope)
         self._fc1 = nn.FC(self.full_name(),
                           3,
-                          param_attr=ParamAttr(initializer=Constant(value=0.1)))
+                          ParamAttr(initializer=Constant(value=0.1)))
 
     def forward(self, inputs):
         x = self._fc1(inputs)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index b5dd383a0eb..b573093c302 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -25,7 +25,6 @@ from paddle.fluid import layers
 from paddle.fluid.executor import Executor
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, program_guard
-from paddle.fluid.log_helper import get_logger
 from . import reader
 from .reader import *
 from . import core
@@ -36,8 +35,9 @@ __all__ = [
     'load_persistables', 'save_inference_model', 'load_inference_model'
 ] + reader.__all__
 
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
+_logger = logging.getLogger(__name__)
+_logger.setLevel(logging.INFO)
 
 
 def is_parameter(var):
@@ -144,37 +144,27 @@ def save_vars(executor,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
+            exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
+
             # The first usage: using `main_program` to specify variables
             def name_has_fc(var):
                 res = "fc" in var.name
                 return res
-            fluid.io.save_vars(executor=exe, dirname=param_path, main_program=main_prog,
+
+            prog = fluid.default_main_program()
+            fluid.io.save_vars(executor=exe, dirname=path, main_program=prog,
                                vars=None, predicate = name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be saved.
             # And variables are going to be saved separately.
 
 
             # The second usage: using `vars` to specify variables
-            var_list = [w, b]
-            path = "./my_paddle_vars"
+            var_list = [var_a, var_b, var_c]
             fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
             # var_a, var_b and var_c will be saved. And they are going to be
-            # saved in the same file named 'var_file' in the path "./my_paddle_vars".
+            # saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     save_dirname = os.path.normpath(dirname)
     if vars is None:
@@ -242,9 +232,7 @@ def save_params(executor, dirname, main_program=None, filename=None):
     NOTICE: Some variables are not Parameter while they are necessary for
     training. So you can NOT save and continue your training just by
     `save_params()` and `load_params()`. Please use `save_persistables()`
-    and `load_persistables()` instead. If you want to save your model for
-    the inference, please use the `save_inference_model` API. You can refer
-    to :ref:`api_guide_model_save_reader_en` for more details.
+    and `load_persistables()` instead.
 
     Args:
         executor(Executor): The executor to run for saving parameters.
@@ -558,40 +546,27 @@ def load_vars(executor,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32', name='fc_w')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32', name='fc_b')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
-
+            exe = fluid.Executor(fluid.CPUPlace())
             param_path = "./my_paddle_model"
+
             # The first usage: using `main_program` to specify variables
             def name_has_fc(var):
                 res = "fc" in var.name
                 return res
-            fluid.io.save_vars(executor=exe, dirname=param_path, main_program=main_prog,
-                              vars=None, predicate=name_has_fc)
-            fluid.io.load_vars(executor=exe, dirname=param_path, main_program=main_prog,
+
+            prog = fluid.default_main_program()
+            fluid.io.load_vars(executor=exe, dirname=path, main_program=prog,
                                vars=None, predicate=name_has_fc)
             # All variables in `main_program` whose name includes "fc" will be loaded.
             # And all the variables are supposed to have been saved in differnet files.
 
+
             # The second usage: using `vars` to specify variables
-            path = "./my_paddle_vars"
-            var_list = [w, b]
-            fluid.io.save_vars(executor=exe, dirname=path, vars=var_list,
-                               filename="vars_file")
+            var_list = [var_a, var_b, var_c]
             fluid.io.load_vars(executor=exe, dirname=path, vars=var_list,
                                filename="vars_file")
-            # w and b will be loaded. And they are supposed to haven
-            # been saved in the same file named 'var_file' in the path "./my_paddle_vars".
+            # var_a, var_b and var_c will be loaded. And they are supposed to haven
+            # been saved in the same file named 'var_file' in the path "./my_paddle_model".
     """
     load_dirname = os.path.normpath(dirname)
     if vars is None:
@@ -660,9 +635,6 @@ def load_params(executor, dirname, main_program=None, filename=None):
     training. So you can NOT save and continue your training just by
     `save_params()` and `load_params()`. Please use `save_persistables()`
     and `load_persistables()` instead.
-    If you want to load the pre-trained model structure and parameters
-    for the inference, please use the `load_inference_model` API. You can
-    refer to :ref:`api_guide_model_save_reader_en` for more details.
 
     Args:
         executor(Executor): The executor to run for loading parameters.
@@ -907,15 +879,10 @@ def save_inference_model(dirname,
                          main_program=None,
                          model_filename=None,
                          params_filename=None,
-                         export_for_deployment=True,
-                         program_only=False):
+                         export_for_deployment=True):
     """
     Prune the given `main_program` to build a new program especially for inference,
     and then save it and all related parameters to given `dirname` by the `executor`.
-    If you just want to save parameters of your trained model, please use the
-    `save_params` API. You can refer to :ref:`api_guide_model_save_reader_en` for
-    more details.
-
 
     Args:
         dirname(str): The directory path to save the inference model.
@@ -939,7 +906,6 @@ def save_inference_model(dirname,
                                      more information will be stored for flexible
                                      optimization and re-training. Currently, only
                                      True is supported.
-        program_only(bool): If True, It will save inference program only, and do not save params of Program.
 
     Returns:
         target_var_name_list(list): The fetch variables' name list
@@ -1073,12 +1039,6 @@ def save_inference_model(dirname,
         with open(model_basename + ".main_program", "wb") as f:
             f.write(main_program.desc.serialize_to_string())
 
-    if program_only:
-        warnings.warn(
-            "save_inference_model specified the param `program_only` to True, It will not save params of Program."
-        )
-        return target_var_name_list
-
     main_program._copy_dist_param_info_from(origin_program)
 
     if params_filename is not None:
@@ -1094,10 +1054,7 @@ def load_inference_model(dirname,
                          params_filename=None,
                          pserver_endpoints=None):
     """
-    Load inference model from a directory. By this API, you can get the model
-    structure(inference program) and model parameters. If you just want to load
-    parameters of the pre-trained model, please use the `load_params` API.
-    You can refer to :ref:`api_guide_model_save_reader_en` for more details.
+    Load inference model from a directory
 
     Args:
         dirname(str): The directory path
@@ -1131,43 +1088,25 @@ def load_inference_model(dirname,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
-                w = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
-                b = fluid.layers.create_parameter(shape=[200], dtype='float32')
-                hidden_w = fluid.layers.matmul(x=data, y=w)
-                hidden_b = fluid.layers.elementwise_add(hidden_w, b)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_prog)
+            exe = fluid.Executor(fluid.CPUPlace())
             path = "./infer_model"
-            fluid.io.save_inference_model(dirname=path, feeded_var_names=['img'],
-                         target_vars=[hidden_b], executor=exe, main_program=main_prog)
-            tensor_img = np.array(np.random.random((1, 64, 784)), dtype=np.float32)
-            [inference_program, feed_target_names, fetch_targets] = (
-                fluid.io.load_inference_model(dirname=path, executor=exe))
+            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
+            [inference_program, feed_target_names, fetch_targets] =
+                fluid.io.load_inference_model(dirname=path, executor=exe)
             results = exe.run(inference_program,
                           feed={feed_target_names[0]: tensor_img},
                           fetch_list=fetch_targets)
 
-            # endpoints is your pserver endpoints list, the above is just an example
-            endpoints = ["127.0.0.1:2023","127.0.0.1:2024"]
             # if we need lookup table, we will use:
-            [dist_inference_program, dist_feed_target_names, dist_fetch_targets] = (
-                fluid.io.load_inference_model(dirname=path,
-                                              executor=exe,
-                                              pserver_endpoints=endpoints))
+            fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints)
 
             # In this example, the inference program was saved in the
             # "./infer_model/__model__" and parameters were saved in
-            # separate files in "./infer_model".
+            # separate files in ""./infer_model".
             # After getting inference program, feed target names and
             # fetch targets, we can use an Executor to run the inference
             # program to get the inference result.
+
     """
     load_dirname = os.path.normpath(dirname)
     if not os.path.isdir(load_dirname):
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index cbfd4f45f90..9eed00b1618 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -85,19 +85,19 @@ class LayerHelperBase(object):
                       block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_norm'])),
                     dtype=dtype,
                     persistable=False)
             abs_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
+                name=unique_name.generate(".".join(
                     [self.name, 'weight_norm_abs'])),
                 dtype=dtype,
                 persistable=False)
             block.append_op(
                 type='abs', inputs={'X': x}, outputs={'Out': abs_out})
             pow_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
+                name=unique_name.generate(".".join(
                     [self.name, 'weight_norm_pow'])),
                 dtype=dtype,
                 persistable=False)
@@ -107,7 +107,7 @@ class LayerHelperBase(object):
                 outputs={'Out': pow_out},
                 attrs={'factor': float(p)})
             sum_out = block.create_var(
-                name=unique_name.generate_with_ignorable_key(".".join(
+                name=unique_name.generate(".".join(
                     [self.name, 'weight_norm_sum'])),
                 dtype=dtype,
                 persistable=False)
@@ -133,7 +133,7 @@ class LayerHelperBase(object):
                          block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_reshape'])),
                     dtype=dtype,
                     persistable=False)
@@ -150,7 +150,7 @@ class LayerHelperBase(object):
                            block=self.startup_program.global_block()):
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_transpose'])),
                     dtype=dtype,
                     persistable=False)
@@ -168,7 +168,7 @@ class LayerHelperBase(object):
             """Computes the norm over all dimensions except dim"""
             if out is None:
                 out = block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
+                    name=unique_name.generate(".".join(
                         [self.name, 'weight_norm_norm'])),
                     dtype=dtype,
                     persistable=False)
@@ -327,8 +327,7 @@ class LayerHelperBase(object):
             infer_var_type.
         """
         return self.main_program.current_block().create_var(
-            name=unique_name.generate_with_ignorable_key(".".join(
-                [self.name, 'tmp'])),
+            name=unique_name.generate(".".join([self.name, 'tmp'])),
             dtype=dtype,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index 6beddac7aac..97c290f5a99 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -33,8 +33,7 @@ def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
 
     if out is None:
         out = helper.create_variable(
-            name=unique_name.generate_with_ignorable_key(".".join(
-                [x.name, 'tmp'])),
+            name=unique_name.generate(".".join([x.name, 'tmp'])),
             shape=x.shape,
             dtype=x.dtype,
             type=x.type,
@@ -47,14 +46,3 @@ def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
         attrs={"reduce_type": red_typ_int,
                "sync_mode": sync_mode})
     return out
-
-
-def _broadcast(x, root, sync_mode=False):
-    helper = LayerHelper("broadcast", **locals())
-    helper.append_op(
-        type='broadcast',
-        inputs={'X': [x]},
-        outputs={'Out': [x]},
-        attrs={"sync_mode": sync_mode,
-               "root": root})
-    return x
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index d073c15b023..2df63d723e6 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -165,31 +165,21 @@ def Print(input,
                 print the gradients of input tensor.
 
     Returns:
-        Variable: Output tensor.
+        Variable: Output tensor, same data with input tensor.
 
-    NOTES:
-        The input and output are two different variables, and in the
-        following process, you should use the output variable but not the input,
-        otherwise, the print layer doesn't have backward.
 
     Examples:
+
         .. code-block:: python
-           
-           import paddle.fluid as fluid
-           
-           input = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-           input = fluid.layers.Print(input, message = "The content of input layer:")
-           # value = some_layer(...)
-           # Print(value, summarize=10,
-           #    message="The content of some_layer: ")
 
+           value = some_layer(...)
+           Print(value, summarize=10,
+               message="The content of some_layer: ")
     '''
-    helper = LayerHelper('print' + "_" + input.name, **locals())
-    output = helper.create_variable_for_type_inference(input.dtype)
+    helper = LayerHelper('print', **locals())
     helper.append_op(
         type='print',
         inputs={'In': input},
-        outputs={'Out': output},
         attrs={
             'first_n': first_n,
             'summarize': summarize,
@@ -200,7 +190,7 @@ def Print(input,
             'print_tensor_lod': print_tensor_lod,
             'print_phase': print_phase.upper()
         })
-    return output
+    return input
 
 
 class BlockGuard(object):
@@ -286,29 +276,27 @@ class StaticRNN(object):
     the same. And the meaning of each axis of input and output are the same.**
 
     Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-
-            vocab_size, hidden_size=10000, 200
-            x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
-            x_emb = layers.embedding(
-                input=x,
-                size=[vocab_size, hidden_size],
-                dtype='float32',
-                is_sparse=False)
-            x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            rnn = fluid.layers.StaticRNN()
-            with rnn.step():
-                word = rnn.step_input(x_emb)
-                prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                rnn.update_memory(prev, hidden)  # set prev to hidden
-                rnn.step_output(hidden)
-
-            result = rnn()
+        >>> import paddle.fluid as fluid
+        >>> import paddle.fluid.layers as layers
+        >>>
+        >>> vocab_size, hidden_size=10000, 200
+        >>> x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
+        >>> x_emb = layers.embedding(
+        >>>         input=x,
+        >>>         size=[vocab_size, hidden_size],
+        >>>         dtype='float32',
+        >>>         is_sparse=False)
+        >>> x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+        >>>
+        >>> rnn = fluid.layers.StaticRNN()
+        >>> with rnn.step():
+        >>>    word = rnn.step_input(x_emb)
+        >>>    prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+        >>>    hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+        >>>    rnn.update_memory(prev, hidden)  # set prev to hidden
+        >>>    rnn.step_output(hidden)
+        >>>
+        >>> result = rnn()
 
     The StaticRNN will unfold sequence into time steps. Users need to define
     how to process each time step during the :code:`with` step.
@@ -373,27 +361,6 @@ class StaticRNN(object):
 
         Returns:
             The memory variable.
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                import paddle.fluid.layers as layers
-
-                vocab_size, hidden_size=10000, 200
-                x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
-                x_emb = layers.embedding(
-                    input=x,
-                    size=[vocab_size, hidden_size],
-                    dtype='float32',
-                    is_sparse=False)
-                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-                rnn = fluid.layers.StaticRNN()
-                with rnn.step():
-                    word = rnn.step_input(x_emb)
-                    prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                    hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                    rnn.update_memory(prev, hidden)
         """
         self._assert_in_rnn_block_('memory')
         if init is None:
@@ -401,7 +368,7 @@ class StaticRNN(object):
                 raise ValueError(
                     "if init is None, memory at least need shape and batch_ref")
             parent_block = self._parent_block()
-            var_name = unique_name.generate_with_ignorable_key("@".join(
+            var_name = unique_name.generate("@".join(
                 [self.helper.name, "memory_boot"]))
             boot_var = parent_block.create_var(
                 name=var_name,
@@ -424,8 +391,7 @@ class StaticRNN(object):
             return self.memory(init=boot_var)
         else:
             pre_mem = self.helper.create_variable(
-                name=unique_name.generate_with_ignorable_key("@".join(
-                    [self.helper.name, "mem"])),
+                name=unique_name.generate("@".join([self.helper.name, "mem"])),
                 dtype=init.dtype,
                 shape=init.shape)
             self.memories[pre_mem.name] = StaticRNNMemoryLink(
@@ -635,20 +601,18 @@ class While(object):
 
     Examples:
           .. code-block:: python
-            
-            import paddle.fluid as fluid
-            
-            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-            d0 = fluid.layers.data("d0", shape=[10], dtype='float32')
-            data_array = fluid.layers.array_write(x=d0, i=i)
-            array_len = fluid.layers.fill_constant(shape=[1],dtype='int64', value=3)
-
-            cond = fluid.layers.less_than(x=i, y=array_len)
-            while_op = fluid.layers.While(cond=cond)
+
+            d0 = layers.data("d0", shape=[10], dtype='float32')
+            data_array = layers.array_write(x=d0, i=i)
+            array_len = layers.fill_constant(shape=[1],dtype='int64', value=3)
+
+            cond = layers.less_than(x=i, y=array_len)
+            while_op = layers.While(cond=cond)
             with while_op.block():
-                d = fluid.layers.array_read(array=data_array, i=i)
-                i = fluid.layers.increment(x=i, value=1, in_place=True)
-                fluid.layers.less_than(x=i, y=array_len, cond=cond)            
+                d = layers.array_read(array=data_array, i=i)
+                i = layers.increment(x=i, in_place=True)
+                layers.array_write(result, i=i, array=d)
+                layers.less_than(x=i, y=array_len, cond=cond)
     """
 
     BEFORE_WHILE_BLOCK = 0
@@ -888,7 +852,6 @@ def increment(x, value=1.0, in_place=True):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           data = fluid.layers.data(name='data', shape=[1], dtype='float32',
                                    append_batch_size=False)
           data = fluid.layers.increment(x=data, value=3.0, in_place=True)
@@ -929,10 +892,9 @@ def array_write(x, i, array=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
-          arr = fluid.layers.array_write(tmp, i=i)
+          arr = layers.array_write(tmp, i=i)
     """
     helper = LayerHelper('array_write', **locals())
     if array is None:
@@ -979,6 +941,9 @@ def less_than(x, y, force_cpu=None, cond=None):
     """
     ${comment}
 
+    >>> import paddle.fluid as fluid
+    >>> less = fluid.layers.less_than(x=label, y=limit)
+
     Args:
         x(${x_type}): ${x_comment}.
         y(${y_type}): ${y_comment}.
@@ -987,13 +952,6 @@ def less_than(x, y, force_cpu=None, cond=None):
 
     Returns:
         ${out_comment}.
-
-    Examples:
-        .. code-block:: python
-
-          label = fluid.layers.data(name='y', shape=[1], dtype='int64')
-          limit = fluid.layers.fill_constant(shape=[1], dtype='int64', value=5)
-          cond = fluid.layers.less_than(x=label, y=limit)
     """
     helper = LayerHelper("less_than", **locals())
     if cond is None:
@@ -1138,9 +1096,6 @@ def equal(x, y, cond=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          label = fluid.layers.data(name="label", shape=[3,10,32,32], dtype="float32")
-          limit = fluid.layers.data(name="limit", shape=[3,10,32,32], dtype="float32")
           less = fluid.layers.equal(x=label, y=limit)
     """
     helper = LayerHelper("equal", **locals())
@@ -1211,7 +1166,6 @@ def array_read(array, i):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           array = fluid.layers.create_array(dtype='float32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           item = fluid.layers.array_read(array, i)
@@ -1286,7 +1240,6 @@ def array_length(array):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           tmp = fluid.layers.zeros(shape=[10], dtype='int32')
           i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
           arr = fluid.layers.array_write(tmp, i=i)
@@ -1424,30 +1377,23 @@ class Switch(object):
 
     Examples:
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            lr = fluid.layers.create_global_var(
+            lr = fluid.layers.tensor.create_global_var(
                 shape=[1],
                 value=0.0,
                 dtype='float32',
                 persistable=True,
                 name="learning_rate")
-            zero_var = fluid.layers.fill_constant(
-                 shape=[1], dtype='float32', value=0.0)
-            one_var = fluid.layers.fill_constant(
+            one_var = tensor.fill_constant(
                 shape=[1], dtype='float32', value=1.0)
-            two_var = fluid.layers.fill_constant(
-                shape=[1], dtype='float32', value=2.0) 
-
-            global_step = fluid.layers.autoincreased_step_counter(
-                   counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
+            two_var = tensor.fill_constant(
+                shape=[1], dtype='float32', value=2.0)
 
             with fluid.layers.control_flow.Switch() as switch:
                 with switch.case(global_step == zero_var):
-                    fluid.layers.assign(input=one_var, output=lr)
+                    fluid.layers.tensor.assign(input=one_var, output=lr)
                 with switch.default():
-                    fluid.layers.assign(input=two_var, output=lr)
+                    fluid.layers.tensor.assign(input=two_var, output=lr)
 
     """
 
@@ -1457,6 +1403,8 @@ class Switch(object):
         self.pre_not_conditions = []
 
     def case(self, condition):
+        """create a new block for this condition
+        """
         if not self.inside_scope:
             raise ValueError("case should be called inside with")
 
@@ -1478,6 +1426,9 @@ class Switch(object):
         return ConditionalBlockGuard(cond_block)
 
     def default(self):
+        """
+        create a default case for this switch
+        """
         pre_cond_num = len(self.pre_not_conditions)
         if pre_cond_num == 0:
             raise ValueError("there should be at least one condition")
@@ -1546,12 +1497,8 @@ class IfElse(object):
     Examples:
           .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            image = fluid.layers.data(name="X", shape=[2, 5, 5], dtype='float32')
-            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             limit = fluid.layers.fill_constant_batch_size_like(
-                 input=label, dtype='int64', shape=[1], value=5.0)
+                input=label, dtype='int64', shape=[1], value=5.0)
             cond = fluid.layers.less_than(x=label, y=limit)
             ie = fluid.layers.IfElse(cond)
             with ie.true_block():
@@ -1589,13 +1536,11 @@ class IfElse(object):
         if id(x) not in self.input_table:
             parent_block = self._parent_block()
             out_true = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
+                name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
 
             out_false = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
+                name=unique_name.generate('ifelse_input' + self.helper.name),
                 dtype=x.dtype)
             parent_block.append_op(
                 type='split_lod_tensor',
@@ -1637,7 +1582,7 @@ class IfElse(object):
                 raise TypeError("Each output should be a variable")
             # create outside tensor
             outside_out = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
+                name=unique_name.generate("_".join(
                     [self.helper.name, 'output'])),
                 dtype=each_out.dtype)
             out_table.append(outside_out)
@@ -1677,7 +1622,23 @@ class DynamicRNN(object):
     sample sequence can be different. This API automatically process them in
     batch.
 
-    The input lod must be set. Please reference to `lod_tensor`.
+    The input lod must be set. Please reference `lod_tensor`
+
+    >>> import paddle.fluid as fluid
+    >>> data = fluid.layers.data(name='sentence', dtype='int64', lod_level=1)
+    >>> embedding = fluid.layers.embedding(input=data, size=[65535, 32],
+    >>>                                    is_sparse=True)
+    >>>
+    >>> drnn = fluid.layers.DynamicRNN()
+    >>> with drnn.block():
+    >>>     word = drnn.step_input(embedding)
+    >>>     prev = drnn.memory(shape=[200])
+    >>>     hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
+    >>>     drnn.update_memory(prev, hidden)  # set prev to hidden
+    >>>     drnn.output(hidden)
+    >>>
+    >>> # last is the last time step of rnn. It is the encoding result.
+    >>> last = fluid.layers.sequence_last_step(drnn())
 
     The dynamic RNN will unfold sequence into timesteps. Users need to define
     how to process each time step during the :code:`with` block.
@@ -1687,30 +1648,10 @@ class DynamicRNN(object):
 
     The dynamic RNN can mark multiple variables as its output. Use `drnn()` to
     get the output sequence.
-
+    
     NOTES:
         Currently it is not supported that setting is_sparse to True of any 
         layers within DynamicRNN.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          sentence = fluid.layers.data(name='sentence', shape=[1], dtype='int64', lod_level=1)
-          embedding = fluid.layers.embedding(input=sentence, size=[65536, 32], is_sparse=True)
-    
-          drnn = fluid.layers.DynamicRNN()
-          with drnn.block():
-              word = drnn.step_input(embedding)
-              prev = drnn.memory(shape=[200])
-              hidden = fluid.layers.fc(input=[word, prev], size=200, act='relu')
-              drnn.update_memory(prev, hidden)  # set prev to hidden
-              drnn.output(hidden)
-
-          # Get the last time step of rnn. It is the encoding result.
-          rnn_output = drnn()
-          last = fluid.layers.sequence_last_step(rnn_output)
     """
     BEFORE_RNN = 0
     IN_RNN = 1
@@ -1737,8 +1678,8 @@ class DynamicRNN(object):
         Mark a sequence as a dynamic RNN input.
 
         Args:
-            x (Variable): The input sequence which should have lod information.
-            level (int): The level of lod used to split steps. Default: 0.
+            x(Variable): The input sequence.
+            level(int): The level of lod used to split steps. Default: 0.
 
         Returns:
             The current timestep in the input sequence.
@@ -1789,37 +1730,13 @@ class DynamicRNN(object):
     def static_input(self, x):
         """
         Mark a variable as a RNN input. The input will not be scattered into
-        time steps. It is optional.
+        time steps.
 
         Args:
-            x (Variable): The input variable.
+            x(Variable): The input variable.
 
         Returns:
             The input variable that can access in RNN.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', dtype='float32', shape=[32], lod_level=1)
-              encoder_proj = fluid.layers.data(name='encoder_proj', dtype='float32', shape=[32], lod_level=1)
-              decoder_boot = fluid.layers.data(name='boot', dtype='float32', shape=[10], lod_level=1)
-
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  current_word = drnn.step_input(sentence)
-                  encoder_word = drnn.static_input(encoder_proj)
-                  hidden_mem = drnn.memory(init=decoder_boot, need_reorder=True)
-                  fc_1 = fluid.layers.fc(input=encoder_word, size=30, bias_attr=False)
-                  fc_2 = fluid.layers.fc(input=current_word, size=30, bias_attr=False)
-                  decoder_inputs = fc_1 + fc_2
-                  h, _, _ = fluid.layers.gru_unit(input=decoder_inputs, hidden=hidden_mem, size=30)
-                  drnn.update_memory(hidden_mem, h)
-                  out = fluid.layers.fc(input=h, size=10, bias_attr=True, act='softmax') 
-                  drnn.output(out)
-
-              rnn_output = drnn()
         """
         self._assert_in_rnn_block_("static_input")
         if not isinstance(x, Variable):
@@ -1896,51 +1813,54 @@ class DynamicRNN(object):
         the input variable. It should be set to true when the initialized memory
         depends on the input sample.
 
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', shape=[32], dtype='float32', lod_level=1)
-              boot_memory = fluid.layers.data(name='boot', shape=[10], dtype='float32', lod_level=1)
-              
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  word = drnn.step_input(sentence)
-                  memory = drnn.memory(init=boot_memory, need_reorder=True)
-                  hidden = fluid.layers.fc(input=[word, memory], size=10, act='tanh')
-                  drnn.update_memory(ex_mem=memory, new_mem=hidden)
-                  drnn.output(hidden)
-
-              rnn_output = drnn()
+        For example,
+
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>> boot_memory = fluid.layers.data(
+        >>>                 name='boot', dtype='float32', shape=[10])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(init=boot_memory, need_reorder=True)
+        >>>     hidden = fluid.layers.fc(
+        >>>                 input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
 
 
         Otherwise, if :code:`shape`, :code:`value`, :code:`dtype` are set, the
         :code:`memory` will be initialized by this :code:`value`.
 
-        Examples:
-            .. code-block:: python
+        For example,
 
-              import paddle.fluid as fluid
-
-              sentence = fluid.layers.data(name='sentence', dtype='float32', shape=[32], lod_level=1)
-              
-              drnn = fluid.layers.DynamicRNN()
-              with drnn.block():
-                  word = drnn.step_input(sentence)
-                  memory = drnn.memory(shape=[10], dtype='float32', value=0)
-                  hidden = fluid.layers.fc(input=[word, memory], size=10, act='tanh')
-                  drnn.update_memory(ex_mem=memory, new_mem=hidden)
-                  drnn.output(hidden)
-
-              rnn_output = drnn()
+        >>> import paddle.fluid as fluid
+        >>> sentence = fluid.layers.data(
+        >>>                 name='sentence', dtype='float32', shape=[32])
+        >>>
+        >>> drnn = fluid.layers.DynamicRNN()
+        >>> with drnn.block():
+        >>>     word = drnn.step_input(sentence)
+        >>>     memory = drnn.memory(shape=[10], dtype='float32', value=0)
+        >>>     hidden = fluid.layers.fc(
+        >>>             input=[word, memory], size=10, act='tanh')
+        >>>     drnn.update_memory(ex_mem=memory, new_mem=hidden)
+        >>>     drnn.output(hidden)
+        >>> rnn_output = drnn()
 
 
         Args:
             init(Variable|None): The initialized variable.
-            shape(list|tuple): The memory shape. The shape does not contain batch_size.
+
+            shape(list|tuple): The memory shape. NOTE the shape does not contain batch_size.
+
             value(float): the initalized value.
+
             need_reorder(bool): True if the initialized memory depends on the input sample.
+
             dtype(str|numpy.dtype): The data type of the initialized memory.
 
         Returns:
@@ -2055,7 +1975,7 @@ class DynamicRNN(object):
         parent_block = self._parent_block_()
         for each in outputs:
             outside_array = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
+                name=unique_name.generate("_".join(
                     [self.helper.name, "output_array", each.name])),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
                 dtype=each.dtype)
@@ -2092,31 +2012,8 @@ class DynamicRNN(object):
                 method))
 
 
-@templatedoc()
+@autodoc()
 def reorder_lod_tensor_by_rank(x, rank_table):
-    """
-    ${comment}
-
-    Args:
-    
-        x(${x_type}): ${x_comment}
-        rank_table(${rank_table_type}): ${rank_table_type}
-    
-    Returns:
-        out(${out_type}): ${out_comment} 
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          data_desc = (['input', [9], 0], ['ref', [5], 1])
-          data = fluid.layers.data(name=data_desc[0][0], shape=data_desc[0][1])
-          rank_data = fluid.layers.data(name=data_desc[1][0], shape=data_desc[1][1])
-          table = fluid.layers.control_flow.lod_rank_table(rank_data)
-          new_data = fluid.layers.reorder_lod_tensor_by_rank(
-                           x=data, rank_table=table)
-
-    """
     helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
     helper.is_instance('x', Variable)
     helper.is_instance('rank_table', Variable)
@@ -2149,12 +2046,9 @@ def is_empty(x, cond=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          input = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
           res = fluid.layers.is_empty(x=input)
           # or:
-          # fluid.layers.is_empty(x=input, cond=res)
-
+          fluid.layers.is_empty(x=input, cond=res)
     """
     helper = LayerHelper("is_empty", **locals())
     if cond is None:
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 36877269faa..dd50fc91248 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -38,9 +38,8 @@ __all__ = [
     'target_assign',
     'detection_output',
     'ssd_loss',
+    'detection_map',
     'rpn_target_assign',
-    'retinanet_target_assign',
-    'sigmoid_focal_loss',
     'anchor_generator',
     'roi_perspective_transform',
     'generate_proposal_labels',
@@ -53,171 +52,11 @@ __all__ = [
     'yolo_box',
     'box_clip',
     'multiclass_nms',
-    'retinanet_detection_output',
     'distribute_fpn_proposals',
     'box_decoder_and_assign',
-    'collect_fpn_proposals',
 ]
 
 
-def retinanet_target_assign(bbox_pred,
-                            cls_logits,
-                            anchor_box,
-                            anchor_var,
-                            gt_boxes,
-                            gt_labels,
-                            is_crowd,
-                            im_info,
-                            num_classes=1,
-                            positive_overlap=0.5,
-                            negative_overlap=0.4):
-    """
-    **Target Assign Layer for Retinanet .**
-
-    This layer can be, for given the Intersection-over-Union (IoU) overlap
-    between anchors and ground truth boxes, to assign classification and
-    regression targets to each anchor, these target labels are used for training
-    retinanet. Every anchor is assigned with a length :attr:`num_classes`
-    one-hot vector of classification targets, and a 4-vector of box regression
-    targets. The assignment rules are as followed:
-    
-    1. Anchors are assigned to ground-truth boxes when: (i) it has the highest
-    IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher
-    than positive_overlap(0.5) with any ground-truth box.
-    
-    2. Anchors are assigned to background when its IoU ratio is lower than
-    negative_overlap (0.4) for all ground-truth boxes.
-    
-    When an anchor is assigned with a ground-truth box which is the i-th category,
-    the i-th entry in its C vector of targets is set to 1 and all other entries
-    are set to 0. When an anchor is assigned with background, all entries are set
-    to 0. Anchors that are not assigned do not contribute to the training
-    objective. The regression targets are the encoded ground-truth boxes
-    associated with the assigned anchors.
- 
-    Args:
-        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
-            predicted locations of M bounding bboxes. N is the batch size,
-            and each bounding box has four coordinate values and the layout
-            is [xmin, ymin, xmax, ymax].
-        cls_logits(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            number of classes (excluding background), M is number of bounding boxes.
-        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
-            each box is represented as [xmin, ymin, xmax, ymax],
-            [xmin, ymin] is the left top coordinate of the anchor box,
-            if the input is image feature map, they are close to the origin
-            of the coordinate system. [xmax, ymax] is the right bottom
-            coordinate of the anchor box.
-        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
-            variances of anchors.
-        gt_boxes(Variable): The ground-truth bounding boxes (bboxes) are a 2D
-            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
-            bboxes of mini-batch input.
-        gt_labels(variable): The ground-truth labels are a 2D LoDTensor with
-            shape [Ng, 1], Ng is the total number of ground-truth labels of
-            mini-batch input.
-        is_crowd(Variable): A 1-D LoDTensor which indicates ground-truth is crowd.
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
-            3 is the height, width and scale.
-        num_classes(int32): The number of classes.
-        positive_overlap(float): Minimum overlap required between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a positive
-            example.
-        negative_overlap(float): Maximum overlap allowed between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be a negative
-            examples.
-
-    Returns:
-        tuple:
-               A tuple(predicted_scores, predicted_location, target_label,
-               target_bbox, bbox_inside_weight, fg_num) is returned. The
-               predicted_scores and predicted_location are the predicted result
-               of the retinanet.The target_label and target_bbox are the ground
-               truth, respectively. The predicted_location is a 2D Tensor with
-               shape [F, 4], and the shape of target_bbox is same as the shape of
-               the predicted_location, F is the number of the foreground
-               anchors. The predicted_scores is a 2D Tensor with shape
-               [F + B, C], and the shape of target_label is [F + B, 1], B is the
-               number of the background anchors, the F and B is depends on the
-               input of this operator. Bbox_inside_weight represents whether the
-               predicted location is fake foreground or not and the shape is [F, 4].
-               Fg_num is the foreground number (including fake foreground) which
-               is needed by focal loss.
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          bbox_pred = layers.data(name='bbox_pred', shape=[1, 100, 4],
-                            append_batch_size=False, dtype='float32')
-          cls_logits = layers.data(name='cls_logits', shape=[1, 100, 10],
-                            append_batch_size=False, dtype='float32')
-          anchor_box = layers.data(name='anchor_box', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-          anchor_var = layers.data(name='anchor_var', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-          gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
-                            append_batch_size=False, dtype='float32')
-          gt_labels = layers.data(name='gt_labels', shape=[10, 1],
-                            append_batch_size=False, dtype='float32')
-          is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
-                            append_batch_size=False, dtype='float32')
-          im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
-                            append_batch_size=False, dtype='float32')
-          loc_pred, score_pred, loc_target, score_target, bbox_inside_weight, fg_num =
-                fluid.layers.retinanet_target_assign(bbox_pred, cls_logits, anchor_box,
-                anchor_var, gt_boxes, gt_labels, is_crowd, im_info, 10)
-
-    """
-
-    helper = LayerHelper('retinanet_target_assign', **locals())
-    # Assign target label to anchors
-    loc_index = helper.create_variable_for_type_inference(dtype='int32')
-    score_index = helper.create_variable_for_type_inference(dtype='int32')
-    target_label = helper.create_variable_for_type_inference(dtype='int32')
-    target_bbox = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    bbox_inside_weight = helper.create_variable_for_type_inference(
-        dtype=anchor_box.dtype)
-    fg_num = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="retinanet_target_assign",
-        inputs={
-            'Anchor': anchor_box,
-            'GtBoxes': gt_boxes,
-            'GtLabels': gt_labels,
-            'IsCrowd': is_crowd,
-            'ImInfo': im_info
-        },
-        outputs={
-            'LocationIndex': loc_index,
-            'ScoreIndex': score_index,
-            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
-            'BBoxInsideWeight': bbox_inside_weight,
-            'ForegroundNumber': fg_num
-        },
-        attrs={
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap
-        })
-
-    loc_index.stop_gradient = True
-    score_index.stop_gradient = True
-    target_label.stop_gradient = True
-    target_bbox.stop_gradient = True
-    bbox_inside_weight.stop_gradient = True
-    fg_num.stop_gradient = True
-
-    cls_logits = nn.reshape(x=cls_logits, shape=(-1, num_classes))
-    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
-    predicted_cls_logits = nn.gather(cls_logits, score_index)
-    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)
-
-    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight, fg_num
-
-
 def rpn_target_assign(bbox_pred,
                       cls_logits,
                       anchor_box,
@@ -302,24 +141,19 @@ def rpn_target_assign(bbox_pred,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            bbox_pred = fluid.layers.data(name='bbox_pred', shape=[100, 4],
-                            append_batch_size=False, dtype='float32')
-            cls_logits = fluid.layers.data(name='cls_logits', shape=[100, 1],
-                            append_batch_size=False, dtype='float32')
-            anchor_box = fluid.layers.data(name='anchor_box', shape=[20, 4],
-                            append_batch_size=False, dtype='float32')
-            anchor_var = fluid.layers.data(name='anchor_var', shape=[20, 4],
-                            append_batch_size=False, dtype='float32')
-            gt_boxes = fluid.layers.data(name='gt_boxes', shape=[10, 4],
-                            append_batch_size=False, dtype='float32')
-            is_crowd = fluid.layers.data(name='is_crowd', shape=[1],
-                            append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_infoss', shape=[1, 3],
-                            append_batch_size=False, dtype='float32')
-            loc_pred, score_pred, loc_target, score_target, bbox_inside_weight=
-                fluid.layers.rpn_target_assign(bbox_pred, cls_logits,
-                anchor_box, anchor_var, gt_boxes, is_crowd, im_info)
+            bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
+                              append_batch_size=False, dtype='float32')
+            cls_logits = layers.data(name='cls_logits', shape=[100, 1],
+                              append_batch_size=False, dtype='float32')
+            anchor_box = layers.data(name='anchor_box', shape=[20, 4],
+                              append_batch_size=False, dtype='float32')
+            gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
+                             append_batch_size=False, dtype='float32')
+            loc_pred, score_pred, loc_target, score_target, bbox_inside_weight =
+                fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                              cls_logits=cls_logits,
+                                              anchor_box=anchor_box,
+                                              gt_boxes=gt_boxes)
 
     """
 
@@ -370,74 +204,6 @@ def rpn_target_assign(bbox_pred,
     return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox, bbox_inside_weight
 
 
-def sigmoid_focal_loss(x, label, fg_num, gamma=2, alpha=0.25):
-    """
-    **Sigmoid Focal Loss Operator.**
-
-    Focal loss is used to address the foreground-background class imbalance existed
-    on the training phase of one-stage detectors. This operator computes the sigmoid
-    value for each element in the input tensor, after which focal loss is measured.
-    
-    The focal loss is given as followed:
-
-    .. math::
-        loss_j = (-label_j * alpha * {(1 - \\sigma(x_j))}^{gamma} * \\log(\\sigma(x_j)) -
-        (1 - labels_j) * (1 - alpha) * {(\sigma(x_j)}^{ gamma} * \\log(1 - \\sigma(x_j)))
-        / fg\_num, j = 1,...,K
-
-    We know that
-    
-    .. math::
-        \\sigma(x_j) = \\frac{1}{1 + \\exp(-x_j)}
-
-    Args:
-        x(Variable): A 2-D tensor with shape [N, D], where N is the batch size and D is the number
-            of classes (excluding background). This input is a tensor of logits computed by the
-            previous operator.
-        label(Variable): A 2-D tensor with shape [N, 1], which is the probabilistic labels.
-        fg_num(Variable): A 1-D tensor with shape [1], which is the number of foreground.
-
-        gamma(float): Hyper-parameter to balance the easy and hard examples. Default value is
-            set to 2.0.
-        alpha(float): Hyper-parameter to balance the positive and negative example. Default value
-            is set to 0.25.
-
-    Returns:
-        out(Variable): A 2-D tensor with shape [N, D], which is the focal loss.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
-                name='data', shape=[10,80], append_batch_size=False, dtype='float32')
-            label = fluid.layers.data(
-                name='label', shape=[10,1], append_batch_size=False, dtype='int32')
-            fg_num = fluid.layers.data(
-                name='fg_num', shape=[1], append_batch_size=False, dtype='int32')
-            loss = fluid.layers.sigmoid_focal_loss(x=input,
-                                                   label=label,
-                                                   fg_num=fg_num,
-                                                   gamma=2.,
-                                                   alpha=0.25)
-    """
-
-    helper = LayerHelper("sigmoid_focal_loss", **locals())
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(
-        type="sigmoid_focal_loss",
-        inputs={"X": x,
-                "Label": label,
-                "FgNum": fg_num},
-        attrs={"gamma": gamma,
-               'alpha': alpha},
-        outputs={"Out": out})
-    return out
-
-
 def detection_output(loc,
                      scores,
                      prior_box,
@@ -509,15 +275,13 @@ def detection_output(loc,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            pb = fluid.layers.data(name='prior_box', shape=[10, 4],
+            pb = layers.data(name='prior_box', shape=[10, 4],
                          append_batch_size=False, dtype='float32')
-            pbv = fluid.layers.data(name='prior_box_var', shape=[10, 4],
+            pbv = layers.data(name='prior_box_var', shape=[10, 4],
                           append_batch_size=False, dtype='float32')
-            loc = fluid.layers.data(name='target_box', shape=[2, 21, 4],
+            loc = layers.data(name='target_box', shape=[2, 21, 4],
                           append_batch_size=False, dtype='float32')
-            scores = fluid.layers.data(name='scores', shape=[2, 21, 10],
+            scores = layers.data(name='scores', shape=[2, 21, 10],
                           append_batch_size=False, dtype='float32')
             nmsed_outs = fluid.layers.detection_output(scores=scores,
                                        loc=loc,
@@ -563,15 +327,6 @@ def iou_similarity(x, y, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[4], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[4], dtype='float32')
-            iou = fluid.layers.iou_similarity(x=x, y=y)
     """
     helper = LayerHelper("iou_similarity", **locals())
     if name is None:
@@ -736,14 +491,6 @@ def polygon_box_transform(input, name=None):
 
     Returns:
         output(${output_type}): ${output_comment}
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            input = fluid.layers.data(name='input', shape=[4, 10, 5, 5],
-                                      append_batch_size=False, dtype='float32')
-            out = fluid.layers.polygon_box_transform(input)
     """
     helper = LayerHelper("polygon_box_transform", **locals())
     if name is None:
@@ -1001,7 +748,6 @@ def detection_map(detect_res,
     Examples:
           .. code-block:: python
 
-            from fluid.layers import detection
             detect_res = fluid.layers.data(
                 name='detect_res',
                 shape=[10, 6],
@@ -1013,7 +759,7 @@ def detection_map(detect_res,
                 append_batch_size=False,
                 dtype='float32')
 
-            map_out = detection.detection_map(detect_res, label, 21)
+            map_out = fluid.layers.detection_map(detect_res, label, 21)
     """
     helper = LayerHelper("detection_map", **locals())
 
@@ -1159,7 +905,7 @@ def target_assign(input,
     this operator assigns classification/regression targets by performing the
     following steps:
 
-    1. Assigning all outputs based on `match_indices`:
+    1. Assigning all outpts based on `match_indices`:
 
     .. code-block:: text
 
@@ -1206,22 +952,11 @@ def target_assign(input,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.layers.data(
-                name='x',
-                shape=[4, 20, 4],
-                dtype='float',
-                lod_level=1,
-                append_batch_size=False)
-            matched_id = fluid.layers.data(
-                name='indices',
-                shape=[8, 20],
-                dtype='int32',
-                append_batch_size=False)
-            trg, trg_weight = fluid.layers.target_assign(
-                x,
-                matched_id,
-                mismatch_value=0)
+            matched_indices, matched_dist = fluid.layers.bipartite_match(iou)
+            gt = layers.data(
+                        name='gt', shape=[1, 1], dtype='int32', lod_level=1)
+            trg, trg_weight = layers.target_assign(
+                            gt, matched_indices, mismatch_value=0)
     """
     helper = LayerHelper('target_assign', **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1813,16 +1548,6 @@ def multi_box_head(inputs,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-
-          images = fluid.layers.data(name='data', shape=[3, 300, 300], dtype='float32')
-          conv1 = fluid.layers.data(name='conv1', shape=[512, 19, 19], dtype='float32')
-          conv2 = fluid.layers.data(name='conv2', shape=[1024, 10, 10], dtype='float32')
-          conv3 = fluid.layers.data(name='conv3', shape=[512, 5, 5], dtype='float32')
-          conv4 = fluid.layers.data(name='conv4', shape=[256, 3, 3], dtype='float32')
-          conv5 = fluid.layers.data(name='conv5', shape=[256, 2, 2], dtype='float32')
-          conv6 = fluid.layers.data(name='conv6', shape=[128, 1, 1], dtype='float32')
-
           mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
             inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
             image=images,
@@ -2106,7 +1831,6 @@ def roi_perspective_transform(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
-
             x = fluid.layers.data(name='x', shape=[256, 28, 28], dtype='float32')
             rois = fluid.layers.data(name='rois', shape=[8], lod_level=1, dtype='float32')
             out = fluid.layers.roi_perspective_transform(x, rois, 7, 7, 1.0)
@@ -2145,13 +1869,9 @@ def generate_proposal_labels(rpn_rois,
                              bg_thresh_lo=0.0,
                              bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
                              class_nums=None,
-                             use_random=True,
-                             is_cls_agnostic=False,
-                             is_cascade_rcnn=False):
+                             use_random=True):
     """
-
     ** Generate Proposal Labels of Faster-RCNN **
-
     This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth,
     to sample foreground boxes and background boxes, and compute loss target.
 
@@ -2182,28 +1902,6 @@ def generate_proposal_labels(rpn_rois,
         bbox_reg_weights(list|tuple): Box regression weights.
         class_nums(int): Class number.
         use_random(bool): Use random sampling to choose foreground and background boxes.
-        is_cls_agnostic(bool): bbox regression use class agnostic simply which only represent fg and bg boxes.
-        is_cascade_rcnn(bool): it will filter some bbox crossing the image's boundary when setting True.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            rpn_rois = fluid.layers.data(name='rpn_rois', shape=[2, 4],
-                           append_batch_size=False, dtype='float32')
-            gt_classes = fluid.layers.data(name='gt_classes', shape=[8, 1],
-                           append_batch_size=False, dtype='float32')
-            is_crowd = fluid.layers.data(name='is_crowd', shape=[8, 1],
-                           append_batch_size=False, dtype='float32')
-            gt_boxes = fluid.layers.data(name='gt_boxes', shape=[8, 4],
-                           append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_info', shape=[10, 3],
-                           append_batch_size=False, dtype='float32')
-            rois, labels_int32, bbox_targets, bbox_inside_weights,
-            bbox_outside_weights = fluid.layers.generate_proposal_labels(
-                           rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
-                           class_nums=10)
-
     """
 
     helper = LayerHelper('generate_proposal_labels', **locals())
@@ -2242,9 +1940,7 @@ def generate_proposal_labels(rpn_rois,
             'bg_thresh_lo': bg_thresh_lo,
             'bbox_reg_weights': bbox_reg_weights,
             'class_nums': class_nums,
-            'use_random': use_random,
-            'is_cls_agnostic': is_cls_agnostic,
-            'is_cascade_rcnn': is_cascade_rcnn
+            'use_random': use_random
         })
 
     rois.stop_gradient = True
@@ -2336,8 +2032,6 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-
           im_info = fluid.layers.data(name="im_info", shape=[3],
               dtype="float32")
           gt_classes = fluid.layers.data(name="gt_classes", shape=[1],
@@ -2346,19 +2040,15 @@ def generate_mask_labels(im_info, gt_classes, is_crowd, gt_segms, rois,
               dtype="float32", lod_level=1)
           gt_masks = fluid.layers.data(name="gt_masks", shape=[2],
               dtype="float32", lod_level=3)
-          # rois, roi_labels can be the output of
+          # rois, labels_int32 can be the output of
           # fluid.layers.generate_proposal_labels.
-          rois = fluid.layers.data(name="rois", shape=[4],
-              dtype="float32", lod_level=1)
-          roi_labels = fluid.layers.data(name="roi_labels", shape=[1],
-              dtype="int32", lod_level=1)
           mask_rois, mask_index, mask_int32 = fluid.layers.generate_mask_labels(
               im_info=im_info,
               gt_classes=gt_classes,
               is_crowd=is_crowd,
               gt_segms=gt_masks,
               rois=rois,
-              labels_int32=roi_labels,
+              labels_int32=labels_int32,
               num_classes=81,
               resolution=14)
     """
@@ -2452,24 +2142,6 @@ def generate_proposals(scores,
             width < min_size. 0.1 by default.
         eta(float): Apply in adaptive NMS, if adaptive threshold > 0.5,
             adaptive_threshold = adaptive_threshold * eta in each iteration.
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-            scores = fluid.layers.data(name='scores', shape=[2, 4, 5, 5],
-                         append_batch_size=False, dtype='float32')
-            bbox_deltas = fluid.layers.data(name='bbox_deltas', shape=[2, 16, 5, 5],
-                         append_batch_size=False, dtype='float32')
-            im_info = fluid.layers.data(name='im_info', shape=[2, 3],
-                         append_batch_size=False, dtype='float32')
-            anchors = fluid.layers.data(name='anchors', shape=[5, 5, 4, 4],
-                         append_batch_size=False, dtype='float32')
-            variances = fluid.layers.data(name='variances', shape=[5, 5, 10, 4],
-                         append_batch_size=False, dtype='float32')
-            rois, roi_probs = fluid.layers.generate_proposals(scores, bbox_deltas,
-                         im_info, anchors, variances)
-
     """
     helper = LayerHelper('generate_proposals', **locals())
 
@@ -2549,113 +2221,6 @@ def box_clip(input, im_info, name=None):
     return output
 
 
-def retinanet_detection_output(bboxes,
-                               scores,
-                               anchors,
-                               im_info,
-                               score_threshold=0.05,
-                               nms_top_k=1000,
-                               keep_top_k=100,
-                               nms_threshold=0.3,
-                               nms_eta=1.):
-    """
-    **Detection Output Layer for Retinanet.**
-
-    This operation is to get the detection results by performing following
-    steps:
-
-    1. Decode top-scoring bounding box predictions per FPN level according 
-       to the anchor boxes.
-    2. Merge top predictions from all levels and apply multi-class non 
-       maximum suppression (NMS) on them to get the final detections.
-
-    Args:
-        bboxes(List): A list of tensors from multiple FPN levels. Each
-            element is a 3-D Tensor with shape [N, Mi, 4] representing the
-            predicted locations of Mi bounding boxes. N is the batch size,
-            Mi is the number of bounding boxes from i-th FPN level and each 
-            bounding box has four coordinate values and the layout is
-            [xmin, ymin, xmax, ymax].
-        scores(List): A list of tensors from multiple FPN levels. Each
-            element is a 3-D Tensor with shape [N, Mi, C] representing the
-            predicted confidence predictions. N is the batch size, C is the
-            class number (excluding background), Mi is the number of bounding
-            boxes from i-th FPN level. For each bounding box, there are total
-            C scores.
-        anchors(List): A 2-D Tensor with shape [Mi, 4] represents the locations
-            of Mi anchor boxes from all FPN level. Each bounding box has four
-            coordinate values and the layout is [xmin, ymin, xmax, ymax].
-        im_info(Variable): A 2-D LoDTensor with shape [N, 3] represents the
-            image information. N is the batch size, each image information
-            includes height, width and scale.
-        score_threshold(float): Threshold to filter out bounding boxes
-            with a confidence score.
-        nms_top_k(int): Maximum number of detections per FPN layer to be
-            kept according to the confidences before NMS.
-        keep_top_k(int): Number of total bounding boxes to be kept per image after
-            NMS step. -1 means keeping all bounding boxes after NMS step.
-        nms_threshold(float): The threshold to be used in NMS.
-        nms_eta(float): The parameter for adaptive NMS.
-
-    Returns:
-        Variable:
-            The detection output is a LoDTensor with shape [No, 6].
-            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
-            `No` is the total number of detections in this mini-batch. For each
-            instance, the offsets in first dimension are called LoD, the offset
-            number is N + 1, N is the batch size. The i-th image has
-            `LoD[i + 1] - LoD[i]` detected results, if it is 0, the i-th image
-            has no detected results. If all images have no detected results,
-            LoD will be set to 0, and the output tensor is empty (None).
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle.fluid as fluid
-
-            bboxes = layers.data(name='bboxes', shape=[1, 21, 4],
-                append_batch_size=False, dtype='float32')
-            scores = layers.data(name='scores', shape=[1, 21, 10],
-                append_batch_size=False, dtype='float32')
-            anchors = layers.data(name='anchors', shape=[21, 4],
-                append_batch_size=False, dtype='float32')
-            im_info = layers.data(name="im_info", shape=[1, 3],
-                append_batch_size=False, dtype='float32')
-            nmsed_outs = fluid.layers.retinanet_detection_output(
-                                                    bboxes=[bboxes, bboxes],
-                                                    scores=[scores, scores],
-                                                    anchors=[anchors, anchors],
-                                                    im_info=im_info,
-                                                    score_threshold=0.05,
-                                                    nms_top_k=1000,
-                                                    keep_top_k=100,
-                                                    nms_threshold=0.3,
-                                                    nms_eta=1.)
-    """
-
-    helper = LayerHelper('retinanet_detection_output', **locals())
-    output = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('scores'))
-    helper.append_op(
-        type="retinanet_detection_output",
-        inputs={
-            'BBoxes': bboxes,
-            'Scores': scores,
-            'Anchors': anchors,
-            'ImInfo': im_info
-        },
-        attrs={
-            'score_threshold': score_threshold,
-            'nms_top_k': nms_top_k,
-            'nms_threshold': nms_threshold,
-            'keep_top_k': keep_top_k,
-            'nms_eta': 1.,
-        },
-        outputs={'Out': output})
-    output.stop_gradient = True
-    return output
-
-
 def multiclass_nms(bboxes,
                    scores,
                    score_threshold,
@@ -2908,68 +2473,3 @@ def box_decoder_and_assign(prior_box,
             "OutputAssignBox": output_assign_box
         })
     return decoded_box, output_assign_box
-
-
-def collect_fpn_proposals(multi_rois,
-                          multi_scores,
-                          min_level,
-                          max_level,
-                          post_nms_top_n,
-                          name=None):
-    """
-    Concat multi-level RoIs (Region of Interest) and select N RoIs 
-    with respect to multi_scores. This operation performs the following steps:
-
-    1. Choose num_level RoIs and scores as input: num_level = max_level - min_level
-    2. Concat multi-level RoIs and scores
-    3. Sort scores and select post_nms_top_n scores
-    4. Gather RoIs by selected indices from scores
-    5. Re-sort RoIs by corresponding batch_id
-
-    Args:
-        multi_ros(list): List of RoIs to collect
-        multi_scores(list): List of scores
-        min_level(int): The lowest level of FPN layer to collect
-        max_level(int): The highest level of FPN layer to collect
-        post_nms_top_n(int): The number of selected RoIs
-        name(str|None): A name for this layer(optional)
-        
-    Returns:
-        Variable: Output variable of selected RoIs. 
-
-    Examples:
-        .. code-block:: python
-           
-            multi_rois = []
-            multi_scores = []
-            for i in range(4):
-                multi_rois.append(fluid.layers.data(
-                    name='roi_'+str(i), shape=[4], dtype='float32', lod_level=1))
-            for i in range(4):
-                multi_scores.append(fluid.layers.data(
-                    name='score_'+str(i), shape=[1], dtype='float32', lod_level=1))
-
-            fpn_rois = fluid.layers.collect_fpn_proposals(
-                multi_rois=multi_rois, 
-                multi_scores=multi_scores,
-                min_level=2, 
-                max_level=5, 
-                post_nms_top_n=2000)
-    """
-
-    helper = LayerHelper('collect_fpn_proposals', **locals())
-    dtype = helper.input_dtype('multi_rois')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
-    output_rois = helper.create_variable_for_type_inference(dtype)
-    output_rois.stop_gradient = True
-    helper.append_op(
-        type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
-        attrs={'post_nms_topN': post_nms_top_n})
-    return output_rois
diff --git a/python/paddle/fluid/layers/device.py b/python/paddle/fluid/layers/device.py
index 78226a52017..43ebd160de3 100644
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -30,7 +30,7 @@ __all__ = []
 def get_places(device_count=None, device_type=None):
     helper = LayerHelper('get_places', **locals())
     out_places = helper.create_variable(
-        name=unique_name.generate_with_ignorable_key(helper.name + ".out"))
+        name=unique_name.generate(helper.name + ".out"))
     attrs = dict()
     if device_count is not None:
         attrs['device_count'] = int(device_count)
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 79ad36e4a34..a2538fa0f9d 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -54,11 +54,6 @@ def data(name,
     All the input variables of this function are passed in as local variables
     to the LayerHelper constructor.
 
-    Notice that paddle would only use :code:`shape` to infer the shapes of 
-    following variables in the network during compile-time. During run-time, 
-    paddle would not check whether the shape of the feeded data matches the 
-    :code:`shape` settings in this function. 
-
     Args:
        name(str): The name/alias of the function
        shape(list): Tuple declaring the shape. If :code:`append_batch_size` is 
@@ -67,12 +62,9 @@ def data(name,
                     should be considered as the shape of the batched data.  
        append_batch_size(bool):
           1. If true, it prepends -1 to the shape.
-            For example if shape=[1], the resulting shape is [-1, 1]. This will 
-            be useful to set different batch size at run time.
-          2. If shape contains -1, such as shape=[1, -1].
-            append_batch_size will be enforced to be be False (ineffective)
-            because PaddlePaddle cannot set more than 1 unknown number on the
-            shape.
+            For example if shape=[1], the resulting shape is [-1, 1].
+          2. If shape contains -1, such as shape=[1, -1],
+            append_batch_size will be enforced to be be False (ineffective).
        dtype(np.dtype|VarType|str): The type of data : float32, float16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
@@ -660,11 +652,11 @@ def py_reader(capacity,
     This layer returns a Reader Variable.
     The Reader provides :code:`decorate_paddle_reader()` and
     :code:`decorate_tensor_provider()` to set a Python generator as the data
-    source. More details :ref:`user_guide_use_py_reader_en` .  When
-    :code:`Executor::Run()` is invoked in C++ side, the data from the generator
-    would be read automatically. Unlike :code:`DataFeeder.feed()`, the data
-    reading process and :code:`Executor::Run()` process can run in parallel
-    using :code:`py_reader`. The :code:`start()` method of the Reader should be
+    source in Python side. When :code:`Executor::Run()` is invoked in C++
+    side, the data from the generator would be read automatically. Unlike
+    :code:`DataFeeder.feed()`, the data reading process and
+    :code:`Executor::Run()` process can run in parallel using
+    :code:`py_reader`. The :code:`start()` method of the Reader should be
     called when each pass begins, while the :code:`reset()` method should be
     called when the pass ends and :code:`fluid.core.EOFException` raises.
     Note that :code:`Program.clone()` method cannot clone :code:`py_reader`.
@@ -902,7 +894,6 @@ def open_files(filenames,
     Examples:
        .. code-block:: python
 
-         import paddle.fluid as fluid
          reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
                                                      './data2.recordio'],
                                              shapes=[(3,224,224), (1,)],
@@ -1000,19 +991,6 @@ def shuffle(reader, buffer_size):
 
     Returns:
         callable: the new reader whose output is shuffled.
-
-    Examples:
-        .. code-block:: python
-
-            raw_reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
-                                                           './data2.recordio'],
-                                                    shapes=[(3,224,224), (1,)],
-                                                    lod_levels=[0, 0],
-                                                    dtypes=['float32', 'int64'],
-                                                    thread_num=2,
-                                                    buffer_size=2)
-            batch_reader = fluid.layers.batch(reader=raw_reader, batch_size=5)
-            shuffle_reader = fluid.layers.shuffle(reader=batch_reader, buffer_size=5000)
     """
     return __create_unshared_decorated_reader__(
         'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
@@ -1076,8 +1054,7 @@ def double_buffer(reader, place=None, name=None):
 
     Examples:
 
-        >>> import paddle.fluid as fluid
-        >>> reader = fluid.layers.open_files(filenames=['mnist.recordio'],
+        >>> reader = fluid.layers.open_files(filenames=['somefile'],
         >>>                                  shapes=[[-1, 784], [-1, 1]],
         >>>                                  dtypes=['float32', 'int64'])
         >>> reader = fluid.layers.double_buffer(reader)
@@ -1112,16 +1089,15 @@ def read_file(reader):
 
     Examples:
         .. code-block:: python
-          
-           import paddle.fluid as fluid
+
            data_file = fluid.layers.open_files(
                 filenames=['mnist.recordio'],
                 shapes=[(-1, 748), (-1, 1)],
                 lod_levels=[0, 0],
                 dtypes=["float32", "int64"])
-           data_file = fluid.layers.double_buffer(
+            data_file = fluid.layers.double_buffer(
                 fluid.layers.batch(data_file, batch_size=64))
-           input, label = fluid.layers.read_file(data_file)
+            input, label = fluid.layers.read_file(data_file)
     """
     helper = LayerHelper('read_file')
     out = [
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 278830c8e27..a9fdb10ae01 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -124,14 +124,14 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           base_lr = 0.1
           sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.exponential_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
+                learning_rate=fluid.layers.exponential_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
 
     """
     with default_main_program()._lr_schedule_guard():
@@ -167,19 +167,6 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
     Returns:
         The decayed learning rate
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          base_lr = 0.1
-          sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.natural_exp_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
-
     """
     with default_main_program()._lr_schedule_guard():
         if imperative_base.enabled():
@@ -223,14 +210,14 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           base_lr = 0.1
           sgd_optimizer = fluid.optimizer.SGD(
-	      learning_rate=fluid.layers.natural_exp_decay(
-		    learning_rate=base_lr,
-		    decay_steps=10000,
-		    decay_rate=0.5,
-		    staircase=True))
+                learning_rate=fluid.layers.inverse_time_decay(
+                    learning_rate=base_lr,
+                    decay_steps=10000,
+                    decay_rate=0.5,
+                    staircase=True))
+          sgd_optimizer.minimize(avg_cost)
     """
     with default_main_program()._lr_schedule_guard():
         if imperative_base.enabled():
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 90689c0f377..734383655cf 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -42,9 +42,7 @@ def monkey_patch_variable():
                 'shape': shape,
                 'value': value,
                 'force_cpu': force_init_on_cpu()
-            },
-            stop_gradient=True)
-        var.stop_gradient = True
+            })
         return var
 
     def create_scalar(block, value, dtype):
@@ -70,10 +68,7 @@ def monkey_patch_variable():
                 'value': value,
                 'input_dim_idx': batch_dim,
                 'output_dim_idx': batch_dim
-            },
-            stop_gradient=True)
-
-        var.stop_gradient = True
+            })
         return var
 
     def astype(self, dtype):
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 8fd50d28c39..b2d2c93ead8 100644
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -50,11 +50,10 @@ def accuracy(input, label, k=1, correct=None, total=None):
     Examples:
         .. code-block:: python
 
-           import paddle.fluid as fluid
            data = fluid.layers.data(name="data", shape=[-1, 32, 32], dtype="float32")
-           label = fluid.layers.data(name="label", shape=[-1,1], dtype="int32")
+           label = fluid.layers.data(name="data", shape=[-1,1], dtype="int32")
            predict = fluid.layers.fc(input=data, size=10)
-           accuracy_out = fluid.layers.accuracy(input=predict, label=label, k=5)
+           acc = fluid.layers.accuracy(input=predict, label=label, k=5)
 
     """
     helper = LayerHelper("accuracy", **locals())
@@ -120,11 +119,9 @@ def auc(input,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-            label = fluid.layers.data(name="label", shape=[1], dtype="int32")
-            predict = fluid.layers.fc(input=data, size=2)
-            auc_out = fluid.layers.auc(input=predict, label=label)
+            # network is a binary classification model and label the ground truth
+            prediction = network(image, is_infer=True)
+            auc_out=fluid.layers.auc(input=prediction, label=label)
     """
     helper = LayerHelper("auc", **locals())
     auc_out = helper.create_variable_for_type_inference(dtype="float64")
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 1c5fa4aa3ff..d179f56c6ca 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -28,7 +28,7 @@ from ..framework import Variable, OpProtoHolder, in_dygraph_mode
 from ..dygraph import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
-from .tensor import concat, assign, fill_constant
+from .tensor import concat, assign
 from . import utils
 from .. import unique_name
 from functools import reduce
@@ -201,10 +201,6 @@ __all__ = [
     'fsp_matrix',
     'continuous_value_model',
     'where',
-    'sign',
-    'deformable_conv',
-    'unfold',
-    'deformable_roi_pooling',
 ]
 
 kIgnoreIndex = -100
@@ -388,9 +384,9 @@ def embedding(input,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1)
-          emb = fluid.layers.embedding(input=data, size=[128, 64])    
+          dict_size = len(dataset.ids)
+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
     """
 
     helper = LayerHelper('embedding', **locals())
@@ -486,18 +482,10 @@ def dynamic_lstm(input,
 
     Examples:
         .. code-block:: python
-            
-            emb_dim = 256
-            vocab_size = 10000
-            hidden_dim = 512
-            
-            data = fluid.layers.data(name='x', shape=[1],
-                         dtype='int32', lod_level=1)
-            emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
 
-            forward_proj = fluid.layers.fc(input=emb, size=hidden_dim * 4,
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
                                            bias_attr=False)
-
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
@@ -636,23 +624,20 @@ def lstm(input,
 
     Examples:
         .. code-block:: python
-            
-            emb_dim = 256
-            vocab_size = 10000
-            data = fluid.layers.data(name='x', shape=[-1, 100, 1],
-                         dtype='int32')
-            emb = fluid.layers.embedding(input=data, size=[vocab_size, emb_dim], is_sparse=True)
+
+            input = embedding
             batch_size = 20
             max_len = 100
             dropout_prob = 0.2
             input_size = 100
             hidden_size = 150
             num_layers = 1
-            init_h = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            init_c = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0 )
-            rnn_out, last_h, last_c = layers.lstm( emb, init_h, init_c, \
-                    max_len, hidden_size, num_layers, \
-                    dropout_prob=dropout_prob)
+            init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+            init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
+
+            rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
+                    max_len, dropout_prob, input_size, hidden_size, \
+                    num_layers)
     """
 
     helper = LayerHelper('cudnn_lstm', **locals())
@@ -1055,8 +1040,6 @@ def dynamic_gru(input,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
             dict_dim, emb_dim = 128, 64
             data = fluid.layers.data(name='sequence', shape=[1],
                                      dtype='int32', lod_level=1)
@@ -1194,17 +1177,10 @@ def gru_unit(input,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim = 128, 64
-            data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
-            emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            hidden_dim = 512
-            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
-            pre_hidden = fluid.layers.data(
-                name='pre_hidden', shape=[hidden_dim], dtype='float32')
-            hidden = fluid.layers.gru_unit(
-                input=x, hidden=pre_hidden, size=hidden_dim * 3)
+             # assuming we have x_t_data and prev_hidden of size=10
+             x_t = fluid.layers.fc(input=x_t_data, size=30)
+             hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
+                                                    hidden = prev_hidden)
 
     """
     activation_dict = dict(
@@ -1268,19 +1244,6 @@ def linear_chain_crf(input, label, param_attr=None):
         output(${transition_exps_type}): ${transition_exps_comment} \n
         output(${log_likelihood_type}): ${log_likelihood_comment}
 
-    Examples:
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             emission = fluid.layers.data(name='emission', shape=[1000], dtype='float32')
-             target = fluid.layers.data(name='target', shape=[1], dtype='int32')
-             crf_cost = fluid.layers.linear_chain_crf(
-                 input=emission,
-                 label=target,
-                 param_attr=fluid.ParamAttr(
-                     name='crfw',
-                     learning_rate=0.2))
-
     """
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
@@ -1567,16 +1530,14 @@ def cross_entropy2(input, label, ignore_index=kIgnoreIndex):
 
 def bpr_loss(input, label, name=None):
     """
-    **Bayesian Personalized Ranking Loss Operator**
+    Bayesian Personalized Ranking Loss Operator.
 
     This operator belongs to pairwise ranking loss. Label is the desired item.
     The loss at a given point in one session is defined as:
-
-    .. math::
-        Y[i] = 1/(N[i] - 1) * \sum_j{\log(\sigma(X[i, Label[i]]-X[i, j]))}
+    $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j<N_{i},~ j\neq Label[i]}\log(\sigma(X[i, Label[i]]-X[i, j]))$
 
     Learn more details by reading paper <session-based recommendations with recurrent
-    neural networks>.
+    neural networks>(https://arxiv.org/abs/1511.06939)
 
     Args:
         input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
@@ -1592,15 +1553,9 @@ def bpr_loss(input, label, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-
-          neg_size = 10
-          label = fluid.layers.data(
-                    name="label", shape=[1], dtype="int64")
-          predict = fluid.layers.data(
-                    name="predict", shape=[neg_size + 1], dtype="float32")
           cost = fluid.layers.bpr_loss(input=predict, label=label)
     """
+
     helper = LayerHelper('bpr_loss', **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     helper.append_op(
@@ -1747,21 +1702,10 @@ def chunk_eval(input,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            dict_size = 10000
-            label_dict_len = 7
-            sequence = fluid.layers.data(
-                name='id', shape=[1], lod_level=1, dtype='int64')
-            embedding = fluid.layers.embedding(
-                input=sequence, size=[dict_size, 512])
-            hidden = fluid.layers.fc(input=embedding, size=512)
-            label = fluid.layers.data(
-                name='label', shape=[1], lod_level=1, dtype='int32')
             crf = fluid.layers.linear_chain_crf(
-                input=hidden, label=label, param_attr=fluid.ParamAttr(name="crfw"))
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
             crf_decode = fluid.layers.crf_decoding(
-                input=hidden, param_attr=fluid.ParamAttr(name="crfw"))
+                input=hidden, param_attr=ParamAttr(name="crfw"))
             fluid.layers.chunk_eval(
                 input=crf_decode,
                 label=label,
@@ -1837,13 +1781,6 @@ def sequence_conv(input,
 
     Returns:
         Variable: output of sequence_conv
-
-    Examples:
-        .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[10,10], append_batch_size=False, dtype='float32')
-             x_conved = fluid.layers.sequence_conv(x,2)
     """
 
     assert not in_dygraph_mode(), (
@@ -1964,8 +1901,6 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.layers.data(name='x', shape=[2], dtype='float32')
              fc = fluid.layers.fc(input=x, size=10)
              # perform softmax in the second dimension
              softmax = fluid.layers.softmax(input=fc, axis=1)
@@ -2237,7 +2172,7 @@ def conv3d(input,
 
     Args:
         input (Variable): The input image with [N, C, D, H, W] format.
-        num_filters(int): The number of filter. It is as same as the output
+            num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
             it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
@@ -2348,7 +2283,7 @@ def conv3d(input,
     return helper.append_activation(pre_act)
 
 
-def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
+def sequence_pool(input, pool_type, is_test=False):
     """
     This function add the operator for sequence pooling.
     It pools features of all time-steps of each instance, and is applied
@@ -2363,32 +2298,29 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
     .. code-block:: text
 
-       x is a 1-level LoDTensor and **pad_value** = 0.0:
-         x.lod = [[2, 3, 2, 0]]
+       x is a 1-level LoDTensor:
+         x.lod = [[2, 3, 2]]
          x.data = [1, 3, 2, 4, 6, 5, 1]
          x.dims = [7, 1]
 
        then output is a Tensor:
-         out.dim = [4, 1]
+         out.dim = [3, 1]
          with condition len(x.lod[-1]) == out.dims[0]
 
        for different pool_type:
-         average: out.data = [2, 4, 3, 0.0], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
-         sum    : out.data = [4, 12, 6, 0.0], where 4=1+3, 12=2+4+6, 6=5+1
-         sqrt   : out.data = [2.82, 6.93, 4.24, 0.0], where 2.82=(1+3)/sqrt(2),
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
                     6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
-         max    : out.data = [3, 6, 5, 0.0], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
-         last   : out.data = [3, 6, 1, 0.0], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
-         first  : out.data = [1, 2, 5, 0.0], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
-
-         and all above 0.0 = **pad_value**.
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+         last   : out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+         first  : out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
 
     Args:
-        input (variable): The input variable which is a LoDTensor.
+        input(variable): The input variable which is a LoDTensor.
         pool_type (string): The pooling type of sequence_pool.
             It supports average, sum, sqrt and max.
-        is_test (bool): Used to distinguish training from scoring mode. Default False.
-        pad_value (float): Used to pad the pooling result for empty input sequence.
+        is_test(bool, Default False): Used distinguish training from scoring mode.
 
     Returns:
         The sequence pooling variable which is a Tensor.
@@ -2397,8 +2329,6 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-
              x = fluid.layers.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
              avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
@@ -2420,11 +2350,8 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
         inputs={"X": input},
         outputs={"Out": pool_out,
                  "MaxIndex": max_index},
-        attrs={
-            "pooltype": pool_type.upper(),
-            "is_test": is_test,
-            "pad_value": pad_value
-        })
+        attrs={"pooltype": pool_type.upper(),
+               "is_test": is_test})
 
     # when pool_type is max, variable max_index is initialized,
     # so we stop the gradient explicitly here
@@ -2450,10 +2377,7 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-           import paddle.fluid as fluid
-           x = fluid.layers.data(name='x', shape=[10], dtype='float32')
-           y = fluid.layers.data(name='y', shape=[10], dtype='float32')
-           out = fluid.layers.sequence_concat(input=[x, y])
+           out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -2982,12 +2906,9 @@ def adaptive_pool3d(input,
           #                 output[:, :, i, j, k] =
           #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
           #
-
-          import paddle.fluid as fluid
-
           data = fluid.layers.data(
-              name='data', shape=[3, 32, 32, 32], dtype='float32')
-          pool_out = fluid.layers.adaptive_pool3d(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool_out, mask = fluid.layers.adaptive_pool3d(
                             input=data,
                             pool_size=[3, 3, 3],
                             pool_type='avg')
@@ -3098,24 +3019,18 @@ def batch_norm(input,
             numerical stability. Default is 1e-5.
         param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
              of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized 
-	     with Xavier. Default: None.
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
              If it is set to None or one attribute of ParamAttr, batch_norm
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
-	     Default: None.
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
         data_layout(string, default NCHW): NCHW|NHWC
         in_place(bool, Default False): Make the input and output of batch norm reuse memory.
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
-        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. If it 
-            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm 
-            will save global mean with the string.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
         moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
-            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm 
-            will save global variance with the string.
         do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
         fuse_with_relu (bool): if True, this OP performs relu after batch norm.
         use_global_stats(bool, Default False): Whether to use global mean and
@@ -3276,11 +3191,9 @@ def data_norm(input,
     Examples:
 
         .. code-block:: python
-            
-            import paddle.fluid as fluid
 
-            hidden1 = fluid.layers.data(name="hidden1", shape=[200])
-            hidden2 = fluid.layers.data_norm(name="hidden2", input=hidden1)
+            data = fluid.layers.data(input=x, size=200, param_attr='fc1.w')
+            hidden2 = fluid.layers.data_norm(input=hidden1)
     """
     helper = LayerHelper('data_norm', **locals())
     dtype = helper.input_dtype()
@@ -3586,13 +3499,10 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
         Variable: A tensor variable of weight parameters after spectral normalization.
 
     Examples:
-       .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            weight = fluid.layers.data(name='weight', shape=[2, 8, 32, 32], 
-                                       append_batch_size=False, dtype='float32')
-            x = fluid.layers.spectral_norm(weight=weight, dim=1, power_iters=2)
+        >>> weight = fluid.layers.data(name='weight', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.spectral_norm(weight=data, dim=1, power_iters=2)
     """
     helper = LayerHelper('spectral_norm', **locals())
     dtype = weight.dtype
@@ -4054,8 +3964,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
 
     Examples:
         .. code-block:: python
-	
-            import paddle.fluid.layers as layers
+
             x = fluid.layers.data(name='x', shape=[10], dtype='float32')
             y = fluid.layers.data(name='y', shape=[10, 20],
                              dtype='float32', lod_level=1)
@@ -4123,7 +4032,6 @@ def sequence_expand_as(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import paddle.fluid.layers as layers
 
             x = fluid.layers.data(name='x', shape=[10], dtype='float32')
             y = fluid.layers.data(name='y', shape=[10, 20],
@@ -4341,25 +4249,16 @@ def beam_search(pre_ids,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
             # Suppose `probs` contains predicted results from the computation
             # cell and `pre_ids` and `pre_scores` is the output of beam_search
             # at previous step.
-            beam_size = 4
-            end_id = 1
-            pre_ids = fluid.layers.data(
-                name='pre_id', shape=[1], lod_level=2, dtype='int64')
-            pre_scores = fluid.layers.data(
-                name='pre_scores', shape=[1], lod_level=2, dtype='float32')
-            probs = fluid.layers.data(
-                name='probs', shape=[10000], dtype='float32')
-            topk_scores, topk_indices = fluid.layers.topk(probs, k=beam_size)
-            accu_scores = fluid.layers.elementwise_add(
-                x=fluid.layers.log(x=topk_scores),
-                y=fluid.layers.reshape(pre_scores, shape=[-1]),
+            topk_scores, topk_indices = layers.topk(probs, k=beam_size)
+            accu_scores = layers.elementwise_add(
+                x=layers.log(x=topk_scores)),
+                y=layers.reshape(
+                    pre_scores, shape=[-1]),
                 axis=0)
-            selected_ids, selected_scores = fluid.layers.beam_search(
+            selected_ids, selected_scores = layers.beam_search(
                 pre_ids=pre_ids,
                 pre_scores=pre_scores,
                 ids=topk_indices,
@@ -4433,13 +4332,9 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
             # Suppose `ids` and `scores` are LodTensorArray variables reserving
             # the selected ids and scores of all steps
-            ids = fluid.layers.create_array(dtype='int64')
-            scores = fluid.layers.create_array(dtype='float32')
-            finished_ids, finished_scores = fluid.layers.beam_search_decode(
+            finished_ids, finished_scores = layers.beam_search_decode(
                 ids, scores, beam_size=5, end_id=0)
     """
     helper = LayerHelper('beam_search_decode', **locals())
@@ -4499,7 +4394,7 @@ def lstm_unit(x_t,
 
             i_t = \sigma(L_{i_t})
 
-    This layer has two outputs including :math:`h_t` and :math:`c_t`.
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
 
     Args:
         x_t (Variable): The input value of current step, a 2-D tensor with shape
@@ -4537,19 +4432,12 @@ def lstm_unit(x_t,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            dict_dim, emb_dim, hidden_dim = 128, 64, 512
-            data = fluid.layers.data(name='step_data', shape=[1], dtype='int32')
-            x = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
-            pre_hidden = fluid.layers.data(
-                name='pre_hidden', shape=[hidden_dim], dtype='float32')
-            pre_cell = fluid.layers.data(
-                name='pre_cell', shape=[hidden_dim], dtype='float32')
-            hidden = fluid.layers.lstm_unit(
-                x_t=x,
-                hidden_t_prev=pre_hidden,
-                cell_t_prev=pre_cell)
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
     """
     helper = LayerHelper('lstm_unit', **locals())
 
@@ -4618,24 +4506,21 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
             fluid.layers.reduce_sum(x)  # [3.5]
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
             fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_sum(y, dim=[1, 2]) # [10, 26]
-            fluid.layers.reduce_sum(y, dim=[0, 1]) # [16, 20]
+            fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
+            fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
 
     """
     helper = LayerHelper('reduce_sum', **locals())
@@ -4678,24 +4563,22 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
             fluid.layers.reduce_mean(x)  # [0.4375]
             fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
             fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
-            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+            fluid.layers.reduce_mean(
+                x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
             # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_mean(y, dim=[1, 2]) # [2.5, 6.5]
-            fluid.layers.reduce_mean(y, dim=[0, 1]) # [4.0, 5.0]
+            fluid.layers.reduce_mean(x, dim=[1, 2]) # [2.5, 6.5]
+            fluid.layers.reduce_mean(x, dim=[0, 1]) # [4.0, 5.0]
     """
     helper = LayerHelper('reduce_mean', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -4736,24 +4619,21 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
             fluid.layers.reduce_max(x)  # [0.9]
             fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
             fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
             fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
             # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_max(y, dim=[1, 2]) # [4.0, 8.0]
-            fluid.layers.reduce_max(y, dim=[0, 1]) # [7.0, 8.0]
+            fluid.layers.reduce_max(x, dim=[1, 2]) # [4.0, 8.0]
+            fluid.layers.reduce_max(x, dim=[0, 1]) # [7.0, 8.0]
     """
     helper = LayerHelper('reduce_max', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -4794,24 +4674,21 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
             fluid.layers.reduce_min(x)  # [0.1]
             fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
             fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
             fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
             # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_min(y, dim=[1, 2]) # [1.0, 5.0]
-            fluid.layers.reduce_min(y, dim=[0, 1]) # [1.0, 2.0]
+            fluid.layers.reduce_min(x, dim=[1, 2]) # [1.0, 5.0]
+            fluid.layers.reduce_min(x, dim=[0, 1]) # [1.0, 2.0]
     """
     helper = LayerHelper('reduce_min', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -4852,25 +4729,22 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the correspending output tensor.
-            x = fluid.layers.data(name='x', shape=[4, 2], dtype='float32')
             fluid.layers.reduce_prod(x)  # [0.0002268]
             fluid.layers.reduce_prod(x, dim=0)  # [0.02, 0.06, 0.3, 0.63]
             fluid.layers.reduce_prod(x, dim=-1)  # [0.027, 0.0084]
             fluid.layers.reduce_prod(x, dim=1,
                                      keep_dim=True)  # [[0.027], [0.0084]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1.0, 2.0], [3.0, 4.0]],
             #      [[5.0, 6.0], [7.0, 8.0]]]
             # Each example is followed by the correspending output tensor.
-            y = fluid.layers.data(name='y', shape=[2, 2, 2], dtype='float32')
-            fluid.layers.reduce_prod(y, dim=[1, 2]) # [24.0, 1680.0]
-            fluid.layers.reduce_prod(y, dim=[0, 1]) # [105.0, 384.0]
+            fluid.layers.reduce_prod(x, dim=[1, 2]) # [24.0, 1680.0]
+            fluid.layers.reduce_prod(x, dim=[0, 1]) # [105.0, 384.0]
     """
     helper = LayerHelper('reduce_prod', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -5011,21 +4885,16 @@ def split(input, num_or_sections, dim=-1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            # input is a variable which shape is [-1, 3, 9, 5]
-            input = fluid.layers.data(
-                 name="input", shape=[3, 9, 5], dtype="float32")
-
-            x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=2)
-            # x0.shape [-1, 3, 3, 5]
-            # x1.shape [-1, 3, 3, 5]
-            # x2.shape [-1, 3, 3, 5]
-
-            x0, x1, x2 = fluid.layers.split(input, num_or_sections=3, dim=2)
-            # x0.shape [-1, 3, 2, 5]
-            # x1.shape [-1, 3, 3, 5]
-            # x2.shape [-1, 3, 4, 5]
+            # x is a Tensor variable with shape [3, 9, 5]:
+            x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1)
+            x0.shape  # [3, 3, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 3, 5]
+            x0, x1, x2 = fluid.layers.split(
+                x, num_or_sections=[2, 3, 4], dim=1)
+            x0.shape  # [3, 2, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 4, 5]
     """
     helper = LayerHelper('split', **locals())
     input_shape = input.shape
@@ -5155,29 +5024,25 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
 
             # Examples to clarify shapes of the inputs and output
             # x: [B, ..., M, K], y: [B, ..., K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
+            fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
 
             # x: [B, M, K], y: [B, K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
 
             # x: [B, M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [B, M, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
 
             # x: [M, K], y: [K, N]
-            # fluid.layers.matmul(x, y)  # out: [M, N]
+            fluid.layers.matmul(x, y)  # out: [M, N]
 
             # x: [B, M, K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [B, M]
+            fluid.layers.matmul(x, y)  # out: [B, M]
 
             # x: [K], y: [K]
-            # fluid.layers.matmul(x, y)  # out: [1]
+            fluid.layers.matmul(x, y)  # out: [1]
 
             # x: [M], y: [N]
-            # fluid.layers.matmul(x, y, True, True)  # out: [M, N]
-
-            x = fluid.layers.data(name='x', shape=[2, 3], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[3, 2], dtype='float32')
-            out = fluid.layers.matmul(x, y, True, True)
+            fluid.layers.matmul(x, y, True, True)  # out: [M, N]
     """
 
     def __check_input(x, y):
@@ -5277,8 +5142,6 @@ def topk(input, k, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
-            input = layers.data(name="input", shape=[13, 11], dtype='float32')
             top5_values, top5_indices = layers.topk(input, k=5)
     """
     helper = LayerHelper("top_k", **locals())
@@ -5303,7 +5166,7 @@ def topk(input, k, name=None):
 
 def edit_distance(input, label, normalized=True, ignored_tokens=None):
     """
-    Edit distance operator computes the edit distances between a batch of
+    EditDistance operator computes the edit distances between a batch of
     hypothesis strings and their references. Edit distance, also called
     Levenshtein distance, measures how dissimilar two strings are by counting
     the minimum number of operations to transform one string into anthor.
@@ -5339,28 +5202,9 @@ def edit_distance(input, label, normalized=True, ignored_tokens=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[1], dtype='int64')
-            y = fluid.layers.data(name='y', shape=[1], dtype='int64')
-            cost, _ = fluid.layers.edit_distance(input=x, label=y)
-
-            cpu = fluid.core.CPUPlace()
-            exe = fluid.Executor(cpu)
-            exe.run(fluid.default_startup_program())
-
-            import numpy
-            x_ = numpy.random.randint(5, size=(2, 1)).astype('int64')
-            y_ = numpy.random.randint(5, size=(2, 1)).astype('int64')
-
-            print(x_)
-            print(y_)
-
-            x = fluid.create_lod_tensor(x_, [[2]], cpu)
-            y = fluid.create_lod_tensor(y_, [[2]], cpu)
-
-            outs = exe.run(feed={'x':x, 'y':y}, fetch_list=[cost.name])
-
-            print(outs)
+            x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = fluid.layers.edit_distance(input=x,label=y)
     """
     helper = LayerHelper("edit_distance", **locals())
 
@@ -5515,11 +5359,8 @@ def warpctc(input, label, blank=0, norm_by_times=False, use_cudnn=False):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            label = fluid.layers.data(name='label', shape=[11, 8],
-                                      dtype='float32', lod_level=1)
-            predict = fluid.layers.data(name='predict', shape=[11, 1],
-                                        dtype='float32')
+            label = fluid.layers.data(shape=[11, 8], dtype='float32', lod_level=1)
+            predict = fluid.layers.data(shape=[11, 1], dtype='float32')
             cost = fluid.layers.warpctc(input=predict, label=label)
 
     """
@@ -5585,9 +5426,8 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 6], append_batch_size=False, dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+            x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
+            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -5889,7 +5729,6 @@ def hsigmoid(input,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             x = fluid.layers.data(name='x', shape=[2], dtype='float32')
             y = fluid.layers.data(name='y', shape=[1], dtype='int64')
             out = fluid.layers.hsigmoid(input=x, label=y, num_classes=6)
@@ -5995,16 +5834,15 @@ def transpose(x, perm, name=None):
 
             # use append_batch_size=False to avoid prepending extra
             # batch size in shape
-            import paddle.fluid as fluid
             x = fluid.layers.data(name='x', shape=[5, 10, 15],
                             dtype='float32', append_batch_size=False)
-            x_transposed = fluid.layers.transpose(x, perm=[1, 0, 2])
+            x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
 
     if len(perm) != len(x.shape):
         raise ValueError(
             "Input(perm) is the permutation of dimensions of Input(input). "
-            "Its length should be equal to Input(input)'s rank.")
+            "It's length shoud be equal to Input(input)'s rank.")
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
@@ -6134,12 +5972,8 @@ def im2sequence(input,
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                     dtype='float32')
             output = fluid.layers.im2sequence(
-                input=data, stride=[1, 1], filter_size=[2, 2])
-
+                input=layer, stride=[1, 1], filter_size=[2, 2])
 
     """
     assert not in_dygraph_mode(), (
@@ -6155,7 +5989,7 @@ def im2sequence(input,
         padding.append(padding[0])
         padding.append(padding[1])
     inputs = {"X": input}
-    attrs = {"kernels": filter_size, "strides": stride, "paddings": padding}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
     if input_image_size:
         if isinstance(out_stride, int):
             out_stride = [out_stride, out_stride]
@@ -6462,13 +6296,11 @@ def sampled_softmax_with_cross_entropy(logits,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name='data', shape=[256], dtype='float32')
+            logits = fluid.layers.data(name='data', shape=[256], dtype='float32')
             label = fluid.layers.data(name='label', shape=[5], dtype='int64')
-            fc = fluid.layers.fc(input=input, size=100)
+            fc = fluid.layers.fc(input=data, size=100)
             out = fluid.layers.sampled_softmax_with_cross_entropy(
-                      logits=fc, label=label, num_samples=25)
+                logits=fc, label=label, num_samples=25)
     """
     helper = LayerHelper('sample_logits', **locals())
     samples = helper.create_variable_for_type_inference(dtype='int64')
@@ -6599,25 +6431,11 @@ def one_hot(input, depth):
             one_hot_label = fluid.layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
-
     one_hot_out = helper.create_variable_for_type_inference(dtype='float32')
-
-    if in_dygraph_mode():
-        inputs = {'X': input}
-        attrs = {'depth': depth}
-    else:
-        if not isinstance(depth, Variable):
-            # user attribute 
-            inputs = {'X': input}
-            attrs = {'depth': depth}
-        else:
-            depth.stop_gradient = True
-            inputs = {'X': input, 'depth_tensor': depth}
-            attrs = {}
     helper.append_op(
         type="one_hot",
-        inputs=inputs,
-        attrs=attrs,
+        inputs={'X': input},
+        attrs={'depth': depth},
         outputs={'Out': one_hot_out},
         stop_gradient=True)
     return one_hot_out
@@ -6739,7 +6557,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
 
     if not (isinstance(shape, list) or isinstance(shape, tuple)):
         raise ValueError("Input shape must be a python list or tuple.")
-
     inputs = {"X": x}
     if isinstance(actual_shape, Variable):
         inputs["Shape"] = actual_shape
@@ -6748,12 +6565,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
 
     # Validate the shape
     unk_dim_idx = -1
-    contain_var = False
     for dim_idx, dim_size in enumerate(shape):
-        if isinstance(dim_size, Variable):
-            contain_var = True
-            continue
-
         if dim_size == -1:
             assert unk_dim_idx == -1, (
                 "Only one dimension in shape can be unknown.")
@@ -6767,35 +6579,13 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
                 "except one unknown dimension.")
 
     helper = LayerHelper("reshape2", **locals())
-    if in_dygraph_mode():
-        inputs = {'X': x}
-        attrs = {'shape': shape}
-    else:
-        if contain_var:
-            new_shape_tensor = []
-            for dim in shape:
-                if isinstance(dim, Variable):
-                    dim.stop_gradient = True
-                    new_shape_tensor.append(dim)
-                else:
-                    assert (isinstance(dim, int))
-                    temp_out = helper.create_variable_for_type_inference(
-                        'int32')
-                    fill_constant(
-                        [1], 'int32', dim, force_cpu=True, out=temp_out)
-                    new_shape_tensor.append(temp_out)
-            inputs['ShapeTensor'] = new_shape_tensor
-            attrs = {}
-
-        else:
-            attrs = {'shape': shape}
     out = x if inplace else helper.create_variable_for_type_inference(
         dtype=x.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="reshape2",
         inputs=inputs,
-        attrs=attrs,
+        attrs={"shape": shape},
         outputs={"Out": out,
                  "XShape": x_shape})
 
@@ -6842,9 +6632,8 @@ def squeeze(input, axes, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
             x = layers.data(name='x', shape=[5, 1, 10])
-            y = layers.squeeze(input=x, axes=[1])
+            y = layers.sequeeze(input=x, axes=[1])
     """
     assert not in_dygraph_mode(), (
         "squeeze layer is not supported in dygraph mode yet.")
@@ -6885,9 +6674,8 @@ def unsqueeze(input, axes, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[5, 10])
-            y = fluid.layers.unsqueeze(input=x, axes=[1])
+            x = layers.data(name='x', shape=[5, 10])
+            y = layers.unsequeeze(input=x, axes=[1])
     """
     helper = LayerHelper("unsqueeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -6976,9 +6764,9 @@ def lod_reset(x, y=None, target_lod=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[10])
-            y = fluid.layers.data(name='y', shape=[10, 20], lod_level=2)
-            out = fluid.layers.lod_reset(x=x, y=y)
+            x = layers.data(name='x', shape=[10])
+            y = layers.data(name='y', shape=[10, 20], lod_level=2)
+            out = layers.lod_reset(x=x, y=y)
     """
     helper = LayerHelper("lod_reset", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -7256,8 +7044,6 @@ def label_smooth(label,
 
     Examples:
         .. code-block:: python
-            
-            import paddle.fluid.layers as layers
 
             label = layers.data(name="label", shape=[1], dtype="float32")
             one_hot_label = layers.one_hot(input=label, depth=10)
@@ -7296,19 +7082,7 @@ def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(
-                name='x', shape=[8, 112, 112], dtype='float32')
-            rois = fluid.layers.data(
-                name='roi', shape=[4], lod_level=1, dtype='float32')
-            pool_out = fluid.layers.roi_pool(
-                input=x,
-                rois=rois,
-                pooled_height=7,
-                pooled_width=7,
-                spatial_scale=1.0)
-
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
     """
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
@@ -7882,7 +7656,7 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
-def gather(input, index, overwrite=True):
+def gather(input, index):
     """
     **Gather Layer**
 
@@ -7913,12 +7687,6 @@ def gather(input, index, overwrite=True):
     Args:
         input (Variable): The source input with rank>=1.
         index (Variable): The index input with rank=1.
-        overwrite (bool): The mode that updating the grad when has same index.
-            If True, use the overwrite mode to update the grad of the same index,
-	    if False, use the accumulate mode to update the grad of the same index. 
-	    Default value is True.
-	    
-
 
     Returns:
         output (Variable): The output is a tensor with the same rank as input.
@@ -7938,12 +7706,11 @@ def gather(input, index, overwrite=True):
         type="gather",
         inputs={"X": input,
                 "Index": index},
-        outputs={"Out": out},
-        attrs={'overwrite': overwrite})
+        outputs={"Out": out})
     return out
 
 
-def scatter(input, index, updates, name=None, overwrite=True):
+def scatter(input, index, updates, name=None):
     """
     **Scatter Layer**
 
@@ -7961,10 +7728,6 @@ def scatter(input, index, updates, name=None, overwrite=True):
                           int32 or int64 as it is used as indexes.
         updates (Variable): The updated value of scatter op.
         name (str|None): The output variable name. Default None.
-        overwrite (bool): The mode that updating the output when has same index.
-            If True, use the overwrite mode to update the output of the same index,
-	    if False, use the accumulate mode to update the output of the same index. 
-	    Default value is True.You can set overwrite=False to implement scatter_add.
 
     Returns:
         output (Variable): The output is a tensor with the same shape as input.
@@ -7973,13 +7736,8 @@ def scatter(input, index, updates, name=None, overwrite=True):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(name='data', shape=[3, 5, 9], dtype='float32', append_batch_size=False)
-            index = fluid.layers.data(name='index', shape=[3], dtype='int64', append_batch_size=False)
-            updates = fluid.layers.data(name='update', shape=[3, 5, 9], dtype='float32', append_batch_size=False)
-
             output = fluid.layers.scatter(input, index, updates)
+
     """
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
@@ -7989,7 +7747,6 @@ def scatter(input, index, updates, name=None, overwrite=True):
         inputs={"X": input,
                 "Ids": index,
                 "Updates": updates},
-        attrs={'overwrite': overwrite},
         outputs={"Out": out})
     return out
 
@@ -8043,12 +7800,7 @@ def sequence_scatter(input, index, updates, name=None):
     Examples:
 
         .. code-block:: python
-	
-            import paddle.fluid.layers as layers
 
-            input = layers.data( name="x", shape=[3, 6], append_batch_size=False, dtype='float32' )
-            index = layers.data( name='index', shape=[1], dtype='int32')
-            updates = layers.data( name='updates', shape=[1], dtype='float32')
             output = fluid.layers.sequence_scatter(input, index, updates)
 
     """
@@ -8128,7 +7880,6 @@ def log(x, name=None):
 
         .. code-block:: python
 
-            x = fluid.layers.data(name="x", shape=[3, 4], dtype="float32")
             output = fluid.layers.log(x)
     """
     helper = LayerHelper('log', **locals())
@@ -8195,12 +7946,8 @@ def selu(x, scale=None, alpha=None, name=None):
     Examples:
 
         .. code-block:: python
-             
-            import paddle.fluid as fluid
-          
-            input = fluid.layers.data(
-                 name="input", shape=[3, 9, 5], dtype="float32")
-            output = fluid.layers.selu(input)
+
+            output = fluid.layers.selu(x)
     """
     helper = LayerHelper('selu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -8250,11 +7997,7 @@ def mean_iou(input, label, num_classes):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            predict = fluid.layers.data(name='predict', shape=[3, 32, 32])
-            label = fluid.layers.data(name='label', shape=[1])
-            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label,
-                                                          num_classes=5)
+            iou, wrongs, corrects = fluid.layers.mean_iou(predict, label, num_classes)
     """
     helper = LayerHelper('mean_iou', **locals())
     dtype = helper.input_dtype()
@@ -8539,9 +8282,9 @@ def rank_loss(label, left, right, name=None):
 
         .. code-block:: python
 
-            label = fluid.layers.data(name="label", shape=[-1, 1], dtype="float32")
-            left = fluid.layers.data(name="left", shape=[-1, 1], dtype="float32")
-            right = fluid.layers.data(name="right", shape=[-1, 1], dtype="float32")
+            label = fluid.layers.data(name="label", shape=[4, 1], dtype="float32")
+            left = fluid.layers.data(name="left", shape=[4, 1], dtype="float32")
+            right = fluid.layers.data(name="right", shape=[4, 1], dtype="float32")
             out = fluid.layers.rank_loss(label, left, right)
 
     """
@@ -8688,11 +8431,8 @@ def pad2d(input,
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                   dtype='float32')
-          result = fluid.layers.pad2d(input=data, paddings=[1, 2, 3, 4],
-                                      mode='reflect')
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          result = fluid.layers.pad2d(input=data, padding=[1,2,3,4], mode='reflect')
     """
 
     helper = LayerHelper('pad2d', **locals())
@@ -8790,7 +8530,6 @@ def pow(x, factor=1.0, name=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
             y = fluid.layers.pow(x, factor=2.0)
     """
@@ -8854,7 +8593,6 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
             x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32")
             y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8)
     """
@@ -8906,19 +8644,14 @@ def prelu(x, mode, param_attr=None, name=None):
     .. math::
         y = \max(0, x) + \\alpha * \min(0, x)
 
-    There are three modes for the activation:
-
-    .. code-block:: text
-
-        all: All elements share same alpha.
-        channel: Elements in same channel share same alpha.
-        element: All elements do not share alpha. Each element has its own alpha.
-
     Args:
         x (Variable): The input tensor.
-        mode (string): The mode for weight sharing. 
         param_attr(ParamAttr|None): The parameter attribute for the learnable
-          weight (alpha), it can be create by ParamAttr.
+          weight (alpha).
+        mode (string): The mode for weight sharing. It supports all, channel
+          and element. all: all elements share same weight
+          channel:elements in a channel share same weight
+          element:each element has a weight
         name(str|None): A name for this layer(optional). If set None, the layer
           will be named automatically.
 
@@ -8929,13 +8662,9 @@ def prelu(x, mode, param_attr=None, name=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            from paddle.fluid.param_attr import ParamAttr
-            x = fluid.layers.data(name="x", shape=[5,10,10], dtype="float32")
+            x = fluid.layers.data(name="x", shape=[10,10], dtype="float32")
             mode = 'channel'
-            output = fluid.layers.prelu(
-                     x,mode,param_attr=ParamAttr(name='alpha'))
-
+            output = fluid.layers.prelu(x,mode)
     """
     helper = LayerHelper('prelu', **locals())
     if mode not in ['all', 'channel', 'element']:
@@ -9036,11 +8765,9 @@ def soft_relu(x, threshold=40.0, name=None):
 
     Examples:
 
-        .. code-block:: python 
- 
-            import paddle.fluid as fluid
-   
-            x = fluid.layers.data(name="x", shape=[3,16,16], dtype="float32")
+        .. code-block:: python
+
+            x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32")
             y = fluid.layers.soft_relu(x, threshold=20.0)
     """
     helper = LayerHelper('soft_relu', **locals())
@@ -9167,7 +8894,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(shape[-1, 1], dtype='int32', lod_level=1)
+            x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
             out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert not in_dygraph_mode(), (
@@ -9210,14 +8937,6 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
     Returns:
         Variable: The output sequence mask.
 
-    Examples:
-        .. code-block:: python
-	
-            import paddle.fluid.layers as layers
-
-            x = fluid.layers.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
-
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -9301,14 +9020,6 @@ def stack(x, axis=0):
     Returns:
         Variable: The stacked variable.
 
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid.layers as layers
-            x1 = layers.data(name='x1', shape=[1, 2], dtype='int32')
-            x2 = layers.data(name='x2', shape=[1, 2], dtype='int32')
-            data = layers.stack([x1,x2])
-
     """
 
     helper = LayerHelper('stack', **locals())
@@ -9344,12 +9055,6 @@ def unstack(x, axis=0, num=None):
     Returns:
         list(Variable): The unstacked variables.
 
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[5, 10], dtype='float32')
-            y = fluid.layers.unstack(x, axis=1)
     """
 
     helper = LayerHelper('unstack', **locals())
@@ -9414,39 +9119,11 @@ def expand(x, expand_times, name=None):
     helper = LayerHelper('expand', input=x, **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    # check expand_times have tensor
-
-    if in_dygraph_mode():
-        inputs = {'X': x}
-        attrs = {'expand_times': expand_times}
-    else:
-
-        def contain_tensor(expand_times):
-            for ele in expand_times:
-                if isinstance(ele, Variable):
-                    return True
-            return False
-
-        if contain_tensor(expand_times):
-            new_expand_times = []
-            for ele in expand_times:
-                if isinstance(ele, Variable):
-                    ele.stop_gradient = True
-                    new_expand_times.append(ele)
-                else:
-                    assert (isinstance(ele, int))
-                    temp_out = helper.create_variable_for_type_inference(dtype)
-                    fill_constant(
-                        [1], 'int32', ele, force_cpu=True, out=temp_out)
-                    new_expand_times.append(temp_out)
-            inputs = {'X': x, 'expand_times_tensor': new_expand_times}
-            attrs = {}
-        else:
-            inputs = {'X': x}
-            attrs = {'expand_times': expand_times}
-
     helper.append_op(
-        type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs)
+        type='expand',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'expand_times': expand_times})
     return out
 
 
@@ -9480,8 +9157,6 @@ def uniform_random_batch_size_like(input,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers 
-
             input = layers.data(name="input", shape=[13, 11], dtype='float32')
             out = layers.uniform_random_batch_size_like(input, [-1, 11])
     """
@@ -9524,7 +9199,6 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
             out = layers.gaussian_random(shape=[20, 30])
     """
 
@@ -9654,10 +9328,8 @@ def sum(x):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
-            input0 = layers.data(name="input0", shape=[13, 11], dtype='float32')
-            input1 = layers.data(name="input1", shape=[13, 11], dtype='float32')
-            out = layers.sum([input0,input1])
+            input = layers.data(name="input", shape=[13, 11], dtype='float32')
+            out = layers.sum(input)
     """
 
     helper = LayerHelper('sum', **locals())
@@ -9675,39 +9347,8 @@ def sum(x):
 @templatedoc()
 def slice(input, axes, starts, ends):
     """
-    Slice Operator.
-
-    Produces a slice of the input tensor along multiple axes. Similar to numpy:
-    https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
-    Slice uses `axes`, `starts` and `ends` attributes to specify the start and
-    end dimension for each axis in the list of axes, it uses this information
-    to slice the input data tensor. If a negative value is passed for any of
-    the start or end indices, it represents number of elements before the end
-    of that dimension. If the value passed to start or end is larger than
-    the n (the number of elements in this dimension), it represents n.
-    For slicing to the end of a dimension with unknown size, it is recommended
-    to pass in INT_MAX. The size of axes must be equal to starts\' and ends\'.
-    Following examples will explain how slice works:
-
-    .. code-block:: text
+    ${comment}
 
-        Case1:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [1, 0]
-                ends = [2, 3]
-            Then:
-                result = [ [5, 6, 7], ]
-        
-        Case2:
-            Given:
-                data = [ [1, 2, 3, 4], [5, 6, 7, 8], ]
-                axes = [0, 1]
-                starts = [0, 1]
-                ends = [-1, 1000]
-            Then:
-                result = [ [2, 3, 4], ]
     Args:
         input (Variable): ${input_comment}.
         axes (List): ${axes_comment}
@@ -9720,16 +9361,14 @@ def slice(input, axes, starts, ends):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
- 
             starts = [1, 0, 2]
             ends = [3, 3, 4]
             axes = [0, 1, 2]
 
-            input = fluid.layers.data(
+            input = layers.data(
                 name="input", shape=[3, 4, 5, 6], dtype='float32')
 
-            out = fluid.layers.slice(input, axes=axes, starts=starts, ends=ends)
+            out = layers.slice(input, axes=axes, starts=starts, ends=ends)
     """
 
     helper = LayerHelper('slice', **locals())
@@ -9761,11 +9400,9 @@ def shape(input):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            input = fluid.layers.data(
+            input = layers.data(
                 name="input", shape=[3, 100, 100], dtype="float32")
-            out = fluid.layers.shape(input)
+            out = layers.shape(input)
     """
 
     helper = LayerHelper('shape', **locals())
@@ -9846,14 +9483,6 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name="X", shape=[1, 2, 5, 5], dtype='float32')
-            y = fluid.layers.scale(x, scale = 2.0, bias = 1.0)
     """
 
     helper = LayerHelper('scale', **locals())
@@ -9929,43 +9558,6 @@ for func in [
             "act (basestring|None): Activation applied to the output.",
             "name (basestring|None): Name of the output."
         ])
-    func.__doc__ = func.__doc__ + """
-
-Examples:
-  .. code-block:: python
-    
-    import paddle.fluid as fluid
-    # example 1: shape(x) = (2, 3, 4, 5), shape(y) = (2, 3, 4, 5)
-    x0 = fluid.layers.data(name="x0", shape=[2, 3, 4, 5], dtype='float32')
-    y0 = fluid.layers.data(name="y0", shape=[2, 3, 4, 5], dtype='float32')
-    z0 = fluid.layers.%s(x0, y0)
-
-    # example 2: shape(X) = (2, 3, 4, 5), shape(Y) = (5)
-    x1 = fluid.layers.data(name="x1", shape=[2, 3, 4, 5], dtype='float32')
-    y1 = fluid.layers.data(name="y1", shape=[5], dtype='float32')
-    z1 = fluid.layers.%s(x1, y1)
-
-    # example 3: shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    x2 = fluid.layers.data(name="x2", shape=[2, 3, 4, 5], dtype='float32')
-    y2 = fluid.layers.data(name="y2", shape=[4, 5], dtype='float32')
-    z2 = fluid.layers.%s(x2, y2, axis=2)
-
-    # example 4: shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    x3 = fluid.layers.data(name="x3", shape=[2, 3, 4, 5], dtype='float32')
-    y3 = fluid.layers.data(name="y3", shape=[3, 4], dtype='float32')
-    z3 = fluid.layers.%s(x3, y3, axis=1)
-
-    # example 5: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    x4 = fluid.layers.data(name="x4", shape=[2, 3, 4, 5], dtype='float32')
-    y4 = fluid.layers.data(name="y4", shape=[2], dtype='float32')
-    z4 = fluid.layers.%s(x4, y4, axis=0)
-
-    # example 6: shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-    x5 = fluid.layers.data(name="x5", shape=[2, 3, 4, 5], dtype='float32')
-    y5 = fluid.layers.data(name="y5", shape=[2], dtype='float32')
-    z5 = fluid.layers.%s(x5, y5, axis=0)
-    """ % (func.__name__, func.__name__, func.__name__, func.__name__,
-           func.__name__, func.__name__)
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
@@ -10126,8 +9718,7 @@ def clip(x, min, max, name=None):
     helper = LayerHelper("clip", **locals())
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
+        name = unique_name.generate(".".join([helper.name, 'tmp']))
 
     out = helper.create_variable(
         type=x.type, name=name, dtype=x.dtype, persistable=False)
@@ -10166,8 +9757,7 @@ def clip_by_norm(x, max_norm, name=None):
     helper = LayerHelper("clip_by_norm", **locals())
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
+        name = unique_name.generate(".".join([helper.name, 'tmp']))
 
     out = helper.create_variable(
         type=x.type, name=name, dtype=x.dtype, persistable=False)
@@ -10192,13 +9782,6 @@ def mean(x, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            input = fluid.layers.data(
-                name='data', shape=[2, 3], dtype='float32')
-            mean = fluid.layers.mean(input)
     """
 
     helper = LayerHelper("mean", **locals())
@@ -10226,15 +9809,6 @@ def merge_selected_rows(x, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-
-            b = fluid.default_main_program().global_block()
-            var = b.create_var(
-                name="X", dtype="float32", persistable=True,
-                type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            y = fluid.layers.merge_selected_rows(var)
     """
 
     helper = LayerHelper("merge_selected_rows", **locals())
@@ -10261,18 +9835,6 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            dataX = fluid.layers.data(name="dataX", append_batch_size = False, shape=[2, 5], dtype="float32")
-            dataY = fluid.layers.data(name="dataY", append_batch_size = False, shape=[5, 3], dtype="float32")
-            output = fluid.layers.mul(dataX, dataY,
-                                      x_num_col_dims = 1,
-                                      y_num_col_dims = 1)
-            
-
     """
 
     helper = LayerHelper("mul", **locals())
@@ -10419,9 +9981,6 @@ def space_to_depth(x, blocksize, name=None):
 
     Examples:
         .. code-block:: python
-	
-            import paddle.fluid as fluid
-            import numpy as np
 
             data = fluid.layers.data(
                 name='data', shape=[1, 4, 2, 2], dtype='float32', append_batch_size=False)
@@ -10433,7 +9992,6 @@ def space_to_depth(x, blocksize, name=None):
             out_main = exe.run(fluid.default_main_program(),
                           feed={'data': data_np},
                           fetch_list=[space_to_depthed])
-
     """
 
     helper = LayerHelper("space_to_depth", **locals())
@@ -10467,13 +10025,6 @@ def sequence_reverse(x, name=None):
 
     Returns:
         out(${y_type}): ${y_comment}
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[2, 6], dtype='float32')
-            x_reversed = fluid.layers.sequence_reverse(x)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -10520,20 +10071,6 @@ def affine_channel(x,
 
     Returns:
         out (Variable): A tensor of the same shape and data layout with x.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32],
-                                     dtype='float32')
-            input_scale = fluid.layers.create_parameter(shape=[3],
-                                     dtype="float32")
-            input_bias = fluid.layers.create_parameter(shape=[3],
-                                     dtype="float32")
-            out = fluid.layers.affine_channel(data,scale=input_scale,
-                                     bias=input_bias)
-
     """
     helper = LayerHelper("affine_channel", **locals())
 
@@ -10682,8 +10219,8 @@ def hash(input, hash_size, num_hash=1, name=None):
 
         # shape [2, 2]
         input.data = [
-            [[1, 2],
-             [3, 4]],
+            [[1], [2]],
+            [[3], [4]],
         ]
 
         input.lod = [[0, 2]]
@@ -10700,8 +10237,8 @@ def hash(input, hash_size, num_hash=1, name=None):
 
         # shape [2, 4]
         output.data = [
-            [[9662, 9217, 1129, 8487],
-             [8310, 1327, 1654, 4567]],
+            [[9662], [9217], [1129], [8487]],
+            [[8310], [1327], [1654], [4567]],
         ]
 
         output.lod = [[0, 2]]
@@ -10720,24 +10257,8 @@ def hash(input, hash_size, num_hash=1, name=None):
     Examples:
        .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            import numpy as np
-
-            titles = fluid.layers.data(name='titles', shape=[1], dtype='int32', lod_level=1)
-            hash_r = fluid.layers.hash(name='hash_x', input=titles, num_hash=1, hash_size=1000)
-
-            place = fluid.core.CPUPlace()
-            exece = fluid.Executor(place)
-            exece.run(fluid.default_startup_program()) 
-
-            # Init Tensor
-            tensor = fluid.core.LoDTensor() 
-            tensor.set(np.random.randint(0, 10, (3, 1)).astype("int32"), place)
-            # Set LoD
-            tensor.set_recursive_sequence_lengths([[1, 1, 1]])
-
-            out = exece.run(feed={'titles': tensor}, fetch_list=[hash_r], return_numpy=False)
+           x = fluid.layers.data(name="x", shape=[1], dtype='int32', lod_level=1)
+           out = fluid.layers.hash(input=x, num_hash=4, hash_size=1000)
     """
     helper = LayerHelper('hash', **locals())
     out = helper.create_variable_for_type_inference(
@@ -10815,11 +10336,9 @@ def grid_sampler(x, grid, name=None):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[10, 32, 32], dtype='float32')
-            theta = fluid.layers.data(name='theta', shape=[2, 3], dtype='float32')
-            grid = fluid.layers.affine_grid(theta=theta, out_shape=[3, 10, 32, 32])
+            x = fluid.layers.data(name='x', shape=[3, 10, 32, 32], dtype='float32')
+            theta = fluid.layers.data(name='theta', shape=[3, 2, 3], dtype='float32')
+            grid = fluid.layers.affine_grid(input=theta, size=[3, 10, 32, 32]})
             out = fluid.layers.grid_sampler(x=x, grid=grid)
 
     """
@@ -10913,16 +10432,8 @@ def teacher_student_sigmoid_loss(input,
 
     Examples:
         .. code-block:: python
-          
-          import paddle.fluid as fluid
 
-          batch_size = 64
-          label = fluid.layers.data(
-                    name="label", shape=[batch_size, 1], dtype="int64", append_batch_size=False)
-          similarity = fluid.layers.data(
-                    name="similarity", shape=[batch_size, 1], dtype="float32", append_batch_size=False)
           cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label)
-
     """
     helper = LayerHelper('teacher_student_sigmoid_loss', **locals())
     out = helper.create_variable(dtype=input.dtype)
@@ -10966,15 +10477,7 @@ def add_position_encoding(input, alpha, beta, name=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-
-          tensor = fluid.layers.data(
-              name='tensor',
-              shape=[32, 64, 512],
-              dtype='float32',
-              append_batch_size=False)
-          position_tensor = fluid.layers.add_position_encoding(
-              input=tensor, alpha=1.0, beta=1.0)
+          position_tensor = fluid.layers.add_position_encoding(input=tensor)
 
     """
     helper = LayerHelper('add_position_encoding', **locals())
@@ -11076,14 +10579,6 @@ def get_tensor_from_selected_rows(x, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
-
-    Examples:
-        .. code-block:: python
-	    
-            import paddle.fluid as fluid
-            b = fluid.default_main_program().global_block()
-            input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
-            out = fluid.layers.get_tensor_from_selected_rows(input)
     """
 
     helper = LayerHelper('get_tensor_from_selected_rows', **locals())
@@ -11521,15 +11016,8 @@ def huber_loss(input, label, delta):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            predict = fluid.layers.fc(input=x, size=1)
-            label = fluid.layers.data(
-                name='label', shape=[1], dtype='float32')
-            loss = fluid.layers.huber_loss(
-                input=predict, label=label, delta=1.0)
-
+            predictions = fluid.layers.softmax(x)
+            loss = fluid.layers.huber_loss(input=predictions, label=label, 1.0)
     """
     helper = LayerHelper('huber_loss', **locals())
     residual = helper.create_variable_for_type_inference(
@@ -11793,12 +11281,8 @@ def fsp_matrix(x, y):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            data = fluid.layers.data(name='data', shape=[3, 32, 32])
-            feature_map_0 = fluid.layers.conv2d(data, num_filters=2,
-                                                filter_size=3)
-            feature_map_1 = fluid.layers.conv2d(feature_map_0, num_filters=2,
-                                                filter_size=1)
+            feature_map_0 = fluid.layers.conv2d(x)
+            feature_map_1 = fluid.layers.conv2d(feature_map_0)
             loss = fluid.layers.fsp_matrix(feature_map_0, feature_map_1)
 
     """
@@ -11827,7 +11311,7 @@ def continuous_value_model(input, cvm, use_cvm=True):
         cvm (Variable):   a 2-D Tensor with shape [N x 2], where N is the batch size, 2 is show and click.
         use_cvm  (bool):  use cvm or not. if use cvm, the output dim is the same as input
                           if don't use cvm, the output dim is input dim - 2(remove show and click)
-                          (cvm op is a customized op, which input is a sequence has embed_with_cvm default, so we need an op named cvm to decided whever use it or not.)
+                          (cvm op is a customized op, which input is a sequence has embedd_with_cvm default, so we need an op named cvm to decided whever use it or not.)
 
     Returns:
 
@@ -11893,430 +11377,3 @@ def where(condition):
     helper.append_op(
         type='where', inputs={'Condition': condition}, outputs={'Out': [out]})
     return out
-
-
-def sign(x):
-    """
-    **sign**
-
-    This function returns sign of every element in `x`: 1 for positive, -1 for negative and 0 for zero.
-
-    Args:
-        x(Variable|numpy.ndarray): The input tensor.
-
-    Returns:
-        Variable: The output sign tensor with identical shape and dtype to `x`.
-
-    Examples:
-        .. code-block:: python
-
-          # [1, 0, -1]
-          data = fluid.layers.sign(np.array([3, 0, -2])) 
-    """
-
-    helper = LayerHelper("sign", **locals())
-
-    if not isinstance(x, Variable):
-        x = assign(x)
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-    helper.append_op(type='sign', inputs={'X': [x]}, outputs={'Out': [out]})
-
-    return out
-
-
-def deformable_conv(input,
-                    offset,
-                    mask,
-                    num_filters,
-                    filter_size,
-                    stride=1,
-                    padding=0,
-                    dilation=1,
-                    groups=None,
-                    deformable_groups=None,
-                    im2col_step=None,
-                    param_attr=None,
-                    bias_attr=None,
-                    name=None):
-    """
-    **Deformable Convolution Layer**
-
-    Compute 2-D deformable convolution on 4-D input.
-    Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
-    
-    .. math::
-
-        y(p) = \sum_{k=1}^{K}{w_k * x(p + p_k + \Delta p_k) * \Delta m_k}
-    
-    Where :math:`\Delta p_k` and :math:`\Delta m_k` are the learnable offset and modulation scalar for the k-th location, respectively.
-    Refer to `Deformable ConvNets v2: More Deformable, Better Results
-    <https://arxiv.org/abs/1811.11168v2>`_ .
-    
-    Example:
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-          Offset shape: :math:`(N, 2 * deformable\_groups * H_f * H_w, H_{in}, W_{in})`
-
-          Mask shape: :math:`(N, deformable\_groups * H_f * H_w, H_{in}, W_{in})`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Args:
-        input (Variable): The input image with [N, C, H, W] format.
-        offset (Variable): The input coord offset of deformable convolution layer.
-        Mask (Variable): The input mask of deformable covolution layer.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the deformable conv layer. According to
-            grouped convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        deformable_groups (int): The number of deformable group partitions.
-            Default: deformable_groups = 1.
-        im2col_step (int): Maximum number of images per im2col computation; 
-            The total batch size should be divisable by this value or smaller
-            than this value; if you face out of memory problem, you can try
-            to use a smaller value here.
-            Default: im2col_step = 64.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of deformable conv. If it is set to None or one attribute of ParamAttr,
-            deformable conv will create ParamAttr as param_attr.
-            If the Initializer of the param_attr is not set, the parameter is
-            initialized with :math:`Normal(0.0, std)`, and the 
-            :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of
-            deformable conv layer. If it is set to False, no bias will be added
-            to the output units. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-    Returns:
-        Variable: The tensor variable storing the deformable convolution \
-                  result.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-    Examples:
-        .. code-block:: python
-
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          offset = fluid.layers.data(name='offset', shape=[18, 32, 32], dtype='float32')
-          mask = fluid.layers.data(name='mask', shape=[9, 32, 32], dtype='float32')
-          out = fluid.layers.deformable_conv(input=data, offset=offset, mask=mask,
-                                             num_filters=2, filter_size=3, padding=1)
-    """
-
-    num_channels = input.shape[1]
-    assert param_attr is not False, "param_attr should not be False here."
-
-    helper = LayerHelper('deformable_conv', **locals())
-    dtype = helper.input_dtype()
-
-    if not isinstance(input, Variable):
-        raise TypeError("Input of deformable_conv must be Variable")
-    if not isinstance(offset, Variable):
-        raise TypeError("Input Offset of deformable_conv must be Variable")
-    if not isinstance(mask, Variable):
-        raise TypeError("Input Mask of deformable_conv must be Variable")
-
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError("num_channels must be divisible by groups.")
-        num_filter_channels = num_channels // groups
-
-    filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    padding = utils.convert_to_list(padding, 2, 'padding')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
-
-    input_shape = input.shape
-    filter_shape = [num_filters, int(num_filter_channels)] + filter_size
-
-    def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * num_channels
-        std = (2.0 / filter_elem_num)**0.5
-        return Normal(0.0, std, 0)
-
-    filter_param = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        default_initializer=_get_default_param_initializer())
-
-    pre_bias = helper.create_variable_for_type_inference(dtype)
-
-    helper.append_op(
-        type='deformable_conv',
-        inputs={
-            'Input': input,
-            'Filter': filter_param,
-            'Offset': offset,
-            'Mask': mask,
-        },
-        outputs={"Output": pre_bias},
-        attrs={
-            'strides': stride,
-            'paddings': padding,
-            'dilations': dilation,
-            'groups': groups,
-            'deformable_groups': deformable_groups,
-            'im2col_step': im2col_step,
-        })
-
-    output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-    return output
-
-
-def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
-    """
-
-    This function returns a col buffer of sliding local blocks of input x, also known
-    as im2col for batched 2D image tensors. For each block under the convolution filter,
-    all element will be rearranged as a column. While the convolution filter silding over
-    the input feature map, a series of such columns will be formed.
-
-    For each input :math:`X` with shape [N, C, H, W], the output shape [N, Cout, Lout]
-    can be calculated as following.
-
-    .. math::
-
-        dkernel[0] &= dilations[0] \\times (kernel\_sizes[0] - 1) + 1
-
-        dkernel[1] &= dilations[1] \\times (kernel\_sizes[1] - 1) + 1
-
-        hout &= \\frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
-
-        wout &= \\frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
-
-        Cout &= C \\times kernel\_sizes[0] \\times kernel\_sizes[1]
-
-        Lout &= hout \\times wout
-
-
-    Args:
-        x(Varaible):              The input tensor of format [N, C, H, W].
-        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
-                                  or an integer k treated as [k, k].
-        strides(int|list):        The strides, should be [stride_h, stride_w]
-                                  or an integer stride treated as [sride, stride].
-                                  For default, strides will be [1, 1].
-        paddings(int|list):       The paddings of each dimension, should be
-                                  [padding_top, padding_left, padding_bottom, padding_right]
-                                  or [padding_h, padding_w] or an integer padding.
-                                  If [padding_h, padding_w] was given, it will expanded to
-                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
-                                  padding was given, [padding, padding, padding, padding] will
-                                  be used. For default, paddings will be [0, 0, 0, 0]
-        dilations(int|list):      the dilations of convolution kernel, shold be
-                                  [dilation_h, dilation_w], or an integer dialtion treated as
-                                  [dilation, dilation]. For default, it will be [1, 1].
-
-    
-    Returns:
-        Variable: The tensor variable corresponding to the sliding local blocks. The output shape is [N, Cout, Lout] as decribled above. Cout is the  total number of values within each block, and Lout is the total number of such blocks.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            x = fluid.layers.data(name = 'data', shape = [3, 224, 224], dtype = 'float32')
-            y = fluid.layers.unfold(x, [3, 3], 1, 1, 1)
-    """
-
-    helper = LayerHelper("unfold", **locals())
-
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
-
-    if isinstance(kernel_sizes, int):
-        kernel_sizes = [kernel_sizes, kernel_sizes]
-    else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
-
-    if isinstance(strides, int):
-        strides = [strides, strides]
-    else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
-
-    if isinstance(dilations, int):
-        dilations = [dilations, dilations]
-    else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
-
-    if isinstance(paddings, int):
-        paddings = [paddings] * 4
-    elif isinstance(paddings, list):
-        if len(paddings) == 2:
-            paddings = paddings * 2
-        elif len(paddings) == 4:
-            pass
-        else:
-            raise ValueError(
-                "paddings should either be an integer or a list of 2 or 4 integers"
-            )
-    else:
-        raise ValueError(
-            "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(
-        type="unfold",
-        inputs={"X": x},
-        outputs={"Y": out},
-        attrs={
-            "kernel_sizes": kernel_sizes,
-            "strides": strides,
-            "paddings": paddings,
-            "dilations": dilations
-        })
-    return out
-
-
-def deformable_roi_pooling(input,
-                           rois,
-                           trans,
-                           no_trans=False,
-                           spatial_scale=1.0,
-                           group_size=[1, 1],
-                           pooled_height=1,
-                           pooled_width=1,
-                           part_size=None,
-                           sample_per_part=1,
-                           trans_std=0.1,
-                           position_sensitive=False,
-                           name=None):
-    """
-    Deformable PSROI Pooling Layer
-    
-    Args:
-       input (Variable):The input of Deformable PSROIPooling.The shape of input tensor is 
-                        [N,C,H,W]. Where N is batch size,C is number of input channels,H 
-                        is height of the feature, and W is the width of the feature.
-       rois (Variable): ROIs (Regions of Interest) to pool over.It should be
-                        a 2-D LoDTensor of shape (num_rois, 4), the lod level
-                        is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
-                        the top left coordinates, and (x2, y2) is the bottom
-                        right coordinates.
-       trans (Variable): Offset of features on ROIs while pooling.The format is NCHW, where 
-                         N is number of ROIs, C is number of channels, which indicate the offset distance 
-                         in the x and y directions, H is pooled height, and W is pooled width.
-       no_trans (bool): Whether to add offset to get new value or not while roi pooling, which 
-                          value is True or False. Default: False.
-       spatial_scale (float): Ratio of input feature map height (or width) to raw image height (or width).
-                             Equals the reciprocal of total stride in convolutional layers, Default: 1.0.
-       group_size (list|tuple): The number of groups which input channels are divided.(eg.number of input channels 
-                         is k1*k2*(C+1), which k1 and k2 are group width and height and C+1 is number of output
-                         chanels. eg.(4, 6), which 4 is height of group and 6 is width of group. Default: [1, 1].
-       pooled_height (integer): The pooled output height. Default: 1.
-       pooled_width (integer): The pooled output width. Default: 1.
-       part_size (list|tuple): The height and width of offset, eg.(4, 6), which height is 4 and width is 6, Default: 
-                        if None, default value is [pooled_height, pooled_width].
-       sample_per_part (integer): The number of samples in each bin. Default: 1.
-       trans_std (float): Coefficient of offset. Default: 0.1.
-       position_sensitive (bool): Whether to choose deformable psroi pooling mode or not. Default: False.
-       name (str): Name of layer. Default: None.
-    Returns:
-        Variable: The tensor variable storing the deformable psroi pooling \
-                  result.
-
-
-    Examples:
-      .. code-block:: python
-
-        input = fluid.layers.data(name="input",
-                                  shape=[2, 192, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False)                   
-        rois = fluid.layers.data(name="rois",
-                                 shape=[4],
-                                 dtype='float32', 
-                                 lod_level=1)
-        trans = fluid.layers.data(name="trans",
-                                  shape=[2, 384, 64, 64], 
-                                  dtype='float32', 
-                                  append_batch_size=False) 
-        x = fluid.layers.nn.deformable_roi_pooling(input=input, 
-                                                     rois=rois, 
-                                                     trans=trans, 
-                                                     no_trans=False,
-                                                     spatial_scale=1.0, 
-                                                     group_size=(1, 1),
-                                                     pooled_height=8,
-                                                     pooled_width=8,
-                                                     part_size=(8, 8),
-                                                     sample_per_part=4, 
-                                                     trans_std=0.1,
-                                                     position_sensitive=False)
-    """
-
-    input_channels = input.shape[1]
-    if position_sensitive == False:
-        output_channels = input_channels
-    else:
-        output_channels = input_channels / pooled_height / pooled_width
-
-    if part_size is None:
-        part_height = pooled_height
-        part_width = pooled_width
-        part_size = [part_height, part_width]
-    part_size = utils.convert_to_list(part_size, 2, 'part_size')
-    group_size = utils.convert_to_list(group_size, 2, 'group_size')
-    helper = LayerHelper('deformable_psroi_pooling', **locals())
-    dtype = helper.input_dtype()
-    output = helper.create_variable_for_type_inference(dtype)
-    top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(
-        type="deformable_psroi_pooling",
-        inputs={"Input": input,
-                "ROIs": rois,
-                "Trans": trans},
-        outputs={"Output": output,
-                 "TopCount": top_count},
-        attrs={
-            "no_trans": no_trans,
-            "spatial_scale": spatial_scale,
-            "output_dim": output_channels,
-            "group_size": group_size,
-            "pooled_height": pooled_height,
-            "pooled_width": pooled_width,
-            "part_size": part_size,
-            "sample_per_part": sample_per_part,
-            "trans_std": trans_std
-        })
-    return output
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 6c944da560d..636e83996f0 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -83,13 +83,12 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
     Examples:
         .. code-block:: python
      
-            import paddle.fluid as fluid
             result = fluid.layers.uniform_random(shape=[32, 784])
     """
 
+    locals_var = locals()
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    locals_var = locals().copy()
     kwargs = dict()
     for name, val in locals_var.items():
         if val is not None:
@@ -103,7 +102,7 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
 def hard_shrink(x, threshold=None):
-    locals_var = locals().copy()
+    locals_var = locals()
     kwargs = dict()
     for name, val in locals_var.items():
         if val is not None:
@@ -124,7 +123,7 @@ _cum_sum_ = generate_layer_fn('cumsum')
 
 
 def cumsum(x, axis=None, exclusive=None, reverse=None):
-    locals_var = locals().copy()
+    locals_var = locals()
     kwargs = dict()
     for name, val in locals_var.items():
         if val is not None:
@@ -145,7 +144,7 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
 
 
 def thresholded_relu(x, threshold=None):
-    locals_var = locals().copy()
+    locals_var = locals()
     kwargs = dict()
     for name, val in locals_var.items():
         if val is not None:
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 92d5e819e5d..9a0afcd4516 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -83,10 +83,9 @@ def create_parameter(shape,
         the created parameter.
 
     Examples:
-        .. code-block:: python
-
-            import paddle.fluid.layers as layers
-            W = layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> W = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
+        >>> data = fluid.layers.data(name="img", shape=[64, 784], append_batch_size=False)
+        >>> hidden = fluid.layers.matmul(x=data, y=W)
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
@@ -123,9 +122,8 @@ def create_global_var(shape,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
-            var = layers.create_global_var(shape=[2,3], value=1.0, dtype='float32',
-                                          persistable=True, force_cpu=True, name='new_var')
+            var = fluid.create_global_var(shape=[2,3], value=1.0, dtype='float32',
+                                 persistable=True, force_cpu=True, name='new_var')
     """
     helper = LayerHelper("global_var", **locals())
     var = helper.create_global_variable(
@@ -246,9 +244,7 @@ def tensor_array_to_tensor(input, axis=1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            tensor_array = fluid.layers.create_parameter(shape=[784, 200], dtype='float32')
-            output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
+           output, output_index = fluid.layers.tensor_array_to_tensor(input=tensor_array)
     """
     helper = LayerHelper('tensor_array_to_tensor', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -279,23 +275,14 @@ def sums(input, out=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-
-          # sum of several tensors
-          a0 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=1)
-          a1 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=2)
-          a2 = fluid.layers.fill_constant(shape=[1], dtype='int64', value=3)
-          sums = fluid.layers.sums(input=[a0, a1, a2])
-
-          # sum of a tensor array
-          array = fluid.layers.create_array('int64')
-          i = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True)
-          fluid.layers.array_write(a0, array=array, i=i)
-          i = fluid.layers.increment(x=i)
-          fluid.layers.array_write(a1, array=array, i=i)
-          i = fluid.layers.increment(x=i)
-          fluid.layers.array_write(a2, array=array, i=i)
-          sums = fluid.layers.sums(input=array)
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(a0)
+          mean_a1 = layers.mean(a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
     """
     helper = LayerHelper('sum', **locals())
     if out is None:
@@ -325,8 +312,6 @@ def assign(input, output=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
           out = fluid.layers.create_tensor(dtype='float32')
           hidden = fluid.layers.fc(input=data, size=10)
           fluid.layers.assign(hidden, out)
@@ -387,7 +372,6 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
     """
 
@@ -441,9 +425,7 @@ def fill_constant_batch_size_like(input,
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             like = fluid.layers.data(name='like', shape=[1], dtype='float32')
-             data = fluid.lgyers.fill_constant_batch_size_like(
+             data = fluid.layers.fill_constant_batch_size_like(
                          input=like, shape=[1], value=0, dtype='int64')
 
     """
@@ -601,7 +583,6 @@ def ones(shape, dtype, force_cpu=False):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           data = fluid.layers.ones(shape=[1], dtype='int64')
     """
     assert isinstance(shape, list) or isinstance(
@@ -631,7 +612,6 @@ def zeros(shape, dtype, force_cpu=False):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
           data = fluid.layers.zeros(shape=[1], dtype='int64')
     """
     return fill_constant(value=0.0, **locals())
@@ -655,11 +635,9 @@ def reverse(x, axis):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="data", shape=[4, 8], dtype="float32")
-          out = fluid.layers.reverse(x=data, axis=0)
+          out = fluid.layers.reverse(x=in, axis=0)
           # or:
-          out = fluid.layers.reverse(x=data, axis=[0,1])
+          out = fluid.layers.reverse(x=in, axis=[0,1])
     """
     if isinstance(axis, int):
         axis = [axis]
@@ -754,14 +732,6 @@ def has_inf(x):
 
     Returns:
         Variable: The tensor variable storing the output, only a bool value.
-    
-    Examples:
-        .. code-block:: python
-          
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_inf(data)
-
     """
     helper = LayerHelper("isinf", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -778,14 +748,6 @@ def has_nan(x):
 
     Returns:
         Variable: The tensor variable storing the output, only a bool value.
-    
-    Examples:
-        .. code-block:: python
-    
-          import paddle.fluid as fluid
-          data = fluid.layers.data(name="input", shape=[4, 32, 32], dtype="float32")
-          res = fluid.layers.has_nan(data)
-
     """
     helper = LayerHelper("isnan", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -803,15 +765,6 @@ def isfinite(x):
 
     Returns:
         Variable: The tensor variable storing the output, contains a bool value.
-
-    Examples:
-
-        .. code-block:: python
-
-            var = fluid.layers.data(name="data",
-                                    shape=(4, 6),
-                                    dtype="float32")
-            out = fluid.layers.isfinite(v)
     """
     helper = LayerHelper("isfinite", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
index 946c6ff6565..c7c82f28e7c 100644
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
@@ -27,7 +27,6 @@ from .initializer import Constant
 from . import unique_name
 from .framework import Program, Variable, program_guard
 from . import layers
-from .layers import detection
 
 __all__ = [
     'MetricBase',
@@ -154,25 +153,20 @@ class CompositeMetric(MetricBase):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            comp = fluid.metrics.CompositeMetric()
-            precision = fluid.metrics.Precision()
-            recall = fluid.metrics.Recall()
-            comp.add_metric(precision)
-            comp.add_metric(recall)
-
+          labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+          pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+          comp = fluid.metrics.CompositeMetric()
+          acc = fluid.metrics.Precision()
+          recall = fluid.metrics.Recall()
+          comp.add_metric(acc)
+          comp.add_metric(recall)
+          for pass in range(PASSES):
+            comp.reset()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
             comp.update(preds=preds, labels=labels)
-            numpy_precision, numpy_recall = comp.eval()
-
-            print("expect precision: %.2f, got %.2f" % ( 3. / 5, numpy_precision ) )
-            print("expect recall: %.2f, got %.2f" % (3. / 4, numpy_recall ) )
+            numpy_acc, numpy_recall = comp.eval()
     """
 
     def __init__(self, name=None):
@@ -221,30 +215,20 @@ class Precision(MetricBase):
     relevant instances among the retrieved instances.
     https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers
 
-    This class mangages the precision score for binary classification task.
+    Note Precision is different with Accuracy in binary classifiers.
+    accuracy = true positive / total instances
+    precision = true positive / all positive instance
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
-
             metric = fluid.metrics.Precision()
-
-            # generate the preds and labels
-
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            metric.update(preds=preds, labels=labels)
-            numpy_precision = metric.eval()
-
-            print("expct precision: %.2f and got %.2f" % ( 3.0 / 5.0, numpy_precision))
+            for pass in range(PASSES):
+                metric.reset()
+                for data in train_reader():
+                    loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                    metric.update(preds=preds, labels=labels)
+                numpy_precision = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -263,7 +247,7 @@ class Precision(MetricBase):
         for i in range(sample_num):
             pred = preds[i]
             label = labels[i]
-            if pred == 1:
+            if label == 1:
                 if pred == label:
                     self.tp += 1
                 else:
@@ -282,30 +266,16 @@ class Recall(MetricBase):
 
     https://en.wikipedia.org/wiki/Precision_and_recall
 
-    This class mangages the recall score for binary classification task.
-
     Examples:
         .. code-block:: python
 
-            import numpy as np
-
             metric = fluid.metrics.Recall()
-
-            # generate the preds and labels
-
-            preds = [[0.1], [0.7], [0.8], [0.9], [0.2],
-                     [0.2], [0.3], [0.5], [0.8], [0.6]]
-
-            labels = [[0], [1], [1], [1], [1],
-                      [0], [0], [0], [0], [0]]
-
-            preds = np.array(preds)
-            labels = np.array(labels)
-
-            metric.update(preds=preds, labels=labels)
-            numpy_precision = metric.eval()
-
-            print("expct precision: %.2f and got %.2f" % ( 3.0 / 4.0, numpy_precision))
+            for pass in range(PASSES):
+                metric.reset()
+                for data in train_reader():
+                    loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds=preds, labels=labels)
+                numpy_recall = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -318,16 +288,15 @@ class Recall(MetricBase):
             raise ValueError("The 'preds' must be a numpy ndarray.")
         if not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray.")
-        sample_num = labels.shape[0]
-        preds = np.rint(preds).astype("int32")
-
+        sample_num = labels[0]
         for i in range(sample_num):
-            pred = preds[i]
+            pred = preds[i].astype("int32")
             label = labels[i]
             if label == 1:
                 if pred == label:
                     self.tp += 1
-                else:
+            else:
+                if pred != label:
                     self.fn += 1
 
     def eval(self):
@@ -337,7 +306,8 @@ class Recall(MetricBase):
 
 class Accuracy(MetricBase):
     """
-    Calculate the mean accuracy over multiple batches.
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
     https://en.wikipedia.org/wiki/Accuracy_and_precision
 
     Args:
@@ -346,28 +316,18 @@ class Accuracy(MetricBase):
     Examples:
         .. code-block:: python
 
-            #suppose we have batch_size = 128
-            batch_size=128
-            accuracy_manager = fluid.metrics.Accuracy()
-
-            #suppose the accuracy is 0.9 for the 1st batch
-            batch1_acc = 0.9
-            accuracy_manager.update(value = batch1_acc, weight = batch_size)
-            print("expect accuracy: %.2f, get accuracy: %.2f" % (batch1_acc, accuracy_manager.eval()))
-
-            #suppose the accuracy is 0.8 for the 2nd batch
-            batch2_acc = 0.8
-
-            accuracy_manager.update(value = batch2_acc, weight = batch_size)
-            #the joint acc for batch1 and batch2 is (batch1_acc * batch_size + batch2_acc * batch_size) / batch_size / 2
-            print("expect accuracy: %.2f, get accuracy: %.2f" % ((batch1_acc * batch_size + batch2_acc * batch_size) / batch_size / 2, accuracy_manager.eval()))
-
-            #reset the accuracy_manager
-            accuracy_manager.reset()
-            #suppose the accuracy is 0.8 for the 3rd batch
-            batch3_acc = 0.8
-            accuracy_manager.update(value = batch3_acc, weight = batch_size)
-            print("expect accuracy: %.2f, get accuracy: %.2f" % (batch3_acc, accuracy_manager.eval()))
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            minibatch_accuracy = fluid.layers.accuracy(pred, label)
+            accuracy_evaluator = fluid.metrics.Accuracy()
+            for pass in range(PASSES):
+                accuracy_evaluator.reset()
+                for data in train_reader():
+                    batch_size = data[0]
+                    loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+                accuracy_evaluator.update(value=minibatch_accuracy, weight=batch_size)
+                numpy_acc = accuracy_evaluator.eval()
     """
 
     def __init__(self, name=None):
@@ -388,15 +348,10 @@ class Accuracy(MetricBase):
                 "The 'value' must be a number(int, float) or a numpy ndarray.")
         if not _is_number_(weight):
             raise ValueError("The 'weight' must be a number(int, float).")
-        if _is_number_(weight) and weight < 0:
-            raise ValueError("The 'weight' can not be negative")
         self.value += value * weight
         self.weight += weight
 
     def eval(self):
-        """
-        Return the mean accuracy (float or numpy.array) for all accumulated batches.
-        """
         if self.weight == 0:
             raise ValueError("There is no data in Accuracy Metrics. \
                 Please check layers.accuracy output has added to Accuracy.")
@@ -416,29 +371,17 @@ class ChunkEvaluator(MetricBase):
     Examples:
         .. code-block:: python
 
-            # init the chunck-level evaluation manager
+            labels = fluid.layers.data(name="data", shape=[1], dtype="int32")
+            data = fluid.layers.data(name="data", shape=[32, 32], dtype="int32")
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+                input=pred,
+                label=label)
             metric = fluid.metrics.ChunkEvaluator()
-
-            # suppose the model predict 10 chuncks, while 8 ones are correct and the ground truth has 9 chuncks.
-            num_infer_chunks = 10
-            num_label_chunks = 9 
-            num_correct_chunks = 8
-
-            metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
-            numpy_precision, numpy_recall, numpy_f1 = metric.eval()
-
-            print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
-
-            # the next batch, predicting 3 prefectly correct chuncks.
-            num_infer_chunks = 3
-            num_label_chunks = 3
-            num_correct_chunks = 3
-
-            metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
-            numpy_precision, numpy_recall, numpy_f1 = metric.eval()
-
-            print("precision: %.2f, recall: %.2f, f1: %.2f" % (numpy_precision, numpy_recall, numpy_f1))
-
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(num_infer_chunks, num_label_chunks, num_correct_chunks)
+                numpy_precision, numpy_recall, numpy_f1 = metric.eval()
     """
 
     def __init__(self, name=None):
@@ -487,17 +430,12 @@ class ChunkEvaluator(MetricBase):
 class EditDistance(MetricBase):
     """
     Edit distance is a way of quantifying how dissimilar two strings
-    (e.g., words) are to each another by counting the minimum number
-    of edit operations (add, remove or replace) required to transform
-    one string into the other.
+    (e.g., words) are to one another by counting the minimum number
+    of operations required to transform one string into the other.
     Refer to https://en.wikipedia.org/wiki/Edit_distance
 
-    This EditDistance class takes two inputs by using update function:
-    1. distances: a (batch_size, 1) numpy.array, each element represents the
-    edit distance between two sequences.
-    2. seq_num: a int|float value, standing for the number of sequence pairs.
-
-    and returns the overall edit distance of multiple sequence-pairs.
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
 
     Args:
         name: the metrics name
@@ -505,37 +443,19 @@ class EditDistance(MetricBase):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-
-            # suppose that batch_size is 128
-            batch_size = 128
-
-            # init the edit distance manager
-            distance_evaluator = fluid.metrics.EditDistance("EditDistance")
-
-            # generate the edit distance across 128 sequence pairs, the max distance is 10 here
-            edit_distances_batch0 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch0 = batch_size
-
-            distance_evaluator.update(edit_distances_batch0, seq_num_batch0)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch0 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
+            distances, seq_num = fluid.layers.edit_distance(input, label)
+            distance_evaluator = fluid.metrics.EditDistance()
+            for epoch in PASS_NUM:
+                distance_evaluator.reset()
+                for data in batches:
+                    loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+                distance_evaluator.update(distances, seq_num)
+                distance, instance_error = distance_evaluator.eval()
 
-            edit_distances_batch1 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch1 = batch_size
+    In the above example:
 
-            distance_evaluator.update(edit_distances_batch1, seq_num_batch1)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch0 and batch1 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
-
-            distance_evaluator.reset()
-
-            edit_distances_batch2 = np.random.randint(low = 0, high = 10, size = (batch_size, 1))
-            seq_num_batch2 = batch_size
-
-            distance_evaluator.update(edit_distances_batch2, seq_num_batch2)
-            avg_distance, wrong_instance_ratio = distance_evaluator.eval()
-            print("the average edit distance for batch2 is %.2f and the wrong instance ratio is %.2f " % (avg_distance, wrong_instance_ratio))
+        - 'distance' is the average of the edit distance in a pass.
+        - 'instance_error' is the instance error rate in a pass.
 
     """
 
@@ -546,15 +466,6 @@ class EditDistance(MetricBase):
         self.instance_error = 0
 
     def update(self, distances, seq_num):
-        """
-        Update the overall edit distance
-
-        Args:
-            distances: a (batch_size, 1) numpy.array, each element represents the 
-            edit distance between two sequences.
-            seq_num: a int|float value, standing for the number of sequence pairs.
-
-        """
         if not _is_numpy_(distances):
             raise ValueError("The 'distances' must be a numpy ndarray.")
         if not _is_number_(seq_num):
@@ -566,11 +477,6 @@ class EditDistance(MetricBase):
         self.total_distance += total_distance
 
     def eval(self):
-        """
-        Return two floats:
-        avg_distance: the average distance for all sequence pairs updated using the update function.
-        avg_instance_error: the ratio of sequence pairs whose edit distance is not zero.
-        """
         if self.seq_num == 0:
             raise ValueError(
                 "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
@@ -582,9 +488,9 @@ class EditDistance(MetricBase):
 
 class Auc(MetricBase):
     """
-    The auc metric is for binary classification.
+    Auc metric adapts to the binary classification.
     Refer to https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-    Please notice that the auc metric is implemented with python, which may be a little bit slow.
+    Need to note that auc metric compute the value via Python natively.
     If you concern the speed, please use the fluid.layers.auc instead.
 
     The `auc` function creates four local variables, `true_positives`,
@@ -605,26 +511,12 @@ class Auc(MetricBase):
     Examples:
         .. code-block:: python
 
-            import numpy as np
-            # init the auc metric
-            auc_metric = fluid.metrics.Auc("ROC")
-
-            # suppose that batch_size is 128
-            batch_num = 100
-            batch_size = 128
-
-            for batch_id in range(batch_num):
-
-                class0_preds = np.random.random(size = (batch_size, 1))
-                class1_preds = 1 - class0_preds
-
-                preds = np.concatenate((class0_preds, class1_preds), axis=1)
-
-                labels = np.random.randint(2, size = (batch_size, 1))
-                auc_metric.update(preds = preds, labels = labels)
-
-                # shall be some score closing to 0.5 as the preds are randomly assigned
-                print("auc for iteration %d is %.2f" % (batch_id, auc_metric.eval()))
+            pred = fluid.layers.fc(input=data, size=1000, act="tanh")
+            metric = fluid.metrics.Auc()
+            for data in train_reader():
+                loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
+                metric.update(preds, labels)
+                numpy_auc = metric.eval()
     """
 
     def __init__(self, name, curve='ROC', num_thresholds=4095):
@@ -637,15 +529,6 @@ class Auc(MetricBase):
         self._stat_neg = [0] * _num_pred_buckets
 
     def update(self, preds, labels):
-        """
-        Update the auc curve with the given predictions and labels
-
-        Args:
-             preds: an numpy array in the shape of (batch_size, 2), preds[i][j] denotes the probability
-             of classifying the instance i into the class j.
-             labels: an numpy array in the shape of (batch_size, 1), labels[i] is either o or 1, representing
-             the label of the instance i.
-        """
         if not _is_numpy_(labels):
             raise ValueError("The 'labels' must be a numpy ndarray.")
         if not _is_numpy_(preds):
@@ -665,9 +548,6 @@ class Auc(MetricBase):
         return abs(x1 - x2) * (y1 + y2) / 2.0
 
     def eval(self):
-        """
-        Return the area (a float score) under auc curve
-        """
         tot_pos = 0.0
         tot_neg = 0.0
         auc = 0.0
@@ -729,38 +609,20 @@ class DetectionMAP(object):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid.layers as layers
-
-            batch_size = -1 # can be any size
-            image_boxs_num = 10
-            bounding_bboxes_num = 21
-
-            pb = layers.data(name='prior_box', shape=[image_boxs_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            pbv = layers.data(name='prior_box_var', shape=[image_boxs_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            loc = layers.data(name='target_box', shape=[batch_size, bounding_bboxes_num, 4],
-                append_batch_size=False, dtype='float32')
-
-            scores = layers.data(name='scores', shape=[batch_size, bounding_bboxes_num, image_boxs_num],
-                append_batch_size=False, dtype='float32')
-
-            nmsed_outs = fluid.layers.detection_output(scores=scores,
-                loc=loc, prior_box=pb, prior_box_var=pbv)
-
-            gt_box = fluid.layers.data(name="gt_box", shape=[batch_size, 4], dtype="float32")
-            gt_label = fluid.layers.data(name="gt_label", shape=[batch_size, 1], dtype="float32")
-            difficult = fluid.layers.data(name="difficult", shape=[batch_size, 1], dtype="float32")
-
-            exe = fluid.Executor(fluid.CUDAPlace(0))
-            map_evaluator = fluid.metrics.DetectionMAP(nmsed_outs, gt_label, gt_box, difficult, class_num = 3)
-
+            exe = fluid.Executor(place)
+            map_evaluator = fluid.Evaluator.DetectionMAP(input,
+                gt_label, gt_box, gt_difficult)
             cur_map, accum_map = map_evaluator.get_map_var()
+            fetch = [cost, cur_map, accum_map]
+            for epoch in PASS_NUM:
+                map_evaluator.reset(exe)
+                for data in batches:
+                    loss, cur_map_v, accum_map_v = exe.run(fetch_list=fetch)
+
+    In the above example:
 
-            # see detailed examples at 
-            https://github.com/PaddlePaddle/models/blob/43cdafbb97e52e6d93cc5bbdc6e7486f27665fc8/PaddleCV/object_detection
+            - 'cur_map_v' is the mAP of current mini-batch.
+            - 'accum_map_v' is the accumulative mAP of one pass.
 
  
     """
@@ -785,7 +647,7 @@ class DetectionMAP(object):
             label = layers.concat([gt_label, gt_box], axis=1)
 
         # calculate mean average precision (mAP) of current mini-batch
-        map = detection.detection_map(
+        map = layers.detection_map(
             input,
             label,
             class_num,
@@ -810,7 +672,7 @@ class DetectionMAP(object):
         self.has_state = var
 
         # calculate accumulative mAP
-        accum_map = detection.detection_map(
+        accum_map = layers.detection_map(
             input,
             label,
             class_num,
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index f991310384f..8485d7d32fe 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -21,9 +21,9 @@ from collections import defaultdict
 
 import paddle.fluid.core as core
 import paddle.fluid.proto.framework_pb2 as framework_pb2
-from paddle.fluid.log_helper import get_logger
 
-logger = get_logger(__name__, logging.INFO)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 
 try:
     from .graphviz import Graph
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 20fbd079f76..5e511ed2eb9 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -390,8 +390,6 @@ def scaled_dot_product_attention(queries,
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
             queries = fluid.layers.data(name="queries",
                                         shape=[3, 5, 9],
                                         dtype="float32",
@@ -518,7 +516,7 @@ def scaled_dot_product_attention(queries,
 
     key_dim_per_head = keys.shape[-1] // num_heads
     scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
-    product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
+    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
 
     weights = layers.reshape(
         x=layers.reshape(
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 006cd291439..69bbef77f85 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@
 
 from __future__ import print_function
 
-import numpy as np
 from collections import defaultdict
+from .wrapped_decorator import signature_safe_contextmanager
 
-from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program
+from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 
 from . import framework
 from . import layers
 from . import unique_name
-from .backward import append_backward, _some_in_set_, _append_grad_suffix_
+from .backward import append_backward
 from .clip import append_gradient_clip_ops, error_clip_callback
 from .framework import program_guard
 from .initializer import Constant
@@ -35,15 +35,14 @@ from .dygraph.learning_rate_scheduler import LearningRateDecay
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
-from .wrapped_decorator import signature_safe_contextmanager
+import copy
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
     'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
     'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum',
-    'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer',
-    'ExponentialMovingAverage', 'PipelineOptimizer'
+    'LarsMomentumOptimizer', 'DGCMomentumOptimizer'
 ]
 
 
@@ -55,7 +54,6 @@ class Optimizer(object):
     but need to use one of it's implementation.
     """
 
-    @imperative_base.no_grad
     def __init__(self, learning_rate, regularization=None, name=None):
         if framework.in_dygraph_mode():
             if not isinstance(learning_rate, float) and \
@@ -63,18 +61,14 @@ class Optimizer(object):
                 raise TypeError(
                     "learning rate should be float or LearningRateDecay, got %s here"
                     % type(learning_rate))
-            if name is not None:
-                self._name = unique_name.generate(name)
-            else:
-                self._name = unique_name.generate(self.__class__.__name__)
         else:
             if not isinstance(learning_rate, float) and \
                     not isinstance(learning_rate, framework.Variable):
                 raise TypeError(
                     "learning rate should be float or Variable, got %s here" %
                     type(learning_rate))
-            self._name = name
 
+        self._name = name
         self.regularization = regularization
         self._learning_rate = learning_rate
         # the learning rate type should be inferenced from loss
@@ -93,90 +87,6 @@ class Optimizer(object):
         self.helper = None
         self._opti_name_list = []
 
-    def load(self, stat_dict):
-        """
-        load optimizer with learning rate decay in dygraph mode
-        :return: None
-
-        Args:
-            stat_dict: the dict load by load_persistable method
-
-        Examples:
-
-        .. code-block:: python
-
-            from __future__ import print_function
-            import numpy as np
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.optimizer import SGDOptimizer
-            from paddle.fluid.dygraph.nn import FC
-            from paddle.fluid.dygraph.base import to_variable
-
-            class MLP(fluid.Layer):
-                def __init__(self, name_scope):
-                    super(MLP, self).__init__(name_scope)
-
-                    self._fc1 = FC(self.full_name(), 10)
-                    self._fc2 = FC(self.full_name(), 10)
-
-                def forward(self, inputs):
-                    y = self._fc1(inputs)
-                    y = self._fc2(y)
-                    return y
-
-            with fluid.dygraph.guard():
-                mlp = MLP('mlp')
-                optimizer2 = SGDOptimizer(
-                    learning_rate=fluid.layers.natural_exp_decay(
-                    learning_rate=0.1,
-                    decay_steps=10000,
-                    decay_rate=0.5,
-                    staircase=True))
-
-                train_reader = paddle.batch(
-                        paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
-
-                for batch_id, data in enumerate(train_reader()):
-                    dy_x_data = np.array(
-                            [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
-
-                    y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                            128, 1)
-
-                    img = to_variable(dy_x_data)
-                    label = to_variable(y_data)
-                    label._stop_gradient = True
-                    cost = mlp(img)
-                    avg_loss = fluid.layers.reduce_mean(cost)
-                    avg_loss.backward()
-                    optimizer.minimize(avg_loss)
-                    mlp.clear_gradients()
-                    fluid.dygraph.save_persistables(
-                            mlp.state_dict(), [optimizer, optimizer2], "save_dir_2")
-                    if batch_id == 2:
-                            break
-
-            with fluid.dygraph.guard():
-                mlp_load = MLP('mlp')
-                optimizer_load2 = SGDOptimizer(
-                        learning_rate=fluid.layers.natural_exp_decay(
-                        learning_rate=0.1,
-                        decay_steps=10000,
-                        decay_rate=0.5,
-                        staircase=True))
-                parameters, optimizers = fluid.dygraph.load_persistables(
-                    "save_dir_2")
-                mlp_load.load_dict(parameters)
-                optimizer_load2.load(optimizers)
-            self.assertTrue(optimizer2._learning_rate.__dict__ == optimizer_load2._learning_rate.__dict__)
-
-        """
-        if framework.in_dygraph_mode():
-            self._learning_rate = stat_dict[self._name]
-        else:
-            raise TypeError("load can only be used under DyGraph mode")
-
     def get_opti_var_name_list(self):
         return self._opti_name_list
 
@@ -552,8 +462,6 @@ class Optimizer(object):
         if framework.in_dygraph_mode():
             with program_guard(framework.default_main_program(),
                                framework.default_startup_program()):
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
             program = loss.block.program
@@ -561,13 +469,11 @@ class Optimizer(object):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
-    @imperative_base.no_grad
     def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
-                 no_grad_set=None,
-                 grad_clip=None):
+                 no_grad_set=None):
         """
         Add operations to minimize `loss` by updating `parameter_list`.
 
@@ -580,7 +486,6 @@ class Optimizer(object):
                 in `parameter_list`.
             parameter_list (list): list of Variables to update.
             no_grad_set (set|None): set of Variables should be ignored.
-            grad_clip (GradClipBase|None) : Gradient clip strategy
 
         Returns:
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
@@ -591,17 +496,9 @@ class Optimizer(object):
             startup_program=startup_program,
             parameter_list=parameter_list,
             no_grad_set=no_grad_set)
-
-        if grad_clip is not None and framework.in_dygraph_mode():
-            # TODO(hongyu): FIX later, this is only for dygraph, should be work for static mode
-            params_grads = grad_clip(params_grads)
-
         optimize_ops = self.apply_optimize(
             loss, startup_program=startup_program, params_grads=params_grads)
 
-        if framework.in_dygraph_mode():
-            framework._dygraph_tracer()._clear_ops()
-
         return optimize_ops, params_grads
 
 
@@ -623,31 +520,8 @@ class SGDOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-                sgd_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
+            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.2)
+            sgd_optimizer.minimize(cost)
     """
 
     def __init__(self, learning_rate, regularization=None, name=None):
@@ -708,31 +582,8 @@ class MomentumOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-                moment_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
+            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
+            optimizer.minimize(cost)
     """
     _velocity_acc_str = "velocity"
 
@@ -985,8 +836,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
         helper = LayerHelper("dgc_clip_by_norm_op", **args)
 
         if name is None:
-            name = unique_name.generate_with_ignorable_key(".".join(
-                [helper.name, 'tmp']))
+            name = unique_name.generate(".".join([helper.name, 'tmp']))
 
         out = helper.create_variable(
             type=x.type, name=name, dtype=x.dtype, persistable=False)
@@ -1156,22 +1006,8 @@ class AdagradOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import numpy as np
-
-            np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-            inp = fluid.layers.data(
-                name="inp", shape=[2, 2], append_batch_size=False)
-            out = fluid.layers.fc(inp, size=3)
-            out = fluid.layers.reduce_sum(out)
             optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
-            optimizer.minimize(out)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(fluid.default_startup_program())
-            exe.run(
-                feed={"inp": np_inp},
-                fetch_list=[out.name])
+            optimizer.minimize(cost)
     """
     _moment_acc_str = "moment"
 
@@ -1270,29 +1106,8 @@ class AdamOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                adam_optimizer = fluid.optimizer.AdamOptimizer(0.01)
-                adam_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+            optimizer = fluid.optimizer.Adam(learning_rate=0.2)
+            optimizer.minimize(cost)
 
     """
     _moment1_acc_str = "moment1"
@@ -1436,33 +1251,6 @@ class AdamaxOptimizer(Optimizer):
     However, it is added here for numerical stability to prevent the
     division by 0 error.
 
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          # First create the Executor.
-          place = fluid.CPUPlace() # fluid.CUDAPlace(0)
-          exe = fluid.Executor(place)
-
-          train_program = fluid.Program()
-          startup_program = fluid.Program()
-          with fluid.program_guard(train_program, startup_program):
-              data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-              hidden = fluid.layers.fc(input=data, size=10)
-              loss = fluid.layers.mean(hidden)
-              adam = fluid.optimizer.Adamax(learning_rate=0.2)
-              adam.minimize(loss)
-
-          # Run the startup program once and only once.
-          exe.run(startup_program)
-
-          x = numpy.random.random(size=(10, 1)).astype('float32')
-          outs = exe.run(program=train_program,
-                        feed={'X': x},
-                         fetch_list=[loss.name])
-
     Args:
         learning_rate (float|Variable): the learning rate used to update parameters. \
         Can be a float value or a Variable with one float value as data element.
@@ -1473,6 +1261,12 @@ class AdamaxOptimizer(Optimizer):
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
 
+    Examples:
+        .. code-block:: python
+
+            optimizer = fluid.optimizer.Adamax(learning_rate=0.2)
+            optimizer.minimize(cost)
+
     Notes:
        Currently, AdamaxOptimizer doesn't support sparse parameter optimization.
     """
@@ -1595,13 +1389,6 @@ class DecayedAdagradOptimizer(Optimizer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
-            from paddle.fluid.optimizer import DecayedAdagrad
-
-            x = layers.data( name='x', shape=[-1, 10], dtype='float32' )
-            trans = layers.fc( x, 100 )
-            cost = layers.reduce_mean( trans )
             optimizer = fluid.optimizer.DecayedAdagrad(learning_rate=0.2)
             optimizer.minimize(cost)
 
@@ -1823,31 +1610,8 @@ class RMSPropOptimizer(Optimizer):
     Examples:
           .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                rms_optimizer = fluid.optimizer.RMSProp(learning_rate=0.1)
-                rms_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
-
+              optimizer = fluid.optimizer.RMSProp(0.0001)
+              _, params_grads = optimizer.minimize(cost)
     """
 
     _momentum_acc_str = "momentum"
@@ -1982,30 +1746,8 @@ class FtrlOptimizer(Optimizer):
     Examples:
           .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            place = fluid.CPUPlace()
-            main = fluid.Program()
-            with fluid.program_guard(main):
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-                avg_cost = fluid.layers.mean(cost)
-
-                ftrl_optimizer = fluid.optimizer.Ftrl(learning_rate=0.1)
-                ftrl_optimizer.minimize(avg_cost)
-
-                fetch_list = [avg_cost]
-                train_reader = paddle.batch(
-                    paddle.dataset.uci_housing.train(), batch_size=1)
-                feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-                exe = fluid.Executor(place)
-                exe.run(fluid.default_startup_program())
-                for data in train_reader():
-                    exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
+              optimizer = fluid.optimizer.Ftrl(0.0001)
+              _, params_grads = optimizer.minimize(cost)
 
     Notes:
        Currently, FtrlOptimizer doesn't support sparse parameter optimization.
@@ -2071,133 +1813,6 @@ class FtrlOptimizer(Optimizer):
         return ftrl_op
 
 
-class LambOptimizer(AdamOptimizer):
-    """
-    LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
-
-    LAMB Optimizer is designed to scale up the batch size of training without losing 
-    accuracy, which supports adaptive element-wise updating and accurate layer-wise 
-    correction. For more information, please refer to `Reducing BERT Pre-Training 
-    Time from 3 Days to 76 Minutes <https://arxiv.org/abs/1904.00962>`_ .
-
-    The updating of parameters follows:
-
-    ..  math::
-
-	m_t^l & = \\beta_1 m_{t - 1}^l + (1 - \\beta_1)g_t^l
-
-	v_t^l & = \\beta_2 v_{t - 1}^l + (1 - \\beta_2)g_t^l \odot g_t^l
-
-	\\widehat{m}_t^l & = m_t^l/(1 - \\beta_1^t)
-
-	\\widehat{v}_t^l & = v_t^l/(1 - \\beta_2^t)
-	
-        r_1 & = \\left \| w_{t-1}^l \\right \|_2
-	
-        r_2 & = \\left \|  \\frac{\\widehat{m}_t^l}{\\sqrt{\\widehat{v}_t^l+\\epsilon}} + \\lambda w_{t-1}^l \\right \|_2
-
-	r & = r_1 / r_2
-
-	\\eta^l & = r \\times \\eta
-
-	w_t^l & = w_{t-1}^l -\\eta ^l \\times (\\frac{\\widehat{m}_t^l}{\\sqrt{\\widehat{v}_t^l+\\epsilon}} + \\lambda w_{t-1}^l)
-
-
-    where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the 
-    learning rate, :math:`\\lambda` the LAMB weight decay rate.
-
-    Args:
-        learning_rate (float|Variable): the learning rate used to update parameters. \
-                                        Can be a float value or a Variable with one \
-                                        float value as data element.
-        lamb_weight_decay (float): The LAMB weight decay rate.
-        beta1 (float): The exponential decay rate for the 1st moment estimates.
-        beta2 (float): The exponential decay rate for the 2nd moment estimates.
-        epsilon (float): A small float value for numerical stability.
-        regularization: A Regularizer, such as
-                        fluid.regularizer.L1DecayRegularizer.
-        name (str|None): An optional name prefix.
-
-    Examples:
-        .. code-block:: python
-            
-            import paddle.fluid as fluid 
-
-            data = fluid.layers.data(name='x', shape=[5], dtype='float32')
-            hidden = fluid.layers.fc(input=data, size=10)
-            cost = fluid.layers.mean(hidden)
-
-            optimizer = fluid.optimizer.Lamb(learning_rate=0.002)
-            optimizer.minimize(cost)
-    """
-    _moment1_acc_str = "moment1"
-    _moment2_acc_str = "moment2"
-    _beta1_pow_acc_str = "beta1_pow_acc"
-    _beta2_pow_acc_str = "beta2_pow_acc"
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 lamb_weight_decay=0.01,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-6,
-                 regularization=None,
-                 name=None):
-        assert learning_rate is not None
-        assert lamb_weight_decay is not None
-        assert beta1 is not None
-        assert beta2 is not None
-        assert epsilon is not None
-        super(LambOptimizer, self).__init__(
-            learning_rate=learning_rate,
-            regularization=regularization,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            name=name)
-        self.type = "lamb"
-        self._weight_decay = lamb_weight_decay
-
-    def _append_optimize_op(self, block, param_and_grad):
-        assert isinstance(block, framework.Block)
-
-        moment1 = self._get_accumulator(self._moment1_acc_str,
-                                        param_and_grad[0])
-        moment2 = self._get_accumulator(self._moment2_acc_str,
-                                        param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
-
-        # create the lamb optimize op
-        lamb_op = block.append_op(
-            type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "LearningRate": self._create_param_lr(param_and_grad),
-                "Moment1": moment1,
-                "Moment2": moment2,
-                "Beta1Pow": beta1_pow_acc,
-                "Beta2Pow": beta2_pow_acc
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "Moment1Out": moment1,
-                "Moment2Out": moment2
-            },
-            attrs={
-                "beta1": self._beta1,
-                "beta2": self._beta2,
-                "epsilon": self._epsilon,
-                "weight_decay": self._weight_decay
-            },
-            stop_gradient=True)
-
-        return lamb_op
-
-
 # We short the class name, since users will use the optimizer with the package
 # name. The sample code:
 #
@@ -2216,14 +1831,13 @@ Adadelta = AdadeltaOptimizer
 RMSProp = RMSPropOptimizer
 Ftrl = FtrlOptimizer
 LarsMomentum = LarsMomentumOptimizer
-Lamb = LambOptimizer
 
 
 class ModelAverage(Optimizer):
-    """Accumulate the average of parameters within sliding window. The average
+    """Accumulate the average of parameters whtin sliding window. The average
     result will be saved in temporary variables which can be applied to
     parameter variables of current model by calling 'apply()' method. And the
-    'restore()' method is used to restore the parameter values of current model.
+    'restore()' method is used to restored the parameter values of current model.
 
     The size of average window is determined by average_window_rate,
     min_average_window, max_average_window and current update times.
@@ -2235,45 +1849,22 @@ class ModelAverage(Optimizer):
         regularization: A Regularizer, such as
                         fluid.regularizer.L2DecayRegularizer.
         name: A optional name prefix.
-
     Examples:
 
       .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy
-
-        # First create the Executor.
-        place = fluid.CPUPlace()  # fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
+        optimizer = fluid.optimizer.Momentum()
+        optimizer.minimize(cost)
+        model_average = fluid.optimizer.ModelAverage(0.15,
+                                                min_average_window=10000,
+                                                max_average_window=20000)
+        for pass_id in range(args.pass_num):
+            for data in train_reader():
+                exe.run(fluid.default_main_program()...)
 
-        train_program = fluid.Program()
-        startup_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            # build net
-            data = fluid.layers.data(name='X', shape=[1], dtype='float32')
-            hidden = fluid.layers.fc(input=data, size=10)
-            loss = fluid.layers.mean(hidden)
-            optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1)
-            optimizer.minimize(loss)
-
-            # build ModelAverage optimizer
-            model_average = fluid.optimizer.ModelAverage(0.15,
-                                                         min_average_window=10000,
-                                                         max_average_window=20000)
-
-            exe.run(startup_program)
-            x = numpy.random.random(size=(10, 1)).astype('float32')
-            outs = exe.run(program=train_program,
-                           feed={'X': x},
-                           fetch_list=[loss.name])
-
-            # apply ModelAverage
             with model_average.apply(exe):
-                x = numpy.random.random(size=(10, 1)).astype('float32')
-                exe.run(program=train_program,
-                        feed={'X': x},
-                        fetch_list=[loss.name])
+                for data in test_reader():
+                    exe.run(inference_program...)
     """
 
     def __init__(self,
@@ -2293,8 +1884,7 @@ class ModelAverage(Optimizer):
         ).all_parameters():
             if param.do_model_average != False:
                 grad = param.block.create_var(
-                    name=unique_name.generate_with_ignorable_key(".".join(
-                        [param.name, 'tmp'])),
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
                     dtype=param.dtype,
                     persistable=False,
                     stop_gradient=True)
@@ -2388,10 +1978,6 @@ class ModelAverage(Optimizer):
     @signature_safe_contextmanager
     def apply(self, executor, need_restore=True):
         """Apply average values to parameters of current model.
-
-        Args:
-            executor(fluid.Executor): current executor.
-            need_restore(bool): If you finally need to do restore, set it to True. Default is True.
         """
         executor.run(self.apply_program)
         try:
@@ -2402,435 +1988,5 @@ class ModelAverage(Optimizer):
 
     def restore(self, executor):
         """Restore parameter values of current model.
-        
-        Args:
-            executor(fluid.Executor): current executor.
-        """
-        executor.run(self.restore_program)
-
-
-class ExponentialMovingAverage(object):
-    """
-    Compute the moving average of parameters with exponential decay.
-    Given a parameter :math:`\\theta`, its exponential moving average (EMA)
-    will be
-
-    ..  math::
-
-        \\text{EMA}_0 & = 0
-
-	\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
-
-    The average results calculated by **update()** method will be saved in 
-    temporary variables which are created and maintained by the object, and can 
-    be applied to parameters of current model by calling **apply()** method. And 
-    the **restore()** method is used to restore the parameters.
-
-    **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be 
-    zero biased, which can be corrected by divided by a factor 
-    :math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters 
-    when calling **apply()** method would be 
-
-    ..  math::
-    
-        \\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}
-
-    **Decay rate scheduling**. A large decay rate very close to 1 would result 
-    in that the averages move very slowly. And a better strategy is to set a 
-    relative smaller decay rate in the very beginning. The argument **thres_steps**
-    allows users to pass a Variable to schedule the decay rate, in this case, 
-    the actual decay rate becomes
-     
-    ..  math::
-    
-        \\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})
-
-    Usually **thres_steps** can be the global training steps.
-
-
-    Args:
-	decay (float): The exponential decay rate, usually close to 1, such as 
-                       0.999, 0.9999, ... .
-        thres_steps (Variable|None): If not `None`, schedule the decay rate.
-	name (str|None): An optional name prefix.
-
-
-    Examples:
-
-	.. code-block:: python
-	     
-	     import paddle.fluid as fluid 
-
-	     data = fluid.layers.data(name='x', shape=[5], dtype='float32')
-	     hidden = fluid.layers.fc(input=data, size=10)
-	     cost = fluid.layers.mean(hidden)
-
-	     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-	     optimizer.minimize(cost)
-
-             global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
-             ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)
-             ema.update()
-
-	     # pseudo code
-	     for pass_id in range(args.pass_num):
-		 for data in train_reader():
-		     exe.run(fluid.default_main_program()...)
-                 
-                 # usage 1
-		 with ema.apply(exe):
-		     for data in test_reader():
-			 exe.run(inference_program...)
-
-                 # usage 2
-		 with ema.apply(exe, need_restore=False):
-		     for data in test_reader():
-			 exe.run(inference_program...)
-                 ...
-                 ema.restore(exe)
-    """
-
-    def __init__(self, decay=0.999, thres_steps=None, name=None):
-        self._decay = decay
-        self._thres_steps = thres_steps
-        self._name = name if name is not None else ''
-        self._decay_var = self._get_ema_decay()
-
-        self._params_tmps = []
-        for param in default_main_program().global_block().all_parameters():
-            if param.do_model_average != False:
-                tmp = param.block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self._name + param.name, 'ema_tmp'])),
-                    dtype=param.dtype,
-                    persistable=False,
-                    stop_gradient=True)
-                self._params_tmps.append((param, tmp))
-
-        self._ema_vars = {}
-        for param, tmp in self._params_tmps:
-            with param.block.program._optimized_guard(
-                [param, tmp]), name_scope('moving_average'):
-                self._ema_vars[param.name] = self._create_ema_vars(param)
-
-        self.apply_program = Program()
-        block = self.apply_program.global_block()
-        with program_guard(main_program=self.apply_program):
-            decay_pow = self._get_decay_pow(block)
-            for param, tmp in self._params_tmps:
-                param = block._clone_variable(param)
-                tmp = block._clone_variable(tmp)
-                ema = block._clone_variable(self._ema_vars[param.name])
-                layers.assign(input=param, output=tmp)
-                # bias correction
-                ema = ema / (1.0 - decay_pow)
-                layers.assign(input=ema, output=param)
-
-        self.restore_program = Program()
-        block = self.restore_program.global_block()
-        with program_guard(main_program=self.restore_program):
-            for param, tmp in self._params_tmps:
-                tmp = block._clone_variable(tmp)
-                param = block._clone_variable(param)
-                layers.assign(input=tmp, output=param)
-
-    def _get_ema_decay(self):
-        with default_main_program()._lr_schedule_guard():
-            decay_var = layers.tensor.create_global_var(
-                shape=[1],
-                value=self._decay,
-                dtype='float32',
-                persistable=True,
-                name="scheduled_ema_decay_rate")
-
-            if self._thres_steps is not None:
-                decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
-                with layers.control_flow.Switch() as switch:
-                    with switch.case(decay_t < self._decay):
-                        layers.tensor.assign(decay_t, decay_var)
-                    with switch.default():
-                        layers.tensor.assign(
-                            np.array(
-                                [self._decay], dtype=np.float32),
-                            decay_var)
-        return decay_var
-
-    def _get_decay_pow(self, block):
-        global_steps = layers.learning_rate_scheduler._decay_step_counter()
-        decay_var = block._clone_variable(self._decay_var)
-        decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
-        return decay_pow_acc
-
-    def _create_ema_vars(self, param):
-        param_ema = layers.create_global_var(
-            name=unique_name.generate(self._name + param.name + '_ema'),
-            shape=param.shape,
-            value=0.0,
-            dtype=param.dtype,
-            persistable=True)
-
-        return param_ema
-
-    def update(self):
-        """ 
-        Update Exponential Moving Average. Should only call this method in 
-        train program.
-        """
-        for param, tmp in self._params_tmps:
-            with param.block.program._optimized_guard(
-                [param, tmp]), name_scope('moving_average'):
-                param_ema = self._ema_vars[param.name]
-                ema_t = param_ema * self._decay_var + param * (1 -
-                                                               self._decay_var)
-                layers.assign(input=ema_t, output=param_ema)
-
-    @signature_safe_contextmanager
-    def apply(self, executor, need_restore=True):
-        """
-        Apply moving average to parameters for evaluation.
-        
-        Args:
-            executor (Executor): The Executor to execute applying.
-            need_restore (bool): Whether to restore parameters after applying.
-        """
-        executor.run(self.apply_program)
-        try:
-            yield
-        finally:
-            if need_restore:
-                self.restore(executor)
-
-    def restore(self, executor):
-        """Restore parameters.
-        
-        Args:
-            executor (Executor): The Executor to execute restoring.
         """
         executor.run(self.restore_program)
-
-
-class PipelineOptimizer(object):
-    def __init__(self,
-                 optimizer,
-                 cut_list=None,
-                 place_list=None,
-                 concurrency_list=None,
-                 queue_size=30,
-                 sync_steps=1,
-                 start_cpu_core_id=0):
-        # TODO: check properties
-        self._optimizer = optimizer
-        self._cut_list = cut_list
-        self._place_list = place_list
-        self._concurrency_list = concurrency_list
-        self._queue_size = queue_size
-        self._sync_steps = sync_steps
-        self._start_cpu_core_id = start_cpu_core_id
-
-    def create_vars(self, block, main_program):
-        used_var_set = set()
-        for op_idx in range(block.desc.op_size()):
-            op_desc = block.desc.op(op_idx)
-            vars = op_desc.input_arg_names() + op_desc.output_arg_names()
-            for var in vars:
-                if var in used_var_set:
-                    continue
-                used_var_set.add(var)
-                source_var = main_program.block(0).var(str(var))
-                block._clone_variable(source_var, False)
-
-    def extract_section_opt_ops(self, ops, cut_point_name):
-        """
-        Extract opt ops in the given section
-        """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if _some_in_set_(op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            else:
-                relevant_op_flags[i] = False
-
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
-
-    def find_input_output(self, ops, name, is_forward=True):
-        """
-        Find the inputs or outputs of a section
-        """
-        all_set = set()
-        part_set = set()
-        for op in ops:
-            if is_forward:
-                part_set.update(op.desc.output_arg_names())
-            else:
-                part_set.update(op.desc.input_arg_names())
-            all_set.update(op.desc.output_arg_names())
-            all_set.update(op.desc.input_arg_names())
-        return all_set - part_set
-
-    def find_persistable_vars(self, ops, whole_parameters):
-        """
-        find the persistable input vars in current section
-        """
-        res = set()
-        for op in ops:
-            vars = op.desc.input_arg_names()
-            for var in vars:
-                if var in whole_parameters:
-                    res.add(var)
-        return res
-
-    def _is_opt_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) & int(optimize_role) != 0:
-            return True
-        return False
-
-    def _is_lr_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.LRSched
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
-
-    def extract_section_ops(self, ops, cut_point_name):
-        """
-        Extract ops in the given section 
-        """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if not self._is_opt_role_op(op) and _some_in_set_(
-                    op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            elif op.desc.type() == "print" and op.desc.input_arg_names()[
-                    0] in output_names:
-                continue
-            else:
-                relevant_op_flags[i] = False
-
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
-
-    def find_section_opt(self, ops, params):
-        res = self.extract_section_opt_ops(ops, params)
-        return res
-
-    def split_program(self, main_program, cut_list):
-        programs = []
-        block = main_program.block(0)
-        whole_parameters = [e.name for e in block.all_parameters()]
-        cut_var_names = []
-        cut_len = len(cut_list)
-        sec_params = []
-        for i, cut_vars in enumerate(cut_list[:-1]):
-            cut_var_names.append([cut_var.name for cut_var in cut_vars])
-        for i, cut_vars in reversed(list(enumerate(cut_list[:-1]))):
-            cut_var_names.append(
-                [_append_grad_suffix_(cut_var.name) for cut_var in cut_vars])
-            if i == 0:
-                cut_var_names[-1] += [var.name for var in cut_list[-1]]
-        ops = block.ops[:]
-        for i, cut_vars in enumerate(cut_var_names):
-            program = {
-                "program": Program(),
-                "input_set": set(),
-                "output_set": set()
-            }
-            cur_ops = self.extract_section_ops(ops, cut_vars)
-            if i == 0:
-                for op in ops:
-                    if self._is_lr_role_op(op):
-                        cur_ops.append(op)
-            #prevent inplace in/out
-            program["input_set"].update(
-                self.find_input_output(
-                    cur_ops, [], is_forward=True))
-            for e in cur_ops:
-                ops.remove(e)
-
-            if i < cut_len:
-                sec_params.append(
-                    self.find_persistable_vars(cur_ops, whole_parameters))
-            if i >= cut_len - 1:
-                opt_ops = self.find_section_opt(ops,
-                                                sec_params[2 * cut_len - 2 - i])
-
-                for e in opt_ops:
-                    ops.remove(e)
-                cur_ops += opt_ops
-
-            op_descs = [op.desc for op in cur_ops]
-            for op_desc in op_descs:
-                ap_op = program["program"].block(0).desc.append_op()
-                ap_op.copy_from(op_desc)
-            program["input_set"].update(
-                self.find_input_output(
-                    cur_ops, cut_vars, is_forward=True))
-            program["input_set"].update(sec_params[min(i, 2 * cut_len - 2 - i)])
-            program["output_set"].update(
-                self.find_input_output(
-                    cur_ops, cut_vars, is_forward=False))
-            programs.append(program)
-        program = {
-            "program": Program(),
-            "input_set": set(),
-            "output_set": set()
-        }
-        op_descs = [op.desc for op in ops]
-        for op_desc in op_descs:
-            ap_op = program["program"].block(0).desc.append_op()
-            ap_op.copy_from(op_desc)
-        program["input_set"].update(
-            [cut_var.name + "@GRAD" for cut_var in cut_list[0]])
-        program["input_set"].update(
-            self.find_input_output(
-                ops, [], is_forward=True))
-        program["input_set"].update(sec_params[0])
-        programs.append(program)
-        inputs = set()
-        for program in reversed(list(programs)):
-            output_list = list(program["output_set"])
-            for output in output_list:
-                if output not in inputs:
-                    program["output_set"].remove(output)
-            inputs.update(program["input_set"])
-        return programs
-
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
-        self._optimizer.minimize(loss, startup_program, parameter_list,
-                                 no_grad_set)
-        program = loss.block.program
-        program_list = self.split_program(program, self._cut_list)
-        for p in program_list:
-            self.create_vars(p["program"].block(0), program)
-        whole_parameters = [e.name for e in program.block(0).all_parameters()]
-        param_need_sync = []
-        for i, section_p in enumerate(program_list):
-            if not isinstance(self._place_list[i], core.CUDAPlace):
-                continue
-            section_var = [e for e in section_p["program"].block(0).vars]
-            for p in section_var:
-                if p in whole_parameters:
-                    param_need_sync.append(p)
-        program._pipeline_opt = {
-            "trainer": "PipelineTrainer",
-            "device_worker": "Section",
-            "section_program_list": program_list,
-            "place_list": self._place_list,
-            "concurrency_list": self._concurrency_list,
-            "queue_size": self._queue_size,
-            "start_cpu_core_id": self._start_cpu_core_id,
-            "sync_steps": self._sync_steps,
-            "param_need_sync": param_need_sync
-        }
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index d4a1041a4bf..a2c6537effa 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -125,6 +125,12 @@ class ParallelExecutor(object):
                  num_trainers=1,
                  trainer_id=0,
                  scope=None):
+        sys.stderr.write(
+            'ParallelExecutor is deprecated. '
+            'Please use CompiledProgram and Executor. CompiledProgram '
+            'is a central place for optimization and Executor is the '
+            'unified executor. Example can be found in compiler.py.\n')
+
         if build_strategy is None:
             build_strategy = BuildStrategy()
 
@@ -324,7 +330,6 @@ class ParallelExecutor(object):
                   loss = fluid.layers.mean(hidden)
 
               place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-              exe = fluid.Executor(place)
               exe.run(startup_program)
 
               parallel_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 1778f4b55e7..b7ce1c0e4f5 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -202,12 +202,11 @@ class WeightNormParamAttr(ParamAttr):
 
     Examples:
         .. code-block:: python
-            
-            import paddle.fluid as fluid
+
             data = fluid.layers.data(name="data", shape=[3, 32, 32], dtype="float32")
             fc = fluid.layers.fc(input=data,
                                  size=1000,
-                                 param_attr=fluid.WeightNormParamAttr(
+                                 param_attr=WeightNormParamAttr(
                                       dim=None,
                                       name='weight_norm_param'))
 
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index f1aca6e5b3e..c2322ec7634 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -12,15 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import core, dygraph
+from . import core
 import six
-import warnings
-import numpy as np
 import threading
-import paddle
-from .framework import Program, Variable, program_guard, default_main_program, default_startup_program, in_dygraph_mode
+from .framework import Program, Variable, program_guard, default_main_program, default_startup_program
 from .executor import global_scope
-from .data_feeder import DataFeeder, BatchedTensorProvider, ListTensorProvider
+from .data_feeder import DataFeeder, BatchedTensorProvider
 from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
 from .unique_name import UniqueNameGenerator
 
@@ -51,13 +48,12 @@ class PyReader(object):
 
     Args:  
         feed_list (list(Variable)|tuple(Variable)): feed variable list.
-            The variables should be created by :code:`fluid.layers.data()`.
-            it can be None under iterable mode.
+            The variables should be created by :code:`fluid.layers.data()`. 
         capacity (int): capacity of the queue maintained in PyReader object. 
         use_double_buffer (bool): whether to use double_buffer_reader to 
             speed up data feeding. 
         iterable (bool): whether the created reader object is iterable.   
-        return_list (bool): whether the return value presented as list.
+
     Returns:
         reader (Reader): the created reader object.
 
@@ -128,7 +124,7 @@ class PyReader(object):
                return reader
 
            image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-           reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=True, return_list=False)
+           reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=True)
 
            user_defined_reader = reader_creator_random_image(784, 784)
            reader.decorate_sample_list_generator(
@@ -142,79 +138,26 @@ class PyReader(object):
                for data in reader():
                    executor.run(feed=data)
 
-
-        3. If return_list=True, the return values would be presented as list instead of dict`.
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            EPOCH_NUM = 3
-            ITER_NUM = 5
-            BATCH_SIZE = 10
-
-            def reader_creator_random_image(height, width):
-                def reader():
-                    for i in range(ITER_NUM):
-                        yield np.random.uniform(low=0, high=255, size=[height, width]),
-                return reader
-
-            image = fluid.layers.data(name='image', shape=[784, 784], dtype='float32')
-            reader = fluid.io.PyReader(feed_list=[image], capacity=4, iterable=True, return_list=True)
-
-            user_defined_reader = reader_creator_random_image(784, 784)
-            reader.decorate_sample_list_generator(
-                paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
-                fluid.core.CPUPlace())
-            # definition of network is omitted
-            executor = fluid.Executor(fluid.core.CPUPlace())
-            executor.run(fluid.default_main_program())
-
-            for _ in range(EPOCH_NUM):
-                for data in reader():
-                    executor.run(feed={"image": data[0]})
     """
 
     unique_name_generator = UniqueNameGenerator()
 
     def __init__(self,
-                 feed_list=None,
-                 capacity=None,
+                 feed_list,
+                 capacity,
                  use_double_buffer=True,
-                 iterable=True,
-                 return_list=False):
+                 iterable=False):
         self._tensor_reader = None
         self._thread = None
-        self._feed_list = feed_list
-        if not capacity:
-            raise ValueError("Please give value to capacity.")
-        # force to use iterable mode under dygraph mode
-        if in_dygraph_mode():
-            if not iterable:
-                warnings.warn(
-                    "Please NOTE: dygraph can support iterable mode only.")
-            self._iterable = True
-            if not return_list:
-                warnings.warn(
-                    "Please NOTE: dygraph can support return as list only.")
-            self._return_list = True
-        else:
-            self._iterable = iterable
-            self._return_list = return_list
-            if not self._feed_list:
-                raise Exception("Feed list must be given under static mode.")
+        self._iterable = iterable
         self._use_double_buffer = use_double_buffer
         self._capacity = capacity
+        self._feed_list = feed_list
         if not self._iterable:
             self._init_non_iterable()
 
     def _init_iterable(self, places):
-        if in_dygraph_mode():
-            self._var_names = []
-        else:
-            self._var_names = [v.name for v in self._feed_list]
+        self._var_names = [v.name for v in self._feed_list]
         self._places = _convert_places(places)
         self._queue = core.init_lod_tensor_blocking_queue(core.Variable(),
                                                           self._capacity)
@@ -297,7 +240,6 @@ class PyReader(object):
             def __init__(self, reader):
                 self._reader = reader._reader
                 self._reset = reader._reset
-                self._return_list = reader._return_list
 
             def __iter__(self):
                 return self
@@ -306,28 +248,12 @@ class PyReader(object):
                 return self.next()
 
             def next(self):
-                if not in_dygraph_mode():
-                    if self._return_list:
-                        ret = self._reader.read_next_list()
-                        ret = ret[0] if ret is not None and len(
-                            ret) > 0 else None
-                    else:
-                        ret = self._reader.read_next()
-                    if ret:
-                        return ret
-                    else:
-                        self._reset()
-                        raise StopIteration
+                ret = self._reader.read_next()
+                if ret:
+                    return ret
                 else:
-                    ret = self._reader.read_next_list()
-                    if ret and ret[0]:
-                        return [
-                            dygraph.base.to_variable(np.array(v))
-                            for v in ret[0]
-                        ]
-                    else:
-                        self._reset()
-                        raise StopIteration
+                    self._reset()
+                    raise StopIteration
 
         self._start()
         return Iterator(self)
@@ -367,9 +293,8 @@ class PyReader(object):
                             break
 
 	'''
-        if not in_dygraph_mode():
-            assert not self._iterable, "start() cannot be called when PyReader is iterable"
-            self._start()
+        assert not self._iterable, "start() cannot be called when PyReader is iterable"
+        self._start()
 
     def reset(self):
         '''
@@ -402,9 +327,8 @@ class PyReader(object):
                             break        
 
         '''
-        if not in_dygraph_mode():
-            assert not self._iterable, "reset() cannot be called when PyReader is iterable"
-            self._reset()
+        assert not self._iterable, "reset() cannot be called when PyReader is iterable"
+        self._reset()
 
     def _start(self):
         def __thread_main__():
@@ -491,35 +415,27 @@ class PyReader(object):
     
         '''
         assert batch_size > 0, "batch_size must be larger than 0"
-        if not in_dygraph_mode():
-            has_lod = False
-            for f in self._feed_list:
-                if f.lod_level != 0:
-                    has_lod = True
-                    break
-
-            if has_lod:
-                self.decorate_sample_list_generator(
-                    paddle.batch(
-                        sample_generator,
-                        batch_size=batch_size,
-                        drop_last=drop_last),
-                    places=places)
-            else:
-                reader = BatchedTensorProvider(
-                    feed_list=self._feed_list,
-                    place=core.CPUPlace(),
-                    batch_size=batch_size,
-                    generator=sample_generator,
-                    drop_last=drop_last)
-                self.decorate_batch_generator(reader, places=places)
-        else:
+        has_lod = False
+        for f in self._feed_list:
+            if f.lod_level != 0:
+                has_lod = True
+                break
+
+        if has_lod:
             self.decorate_sample_list_generator(
                 paddle.batch(
                     sample_generator,
                     batch_size=batch_size,
                     drop_last=drop_last),
                 places=places)
+        else:
+            reader = BatchedTensorProvider(
+                feed_list=self._feed_list,
+                place=core.CPUPlace(),
+                batch_size=batch_size,
+                generator=sample_generator,
+                drop_last=drop_last)
+            self.decorate_batch_generator(reader, places=places)
 
     def decorate_sample_list_generator(self, reader, places=None):
         '''
@@ -572,22 +488,14 @@ class PyReader(object):
         '''
         assert self._tensor_reader is None, \
             "Cannot reset the data source of PyReader"
-        if not in_dygraph_mode():
-            with program_guard(Program(), Program()):
-                feeder = DataFeeder(
-                    feed_list=self._feed_list, place=core.CPUPlace())
-                paddle_reader = feeder.decorate_reader(
-                    reader, multi_devices=False)
-
-            def __tensor_reader_impl__():
-                for slots in paddle_reader():
-                    yield [slots[var.name] for var in self._feed_list]
-        else:
-            provider = ListTensorProvider(reader, places)
-
-            def __tensor_reader_impl__():
-                for slots in provider():
-                    yield slots[0]
+        with program_guard(Program(), Program()):
+            feeder = DataFeeder(
+                feed_list=self._feed_list, place=core.CPUPlace())
+            paddle_reader = feeder.decorate_reader(reader, multi_devices=False)
+
+        def __tensor_reader_impl__():
+            for slots in paddle_reader():
+                yield [slots[var.name] for var in self._feed_list]
 
         self.decorate_batch_generator(__tensor_reader_impl__, places)
 
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 822029a372b..d8aace9fdfa 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -124,21 +124,11 @@ class L2DecayRegularizer(WeightDecayRegularizer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name='image', shape=[3, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                hidden = fluid.layers.fc(input=data, size=128, act='relu')
-                prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                avg_loss = fluid.layers.mean(loss)
             optimizer = fluid.optimizer.Adagrad(
                 learning_rate=1e-4,
                 regularization=fluid.regularizer.L2DecayRegularizer(
                     regularization_coeff=0.1))
-            optimizer.minimize(avg_loss)
+            optimizer.minimize(avg_cost)
     """
 
     def __init__(self, regularization_coeff=0.0):
@@ -162,11 +152,8 @@ class L2DecayRegularizer(WeightDecayRegularizer):
         assert isinstance(param, framework.Parameter)
         assert isinstance(block, framework.Block)
 
-        if framework.in_dygraph_mode():
-            decay = block.create_var(dtype=param.dtype, shape=param.shape)
-        else:
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
+        decay = block.create_var(
+            dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append Op to calculate decay
         block.append_op(
@@ -196,21 +183,11 @@ class L1DecayRegularizer(WeightDecayRegularizer):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-            with fluid.program_guard(main_prog, startup_prog):
-                data = fluid.layers.data(name='image', shape=[3, 28, 28], dtype='float32')
-                label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-                hidden = fluid.layers.fc(input=data, size=128, act='relu')
-                prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
-                loss = fluid.layers.cross_entropy(input=prediction, label=label)
-                avg_loss = fluid.layers.mean(loss)
             optimizer = fluid.optimizer.Adagrad(
                 learning_rate=1e-4,
                 regularization=fluid.regularizer.L1DecayRegularizer(
                     regularization_coeff=0.1))
-            optimizer.minimize(avg_loss)
+            optimizer.minimize(avg_cost)
     """
 
     def __init__(self, regularization_coeff=0.0):
@@ -234,11 +211,8 @@ class L1DecayRegularizer(WeightDecayRegularizer):
         assert isinstance(param, framework.Parameter)
         assert isinstance(block, framework.Block)
 
-        if framework.in_dygraph_mode():
-            decay = block.create_var(dtype=param.dtype, shape=param.shape)
-        else:
-            decay = block.create_var(
-                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
+        decay = block.create_var(
+            dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append sign op
         block.append_op(
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 2d81fd43171..d24417bbacb 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -1,10 +1,6 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-if(NOT WITH_DISTRIBUTE)
-  list(REMOVE_ITEM TEST_OPS test_communicator)
-endif(NOT WITH_DISTRIBUTE)
-
 foreach(src ${TEST_OPS})
   py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/tests/demo/pyreader.py b/python/paddle/fluid/tests/demo/pyreader.py
index 6995346ffa6..bbcef4c3ff2 100644
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
@@ -80,21 +80,19 @@ def main():
         train_reader.start()
         try:
             while True:
-                print(
-                    'train_loss',
-                    numpy.array(trainer.run(fetch_list=[loss.name])))
+                print 'train_loss', numpy.array(
+                    trainer.run(fetch_list=[loss.name]))
         except fluid.core.EOFException:
-            print('End of epoch', epoch_id)
+            print 'End of epoch', epoch_id
             train_reader.reset()
 
         test_reader.start()
         try:
             while True:
-                print(
-                    'test loss',
-                    numpy.array(tester.run(fetch_list=[test_loss.name])))
+                print 'test loss', numpy.array(
+                    tester.run(fetch_list=[test_loss.name]))
         except fluid.core.EOFException:
-            print('End of testing')
+            print 'End of testing'
             test_reader.reset()
 
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index e72a430ff57..e1c4c2eca08 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-from paddle.fluid.layers import detection
 from paddle.fluid.framework import Program, program_guard
 import unittest
 
@@ -350,7 +349,7 @@ class TestDetectionMAP(unittest.TestCase):
                 append_batch_size=False,
                 dtype='float32')
 
-            map_out = detection.detection_map(detect_res, label, 21)
+            map_out = layers.detection_map(detect_res, label, 21)
             self.assertIsNotNone(map_out)
             self.assertEqual(map_out.shape, (1, ))
         print(str(program))
@@ -523,32 +522,6 @@ class TestMulticlassNMS(unittest.TestCase):
             self.assertIsNotNone(output)
 
 
-class TestCollectFpnPropsals(unittest.TestCase):
-    def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            multi_bboxes = []
-            multi_scores = []
-            for i in range(4):
-                bboxes = layers.data(
-                    name='rois' + str(i),
-                    shape=[10, 4],
-                    dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
-                    name='scores' + str(i),
-                    shape=[10, 1],
-                    dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                multi_bboxes.append(bboxes)
-                multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
-
-
 class TestDistributeFpnProposals(unittest.TestCase):
     def test_distribute_fpn_proposals(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/test_lod_tensor.py b/python/paddle/fluid/tests/test_lod_tensor.py
index 9bd343c103f..722b5f07b04 100644
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
@@ -15,7 +15,6 @@
 from __future__ import print_function
 
 import paddle.fluid as fluid
-import paddle.fluid.core as core
 from paddle.fluid.lod_tensor import create_lod_tensor, create_random_int_lodtensor
 import numpy as np
 import unittest
@@ -97,23 +96,6 @@ class TestLoDTensor(unittest.TestCase):
                          recursive_seq_lens)
         self.assertEqual(tensor.shape(), [10, 1])
 
-    def test_print_lodtensor(self):
-        shape = [1]
-        recursive_seq_lens = [[2, 3, 5]]
-        dict_size = 100
-        low = 0
-        high = dict_size - 1
-        tensor = create_random_int_lodtensor(recursive_seq_lens, shape,
-                                             fluid.CPUPlace(), low, high)
-        print(tensor)
-        self.assertTrue(isinstance(str(tensor), str))
-
-        if core.is_compiled_with_cuda():
-            gtensor = create_random_int_lodtensor(recursive_seq_lens, shape,
-                                                  fluid.CUDAPlace(0), low, high)
-            print(gtensor)
-            self.assertTrue(isinstance(str(gtensor), str))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 15569b339df..aa4fc5ceb90 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -8,8 +8,6 @@ if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
     list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
     LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_nccl)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_lars)
     LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
     LIST(REMOVE_ITEM TEST_OPS test_dist_ctr)
     LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow)
@@ -17,11 +15,8 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
     LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op)
     LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op)
-    LIST(REMOVE_ITEM TEST_OPS test_dist_fleet_ctr)
 endif(NOT WITH_DISTRIBUTE)
 
-LIST(REMOVE_ITEM TEST_OPS test_launch)
-
 if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
@@ -34,7 +29,6 @@ list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://gi
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
-
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorator_helper) # decorator_helper is a helper python file, not a test
 if(APPLE)
@@ -67,35 +61,12 @@ function(py_test_modules TARGET_NAME)
         COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
         ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (py_test_modules_SERIAL)         
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)            
+    if (py_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
     set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 350)
   endif()
 endfunction()
-
-function(bash_test_modules TARGET_NAME)
-    if(NOT WITH_TESTING)
-        return()
-    endif()
-
-    set(options SERIAL)
-    set(oneValueArgs "")
-    set(multiValueArgs MODULES DEPS ENVS)
-    cmake_parse_arguments(bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    message(STATUS "CMAKE_CURRENT_BINARY_DIR:" ${CMAKE_CURRENT_BINARY_DIR})
-
-    add_test(NAME ${TARGET_NAME}
-        COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${bash_test_modules_ENVS}
-        bash ${CMAKE_CURRENT_BINARY_DIR}/${bash_test_modules_MODULES}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if (bash_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
-    endif()
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
-endfunction()
-
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
 list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
@@ -109,13 +80,10 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
-list(REMOVE_ITEM TEST_OPS test_imperative_resnet_sorted_gradient)
-list(REMOVE_ITEM TEST_OPS test_imperative_mnist_sorted_gradient)
 list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 list(REMOVE_ITEM TEST_OPS test_layers)
-list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
 
 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test   
@@ -156,52 +124,43 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
-py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
-py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
+py_test_modules(test_warpctc_op MODULES test_warpctc_op SERIAL)
+py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS} SERIAL)
+py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS} SERIAL)
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
-    FLAGS_cudnn_deterministic=1)
-py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
-        FLAGS_cudnn_deterministic=1)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 
 if(WITH_DISTRIBUTE)
-    py_test_modules(test_dist_train MODULES test_dist_train)
+    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
     if(WITH_DGC)
         py_test_modules(test_dgc_op MODULES test_dgc_op)
     endif()
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
-        set_tests_properties(test_dist_mnist_nccl PROPERTIES TIMEOUT 250)
-        set_tests_properties(test_dist_mnist_lars PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
-        bash_test_modules(test_launch MODULES test_launch.sh)
+        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl SERIAL)
         # FIXME(typhoonzero): add these tests back
         # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
         # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
+        set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
     endif(NOT APPLE)
     # py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
 endif()
 
-py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
-py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
+py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
+py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
 set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 740)
-py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer)
+py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
 if(NOT WIN32)
-    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
+    py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
 endif()
 
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
@@ -223,5 +182,5 @@ if(WITH_DISTRIBUTE)
 endif()
 
 set_tests_properties(test_recordio_reader test_parallel_executor_test_while_train test_parallel_executor_mnist
-        test_parallel_executor_seresnext test_parallel_executor_crf test_sync_batch_norm_op
+        test_parallel_executor_seresnext test_parallel_executor_crf
         PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
index c030afdd4ff..48a4768782c 100644
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
@@ -16,9 +16,9 @@ import logging
 import paddle
 import tarfile
 
-from paddle.fluid.log_helper import get_logger
-
-logger = get_logger("paddle", logging.INFO)
+logging.basicConfig()
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
 
 DATA_URL = "http://paddle-ctr-data.bj.bcebos.com/avazu_ctr_data.tgz"
 DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e"
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index 3775f62097d..ace053e030c 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -23,6 +23,7 @@ from itertools import product
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
+from paddle.fluid.backward import calc_gradient
 from paddle.fluid.backward import _append_grad_suffix_, _as_list
 
 
@@ -182,7 +183,7 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
     dy = program.global_block().create_var(
         name=dy_name, shape=y.shape, dtype=np_type, persistable=True)
     # append backward
-    dx = fluid.gradients(y, x, dy)
+    dx = calc_gradient(y, x, dy)
 
     # init dy tensor in scope
     value = np.zeros(y.shape, dtype=np_type)
@@ -195,23 +196,17 @@ def _compute_analytical_jacobian(program, x, y, place, scope):
     x = _as_list(x)
     jacobian = make_jacobian(x, y_size, np_type)
 
-    # filter None in dx for DX/DY may be None in kernel
-    # only fetch not None dx in exe.run
-    filted = [(i, dxi) for i, dxi in enumerate(dx) if dxi is not None]
-    filted_idx, filted_dx = zip(*filted)
-
     for i in six.moves.xrange(y_size):
         _set_item(dy_t, i, 1, np_type)
 
-        dx_res = exe.run(program, scope=scope, fetch_list=filted_dx)
+        dx_res = exe.run(program, scope=scope, fetch_list=dx)
 
-        for j in six.moves.xrange(len(filted_dx)):
-            dx_idx = filted_idx[j]
+        for j in six.moves.xrange(len(x)):
             if dx_res[j] is not None:
-                jacobian[dx_idx][:, i] = dx_res[j].flatten()
+                jacobian[j][:, i] = dx_res[j].flatten()
             else:
-                jacobian[dx_idx][:, i] = np.zeros(
-                    dx[dx_idx].shape, dtype=np_type).flatten()
+                jacobian[j][:, i] = np.zeros(
+                    dx[j].shape, dtype=np_type).flatten()
 
         _set_item(dy_t, i, 0, np_type)
 
@@ -308,7 +303,7 @@ def grad_check(x,
             _compute_analytical_jacobian(prog, clone_x, clone_y, place, scope))
 
     for i, (x_idx,
-            y_idx) in enumerate(product(*[range(len(x)), range(len(y))])):
+            y_idx) in enumerate(product(* [range(len(x)), range(len(y))])):
         a = analytical[y_idx][x_idx]
         n = numerical[x_idx][y_idx]
         if not np.allclose(a, n, rtol, atol):
@@ -381,7 +376,7 @@ def double_grad_check(x,
         ]
 
     # append first order grads
-    target_grads = fluid.gradients(y, x, y_grads)
+    target_grads = calc_gradient(y, x, y_grads)
 
     # y_grads are the input of first-order backward,
     # so, they are also the input of second-order backward.
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 6e4f0166121..28b670d7ab3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -57,8 +57,6 @@ class TestConv2dMKLDNNOp(TestConv2dOp):
         self.fuse_bias = False
         self.bias_size = None
         self.fuse_relu = False
-        self.fuse_brelu = False
-        self.fuse_brelu_threshold = 6.0
         self.fuse_residual_connection = False
         self.input_residual_size = None
         TestConv2dOp.setUp(self)
@@ -86,38 +84,15 @@ class TestConv2dMKLDNNOp(TestConv2dOp):
         if self.fuse_relu:
             output = np.maximum(output, 0).astype(self.dsttype)
 
-        if self.fuse_brelu:
-            output = np.minimum(
-                np.maximum(output, 0),
-                self.fuse_brelu_threshold).astype(self.dsttype)
         output = output.astype(self.dtype)
 
         self.attrs['fuse_bias'] = self.fuse_bias
         self.attrs['fuse_relu'] = self.fuse_relu
-        self.attrs['fuse_brelu'] = self.fuse_brelu
-        self.attrs['fuse_brelu_threshold'] = self.fuse_brelu_threshold
         self.attrs['fuse_residual_connection'] = self.fuse_residual_connection
 
         self.outputs['Output'] = output
 
 
-class TestWithbreluFusion(TestConv2dMKLDNNOp):
-    def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
-        self.fuse_brelu = True
-        self.fuse_brelu_threshold = 6.0
-        self.dsttype = np.float32
-
-    def test_check_grad(self):
-        pass
-
-    def test_check_grad_no_filter(self):
-        pass
-
-    def test_check_grad_no_input(self):
-        pass
-
-
 class TestWithFuse(TestConv2dMKLDNNOp):
     def init_test_case(self):
         TestConv2dMKLDNNOp.init_test_case(self)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
index 8f0a9898dce..84229a5cffb 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
@@ -20,30 +20,34 @@ from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 def fully_connected_naive(input, weights, bias_data=None):
+    in_n, in_c, in_h, in_w = input.shape
+    w_h, w_c = weights.shape
+
+    x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    # this transpose should be implemented at C code
+    w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
     result = None
 
     if not bias_data:
-        result = np.dot(input, weights)
+        result = np.dot(x_data, w_data)
     else:
-        result = np.dot(input, weights) + bias_data
+        result = np.dot(x_data, w_data) + bias_data
 
     return result
 
 
 class MatrixGenerate:
     def __init__(self, mb, ic, oc, h, w):
-        self.input = np.random.random((mb, ic * h * w)).astype("float32")
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
         self.weights = np.random.random((ic * h * w, oc)).astype("float32")
 
 
 class TestFCMKLDNNOp(OpTest):
-    def create_data(self):
-        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
-
     def setUp(self):
         self.op_type = "fc"
         self.use_mkldnn = True
-        self.create_data()
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
         self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
 
         self.attrs = {'use_mkldnn': self.use_mkldnn, }
@@ -56,16 +60,37 @@ class TestFCMKLDNNOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        pass
+        self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9)
 
     def test_check_grad_no_weight(self):
-        pass
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W'))
 
 
 class TestFCMKLDNNOp1(TestFCMKLDNNOp):
-    def create_data(self):
+    def init_op_type(self):
         self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
 
 
+class TestFCMKLDNNOp2(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 40, 1, 1)
+
+
+class TestFCMKLDNNOp3(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 2, 4, 1, 1)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
index a7f167cbd41..034d7792c13 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
@@ -18,7 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_activation_op import TestAbs, TestGelu, TestSigmoid, TestSquare, TestRelu, TestTanh
+from paddle.fluid.tests.unittests.test_activation_op import TestSigmoid, TestRelu, TestTanh
 
 
 class TestNGRAPHReluDim4(TestRelu):
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
index 764d136ec8d..ff2e865b66a 100644
--- a/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py
@@ -15,40 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1, TestDepthwiseConv, TestDepthwiseConv2, TestDepthwiseConv3, TestDepthwiseConvWithDilation, TestDepthwiseConvWithDilation2
-
-
-class TestNGRAPHDepthwiseConv(TestDepthwiseConv):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConv2(TestDepthwiseConv2):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv2, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConv3(TestDepthwiseConv3):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConv3, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConvWithDilation(TestDepthwiseConvWithDilation):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConvWithDilation, self).init_test_case()
-        self.use_cuda = False
-
-
-class TestNGRAPHDepthwiseConvWithDilation2(TestDepthwiseConvWithDilation2):
-    def init_test_case(self):
-        super(TestNGRAPHDepthwiseConvWithDilation2, self).init_test_case()
-        self.use_cuda = False
-
-
-del TestDepthwiseConv, TestDepthwiseConv2, TestDepthwiseConv3, TestDepthwiseConvWithDilation, TestDepthwiseConvWithDilation2
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestWithPad, TestWithStride, TestWithGroup, TestWith1x1, TestWithInput1x1Filter1x1
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index 3890236013c..8b9e2997ec7 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -55,6 +55,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
@@ -100,13 +101,11 @@ class MNIST(fluid.dygraph.Layer):
                               loc=0.0, scale=scale)),
                       act="softmax")
 
-    def forward(self, inputs, label):
+    def forward(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
         x = self._simple_img_conv_pool_2(x)
-        cost = self._fc(x)
-        loss = fluid.layers.cross_entropy(cost, label)
-        avg_loss = fluid.layers.mean(loss)
-        return avg_loss
+        x = self._fc(x)
+        return x
 
 
 class TestMnist(TestParallelDyGraphRunnerBase):
@@ -114,7 +113,7 @@ class TestMnist(TestParallelDyGraphRunnerBase):
         model = MNIST("mnist")
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = fluid.optimizer.SGD(learning_rate=1e-3)
+        opt = SGDOptimizer(learning_rate=1e-3)
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
@@ -127,8 +126,9 @@ class TestMnist(TestParallelDyGraphRunnerBase):
         label = to_variable(y_data)
         label.stop_gradient = True
 
-        avg_loss = model(img, label)
-
+        cost = model(img)
+        loss = fluid.layers.cross_entropy(cost, label)
+        avg_loss = fluid.layers.mean(loss)
         return avg_loss
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index abc463a0fb0..5e77ce9b811 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -105,23 +105,18 @@ def train(use_cuda, thread_num, cpu_num):
 
     img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network(
         use_py_reader=True)
-    print("build convolutional neural network done.")
 
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
     optimizer.minimize(avg_loss)
-    print("Adam optimizer minimize done.")
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.mnist.train(), buf_size=500),
         batch_size=BATCH_SIZE)
-    print("declared train reader done.")
 
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
-    print("going to run startup program")
     exe.run(fluid.default_startup_program())
-    print("run startup program done.")
 
     os.environ['CPU_NUM'] = str(cpu_num)
 
@@ -142,7 +137,6 @@ def train(use_cuda, thread_num, cpu_num):
         main_program=main_program,
         build_strategy=build_strategy,
         exec_strategy=exec_strategy)
-    print("declare parallel executor done.")
 
     py_reader.decorate_paddle_reader(train_reader)
 
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index b5d1115723e..42276a0647d 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -25,15 +25,9 @@ class TestConcatOp(OpTest):
         self.init_test_data()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
         self.attrs = {'axis': self.axis}
-        if self.axis < 0:
-            self.actual_axis = self.axis + len(self.x0.shape)
-            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
-        else:
-            self.actual_axis = self.axis
-
         self.outputs = {
             'Out': np.concatenate(
-                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+                (self.x0, self.x1, self.x2), axis=self.axis)
         }
 
     def test_check_output(self):
@@ -81,13 +75,5 @@ class TestConcatOp4(TestConcatOp):
         pass
 
 
-class TestConcatOp5(TestConcatOp):
-    def init_test_data(self):
-        self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
-        self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
-        self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
-        self.axis = -3
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 6daf9f8994d..6c7054e95ef 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -24,26 +24,17 @@ import six
 import argparse
 import pickle
 import numpy as np
-import time
+
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import DataParallel
 
-RUN_STEP = 5
+RUN_STEP = 10
 DEFAULT_BATCH_SIZE = 2
 
 
-def my_print(class_name, log_str):
-    localtime = time.asctime(time.localtime(time.time()))
-    print_str = localtime + "\t" + class_name + "\t" + log_str
-    if six.PY2:
-        sys.stderr.write(pickle.dumps(print_str))
-    else:
-        sys.stderr.buffer.write(pickle.dumps(print_str))
-
-
 class TestDistRunnerBase(object):
     def get_model(self,
                   batch_size=DEFAULT_BATCH_SIZE,
@@ -60,14 +51,11 @@ class TestDistRunnerBase(object):
                        trainers,
                        sync_mode,
                        dc_asgd=False,
-                       current_endpoint=None,
-                       nccl_comm_num=1):
+                       current_endpoint=None):
         # NOTE: import fluid until runtime, or else forking processes will cause error.
         config = fluid.DistributeTranspilerConfig()
         config.enable_dc_asgd = dc_asgd
         config.sync_mode = sync_mode
-        if nccl_comm_num > 1:
-            config.nccl_comm_num = nccl_comm_num
         # config.runtime_split_send_recv = True
         t = fluid.DistributeTranspiler(config=config)
         t.transpile(
@@ -92,9 +80,7 @@ class TestDistRunnerBase(object):
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(startup_prog)
-        my_print(type(self).__name__, "run pserver startup program done.")
         exe.run(pserver_prog)
-        my_print(type(self).__name__, "run pserver main program done.")
 
     def run_trainer(self, args):
         self.lr = args.lr
@@ -109,29 +95,17 @@ class TestDistRunnerBase(object):
                 self.get_model(batch_size=args.batch_size)
 
         if args.mem_opt:
-            my_print(type(self).__name__, "begin to run memory optimize")
             fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
-            my_print(type(self).__name__, "trainer run memory optimize done.")
         if args.update_method == "pserver":
-            my_print(
-                type(self).__name__,
-                "begin to run transpile on trainer with pserver mode")
             t = self.get_transpiler(args.trainer_id,
                                     fluid.default_main_program(),
                                     args.endpoints, args.trainers,
                                     args.sync_mode, args.dc_asgd)
             trainer_prog = t.get_trainer_program()
-            my_print(
-                type(self).__name__,
-                "get trainer program done with pserver mode.")
         elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer":
             # transpile for nccl2
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
-            config.nccl_comm_num = args.nccl_comm_num
-            my_print(
-                type(self).__name__,
-                "begin to run transpile on trainer with nccl2 mode")
             nccl2_t = fluid.DistributeTranspiler(config=config)
             nccl2_t.transpile(
                 args.trainer_id,
@@ -139,9 +113,6 @@ class TestDistRunnerBase(object):
                 startup_program=fluid.default_startup_program(),
                 trainers=args.endpoints,
                 current_endpoint=args.current_endpoint)
-            my_print(
-                type(self).__name__,
-                "get trainer program done. with nccl2 mode")
             trainer_prog = fluid.default_main_program()
         else:
             trainer_prog = fluid.default_main_program()
@@ -154,7 +125,6 @@ class TestDistRunnerBase(object):
 
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
-        my_print(type(self).__name__, "run worker startup program done.")
 
         exec_strategy = fluid.ExecutionStrategy()
         exec_strategy.num_threads = 1
@@ -165,9 +135,6 @@ class TestDistRunnerBase(object):
         build_stra.enable_inplace = False
         build_stra.memory_optimize = False
 
-        if args.enable_backward_deps:
-            build_stra.enable_backward_optimizer_op_deps = True
-
         if args.use_reduce:
             build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         else:
@@ -187,21 +154,10 @@ class TestDistRunnerBase(object):
             build_stra.num_trainers = 1
             build_stra.trainer_id = 0
 
-        my_print(type(self).__name__, "begin to compile with data parallel")
         binary = compiler.CompiledProgram(trainer_prog).with_data_parallel(
             loss_name=avg_cost.name,
             build_strategy=build_stra,
             exec_strategy=exec_strategy)
-        my_print(type(self).__name__, "program compiled with data parallel")
-
-        if args.use_cuda and args.update_method == "nccl2":
-            # it just for test share_vars_from feature.
-            test_exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                loss_name=avg_cost.name,
-                build_strategy=build_stra,
-                main_program=test_program,
-                share_vars_from=binary._executor)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.values()
@@ -222,7 +178,6 @@ class TestDistRunnerBase(object):
             else:
                 return origin_batch
 
-        my_print(type(self).__name__, "begin to train on trainer")
         out_losses = []
         for _ in six.moves.xrange(RUN_STEP):
             loss, = exe.run(binary,
@@ -245,7 +200,6 @@ class TestParallelDyGraphRunnerBase(object):
             "train_one_loop should be implemented by the child classes.")
 
     def run_trainer(self, args):
-
         seed = 90
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(device_id)
@@ -263,48 +217,39 @@ class TestParallelDyGraphRunnerBase(object):
         with fluid.dygraph.guard(place):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
-            np.random.seed(seed)
-            import random
-            random.seed = seed
             model, train_reader, opt = self.get_model()
-            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
 
+            nranks = len(args.endpoints.split(",")) if args.endpoints else 1
             if args.update_method == "nccl2":
+                sys.stderr.write("")
+                model = dygraph.parallel.DataParallel(model)
                 strategy = dygraph.parallel.ParallelStrategy()
                 strategy.nranks = nranks
                 strategy.local_rank = args.trainer_id
                 strategy.trainer_endpoints = args.endpoints.split(",")
                 strategy.current_endpoint = args.current_endpoint
-                my_print(
-                    type(self).__name__,
-                    "begin to prepare context in dygraph with nccl2")
                 dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
-                my_print(type(self).__name__, "model built in dygraph")
             out_losses = []
-            my_print(type(self).__name__, "begin to run dygraph training")
             for step_id, data in enumerate(train_reader()):
                 data = _get_data(data)
                 if step_id == RUN_STEP:
                     break
                 loss = self.run_one_loop(model, opt, data)
-                if step_id % 10 == 0:
-                    my_print(
-                        type(self).__name__,
-                        "loss at step %d: %f" % (step_id, loss))
-                out_losses.append(loss.numpy())
 
-                # FIXME(Yancey1989): scale the loss inplace
-                if args.update_method == "nccl2":
-                    loss = model.scale_loss(loss)
+                # FIXME(Yancey1989): scale the loss inplace 
+                loss.stop_gradient = True
+                loss_scale = to_variable(np.array([nranks]).astype("float32"))
+                loss = loss / loss_scale
 
+                out_losses.append(loss.numpy())
                 loss.backward()
-                if args.update_method == "nccl2":
-                    model.apply_collective_grads()
 
                 opt.minimize(loss)
                 model.clear_gradients()
-            my_print(type(self).__name__, pickle.dumps(out_losses))
+            if six.PY2:
+                print(pickle.dumps(out_losses))
+            else:
+                sys.stdout.buffer.write(pickle.dumps(out_losses))
 
 
 def runtime_main(test_class):
@@ -319,9 +264,6 @@ def runtime_main(test_class):
         choices=["pserver", "nccl2", "local", "nccl2_reduce_layer"])
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
     parser.add_argument('--trainers', type=int, required=False, default=1)
-    parser.add_argument('--nccl_comm_num', type=int, required=False, default=1)
-    parser.add_argument(
-        '--enable_backward_deps', type=bool, required=False, default=1)
     parser.add_argument(
         '--current_endpoint', type=str, required=False, default="")
     parser.add_argument('--sync_mode', action='store_true')
@@ -399,18 +341,14 @@ class TestDistBase(unittest.TestCase):
         self._lr = 0.001
         self._use_dgc = False
         self._dygraph = False
-        self._nccl_comm_num = 1
         self._setup_config()
         self._after_setup_config()
-        self._enable_backward_deps = False
 
     def _find_free_port(self):
         def __free_port():
             with closing(socket.socket(socket.AF_INET,
                                        socket.SOCK_STREAM)) as s:
                 s.bind(('', 0))
-                my_print(
-                    type(self).__name__, "socket name: %s" % s.getsockname()[1])
                 return s.getsockname()[1]
 
         while True:
@@ -441,13 +379,11 @@ class TestDistBase(unittest.TestCase):
         ps0_pipe = open("/tmp/ps0_err.log", "wb")
         ps1_pipe = open("/tmp/ps1_err.log", "wb")
 
-        my_print(type(self).__name__, "going to start pserver process 0")
         ps0_proc = subprocess.Popen(
             ps0_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=ps0_pipe,
             env=required_envs)
-        my_print(type(self).__name__, "going to start pserver process 1")
         ps1_proc = subprocess.Popen(
             ps1_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
@@ -553,13 +489,11 @@ class TestDistBase(unittest.TestCase):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
-        my_print(type(self).__name__, "going to start trainer process 0")
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=tr0_pipe,
             env=env0)
-        my_print(type(self).__name__, "going to start trainer process 1")
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
@@ -591,20 +525,16 @@ class TestDistBase(unittest.TestCase):
         ps1.terminate()
 
         # print server log
-        '''
-        with open("/tmp/ps0_err.log", "rb") as fn:
+        with open("/tmp/ps0_err.log", "r") as fn:
             sys.stderr.write("ps0 stderr: %s\n" % fn.read())
-        with open("/tmp/ps1_err.log", "rb") as fn:
+        with open("/tmp/ps1_err.log", "r") as fn:
             sys.stderr.write("ps1 stderr: %s\n" % fn.read())
-        '''
 
         # print log
-        '''
-        with open("/tmp/tr0_err.log", "rb") as fn:
+        with open("/tmp/tr0_err.log", "r") as fn:
             sys.stderr.write('trainer 0 stderr: %s\n' % fn.read())
-        with open("/tmp/tr1_err.log", "rb") as fn:
+        with open("/tmp/tr1_err.log", "r") as fn:
             sys.stderr.write('trainer 1 stderr: %s\n' % fn.read())
-        '''
 
         return pickle.loads(tr0_out), pickle.loads(tr1_out)
 
@@ -656,19 +586,10 @@ class TestDistBase(unittest.TestCase):
         if self._use_dgc:
             tr0_cmd += " --use_dgc"
             tr1_cmd += " --use_dgc"
-
-        if self._nccl_comm_num > 1:
-            tr0_cmd += " --nccl_comm_num {}".format(self._nccl_comm_num)
-            tr1_cmd += " --nccl_comm_num {}".format(self._nccl_comm_num)
-
         if self._mp_mode:
             env0 = {"FLAGS_selected_gpus": "0"}
             env1 = {"FLAGS_selected_gpus": "1"}
 
-        if self._enable_backward_deps:
-            tr0_cmd += " --enable_backward_deps 1"
-            tr1_cmd += " --enable_backward_deps 1"
-
         env0.update(envs)
         env1.update(envs)
 
@@ -677,13 +598,11 @@ class TestDistBase(unittest.TestCase):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
-        my_print(type(self).__name__, "going to start process 0 with nccl2")
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=tr0_pipe,
             env=env0)
-        my_print(type(self).__name__, "going to start process 1 with nccl2")
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
@@ -714,7 +633,7 @@ class TestDistBase(unittest.TestCase):
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_rpc_deadline": "30000",  # 5sec to fail fast
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "FLAGS_cudnn_deterministic": "1",
             "http_proxy": "",
             "NCCL_P2P_DISABLE": "1"
@@ -744,6 +663,9 @@ class TestDistBase(unittest.TestCase):
             local_loss = local_losses[step_id]
             tr0_loss = tr0_losses[step_id]
             tr1_loss = tr1_losses[step_id]
-            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss])) / 2
+            dist_loss = (np.array([tr0_loss]) + np.array([tr1_loss]))
+            if not self._dygraph:
+                # Parallel DyGraph already scaled the loss in training
+                dist_loss = dist_loss / 2
             print("=======", local_loss, ":", dist_loss[0], "=======")
             self.assertAlmostEqual(local_loss, dist_loss[0], delta=delta)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
index 89bbc69fa88..b9d2f6db394 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -26,6 +26,42 @@ class TestDistMnist2x2(TestDistBase):
         self.check_with_place("dist_mnist.py", delta=1e-5)
 
 
+class TestDistMnistNCCL2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1e-5)
+
+
+class TestDistMnistNCCL2DGC(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._use_dgc = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place("dist_mnist.py", delta=1e-5)
+
+
+class TestDistMnist2x2Lars(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+
+    def test_se_resnext(self):
+        self.check_with_place("dist_mnist_lars.py", delta=1e-5)
+
+
 class TestDistMnist2x2WithMemopt(TestDistBase):
     def _setup_config(self):
         self._sync_mode = True
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index f473c435e59..8b8fdcc887b 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -51,6 +51,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index 847616034c6..690875662e6 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -34,24 +34,6 @@ class TestExpandOpRank1(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestExpandOpRank1_tensor_attr(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.inputs = {
-            'X': np.random.random(12).astype("float32"),
-            'expand_times_tensor': [('x1', np.ones((1)).astype('int32') * 2)]
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], 2)
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('x1'))
-
-
 class TestExpandOpRank2_Corner(OpTest):
     def setUp(self):
         self.op_type = "expand"
@@ -67,25 +49,6 @@ class TestExpandOpRank2_Corner(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestExpandOpRank2_Corner_tensor_attr(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.inputs = {
-            'X': np.random.random((12, 14)).astype("float32"),
-            'expand_times_tensor': [('x1', np.ones((1)).astype('int32')),
-                                    ('x2', np.ones((1)).astype('int32'))]
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], (1, 1))
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 class TestExpandOpRank2(OpTest):
     def setUp(self):
         self.op_type = "expand"
@@ -101,25 +64,6 @@ class TestExpandOpRank2(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestExpandOpRank2_attr_tensor(OpTest):
-    def setUp(self):
-        self.op_type = "expand"
-        self.inputs = {
-            'X': np.random.random((12, 14)).astype("float32"),
-            'expand_times_tensor': [('x1', np.ones((1)).astype('int32') * 2),
-                                    ('x2', np.ones((1)).astype('int32') * 3)]
-        }
-        self.attrs = {}
-        output = np.tile(self.inputs['X'], (2, 3))
-        self.outputs = {'Out': output}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
-
 class TestExpandOpRank3_Corner(OpTest):
     def setUp(self):
         self.op_type = "expand"
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 8fe814dc50d..8d82438c15c 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -90,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
         self.check_output()
 
 
+class TestFakeQuantizeMovingOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_moving_average_abs_max"
+        self.attrs = {
+            'bit_length': int(5),
+            'moving_rate': float(0.9),
+            'is_test': False
+        }
+        accum = np.zeros(1).astype("float32")
+        accum[0] = 1
+        state = np.zeros(1).astype("float32")
+        state[0] = 1
+        scale = np.zeros(1).astype("float32")
+        scale[0] = 0.001
+        self.inputs = {
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'InScale': scale,
+            'InAccum': accum,
+            'InState': state,
+        }
+
+        out_accum = np.zeros(1).astype("float32")
+        out_state = np.zeros(1).astype("float32")
+        out_scale = np.zeros(1).astype("float32")
+        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
+            np.abs(self.inputs['X'])).astype("float32")
+        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+        out_scale = out_accum / out_state
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / out_scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutAccum': out_accum,
+            'OutState': out_state,
+            'OutScale': out_scale,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestMovingAverageAbsMaxScaleOp(OpTest):
     def setUp(self):
         self.op_type = "moving_average_abs_max_scale"
@@ -153,62 +193,5 @@ class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
         self.check_output(no_check_set=set(['OutScale', 'OutScales']))
 
 
-class TestMovingOpBase(OpTest):
-    def setUp(self):
-        self.init_type()
-        self.attrs = {
-            'bit_length': int(5),
-            'moving_rate': float(0.9),
-            'is_test': False
-        }
-        accum = np.zeros(1).astype("float32")
-        accum[0] = 1
-        state = np.zeros(1).astype("float32")
-        state[0] = 1
-        scale = np.zeros(1).astype("float32")
-        scale[0] = 0.001
-        self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
-            'InScale': scale,
-            'InAccum': accum,
-            'InState': state,
-        }
-
-        out_accum = np.zeros(1).astype("float32")
-        out_state = np.zeros(1).astype("float32")
-        out_scale = np.zeros(1).astype("float32")
-        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
-            np.abs(self.inputs['X'])).astype("float32")
-        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
-        out_scale = out_accum / out_state
-        out_data = self.calc_output(out_scale)
-        self.outputs = {
-            'Out': out_data,
-            'OutAccum': out_accum,
-            'OutState': out_state,
-            'OutScale': out_scale,
-        }
-
-    def init_type(self):
-        self.op_type = "fake_quantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        return np.round(self.inputs['X'] / out_scale * (
-            (1 << (self.attrs['bit_length'] - 1)) - 1))
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestFakeQuantDequantMovingOp(TestMovingOpBase):
-    def init_type(self):
-        self.op_type = "fake_quantize_dequantize_moving_average_abs_max"
-
-    def calc_output(self, out_scale):
-        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
-        return np.round(self.inputs['X'] / out_scale *
-                        range_v) * out_scale / range_v
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 119f64ce734..bd5785aa55a 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -23,11 +23,8 @@ class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
         self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {
-            'X': xnp,
-            'Index': np.array(self.index).astype(self.index_type)
-        }
+        xnp = np.random.random(self.x_shape).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array(self.index).astype("int32")}
         self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
 
     def test_check_output(self):
@@ -37,73 +34,14 @@ class TestGatherOp(OpTest):
         self.check_grad(['X'], 'Out')
 
     def config(self):
-        """
-        For multi-dimension input
-        """
         self.x_shape = (10, 20)
-        self.x_type = "float32"
         self.index = [1, 3, 5]
-        self.index_type = "int32"
 
 
 class TestCase1(TestGatherOp):
     def config(self):
-        """
-        For one dimension input
-        """
         self.x_shape = (10)
-        self.x_type = "float32"
         self.index = [1, 3, 5]
-        self.index_type = "int32"
-
-
-class TestCase2(TestGatherOp):
-    def config(self):
-        """
-        For int64_t index type
-        """
-        self.x_shape = (10)
-        self.x_type = "float32"
-        self.index = [1, 3, 5]
-        self.index_type = "int64"
-
-
-class TestCase3(TestGatherOp):
-    def config(self):
-        """
-        For other input type
-        """
-        self.x_shape = (10, 20)
-        self.x_type = "double"
-        self.index = [1, 3, 5]
-        self.index_type = "int64"
-
-
-class TestCase4(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': False}
-        self.x_type = "double"
-        self.index = [1, 1]
-        self.index_type = "int32"
-
-
-class TestCase5(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': False}
-        self.x_type = "float"
-        self.index = [1, 1, 3]
-        self.index_type = "int32"
-
-
-class TestCase6(TestGatherOp):
-    def config(self):
-        self.x_shape = (10, 20)
-        self.attrs = {'overwrite': True}
-        self.x_type = "float"
-        self.index = [1, 3]
-        self.index_type = "int32"
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 406c255970a..5f6328707fd 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -22,10 +22,10 @@ import paddle.fluid as fluid
 from op_test import OpTest
 
 
-def generate_proposal_labels_in_python(
-        rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums, is_cls_agnostic, is_cascade_rcnn):
+def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
+                                       im_info, batch_size_per_im, fg_fraction,
+                                       fg_thresh, bg_thresh_hi, bg_thresh_lo,
+                                       bbox_reg_weights, class_nums):
     rois = []
     labels_int32 = []
     bbox_targets = []
@@ -36,12 +36,13 @@ def generate_proposal_labels_in_python(
         im_info), 'batch size of rpn_rois and ground_truth is not matched'
 
     for im_i in range(len(im_info)):
-        frcn_blobs = _sample_rois(rpn_rois[im_i], gt_classes[im_i],
-                                  is_crowd[im_i], gt_boxes[im_i], im_info[im_i],
-                                  batch_size_per_im, fg_fraction, fg_thresh,
-                                  bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-                                  class_nums, is_cls_agnostic, is_cascade_rcnn)
+        frcn_blobs = _sample_rois(
+            rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
+            im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
+            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)
+
         lod.append(frcn_blobs['rois'].shape[0])
+
         rois.append(frcn_blobs['rois'])
         labels_int32.append(frcn_blobs['labels_int32'])
         bbox_targets.append(frcn_blobs['bbox_targets'])
@@ -53,8 +54,7 @@ def generate_proposal_labels_in_python(
 
 def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
                  batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-                 bg_thresh_lo, bbox_reg_weights, class_nums, is_cls_agnostic,
-                 is_cascade_rcnn):
+                 bg_thresh_lo, bbox_reg_weights, class_nums):
     rois_per_image = int(batch_size_per_im)
     fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
 
@@ -62,8 +62,7 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     im_scale = im_info[2]
     inv_im_scale = 1. / im_scale
     rpn_rois = rpn_rois * inv_im_scale
-    if is_cascade_rcnn:
-        rpn_rois = rpn_rois[gt_boxes.shape[0]:, :]
+
     boxes = np.vstack([gt_boxes, rpn_rois])
     gt_overlaps = np.zeros((boxes.shape[0], class_nums))
     box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
@@ -88,37 +87,26 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     max_overlaps = gt_overlaps.max(axis=1)
     max_classes = gt_overlaps.argmax(axis=1)
 
-    # Cascade RCNN Decode Filter
-    if is_cascade_rcnn:
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        keep = np.where((ws > 0) & (hs > 0))[0]
-        boxes = boxes[keep]
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        fg_rois_per_this_image = fg_inds.shape[0]
-        bg_rois_per_this_image = bg_inds.shape[0]
-    else:
-        # Foreground
-        fg_inds = np.where(max_overlaps >= fg_thresh)[0]
-        fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
-        # Sample foreground if there are too many
-        if fg_inds.shape[0] > fg_rois_per_this_image:
-            fg_inds = np.random.choice(
-                fg_inds, size=fg_rois_per_this_image, replace=False)
-        fg_inds = fg_inds[:fg_rois_per_this_image]
-        # Background
-        bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
-                                                            bg_thresh_lo))[0]
-        bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
-        bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
-                                            bg_inds.shape[0])
-        # Sample background if there are too many
-        if bg_inds.shape[0] > bg_rois_per_this_image:
-            bg_inds = np.random.choice(
-                bg_inds, size=bg_rois_per_this_image, replace=False)
-        bg_inds = bg_inds[:bg_rois_per_this_image]
+    # Foreground
+    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
+    fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
+    # Sample foreground if there are too many
+    # if fg_inds.shape[0] > fg_rois_per_this_image:
+    #     fg_inds = np.random.choice(
+    #         fg_inds, size=fg_rois_per_this_image, replace=False)
+    fg_inds = fg_inds[:fg_rois_per_this_image]
+
+    # Background
+    bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
+                                                        bg_thresh_lo))[0]
+    bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
+    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
+                                        bg_inds.shape[0])
+    # Sample background if there are too many
+    # if bg_inds.shape[0] > bg_rois_per_this_image:
+    #     bg_inds = np.random.choice(
+    #         bg_inds, size=bg_rois_per_this_image, replace=False)
+    bg_inds = bg_inds[:bg_rois_per_this_image]
 
     keep_inds = np.append(fg_inds, bg_inds)
     sampled_labels = max_classes[keep_inds]
@@ -126,12 +114,14 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     sampled_boxes = boxes[keep_inds]
     sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
     sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
+
     bbox_label_targets = _compute_targets(sampled_boxes, sampled_gts,
                                           sampled_labels, bbox_reg_weights)
-    bbox_targets, bbox_inside_weights = _expand_bbox_targets(
-        bbox_label_targets, class_nums, is_cls_agnostic)
+    bbox_targets, bbox_inside_weights = _expand_bbox_targets(bbox_label_targets,
+                                                             class_nums)
     bbox_outside_weights = np.array(
         bbox_inside_weights > 0, dtype=bbox_inside_weights.dtype)
+
     # Scale rois
     sampled_rois = sampled_boxes * im_scale
 
@@ -202,22 +192,19 @@ def _box_to_delta(ex_boxes, gt_boxes, weights):
     return targets
 
 
-def _expand_bbox_targets(bbox_targets_input, class_nums, is_cls_agnostic):
+def _expand_bbox_targets(bbox_targets_input, class_nums):
     class_labels = bbox_targets_input[:, 0]
     fg_inds = np.where(class_labels > 0)[0]
-    #if is_cls_agnostic:
-    #	class_labels = [1 if ll > 0 else 0 for ll in class_labels]
-    #    class_labels = np.array(class_labels, dtype=np.int32)
-    #	class_nums = 2
-    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums
-                             if not is_cls_agnostic else 4 * 2))
+
+    bbox_targets = np.zeros((class_labels.shape[0], 4 * class_nums))
     bbox_inside_weights = np.zeros(bbox_targets.shape)
     for ind in fg_inds:
-        class_label = int(class_labels[ind]) if not is_cls_agnostic else 1
+        class_label = int(class_labels[ind])
         start_ind = class_label * 4
         end_ind = class_label * 4 + 4
         bbox_targets[ind, start_ind:end_ind] = bbox_targets_input[ind, 1:]
         bbox_inside_weights[ind, start_ind:end_ind] = (1.0, 1.0, 1.0, 1.0)
+
     return bbox_targets, bbox_inside_weights
 
 
@@ -241,9 +228,7 @@ class TestGenerateProposalLabelsOp(OpTest):
             'bg_thresh_lo': self.bg_thresh_lo,
             'bbox_reg_weights': self.bbox_reg_weights,
             'class_nums': self.class_nums,
-            'use_random': False,
-            'is_cls_agnostic': self.is_cls_agnostic,
-            'is_cascade_rcnn': self.is_cascade_rcnn
+            'use_random': False
         }
         self.outputs = {
             'Rois': (self.rois, [self.lod]),
@@ -267,15 +252,12 @@ class TestGenerateProposalLabelsOp(OpTest):
         self.bg_thresh_hi = 0.5
         self.bg_thresh_lo = 0.0
         self.bbox_reg_weights = [0.1, 0.1, 0.2, 0.2]
-        #self.class_nums = 81
-        self.is_cls_agnostic = False  #True
-        self.is_cascade_rcnn = True
-        self.class_nums = 2 if self.is_cls_agnostic else 81
+        self.class_nums = 81
 
     def init_test_input(self):
         np.random.seed(0)
         gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = 2000 if not self.is_cascade_rcnn else 512  #self.batch_size_per_im - gt_nums
+        proposal_nums = 2000  #self.batch_size_per_im - gt_nums
         images_shape = [[64, 64]]
         self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
         for i in range(len(images_shape)):
@@ -298,8 +280,7 @@ class TestGenerateProposalLabelsOp(OpTest):
                 self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
                 self.batch_size_per_im, self.fg_fraction,
                 self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
-                self.bbox_reg_weights, self.class_nums,
-                self.is_cls_agnostic, self.is_cascade_rcnn
+                self.bbox_reg_weights, self.class_nums
             )
         self.rois = np.vstack(self.rois)
         self.labels_int32 = np.hstack(self.labels_int32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index afa21a375a4..8404a57eb85 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -34,6 +34,20 @@ class MyLayer(fluid.Layer):
         return [x]
 
 
+class MyPyLayer(fluid.PyLayer):
+    def __init__(self):
+        super(MyPyLayer, self).__init__()
+
+    @staticmethod
+    def forward(inputs):
+        return np.tanh(inputs[0])
+
+    @staticmethod
+    def backward(inputs):
+        inp, out, dout = inputs
+        return np.array(dout) * (1 - np.square(np.array(out)))
+
+
 class MLP(fluid.Layer):
     def __init__(self, name_scope):
         super(MLP, self).__init__(name_scope)
@@ -67,7 +81,7 @@ class SimpleRNNCell(fluid.Layer):
         self._dtype = core.VarDesc.VarType.FP32
         self.param_attr = param_attr
 
-    def _build_once(self, inputs, pre_hidden):
+    def build_once(self, inputs, pre_hidden):
         i2h_param_shape = [self.step_input_size, self.hidden_size]
         h2h_param_shape = [self.hidden_size, self.hidden_size]
         h2o_param_shape = [self.output_size, self.hidden_size]
@@ -187,21 +201,8 @@ class TestImperative(unittest.TestCase):
             ret = fluid.layers.sums(inputs)
             loss = fluid.layers.reduce_sum(ret)
             loss.backward()
-        with fluid.dygraph.guard():
-            inputs2 = []
-            for _ in range(10):
-                inputs2.append(fluid.dygraph.base.to_variable(x))
-            ret2 = fluid.layers.sums(inputs2)
-            loss2 = fluid.layers.reduce_sum(ret2)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            loss2.backward(backward_strategy)
-
             self.assertTrue(np.allclose(ret.numpy(), x * 10))
             self.assertTrue(np.allclose(inputs[0].gradient(), x))
-            self.assertTrue(np.allclose(ret2.numpy(), x * 10))
-            a = inputs2[0].gradient()
-            self.assertTrue(np.allclose(inputs2[0].gradient(), x))
 
     def test_layer(self):
         with fluid.dygraph.guard():
@@ -210,6 +211,75 @@ class TestImperative(unittest.TestCase):
             l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
+    def test_pylayer_func_id(self):
+
+        with fluid.dygraph.guard():
+
+            class PyLayer1(fluid.PyLayer):
+                def __init__(self):
+                    super(PyLayer1, self).__init__()
+
+                @staticmethod
+                def forward(input):
+                    return input
+
+                @staticmethod
+                def backward(input):
+                    return input
+
+            class PyLayer2(fluid.PyLayer):
+                def __init__(self):
+                    super(PyLayer2, self).__init__()
+
+                @staticmethod
+                def forward(input):
+                    return input
+
+                @staticmethod
+                def backward(input):
+                    return input
+
+            py_layer_1 = PyLayer1()
+            py_layer_2 = PyLayer2()
+            py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
+            py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2])))
+            id = py_layer_1.forward_id
+            self.assertGreater(id, 0)
+            self.assertEqual(py_layer_1.backward_id, id + 1)
+            self.assertEqual(py_layer_2.forward_id, id + 2)
+            self.assertEqual(py_layer_2.backward_id, id + 3)
+            py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2])))
+            self.assertEqual(py_layer_1.forward_id, id)
+
+    def test_pylayer(self):
+        np_inp = np.ones([2, 2], np.float32)
+        with fluid.dygraph.guard():
+            my_py_layer = MyPyLayer()
+            var_inp = fluid.dygraph.base.to_variable(np_inp)
+            outs = my_py_layer(var_inp)
+            dy_out = np.sum(outs[0].numpy())
+            outs[0].backward()
+            dy_grad = var_inp.gradient()
+
+        with new_program_scope():
+            inp = fluid.layers.data(
+                name="inp", shape=[2, 2], append_batch_size=False)
+            # TODO(panyx0718): Paddle doesn't diff against data `inp`.
+            x1 = inp * 1
+            # TODO(panyx0718): If reduce_sum is skipped, the result is wrong.
+            x = fluid.layers.reduce_sum(fluid.layers.tanh(x1))
+            param_grads = fluid.backward.append_backward(
+                x, parameter_list=[x1.name])[0]
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            static_out, static_grad = exe.run(
+                feed={inp.name: np_inp},
+                fetch_list=[x.name, param_grads[1].name])
+
+        self.assertTrue(np.allclose(dy_out, static_out))
+        self.assertTrue(np.allclose(dy_grad, static_grad))
+
     def test_layer_in_out(self):
         np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32)
         with fluid.dygraph.guard():
@@ -221,17 +291,6 @@ class TestImperative(unittest.TestCase):
             x.backward()
             dy_grad = l._x_for_debug.gradient()
 
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            l2 = MyLayer("my_layer")
-            x2 = l2(var_inp2)[0]
-            self.assertIsNotNone(x2)
-            dy_out2 = x2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            x2.backward(backward_strategy)
-            dy_grad2 = l2._x_for_debug.gradient()
-
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[3], append_batch_size=False)
@@ -248,8 +307,6 @@ class TestImperative(unittest.TestCase):
 
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad2, static_grad))
 
     def test_mlp(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
@@ -261,16 +318,6 @@ class TestImperative(unittest.TestCase):
             out.backward()
             dy_grad = mlp._fc1._w.gradient()
 
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            mlp2 = MLP("mlp")
-            out2 = mlp2(var_inp2)
-            dy_out2 = out2.numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            out2.backward(backward_strategy)
-            dy_grad2 = mlp2._fc1._w.gradient()
-
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[2, 2], append_batch_size=False)
@@ -288,8 +335,6 @@ class TestImperative(unittest.TestCase):
 
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad, static_grad))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad2, static_grad))
 
         params = mlp.parameters(True)
         self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
@@ -368,19 +413,6 @@ class TestImperative(unittest.TestCase):
             dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
             dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
 
-        with fluid.dygraph.guard():
-            var_inp2 = fluid.dygraph.base.to_variable(np_inp)
-            var_inp2 = fluid.layers.reshape(var_inp2, shape=[1, 4, 3])
-            simple_rnn2 = SimpleRNN("simple_rnn")
-            outs2, pre_hiddens2 = simple_rnn2.forward(var_inp2)
-            dy_out2 = outs2[3].numpy()
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            outs2[3].backward(backward_strategy)
-            dy_grad_h2o2 = simple_rnn2._cell._h2o_w.gradient()
-            dy_grad_h2h2 = simple_rnn2._cell._h2h_w.gradient()
-            dy_grad_i2h2 = simple_rnn2._cell._i2h_w.gradient()
-
         with new_program_scope():
             inp = fluid.layers.data(
                 name="inp", shape=[1, 4, 3], append_batch_size=False)
@@ -395,15 +427,10 @@ class TestImperative(unittest.TestCase):
                     outs[3].name, param_grads[0][1].name,
                     param_grads[1][1].name, param_grads[2][1].name
                 ])
-
         self.assertTrue(np.allclose(dy_out, static_out))
         self.assertTrue(np.allclose(dy_grad_h2o, static_grad_h2o))
         self.assertTrue(np.allclose(dy_grad_h2h, static_grad_h2h))
         self.assertTrue(np.allclose(dy_grad_i2h, static_grad_i2h))
-        self.assertTrue(np.allclose(dy_out2, static_out))
-        self.assertTrue(np.allclose(dy_grad_h2o2, static_grad_h2o))
-        self.assertTrue(np.allclose(dy_grad_h2h2, static_grad_h2h))
-        self.assertTrue(np.allclose(dy_grad_i2h2, static_grad_i2h))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
index 25d490f6797..889e7c0fa6c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -18,13 +18,14 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid import Conv2D, Pool2D, FC, core
+from paddle.fluid import Conv2D, Pool2D, FC
 from paddle.fluid.dygraph.base import to_variable
 
 
 class SimpleImgConvPool(fluid.Layer):
     def __init__(self,
                  name_scope,
+                 num_channels,
                  num_filters,
                  filter_size,
                  pool_size,
@@ -44,6 +45,7 @@ class SimpleImgConvPool(fluid.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
@@ -74,10 +76,10 @@ class MNIST(fluid.Layer):
         super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
@@ -97,19 +99,9 @@ class MNIST(fluid.Layer):
 
 
 class TestDygraphCheckpoint(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
     def test_save_load_persistables(self):
         seed = 90
         epoch_num = 1
-        batch_size = 128
 
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
@@ -117,21 +109,22 @@ class TestDygraphCheckpoint(unittest.TestCase):
 
             mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             dy_param_init_value = {}
 
+            step = 0
             for epoch in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-                    img = data[0]
-                    label = data[1]
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
                     label.stop_gradient = True
 
                     cost = mnist(img)
@@ -149,7 +142,7 @@ class TestDygraphCheckpoint(unittest.TestCase):
                     for param in mnist.parameters():
                         dy_param_init_value[param.name] = param.numpy()
 
-                    restore, _ = fluid.dygraph.load_persistables("save_dir")
+                    restore = fluid.dygraph.load_persistables("save_dir")
                     mnist.load_dict(restore)
 
                     self.assertEqual(len(dy_param_init_value), len(restore))
@@ -160,7 +153,9 @@ class TestDygraphCheckpoint(unittest.TestCase):
                         self.assertTrue(np.isfinite(value.numpy().all()))
                         self.assertFalse(np.isnan(value.numpy().any()))
 
-                    if batch_id > 10:
+                    step += 1
+
+                    if step > 10:
                         break
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index daf8cc00d43..ca2cffa9c75 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -258,35 +258,7 @@ class TestDygraphDeepCF(unittest.TestCase):
                     dy_loss = loss.numpy()
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            deepcf2 = DeepCF('deepcf', num_users, num_items, matrix)
-            adam2 = fluid.optimizer.AdamOptimizer(0.01)
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            for e in range(NUM_EPOCHES):
-                sys.stderr.write('epoch %d\n' % e)
-                for slice in range(0, BATCH_SIZE * NUM_BATCHES, BATCH_SIZE):
-                    if slice + BATCH_SIZE >= users_np.shape[0]:
-                        break
-                    prediction2 = deepcf2(
-                        to_variable(users_np[slice:slice + BATCH_SIZE]),
-                        to_variable(items_np[slice:slice + BATCH_SIZE]))
-                    loss2 = fluid.layers.reduce_sum(
-                        fluid.layers.log_loss(prediction2,
-                                              to_variable(labels_np[
-                                                  slice:slice + BATCH_SIZE])))
-                    loss2.backward(backward_strategy)
-                    adam2.minimize(loss2)
-                    deepcf2.clear_gradients()
-                    dy_loss2 = loss2.numpy()
-                    sys.stderr.write('dynamic loss: %s %s\n' %
-                                     (slice, dy_loss2))
-
         self.assertEqual(static_loss, dy_loss)
-        self.assertEqual(static_loss, dy_loss2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 7e8cebab44e..5d773ec1c9d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -170,59 +170,11 @@ class TestDygraphGAN(unittest.TestCase):
             dy_g_loss = g_loss.numpy()
             dy_d_loss = d_loss.numpy()
 
-        dy_params2 = dict()
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            backward_strategy = fluid.dygraph.BackwardStrategy()
-            backward_strategy.sort_sum_gradient = True
-            discriminator2 = Discriminator("d")
-            generator2 = Generator("g")
-            sgd2 = SGDOptimizer(learning_rate=1e-3)
-
-            d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
-            d_loss_real2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_real2, label=to_variable(np.ones([2, 1], np.float32))))
-
-            d_fake2 = discriminator2(
-                generator2(to_variable(np.ones([2, 2], np.float32))))
-            d_loss_fake2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake2, label=to_variable(np.zeros([2, 1], np.float32))))
-
-            d_loss2 = d_loss_real2 + d_loss_fake2
-            d_loss2.backward(backward_strategy)
-            sgd2.minimize(d_loss2)
-            discriminator2.clear_gradients()
-            generator2.clear_gradients()
-
-            d_fake2 = discriminator2(
-                generator2(to_variable(np.ones([2, 2], np.float32))))
-            g_loss2 = fluid.layers.reduce_mean(
-                fluid.layers.sigmoid_cross_entropy_with_logits(
-                    x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss2.backward(backward_strategy)
-            sgd2.minimize(g_loss2)
-            for p in discriminator2.parameters():
-                dy_params2[p.name] = p.numpy()
-            for p in generator.parameters():
-                dy_params2[p.name] = p.numpy()
-
-            dy_g_loss2 = g_loss2.numpy()
-            dy_d_loss2 = d_loss2.numpy()
-
         self.assertEqual(dy_g_loss, static_g_loss)
         self.assertEqual(dy_d_loss, static_d_loss)
         for k, v in six.iteritems(dy_params):
             self.assertTrue(np.allclose(v, static_params[k]))
 
-        self.assertEqual(dy_g_loss2, static_g_loss)
-        self.assertEqual(dy_d_loss2, static_d_loss)
-        for k, v in six.iteritems(dy_params2):
-            self.assertTrue(np.allclose(v, static_params[k]))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 8531eda8697..234fcd60404 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -101,11 +101,11 @@ class TestDygraphGNN(unittest.TestCase):
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
             exe.run(startup)
             static_loss = exe.run(feed={
-                'features': np.ones(
+                'features': np.zeros(
                     [1, 100, 50], dtype=np.float32),
-                'adj': np.ones(
+                'adj': np.zeros(
                     [1, 100, 100], dtype=np.float32),
-                'labels': np.ones(
+                'labels': np.zeros(
                     [100, 1], dtype=np.int64)
             },
                                   fetch_list=[loss])[0]
@@ -117,10 +117,10 @@ class TestDygraphGNN(unittest.TestCase):
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
-            features = np.ones([1, 100, 50], dtype=np.float32)
+            features = np.zeros([1, 100, 50], dtype=np.float32)
             # Use selected rows when it's supported.
-            adj = np.ones([1, 100, 100], dtype=np.float32)
-            labels = np.ones([100, 1], dtype=np.int64)
+            adj = np.zeros([1, 100, 100], dtype=np.float32)
+            labels = np.zeros([100, 1], dtype=np.int64)
 
             model = GCN('test_gcn', 50)
             logits = model(to_variable(features), to_variable(adj))
@@ -130,39 +130,11 @@ class TestDygraphGNN(unittest.TestCase):
             loss = fluid.layers.softmax_with_cross_entropy(logits,
                                                            to_variable(labels))
             loss = fluid.layers.reduce_sum(loss)
-            loss.backward()
             adam = AdamOptimizer(learning_rate=1e-3)
-
             adam.minimize(loss)
-            model.clear_gradients()
-
-        with fluid.dygraph.guard():
-            fluid.default_startup_program().random_seed = seed
-            fluid.default_main_program().random_seed = seed
-
-            features2 = np.ones([1, 100, 50], dtype=np.float32)
-            # Use selected rows when it's supported.
-            adj2 = np.ones([1, 100, 100], dtype=np.float32)
-            labels2 = np.ones([100, 1], dtype=np.int64)
-
-            model2 = GCN('test_gcn', 50)
-            logits2 = model2(to_variable(features2), to_variable(adj2))
-            logits2 = fluid.layers.reshape(logits2, logits2.shape[1:])
-            # In other example, it's nll with log_softmax. However, paddle's
-            # log_loss only supports binary classification now.
-            loss2 = fluid.layers.softmax_with_cross_entropy(
-                logits2, to_variable(labels2))
-            loss2 = fluid.layers.reduce_sum(loss2)
-            loss2.backward()
-            adam2 = AdamOptimizer(learning_rate=1e-3)
-            adam2.minimize(loss2)
-            model2.clear_gradients()
-
-        self.assertEqual(static_loss, loss.numpy())
-        self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
-        self.assertEqual(static_loss, loss2.numpy())
-        self.assertTrue(np.allclose(static_weight, model2.gc.weight.numpy()))
-        sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
+            self.assertEqual(static_loss, loss.numpy())
+            self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index c3a12addfc8..908237b8873 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -31,6 +31,7 @@ from test_imperative_base import new_program_scope
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
                  name_scope,
+                 num_channels,
                  num_filters,
                  filter_size,
                  pool_size,
@@ -50,6 +51,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
@@ -80,10 +82,10 @@ class MNIST(fluid.dygraph.Layer):
         super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 20, 5, 2, 2, act="relu")
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 50, 5, 2, 2, act="relu")
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
@@ -103,45 +105,30 @@ class MNIST(fluid.dygraph.Layer):
 
 
 class TestImperativeMnist(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
     def test_mnist_float32(self):
         seed = 90
         epoch_num = 1
-        batch_size = 128
-        batch_num = 50
-
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
             mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             mnist.train()
             dy_param_init_value = {}
             for epoch in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
-                    if batch_id >= batch_num:
-                        break
-                    img = data[0]
-                    dy_x_data = img.numpy()
-                    label = data[1]
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
                     label.stop_gradient = True
 
                     cost = mnist(img)
@@ -172,9 +159,7 @@ class TestImperativeMnist(unittest.TestCase):
             mnist = MNIST("mnist")
             sgd = SGDOptimizer(learning_rate=1e-3)
             train_reader = paddle.batch(
-                paddle.dataset.mnist.train(),
-                batch_size=batch_size,
-                drop_last=True)
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             img = fluid.layers.data(
                 name='pixel', shape=[1, 28, 28], dtype='float32')
@@ -198,14 +183,11 @@ class TestImperativeMnist(unittest.TestCase):
 
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
-                    if batch_id >= batch_num:
-                        break
                     static_x_data = np.array(
                         [x[0].reshape(1, 28, 28)
                          for x in data]).astype('float32')
                     y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(
-                            [batch_size, 1])
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
 
                     fetch_list = [avg_loss.name]
                     fetch_list.extend(static_param_name_list)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index a7c39f7ff2a..b9f93119e83 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -48,41 +48,29 @@ class TestImperativeOptimizerBase(unittest.TestCase):
     def get_optimizer(self):
         raise NotImplementedError()
 
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                image = np.array(item[0]).reshape(1, 28, 28)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield image, label
-
-        return _reader_imple
-
     def _check_mlp(self):
         seed = 90
-        batch_size = 128
-
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
             mlp = MLP('mlp')
             optimizer = self.get_optimizer()
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(paddle.dataset.mnist.train()),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
             dy_param_init_value = {}
-            for batch_id, data in enumerate(batch_py_reader()):
+            for batch_id, data in enumerate(train_reader()):
                 if batch_id >= self.batch_num:
                     break
 
-                img = data[0]
-                label = data[1]
+                dy_x_data = np.array(
+                    [x[0].reshape(1, 28, 28) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    128, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
                 label._stop_gradient = True
 
                 cost = mlp(img)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 5f6c5b1cb6a..088d36be232 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -44,7 +44,7 @@ class SimpleLSTMRNN(fluid.Layer):
         self.cell_array = []
         self.hidden_array = []
 
-    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
+    def build_once(self, input_embedding, init_hidden=None, init_cell=None):
         self.weight_1_arr = []
         self.weight_2_arr = []
         self.bias_arr = []
@@ -176,6 +176,9 @@ class PtbModel(fluid.Layer):
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
 
+    def build_once(self, input, label, init_hidden, init_cell):
+        pass
+
     def forward(self, input, label, init_hidden, init_cell):
         init_h = fluid.layers.reshape(
             init_hidden, shape=[self.num_layers, -1, self.hidden_size])
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 9eab5abc06c..d9ef08b3c49 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -71,6 +71,7 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.Layer):
     def __init__(self,
                  name_scope,
+                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
@@ -80,6 +81,7 @@ class ConvBNLayer(fluid.Layer):
 
         self._conv = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
@@ -98,22 +100,30 @@ class ConvBNLayer(fluid.Layer):
 
 
 class BottleneckBlock(fluid.Layer):
-    def __init__(self, name_scope, num_filters, stride, shortcut=True):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True):
         super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
             act='relu')
         self.conv1 = ConvBNLayer(
             self.full_name(),
+            num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu')
         self.conv2 = ConvBNLayer(
             self.full_name(),
+            num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
             act=None)
@@ -121,12 +131,15 @@ class BottleneckBlock(fluid.Layer):
         if not shortcut:
             self.short = ConvBNLayer(
                 self.full_name(),
+                num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
                 stride=stride)
 
         self.shortcut = shortcut
 
+        self._num_channels_out = num_filters * 4
+
     def forward(self, inputs):
         y = self.conv0(inputs)
         conv1 = self.conv1(y)
@@ -162,6 +175,7 @@ class ResNet(fluid.Layer):
 
         self.conv = ConvBNLayer(
             self.full_name(),
+            num_channels=3,
             num_filters=64,
             filter_size=7,
             stride=2,
@@ -174,6 +188,7 @@ class ResNet(fluid.Layer):
             pool_type='max')
 
         self.bottleneck_block_list = []
+        num_channels = 64
         for block in range(len(depth)):
             shortcut = False
             for i in range(depth[block]):
@@ -181,9 +196,11 @@ class ResNet(fluid.Layer):
                     'bb_%d_%d' % (block, i),
                     BottleneckBlock(
                         self.full_name(),
+                        num_channels=num_channels,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
                         shortcut=shortcut))
+                num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
@@ -210,21 +227,11 @@ class ResNet(fluid.Layer):
 
 
 class TestDygraphResnet(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                doc = np.array(item[0]).reshape(3, 224, 224)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield doc, label
-
-        return _reader_imple
-
     def test_resnet_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
-        batch_num = 10
-
+        batch_num = 20
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -234,26 +241,25 @@ class TestDygraphResnet(unittest.TestCase):
             np.random.seed(seed)
             import random
             random.seed = seed
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size)
 
             dy_param_init_value = {}
             for param in resnet.parameters():
                 dy_param_init_value[param.name] = param.numpy()
 
-            for batch_id, data in enumerate(batch_py_reader()):
+            for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
                     break
 
-                img = data[0]
-                label = data[1]
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    batch_size, 1)
+
+                img = to_variable(dy_x_data)
+                label = to_variable(y_data)
                 label.stop_gradient = True
 
                 out = resnet(img)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index f6585d1b30d..3f3f92cde57 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -64,6 +64,7 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.dygraph.Layer):
     def __init__(self,
                  name_scope,
+                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
@@ -73,6 +74,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
 
         self._conv = Conv2D(
             self.full_name(),
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
@@ -129,15 +131,20 @@ class BottleneckBlock(fluid.dygraph.Layer):
         super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
-            self.full_name(), num_filters=num_filters, filter_size=1)
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1)
         self.conv1 = ConvBNLayer(
             self.full_name(),
+            num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             groups=cardinality)
         self.conv2 = ConvBNLayer(
             self.full_name(),
+            num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
             act='relu')
@@ -150,6 +157,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
         if not shortcut:
             self.short = ConvBNLayer(
                 self.full_name(),
+                num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
                 stride=stride)
@@ -192,6 +200,7 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
+                num_channels=3,
                 num_filters=64,
                 filter_size=7,
                 stride=2,
@@ -209,6 +218,7 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
+                num_channels=3,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
@@ -226,18 +236,21 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
+                num_channels=3,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
                 act='relu')
             self.conv1 = ConvBNLayer(
                 self.full_name(),
+                num_channels=64,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
                 act='relu')
             self.conv2 = ConvBNLayer(
                 self.full_name(),
+                num_channels=64,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
@@ -298,20 +311,11 @@ class SeResNeXt(fluid.dygraph.Layer):
 
 
 class TestImperativeResneXt(unittest.TestCase):
-    def reader_decorator(self, reader):
-        def _reader_imple():
-            for item in reader():
-                doc = np.array(item[0]).reshape(3, 224, 224)
-                label = np.array(item[1]).astype('int64').reshape(1)
-                yield doc, label
-
-        return _reader_imple
-
     def test_se_resnext_float32(self):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
-        batch_num = 1
+        batch_num = 2
         epoch_num = 1
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
@@ -322,28 +326,29 @@ class TestImperativeResneXt(unittest.TestCase):
             np.random.seed(seed)
             import random
             random.seed = seed
-
-            batch_py_reader = fluid.io.PyReader(capacity=1)
-            batch_py_reader.decorate_sample_list_generator(
-                paddle.batch(
-                    self.reader_decorator(
-                        paddle.dataset.flowers.train(use_xmap=False)),
-                    batch_size=batch_size,
-                    drop_last=True),
-                places=fluid.CPUPlace())
+            train_reader = paddle.batch(
+                paddle.dataset.flowers.train(use_xmap=False),
+                batch_size=batch_size,
+                drop_last=True)
 
             dy_param_init_value = {}
             for param in se_resnext.parameters():
                 dy_param_init_value[param.name] = param.numpy()
             for epoch_id in range(epoch_num):
-                for batch_id, data in enumerate(batch_py_reader()):
+                for batch_id, data in enumerate(train_reader()):
 
                     if batch_id >= batch_num and batch_num != -1:
                         break
 
-                    img = data[0]
-                    label = data[1]
-                    label.stop_gradient = True
+                    dy_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            batch_size, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
                     label.stop_gradient = True
 
                     out = se_resnext(img)
diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py
index 553ebaec7f1..9d5e064e6ad 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
@@ -104,7 +104,6 @@ class TestInferShape(unittest.TestCase):
         sum_op_desc = block.append_op()
         sum_op_desc.set_type("expand")
         sum_op_desc.set_input("X", ["x"])
-        sum_op_desc.set_input('expand_times_tensor', [])
         sum_op_desc.set_output("Out", ["out"])
         sum_op_desc._set_attr('expand_times', expand_times)
 
diff --git a/python/paddle/fluid/tests/unittests/test_install_check.py b/python/paddle/fluid/tests/unittests/test_install_check.py
index 5cb199d4967..5802e2ed0a3 100644
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
@@ -20,7 +20,3 @@ import paddle.fluid as fluid
 class TestInstallCheck(unittest.TestCase):
     def test_install_check(self):
         fluid.install_check.run_check()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index fdc5d3679e7..fb6c43136ff 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -72,9 +72,6 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
 
 
 class TestLayerNormdOp(unittest.TestCase):
-    def setUp(self):
-        self.use_cudnn = True
-
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
@@ -163,8 +160,7 @@ class TestLayerNormdOp(unittest.TestCase):
                 self.__assert_close(bias_grad, out[5], "bias_grad")
 
         places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu(
-                "layer_norm") and self.use_cudnn:
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
             places.append(core.CUDAPlace(0))
 
         for place in places:
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 944b1bb12fe..2474125835f 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -190,7 +190,8 @@ class TestLayer(LayerTest):
 
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D(
+                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
             ret = conv2d(images)
             static_ret2 = self.get_static_graph_result(
                 feed={'pixel': np.ones(
@@ -199,7 +200,8 @@ class TestLayer(LayerTest):
 
         with self.dynamic_graph():
             images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D(
+                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
             dy_ret = conv2d(base.to_variable(images))
 
         self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
@@ -1265,12 +1267,6 @@ class TestBook(LayerTest):
             out = layers.scatter(input=x, index=idx, updates=updates)
             return (out)
 
-    def make_one_hot(self):
-        with fluid.framework._dygraph_place_guard(place=fluid.CPUPlace()):
-            label = self._get_data(name="label", shape=[1], dtype="int32")
-            one_hot_label = layers.one_hot(input=label, depth=10)
-            return (one_hot_label)
-
     def make_label_smooth(self):
         # TODO(minqiyang): support gpu ut
         self._force_to_use_cpu = True
@@ -1961,173 +1957,6 @@ class TestBook(LayerTest):
             self.assertIsNotNone(out)
         print(str(program))
 
-    def test_deformable_conv(self):
-        if core.is_compiled_with_cuda():
-            with program_guard(fluid.default_main_program(),
-                               fluid.default_startup_program()):
-                input = layers.data(
-                    name='input',
-                    append_batch_size=False,
-                    shape=[2, 3, 32, 32],
-                    dtype="float32")
-                offset = layers.data(
-                    name='offset',
-                    append_batch_size=False,
-                    shape=[2, 18, 32, 32],
-                    dtype="float32")
-                mask = layers.data(
-                    name='mask',
-                    append_batch_size=False,
-                    shape=[2, 9, 32, 32],
-                    dtype="float32")
-                out = layers.deformable_conv(
-                    input=input,
-                    offset=offset,
-                    mask=mask,
-                    num_filters=2,
-                    filter_size=3,
-                    padding=1)
-                return (out)
-
-    def test_unfold(self):
-        with self.static_graph():
-            x = layers.data(name='x', shape=[3, 20, 20], dtype='float32')
-            out = layers.unfold(x, [3, 3], 1, 1, 1)
-            return (out)
-
-    def test_deform_roi_pooling(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='input',
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            rois = layers.data(
-                name="rois", shape=[4], dtype='float32', lod_level=1)
-            trans = layers.data(
-                name="trans",
-                shape=[2, 3, 32, 32],
-                dtype='float32',
-                append_batch_size=False)
-            out = layers.deformable_roi_pooling(
-                input=input,
-                rois=rois,
-                trans=trans,
-                no_trans=False,
-                spatial_scale=1.0,
-                group_size=(1, 1),
-                pooled_height=8,
-                pooled_width=8,
-                part_size=(8, 8),
-                sample_per_part=4,
-                trans_std=0.1)
-        return (out)
-
-    def test_retinanet_target_assign(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            bbox_pred = layers.data(
-                name='bbox_pred',
-                shape=[1, 100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            cls_logits = layers.data(
-                name='cls_logits',
-                shape=[1, 100, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_box = layers.data(
-                name='anchor_box',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            anchor_var = layers.data(
-                name='anchor_var',
-                shape=[100, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[10, 4],
-                append_batch_size=False,
-                dtype='float32')
-            gt_labels = layers.data(
-                name='gt_labels',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='float32')
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[1],
-                append_batch_size=False,
-                dtype='float32')
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
-            return (layers.retinanet_target_assign(
-                bbox_pred, cls_logits, anchor_box, anchor_var, gt_boxes,
-                gt_labels, is_crowd, im_info, 10))
-
-    def test_sigmoid_focal_loss(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = layers.data(
-                name='data',
-                shape=[10, 80],
-                append_batch_size=False,
-                dtype='float32')
-            label = layers.data(
-                name='label',
-                shape=[10, 1],
-                append_batch_size=False,
-                dtype='int32')
-            fg_num = layers.data(
-                name='fg_num',
-                shape=[1],
-                append_batch_size=False,
-                dtype='int32')
-            out = fluid.layers.sigmoid_focal_loss(
-                x=input, label=label, fg_num=fg_num, gamma=2., alpha=0.25)
-            return (out)
-
-    def test_retinanet_detection_output(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            bboxes = layers.data(
-                name='bboxes',
-                shape=[1, 21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=[1, 21, 10],
-                append_batch_size=False,
-                dtype='float32')
-            anchors = layers.data(
-                name='anchors',
-                shape=[21, 4],
-                append_batch_size=False,
-                dtype='float32')
-            im_info = layers.data(
-                name="im_info",
-                shape=[1, 3],
-                append_batch_size=False,
-                dtype='float32')
-            nmsed_outs = layers.retinanet_detection_output(
-                bboxes=[bboxes, bboxes],
-                scores=[scores, scores],
-                anchors=[anchors, anchors],
-                im_info=im_info,
-                score_threshold=0.05,
-                nms_top_k=1000,
-                keep_top_k=100,
-                nms_threshold=0.3,
-                nms_eta=1.)
-            return (nmsed_outs)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index ae1e85c483e..df0d8e0345c 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -43,41 +43,44 @@ class TestMulGradCheck(unittest.TestCase):
             self.func(p)
 
 
-class TestConvDoubleGradCheck(unittest.TestCase):
+class TestReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        shape = [2, 4, 14, 16]
+        shape = [2, 8]
         eps = 0.005
         dtype = np.float64
+
         x = layers.data('x', shape, False, dtype)
-        y = layers.conv2d(x, 4, 1, bias_attr=False)
+        x.persistable = True
+        y = layers.relu(x)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.02
 
-        w = fluid.default_main_program().global_block().all_parameters()
-        w_arr = []
-        for p in w:
-            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+            [x], y, x_init=x_arr, place=place, eps=eps)
 
     def test_grad(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
-            places = [fluid.CUDAPlace(0)]
-            for p in places:
-                self.func(p)
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
 
 
-class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
+class TestLeakyReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        shape = [7, 11]
-        eps = 0.05
+        shape = [3, 7]
+        eps = 0.005
+        alpha = 0.2
         dtype = np.float64
 
         x = layers.data('x', shape, False, dtype)
         x.persistable = True
-        y = layers.reduce_mean(x, dim=0)
+
+        y = layers.leaky_relu(x, alpha=alpha)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.02
 
         gradient_checker.double_grad_check(
             [x], y, x_init=x_arr, place=place, eps=eps)
@@ -86,36 +89,30 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
 
 
-class TestMulDoubleGradCheck(unittest.TestCase):
+class TestConvDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
-        # the shape of input variable shoule be clearly specified, not inlcude -1.
-        x_shape = [7, 11]
-        y_shape = [11, 9]
+        shape = [2, 4, 14, 16]
         eps = 0.005
         dtype = np.float64
+        x = layers.data('x', shape, False, dtype)
+        y = layers.conv2d(x, 4, 1, bias_attr=False)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
 
-        x = layers.data('x', x_shape, False, dtype)
-        x.persistable = True
-        y = layers.data('y', y_shape, False, dtype)
-        y.persistable = True
-        out = layers.mul(x, y)
-        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
-        y_arr = np.random.uniform(-1, 1, y_shape).astype(dtype)
-
+        w = fluid.default_main_program().global_block().all_parameters()
+        w_arr = []
+        for p in w:
+            w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
         gradient_checker.double_grad_check(
-            [x, y], out, x_init=[x_arr, y_arr], place=place, eps=eps)
+            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
-        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
-            places.append(fluid.CUDAPlace(0))
-        for p in places:
-            self.func(p)
+            places = [fluid.CUDAPlace(0)]
+            for p in places:
+                self.func(p)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index f213a0c77f4..7afdae804a6 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -28,34 +28,10 @@ class TestOneHotOp(OpTest):
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
-        depth_np = np.array(10).astype('int32')
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
         x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_attr(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
@@ -64,7 +40,7 @@ class TestOneHotOp_attr(OpTest):
             out[i, x[i]] = 1.0
 
         self.inputs = {'X': (x, x_lod)}
-        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.attrs = {'depth': depth, 'dtype': int(core.VarDesc.VarType.FP32)}
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
@@ -72,37 +48,13 @@ class TestOneHotOp_attr(OpTest):
 
 
 class TestOneHotOp_default_dtype(OpTest):
-    def setUp(self):
-        self.op_type = 'one_hot'
-        depth = 10
-        depth_np = np.array(10).astype('int32')
-        dimension = 12
-        x_lod = [[4, 1, 3, 3]]
-        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
-
-        out = np.zeros(shape=(np.product(x.shape[:-1]),
-                              depth)).astype('float32')
-
-        for i in range(np.product(x.shape)):
-            out[i, x[i]] = 1.0
-
-        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
-        self.attrs = {}
-        self.outputs = {'Out': (out, x_lod)}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestOneHotOp_default_dtype_attr(OpTest):
     def setUp(self):
         self.op_type = 'one_hot'
         depth = 10
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
         x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
-        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+        x = np.array(x).astype('int').reshape([sum(x_lod[0]), 1])
 
         out = np.zeros(shape=(np.product(x.shape[:-1]),
                               depth)).astype('float32')
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index aa9634a2d41..37b9a9188ab 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -33,7 +33,7 @@ class TestOperator(unittest.TestCase):
         except ValueError as v_err:
             self.assertEqual(
                 cpt.get_exception_message(v_err),
-                "`type` to initialized an Operator can not be None.")
+                "`type` to initilized an Operator can not be None.")
         try:
             block.append_op(type="no_such_op")
             self.assertFail()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index ecdca39a543..a08991986a7 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -13,13 +13,10 @@
 # limitations under the License.
 
 from __future__ import print_function
-#import unittest
+import unittest
 from test_dist_base import TestDistBase
-import paddle.fluid as fluid
 
-#TODO(guru4elephant): should have dygraph test dist base
-# current TestDistBase has some incompatible code with dygraph
-'''
+
 class TestParallelDygraphMnist(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
@@ -27,11 +24,9 @@ class TestParallelDygraphMnist(TestDistBase):
         self._dygraph = True
 
     def test_mnist(self):
-        return
-        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("parallel_dygraph_mnist.py", delta=1e-5)
-'''
+        self.check_with_place(
+            "parallel_dygraph_mnist.py", delta=1e-5, check_error_log=True)
+
 
 if __name__ == "__main__":
-    #unittest.main()
-    pass
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
index 328b3a4813e..d0eca7d6dfb 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -17,8 +17,6 @@ from paddle.fluid import compiler
 import unittest
 import logging
 import six
-import os
-os.environ['CPU_NUM'] = str(4)
 
 
 class TestBase(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
index 0457e9cefdb..645b0188d5f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
@@ -45,8 +45,7 @@ class TestFetchAndFeed(unittest.TestCase):
     def parallel_exe(self,
                      use_cuda,
                      run_parallel_exe,
-                     use_faster_executor=False,
-                     num_threads=4,
+                     use_experimental_executor=False,
                      seed=1):
         main_program = fluid.Program()
         startup = fluid.Program()
@@ -73,8 +72,7 @@ class TestFetchAndFeed(unittest.TestCase):
         build_strategy.enable_inplace = False
         build_strategy.memory_optimize = False
         exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.use_experimental_executor = use_faster_executor
-        exec_strategy.num_threads = num_threads
+        exec_strategy.use_experimental_executor = use_experimental_executor
         train_cp = compiler.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name,
             build_strategy=build_strategy,
@@ -145,25 +143,24 @@ class TestFetchAndFeed(unittest.TestCase):
             if batch_id == 2:
                 break
 
-    def check_executor(self, use_faster_executor=False, num_threads=4):
+    def test_fetch_with_threaded_executor(self):
+        if core.is_compiled_with_cuda():
+            self.parallel_exe(
+                use_cuda=True,
+                run_parallel_exe=self.run_parallel_exe_with_fetch)
+        self.parallel_exe(
+            use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch)
+
+    def test_fetch_with_fast_threaded_executor(self):
         if core.is_compiled_with_cuda():
             self.parallel_exe(
                 use_cuda=True,
                 run_parallel_exe=self.run_parallel_exe_with_fetch,
-                use_faster_executor=use_faster_executor,
-                num_threads=num_threads)
+                use_experimental_executor=True)
         self.parallel_exe(
             use_cuda=False,
             run_parallel_exe=self.run_parallel_exe_with_fetch,
-            use_faster_executor=use_faster_executor,
-            num_threads=num_threads)
-
-    def test_fetch(self):
-        for use_faster_executor in {True, False}:
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=4)
-            self.check_executor(
-                use_faster_executor=use_faster_executor, num_threads=1)
+            use_experimental_executor=True)
 
     def test_feed(self):
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py
index 0fc11ef8d92..8097b5f7343 100644
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
@@ -17,13 +17,11 @@ from __future__ import print_function
 import unittest
 import paddle.fluid.core as core
 from paddle.fluid.executor import Executor
-import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.backward import append_backward
 from paddle.fluid.framework import switch_main_program
 from paddle.fluid.framework import Program
 import numpy as np
-from simple_nets import simple_fc_net, init_data
 
 
 class TestPrintOpCPU(unittest.TestCase):
@@ -58,27 +56,6 @@ class TestPrintOpCPU(unittest.TestCase):
                        fetch_list=[loss],
                        return_numpy=False)
 
-    def test_all_parameters(self):
-        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
-        x.stop_gradient = False
-
-        for print_tensor_name in [True, False]:
-            for print_tensor_type in [True, False]:
-                for print_tensor_shape in [True, False]:
-                    for print_tensor_lod in [True, False]:
-                        layers.Print(
-                            input=x,
-                            print_tensor_name=print_tensor_name,
-                            print_tensor_type=print_tensor_type,
-                            print_tensor_shape=print_tensor_shape,
-                            print_tensor_lod=print_tensor_lod, )
-        loss = layers.mean(x)
-        append_backward(loss=loss)
-        exe = Executor(self.place)
-        outs = exe.run(feed={'x': self.x_tensor},
-                       fetch_list=[loss],
-                       return_numpy=False)
-
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
@@ -91,35 +68,5 @@ class TestPrintOpGPU(TestPrintOpCPU):
         self.x_tensor.set_recursive_sequence_lengths([[1, 1]])
 
 
-class TestPrintOpBackward(unittest.TestCase):
-    def check_backward(self, use_cuda):
-        main = fluid.Program()
-        startup = fluid.Program()
-
-        with fluid.program_guard(main, startup):
-            loss = simple_fc_net()
-            loss = fluid.layers.Print(loss)
-            fluid.optimizer.Adam().minimize(loss)
-
-        print_ops = [op for op in main.blocks[0].ops if op.type == u'print']
-        assert len(print_ops) == 2, "The number of print op should be 2"
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup)
-
-        binary = fluid.compiler.CompiledProgram(main).with_data_parallel(
-            loss_name=loss.name)
-
-        img, label = init_data()
-        feed_dict = {"image": img, "label": label}
-        exe.run(binary, feed_dict)
-
-    def test_fw_bw(self):
-        if core.is_compiled_with_cuda():
-            self.check_backward(use_cuda=True)
-        self.check_backward(use_cuda=False)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index e4fb9b1970a..a3701f0808b 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -22,7 +22,6 @@ import numpy as np
 import threading
 import multiprocessing
 import os
-os.environ['CPU_NUM'] = str(4)
 
 
 def as_tensor(np_array_or_tensor, place=None):
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 0417da7228e..f5009556adc 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -86,7 +86,3 @@ class TestRecordIO(unittest.TestCase):
     def test_double_buffer_reader(self):
         self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
                                                                                        place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index 3221985c442..7691221a551 100644
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -37,7 +37,6 @@ class TestReshapeOp(OpTest):
         self.infered_shape = (5, 10)
 
     def test_check_output(self):
-
         self.check_output(no_check_set=['XShape'])
 
     def test_check_grad(self):
@@ -83,51 +82,5 @@ class TestReshapeOpWithInputShape(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestReshapeOp_attr_tensor(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        shape_tensor = []
-        for index, ele in enumerate(self.new_shape):
-            shape_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            'ShapeTensor': shape_tensor
-        }
-        self.attrs = {}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 25)
-        self.new_shape = (5, 10)
-        self.infered_shape = (5, 10)
-
-    def test_check_output(self):
-        self.check_output(no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        self.check_grad(["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_tensor(TestReshapeOp_attr_tensor):
-    def init_data(self):
-        self.ori_shape = (5, 10)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-
-
-class TestReshapeOpDimInfer2_attr_tensor(TestReshapeOp_attr_tensor):
-    def init_data(self):
-        self.ori_shape = (2, 2, 6)
-        self.new_shape = (2, 0, 3, -1)
-        self.infered_shape = (2, 2, 3, -1)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
index 3dba961dc9d..1a2c9bb5f43 100644
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -167,105 +167,6 @@ def rpn_target_assign_in_python(all_anchors,
     return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights
 
 
-def retinanet_target_assign(anchor_by_gt_overlap, gt_labels, positive_overlap,
-                            negative_overlap):
-    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
-    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
-        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
-
-    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
-    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
-        anchor_by_gt_overlap.shape[1])]
-    anchors_with_max_overlap = np.where(
-        anchor_by_gt_overlap == gt_to_anchor_max)[0]
-
-    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
-    labels[anchors_with_max_overlap] = 1
-    labels[anchor_to_gt_max >= positive_overlap] = 1
-
-    fg_inds = np.where(labels == 1)[0]
-    bbox_inside_weight = np.zeros((len(fg_inds), 4), dtype=np.float32)
-
-    bg_inds = np.where(anchor_to_gt_max < negative_overlap)[0]
-    enable_inds = bg_inds
-
-    fg_fake_inds = np.array([], np.int32)
-    fg_value = np.array([fg_inds[0]], np.int32)
-    fake_num = 0
-    for bg_id in enable_inds:
-        if bg_id in fg_inds:
-            fake_num += 1
-            fg_fake_inds = np.hstack([fg_fake_inds, fg_value])
-    labels[enable_inds] = 0
-
-    bbox_inside_weight[fake_num:, :] = 1
-    fg_inds = np.where(labels == 1)[0]
-    bg_inds = np.where(labels == 0)[0]
-    loc_index = np.hstack([fg_fake_inds, fg_inds])
-    score_index = np.hstack([fg_inds, bg_inds])
-    score_index_tmp = np.hstack([fg_inds])
-    labels = labels[score_index]
-
-    gt_inds = anchor_to_gt_argmax[loc_index]
-    label_inds = anchor_to_gt_argmax[score_index_tmp]
-    labels[0:len(fg_inds)] = np.squeeze(gt_labels[label_inds])
-    fg_num = len(fg_fake_inds) + len(fg_inds) + 1
-    assert not np.any(labels == -1), "Wrong labels with -1"
-    return loc_index, score_index, labels, gt_inds, bbox_inside_weight, fg_num
-
-
-def retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels,
-                                      is_crowd, im_info, lod, positive_overlap,
-                                      negative_overlap):
-    anchor_num = all_anchors.shape[0]
-    batch_size = len(lod) - 1
-    for i in range(batch_size):
-        im_scale = im_info[i][2]
-
-        inds_inside = np.arange(all_anchors.shape[0])
-        inside_anchors = all_anchors
-        b, e = lod[i], lod[i + 1]
-        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
-        gt_labels_slice = gt_labels[b:e, :]
-        is_crowd_slice = is_crowd[b:e]
-
-        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
-        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
-        gt_labels_slice = gt_labels_slice[not_crowd_inds]
-        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)
-
-        loc_inds, score_inds, labels, gt_inds, bbox_inside_weight, fg_num = \
-                         retinanet_target_assign(iou, gt_labels_slice,
-                                                positive_overlap, negative_overlap)
-        # unmap to all anchor
-        loc_inds = inds_inside[loc_inds]
-        score_inds = inds_inside[score_inds]
-
-        sampled_gt = gt_boxes_slice[gt_inds]
-        sampled_anchor = all_anchors[loc_inds]
-        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])
-
-        if i == 0:
-            loc_indexes = loc_inds
-            score_indexes = score_inds
-            tgt_labels = labels
-            tgt_bboxes = box_deltas
-            bbox_inside_weights = bbox_inside_weight
-            fg_nums = [[fg_num]]
-        else:
-            loc_indexes = np.concatenate(
-                [loc_indexes, loc_inds + i * anchor_num])
-            score_indexes = np.concatenate(
-                [score_indexes, score_inds + i * anchor_num])
-            tgt_labels = np.concatenate([tgt_labels, labels])
-            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])
-            bbox_inside_weights = np.vstack([bbox_inside_weights, \
-                                             bbox_inside_weight])
-            fg_nums = np.concatenate([fg_nums, [[fg_num]]])
-
-    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels, bbox_inside_weights, fg_nums
-
-
 class TestRpnTargetAssignOp(OpTest):
     def setUp(self):
         n, c, h, w = 2, 4, 14, 14
@@ -333,65 +234,5 @@ class TestRpnTargetAssignOp(OpTest):
         self.check_output()
 
 
-class TestRetinanetTargetAssignOp(OpTest):
-    def setUp(self):
-        n, c, h, w = 2, 4, 14, 14
-        all_anchors = get_anchor(n, c, h, w)
-        gt_num = 10
-        all_anchors = all_anchors.reshape(-1, 4)
-        anchor_num = all_anchors.shape[0]
-
-        images_shape = [[64, 64], [64, 64]]
-        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
-        lod = [0, 4, 8]
-
-        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
-        for i in range(len(images_shape)):
-            im_info[i, 0] = images_shape[i][0]
-            im_info[i, 1] = images_shape[i][1]
-            im_info[i, 2] = 0.8  #scale
-        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
-        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
-        gt_labels = np.vstack([
-            v['gt_classes'].reshape(len(v['gt_classes']), 1)
-            for v in groundtruth
-        ])
-        gt_labels = gt_labels.reshape(len(gt_labels), 1)
-        all_anchors = all_anchors.astype('float32')
-        gt_boxes = gt_boxes.astype('float32')
-        gt_labels = gt_labels.astype('int32')
-
-        positive_overlap = 0.5
-        negative_overlap = 0.4
-
-        loc_index, score_index, tgt_bbox, labels, bbox_inside_weights, fg_num = \
-            retinanet_target_assign_in_python(all_anchors, gt_boxes, gt_labels, is_crowd,
-                                   im_info, lod, positive_overlap, negative_overlap)
-        labels = labels[:, np.newaxis]
-        self.op_type = "retinanet_target_assign"
-        self.inputs = {
-            'Anchor': all_anchors,
-            'GtBoxes': (gt_boxes, [[4, 4]]),
-            'GtLabels': (gt_labels, [[4, 4]]),
-            'IsCrowd': (is_crowd, [[4, 4]]),
-            'ImInfo': (im_info, [[1, 1]])
-        }
-        self.attrs = {
-            'positive_overlap': positive_overlap,
-            'negative_overlap': negative_overlap
-        }
-        self.outputs = {
-            'LocationIndex': loc_index.astype('int32'),
-            'ScoreIndex': score_index.astype('int32'),
-            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': labels.astype('int32'),
-            'BBoxInsideWeight': bbox_inside_weights.astype('float32'),
-            'ForegroundNumber': fg_num.astype('int32')
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_op.py b/python/paddle/fluid/tests/unittests/test_scatter_op.py
index 9c60a118285..088996f9d7d 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.fluid.core as core
 
 
 class TestScatterOp(OpTest):
@@ -38,98 +37,5 @@ class TestScatterOp(OpTest):
         self.check_grad(['Updates'], 'Out', in_place=True)
 
 
-class TestScatterOp0(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.attrs = {'overwrite': True}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-class TestScatterOp1(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        zeros_np = np.zeros([2, 3]).astype('float32')
-        index_np = np.array([1, 1]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = zeros_np
-        for i in range(0, len(index_np)):
-            output_np[index_np[i]] += updates_np[i]
-        self.attrs = {'overwrite': False}
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Updates'], 'Out', in_place=True)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScatterOp2(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        index_np = np.array([1, 2]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = updates_np
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestScatterOp3(OpTest):
-    def setUp(self):
-        self.op_type = "scatter"
-        ref_np = np.ones((3, 3)).astype("float32")
-        zeros_np = np.zeros([2, 3]).astype('float32')
-        index_np = np.array([1, 1]).astype("int32")
-        updates_np = np.random.random((2, 3)).astype("float32")
-        output_np = np.copy(ref_np)
-        output_np[index_np] = zeros_np
-        for i in range(0, len(index_np)):
-            output_np[index_np[i]] += updates_np[i]
-        self.attrs = {'overwrite': False}
-        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
-        self.outputs = {'Out': output_np}
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, ['Updates'], 'Out', in_place=True)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py
index aa801b1f5d8..176265428c8 100644
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
@@ -20,42 +20,31 @@ from op_test import OpTest
 from test_reorder_lod_tensor import convert_to_offset
 
 
-def compute_seqpool_sum(x, offset, out, pad_value=0.0):
+def compute_seqpool_sum(x, offset, out):
     for i in range(len(offset[0]) - 1):
-        if offset[0][i] == offset[0][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            out[i] = sub_x.sum(axis=0)
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
+        out[i] = sub_x.sum(axis=0)
 
 
-def compute_seqpool_avg(x, offset, out, pad_value=0.0):
+def compute_seqpool_avg(x, offset, out):
     for i in range(len(offset[0]) - 1):
-        if offset[0][i] == offset[0][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            out[i] = sub_x.mean(axis=0)
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
+        out[i] = sub_x.mean(axis=0)
 
 
-def compute_seqpool_sqrt(x, offset, out, pad_value=0.0):
+def compute_seqpool_sqrt(x, offset, out):
     for i in range(len(offset[0]) - 1):
-        if offset[0][i] == offset[0][i + 1]:
-            out[i] = pad_value
-        else:
-            sub_x = x[offset[0][i]:offset[0][i + 1], :]
-            seq_len = offset[0][i + 1] - offset[0][i]
-            out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
+        sub_x = x[offset[0][i]:offset[0][i + 1], :]
+        seq_len = offset[0][i + 1] - offset[0][i]
+        out[i] = sub_x.sum(axis=0) / np.sqrt(seq_len)
 
 
 class TestSeqAvgPool(OpTest):
-    def set_lod(self):
-        return [[11]]
-
     def set_data(self):
         self.op_type = 'sequence_pool'
+        # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
-        lod = self.set_lod()
+        lod = [[11]]
         self.inputs = {'X': (x, lod)}
         offset = convert_to_offset(lod)
         out = np.zeros((len(lod[0]), 23)).astype('float32')
@@ -63,8 +52,8 @@ class TestSeqAvgPool(OpTest):
         return x, offset, out
 
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "AVERAGE"}
-        compute_seqpool_avg(x, offset, out, self.attrs["pad_value"])
+        self.attrs = {'pooltype': "AVERAGE"}
+        compute_seqpool_avg(x, offset, out)
 
     def setUp(self):
         x, offset, out = self.set_data()
@@ -80,160 +69,95 @@ class TestSeqAvgPool(OpTest):
         self.check_grad(["X"], "Out")
 
 
-class TestSeqAvgPoolLen0(TestSeqAvgPool):
-    def set_lod(self):
-        return [[0, 4, 0, 7, 0]]
-
-
 class TestSeqSumPool(TestSeqAvgPool):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.1, 'pooltype': "SUM"}
-        compute_seqpool_sum(x, offset, out, self.attrs["pad_value"])
-
-
-class TestSeqSumPoolLen0(TestSeqSumPool):
-    def set_lod(self):
-        return [[0, 4, 0, 7, 0]]
+        self.attrs = {'pooltype': "SUM"}
+        compute_seqpool_sum(x, offset, out)
 
 
 class TestSeqMaxPool(TestSeqAvgPool):
-    def set_lod(self):
-        return [[13]]
-
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
-        lod = self.set_lod()
+        lod = [[13]]
         offset = convert_to_offset(lod)
         for i in range(len(offset[0]) - 1):
             l = offset[0][i + 1] - offset[0][i]
-            if l > 0:
-                x[offset[0][i] + np.random.randint(l), :] += 2.0
+            x[offset[0][i] + np.random.randint(l), :] += 2.0
 
         self.inputs = {'X': (x, lod)}
 
-        out = np.zeros((len(lod[0]), 23)).astype('float32')
+        out = np.zeros((1, 23)).astype('float32')
         self.outputs = {'Out': out}
         return x, offset, out
 
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.5, 'pooltype': "MAX"}
+        self.attrs = {'pooltype': "MAX"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[0][i]:offset[0][i + 1], :]
-                out[i] = np.amax(sub_x, axis=0)
-
-
-class TestSeqMaxPoolLen0(TestSeqMaxPool):
-    def set_lod(self):
-        return [[0, 1, 1, 5, 6, 0]]
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = np.amax(sub_x, axis=0)
 
 
 class TestSeqSqrtPool(TestSeqAvgPool):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
-        compute_seqpool_sqrt(x, offset, out, self.attrs["pad_value"])
-
-
-class TestSeqSqrtPoolLen0(TestSeqSqrtPool):
-    def set_lod(self):
-        return [[0, 7, 0, 2, 2, 0]]
+        self.attrs = {'pooltype': "SQRT"}
+        compute_seqpool_sqrt(x, offset, out)
 
 
 class TestSeqLastPool(TestSeqAvgPool):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
+        self.attrs = {'pooltype': "LAST"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[0][i]:offset[0][i + 1], :]
-                out[i] = sub_x[-1, :]
-
-
-class TestSeqLastPoolLen0(TestSeqLastPool):
-    def set_lod(self):
-        return [[0, 3, 4, 0, 4, 0]]
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = sub_x[-1, :]
 
 
 class TestSeqFirstPool(TestSeqAvgPool):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.3, 'pooltype': "FIRST"}
+        self.attrs = {'pooltype': "FIRST"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"]
-            else:
-                sub_x = x[offset[0][i]:offset[0][i + 1], :]
-                out[i] = sub_x[0, :]
-
-
-class TestSeqFirstPoolLen0(TestSeqFirstPool):
-    def set_lod(self):
-        return [[0, 2, 0, 3, 6, 0]]
+            sub_x = x[offset[0][i]:offset[0][i + 1], :]
+            out[i] = sub_x[0, :]
 
 
 class TestSeqAvgPool2D(TestSeqAvgPool):
-    def set_lod(self):
-        return [[4, 1, 3, 5]]
-
     def set_data(self):
         self.op_type = 'sequence_pool'
+        # one level, batch size is 4
         x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
-        lod = self.set_lod()
+        lod = [[4, 1, 3, 5]]
         self.inputs = {'X': (x, lod)}
         offset = convert_to_offset(lod)
 
-        out = np.zeros((len(lod[0]), 3, 17)).astype('float32')
+        out = np.zeros((4, 3, 17)).astype('float32')
         self.outputs = {'Out': out}
         return x, offset, out
 
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "AVERAGE"}
+        self.attrs = {'pooltype': "AVERAGE"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
-
-
-class TestSeqAvgPool2DLen0(TestSeqAvgPool2D):
-    def set_lod(self):
-        return [[0, 5, 0, 8, 0]]
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
 
 
 class TestSeqSumPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.2, 'pooltype': "SUM"}
+        self.attrs = {'pooltype': "SUM"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
-
-
-class TestSeqSumPool2DLen0(TestSeqSumPool2D):
-    def set_lod(self):
-        return [[0, 8, 0, 5, 0]]
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
 class TestSeqSqrtPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "SQRT"}
+        self.attrs = {'pooltype': "SQRT"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 17))
-                seq_len = offset[0][i + 1] - offset[0][i]
-                out[i] = np.reshape(
-                    sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            seq_len = offset[0][i + 1] - offset[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(seq_len), (3, 17))
 
     def test_check_grad(self):
         # Remove MaxIndex after check_grad is refined.
@@ -242,57 +166,36 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D):
         self.check_grad(["X"], "Out", max_relative_error=0.06)
 
 
-class TestSeqSqrtPool2DLen0(TestSeqSqrtPool2D):
-    def set_lod(self):
-        return [[0, 8, 0, 5, 0]]
-
-
 class TestSeqMaxPool2D(TestSeqAvgPool2D):
-    def set_lod(self):
-        return [[4, 1, 3, 5]]
-
     def set_data(self):
         self.op_type = 'sequence_pool'
         x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
-        self.lod = self.set_lod()
-        self.inputs = {'X': (x, self.lod)}
-        offset = convert_to_offset(self.lod)
+        lod = [[4, 1, 3, 5]]
+        self.inputs = {'X': (x, lod)}
+        offset = convert_to_offset(lod)
         for i in range(len(offset[0]) - 1):
             l = offset[0][i + 1] - offset[0][i]
-            if l == 0:
-                continue
             x[offset[0][i] + np.random.randint(l), :] += 1.0
 
-        out = np.zeros((len(self.lod[0]), 3, 11)).astype('float32')
+        out = np.zeros((4, 3, 11)).astype('float32')
         self.outputs = {'Out': out}
         return x, offset, out
 
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "MAX"}
+        self.attrs = {'pooltype': "MAX"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 11))
-                continue
             sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
                                (-1, 3 * 11))
             out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
 
-class TestSeqMaxPool2DLen0(TestSeqMaxPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 10, 0]]
-
-
 class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 1.0, 'pooltype': "MAX", 'is_test': True}
+        self.attrs = {'pooltype': "MAX", 'is_test': True}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 11))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 11))
-                out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
 
     def test_check_grad(self):
         """Grad computation does not apply to Sequence MAX 
@@ -300,43 +203,22 @@ class TestSeqMaxPool2DInference(TestSeqMaxPool2D):
         return
 
 
-class TestSeqMaxPool2DInferenceLen0(TestSeqMaxPool2DInference):
-    def set_lod(self):
-        return [[0, 3, 0, 10, 0]]
-
-
 class TestSeqLastPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "LAST"}
+        self.attrs = {'pooltype': "LAST"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x[-1, :], (3, 17))
-
-
-class TestSeqLastPool2DLen0(TestSeqLastPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 1, 9, 0]]
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[-1, :], (3, 17))
 
 
 class TestSeqFirstPool2D(TestSeqAvgPool2D):
     def compute(self, x, offset, out):
-        self.attrs = {"pad_value": 0.0, 'pooltype': "FIRST"}
+        self.attrs = {'pooltype': "FIRST"}
         for i in range(len(offset[0]) - 1):
-            if offset[0][i] == offset[0][i + 1]:
-                out[i] = self.attrs["pad_value"] * np.ones((3, 17))
-            else:
-                sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
-                                   (-1, 3 * 17))
-                out[i] = np.reshape(sub_x[0, :], (3, 17))
-
-
-class TestSeqFirstPool2DLen0(TestSeqFirstPool2D):
-    def set_lod(self):
-        return [[0, 3, 0, 3, 7, 0]]
+            sub_x = np.reshape(x[offset[0][i]:offset[0][i + 1], :],
+                               (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[0, :], (3, 17))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index b8a2515e716..f6a658cb1b7 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -98,7 +98,6 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
 
         #####################################################################
         # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU
-        assert core.get_cuda_device_count() > 1
         main, startup, outs = self.build_program(place, layout, seed, True,
                                                  only_forward)
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 4615511ed85..3c974ea460c 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -236,26 +236,6 @@ class TestTensor(unittest.TestCase):
             place = core.CUDAPlace(0)
             self.run_sliece_tensor(place)
 
-    def test_print_tensor(self):
-        scope = core.Scope()
-        var = scope.var("test_tensor")
-        place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor._set_dims([10, 10])
-        tensor._alloc_int(place)
-        tensor_array = numpy.array(tensor)
-        self.assertEqual((10, 10), tensor_array.shape)
-        tensor_array[0, 0] = 1
-        tensor_array[2, 2] = 2
-        tensor.set(tensor_array, place)
-        print(tensor)
-        self.assertTrue(isinstance(str(tensor), str))
-
-        if core.is_compiled_with_cuda():
-            tensor.set(tensor_array, core.CUDAPlace(0))
-            print(tensor)
-            self.assertTrue(isinstance(str(tensor), str))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index a6c43bb8373..35e4af2d098 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
+from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -145,24 +145,6 @@ class TestVariable(unittest.TestCase):
         if core.is_compiled_with_cuda():
             self._test_slice(core.CUDAPlace(0))
 
-    def _tostring(self):
-        b = default_main_program().current_block()
-        w = b.create_var(dtype="float64", lod_level=0)
-        print(w)
-        self.assertTrue(isinstance(str(w), str))
-
-        if core.is_compiled_with_cuda():
-            wc = b.create_var(dtype="int", lod_level=0)
-            print(wc)
-            self.assertTrue(isinstance(str(wc), str))
-
-    def test_tostring(self):
-        with fluid.dygraph.guard():
-            self._tostring()
-
-        with fluid.program_guard(default_main_program()):
-            self._tostring()
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
index 42a0e5c802c..a3927ef11d3 100644
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -30,18 +30,14 @@ class VersionTest(unittest.TestCase):
         self._commit_regex = "[0-9a-f]{5,49}"
 
     def test_check_output(self):
-        # check commit format
-        self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
         self.assertTrue(isinstance(fluid_version.istaged, bool))
 
         # check version format
         if fluid_version.istaged:
-            self.assertEqual(fluid_version.major, 0)
-            self.assertEqual(fluid_version.minor, 0)
-            self.assertEqual(fluid_version.patch, "0")
-            self.assertEqual(fluid_version.rc, 0)
-            self.assertEqual(fluid_version.full_version, "0.0.0")
+            self.assertEqual(fluid_version.full_version, "latest")
         else:
+            # check commit format
+            self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
             self.assertTrue(re.match(self._major_regex, fluid_version.major))
             self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
             self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 62e725a04a1..ec0592baa22 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -241,20 +241,20 @@ class TestWarpCTCOpCase1(TestWarpCTCOp):
         self.use_cudnn = False
 
 
-# TODO: fix this test failed cuda9/10 manylinux images
-# class TestCudnnCTCOp(TestWarpCTCOp):
-#     def config(self):
-#         self.batch_size = 4
-#         self.num_classes = 8
-#         self.logits_lod = [[4, 1, 3, 3]]
-#         self.labels_lod = [[3, 1, 4, 4]]
-#         self.blank = 0
-#         self.norm_by_times = False
-#         self.use_cudnn = True
-#     def test_check_grad(self):
-#         if sys.version_info < (3, 0):
-#             self.outputs['WarpCTCGrad'] = self.gradient
-#             self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
+class TestCudnnCTCOp(TestWarpCTCOp):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[4, 1, 3, 3]]
+        self.labels_lod = [[3, 1, 4, 4]]
+        self.blank = 0
+        self.norm_by_times = False
+        self.use_cudnn = True
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 806d09895ad..c742ee002aa 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer']
+__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer']
 
 
 # can be initialized from train_desc,
@@ -61,12 +61,9 @@ class TrainerDesc(object):
     def _set_program(self, program):
         self._program = program
 
-    def _set_use_cvm(self, use_cvm=False):
-        self.proto_desc.use_cvm = use_cvm
-
     def _desc(self):
         from google.protobuf import text_format
-        return self.proto_desc.SerializeToString()
+        return text_format.MessageToString(self.proto_desc)
 
 
 class MultiTrainer(TrainerDesc):
@@ -102,22 +99,3 @@ class DistMultiTrainer(TrainerDesc):
         self._device_worker._set_infer(self._infer)
         self._device_worker._set_program(self._program)
         self._device_worker._gen_worker_desc(self.proto_desc)
-
-
-class PipelineTrainer(TrainerDesc):
-    def __init__(self):
-        super(PipelineTrainer, self).__init__()
-        pass
-
-    def _set_program(self, program):
-        super(PipelineTrainer, self)._set_program(program)
-        self._program = program
-
-    def _gen_trainer_desc(self):
-        super(PipelineTrainer, self)._gen_trainer_desc()
-        self.proto_desc.class_name = "PipelineTrainer"
-        if self._program == None:
-            raise RuntimeError("None Program")
-        self._device_worker._set_infer(self._infer)
-        self._device_worker._set_program(self._program)
-        self._device_worker._gen_worker_desc(self.proto_desc)
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 67d240cccd6..871b663663e 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .trainer_desc import MultiTrainer, DistMultiTrainer, PipelineTrainer
-from .device_worker import Hogwild, DownpourSGD, Section
+from .trainer_desc import MultiTrainer, DistMultiTrainer
+from .device_worker import Hogwild, DownpourSGD
 
 __all__ = ["TrainerFactory"]
 
@@ -35,9 +35,7 @@ class TrainerFactory(object):
             device_worker_class = opt_info["device_worker"]
             trainer = globals()[trainer_class]()
             device_worker = globals()[device_worker_class]()
-            if "fleet_desc" in opt_info:
-                device_worker._set_fleet_desc(opt_info["fleet_desc"])
-                trainer._set_fleet_desc(opt_info["fleet_desc"])
-                trainer._set_use_cvm(opt_info["use_cvm"])
+            device_worker._set_fleet_desc(opt_info["fleet_desc"])
             trainer._set_device_worker(device_worker)
+            trainer._set_fleet_desc(opt_info["fleet_desc"])
         return trainer
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index feb32773821..60f74bb6264 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -146,11 +146,6 @@ class DistributeTranspilerConfig(object):
           We can use bandwidth effiently when data size is larger than 2MB.If you
           want to change it, please be sure you have read the slice_variable function.
 
-    Examples:
-        .. code-block:: python
-
-            config = fluid.DistributeTranspilerConfig()
-            config.slice_var_up = True
     """
 
     slice_var_up = True
@@ -163,16 +158,7 @@ class DistributeTranspilerConfig(object):
     wait_port = True
     # split the send recv var in runtime
     runtime_split_send_recv = False
-    sync_mode = True
-
-    nccl_comm_num = 1
-    #The picture here illustrates the principle:
-    #https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
-    use_hierarchical_allreduce = False
-    #Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu cards' number in most cases.
-    hierarchical_allreduce_inter_nranks = 0
-    #Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to nodes number.
-    hierarchical_allreduce_exter_nranks = 0
+    sync_mode = None
 
 
 class DistributeTranspiler(object):
@@ -195,23 +181,13 @@ class DistributeTranspiler(object):
     Examples:
         .. code-block:: python
 
-            x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-            y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_loss = fluid.layers.mean(cost)
-
-            sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            sgd_optimizer.minimize(avg_loss)
-
             # for pserver mode
             pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
             trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
             current_endpoint = "192.168.0.1:6174"
             trainer_id = 0
             trainers = 4
-            role = "PSERVER"
+            role = os.getenv("PADDLE_TRAINING_ROLE")
             t = fluid.DistributeTranspiler()
             t.transpile(
                  trainer_id, pservers=pserver_endpoints, trainers=trainers)
@@ -223,17 +199,14 @@ class DistributeTranspiler(object):
                  trainer_program = t.get_trainer_program()
 
             # for nccl2 mode
-            trainer_num = 2
-            trainer_id = 0
             config = fluid.DistributeTranspilerConfig()
             config.mode = "nccl2"
-            trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
             t = fluid.DistributeTranspiler(config=config)
-            t.transpile(trainer_id=trainer_id, trainers=trainer_endpoints, current_endpoint="192.168.0.1:6174")
+            t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
             exe = fluid.ParallelExecutor(
-                use_cuda=True,
-                loss_name=avg_loss.name,
-                num_trainers=trainer_num,
+                use_cuda,
+                loss_name=loss_var.name,
+                num_trainers=len(trainers.split(",)),
                 trainer_id=trainer_id
             )
     """
@@ -270,36 +243,14 @@ class DistributeTranspiler(object):
 
             nccl_id_var = startup_program.global_block().create_var(
                 name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
-
-            for i in range(1, self.config.nccl_comm_num):
-                startup_program.global_block().create_var(
-                    name="NCCLID_{}".format(i),
-                    persistable=True,
-                    type=core.VarDesc.VarType.RAW)
-
-            if self.config.use_hierarchical_allreduce:
-                for i in range(0, self.config.nccl_comm_num):
-                    startup_program.global_block().create_var(
-                        name="Hierarchical_inter_NCCLID_{}".format(i),
-                        persistable=True,
-                        type=core.VarDesc.VarType.RAW)
-                    startup_program.global_block().create_var(
-                        name="Hierarchical_exter_NCCLID_{}".format(i),
-                        persistable=True,
-                        type=core.VarDesc.VarType.RAW)
-
             startup_program.global_block().append_op(
                 type="gen_nccl_id",
                 inputs={},
                 outputs={"NCCLID": nccl_id_var},
                 attrs={
-                    "trainers": trainers.split(","),
-                    "trainer_id": trainer_id,
-                    "nccl_comm_num": self.config.nccl_comm_num,
-                    "use_hierarchical_allreduce":
-                    self.config.use_hierarchical_allreduce,
-                    "hierarchical_allreduce_inter_nranks":
-                    self.config.hierarchical_allreduce_inter_nranks
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": trainer_id
                 })
             return nccl_id_var
         else:
@@ -338,7 +289,7 @@ class DistributeTranspiler(object):
                   startup_program=None,
                   current_endpoint="127.0.0.1:6174"):
         """
-        Run the transpiler. Transpile the input program.
+        Run the transpiler.
 
         Args:
             trainer_id (int): id for current trainer worker, if you have
@@ -358,17 +309,6 @@ class DistributeTranspiler(object):
             current_endpoint (str): need pass current endpoint when
                 transpile as nccl2 distributed mode. In pserver mode
                 this argument is not used.
-
-        Examples:
-            .. code-block:: python
-
-                transpiler = fluid.DistributeTranspiler()
-                t.transpile(
-                    trainer_id=0,
-                    pservers="127.0.0.1:7000,127.0.0.1:7001",
-                    trainers=2,
-                    sync_mode=False,
-                    current_endpoint="127.0.0.1:7000")
         """
         if program is None:
             program = default_main_program()
@@ -381,12 +321,6 @@ class DistributeTranspiler(object):
         if self.config.mode == "nccl2":
             assert (isinstance(trainers, str))
             self.origin_program._trainers_endpoints = trainers.split(",")
-            self.origin_program._nccl_comm_num = self.config.nccl_comm_num
-            self.origin_program._use_hierarchical_allreduce = self.config.use_hierarchical_allreduce
-            self.origin_program._hierarchical_allreduce_inter_nranks = \
-                int(self.config.hierarchical_allreduce_inter_nranks)
-            self.origin_program._hierarchical_allreduce_exter_nranks = \
-                int(self.config.hierarchical_allreduce_exter_nranks)
             self._transpile_nccl2(
                 trainer_id,
                 trainers,
@@ -396,7 +330,7 @@ class DistributeTranspiler(object):
             return
 
         self.trainer_num = trainers
-        self.sync_mode = sync_mode
+        self.sync_mode = self.config.sync_mode if self.config.sync_mode else sync_mode
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
         self.pserver_endpoints = pserver_endpoints
@@ -649,18 +583,6 @@ class DistributeTranspiler(object):
 
         Returns:
             Program: trainer side program.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(trainer_id, trainers=trainers, pservers=pserver_endpoints)
-              trainer_program = t.get_trainer_program()
         """
         # remove optimize ops and add a send op to main_program
         # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay?
@@ -786,20 +708,6 @@ class DistributeTranspiler(object):
 
         Returns:
             Program: the program for current parameter server to run.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program = t.get_pserver_program(current_endpoint)
         """
         # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
         # NOTE: assume blocks of the same variable is not distributed
@@ -1043,20 +951,6 @@ class DistributeTranspiler(object):
 
         Returns:
             tuple: (main_program, startup_program), of type "Program"
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              #this is an example, find available endpoints in your case
-              pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-              current_endpoint = "192.168.0.1:6174"
-              trainer_id = 0
-              trainers = 4
-              t = fluid.DistributeTranspiler()
-              t.transpile(
-                   trainer_id, pservers=pserver_endpoints, trainers=trainers)
-              pserver_program, pserver_startup_program = t.get_pserver_programs(current_endpoint)
         """
         pserver_prog = self.get_pserver_program(endpoint)
         pserver_startup = self.get_startup_program(
@@ -1082,21 +976,6 @@ class DistributeTranspiler(object):
 
         Returns:
             Program: parameter server side startup program.
-
-        Examples:
-	    .. code-block:: python
-            
-                pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
-                current_endpoint = "192.168.0.1:6174"
-                trainer_id = 0
-                trainers = 4
-
-                t = fluid.DistributeTranspiler()
-                t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
-                pserver_program = t.get_pserver_program(current_endpoint)
-                pserver_startup_program = t.get_startup_program(current_endpoint,
-                                                                pserver_program)
         """
         s_prog = Program()
         orig_s_prog = self.startup_program
diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py
index 8917fb75128..8a527e72fb9 100644
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -76,7 +76,6 @@ class InferenceTranspiler(object):
             self._fuse_conv_relu_mkldnn(
                 program)  # ResNet residual block merging
             self._fuse_bn_relu_mkldnn(program)
-            self._fuse_mul_add_mkldnn(program)
 
         self._is_test_pass(program)
 
@@ -388,62 +387,6 @@ class InferenceTranspiler(object):
         # And a better solution will be considered later.
         program = program.clone()
 
-    def _fuse_mul_add_mkldnn(self, program):
-        '''
-        Transpile the program by fusing Mul+Add layers to FC layer with the MKL-DNN inner product.
-        The MUL following a Elementwise_add layer can be replaced by the MKL-DNN FC.
-        The Elementwise add's bias input 'Y' has to be added into the
-        MKL-DNN-based FC input 'Bias'.
-         The operator transformation is:
-         - before:
-           - MUL->elementwise_add -> any_other_op
-         - after:
-           - FC -> any_other_op
-         The transpile stages are:
-         1. insert a new MKL-DNN-based FC operator with `Bias` input
-            taken from the Elementwise add's input 'Y' (bias),
-        2. fuse the parameters of MUL and Elemenwise add,
-        3. remove the MUL, elementwise_add operators,
-        4. make the input of the deleted Elementwise add operator to be the input of the
-           new FC operator,
-        5. remove unused variables,
-         Args:
-            program (Program): program to transpile
-         '''
-
-        self.block = program.block(0)
-        self.input_map = {}  # store the input names should be adjusted
-        i = 0
-        while i < len(self.block.ops):
-            # find a elementwise add op
-            if self.block.ops[i].type == 'elementwise_add':
-                add_op = self.block.ops[i]
-                add_idx = i
-                mul_idx = -1
-                # find the preceding mul op
-                for j in reversed(range(add_idx)):
-                    if self.block.ops[j].type == 'mul':
-                        mul_out_name = self.block.ops[j].output_arg_names[0]
-                        if self.block.ops[j].output_arg_names[
-                                0] in add_op.input_arg_names:
-                            mul_op = self.block.ops[j]
-                            mul_idx = j
-                            break
-                if mul_idx < 0:
-                    i += 1
-                    continue
-                # create and insert a new fc op
-                fc_op_new = self._insert_fc_op(add_idx + 1, mul_op, add_op)
-                # remove the old operators
-                self.block._remove_op(add_idx)
-                self.block._remove_op(mul_idx)
-                # restart scanning for elementwise add from the deleted mul's index
-                i = mul_idx
-            i += 1
-        self._adjust_input()
-        self._remove_unused_var()
-        program = program.clone()
-
     # ====================== private transpiler functions =====================
     def _insert_bias_op(self, index, current_op, bn_op):
         '''
@@ -566,42 +509,6 @@ class InferenceTranspiler(object):
             outputs={"Output": out_var},
             attrs=attrs)
 
-    def _insert_fc_op(self, index, mul_op, add_op):
-        '''
-        Construct a new FC operator by copying the old Mul and adding the
-        'Y' input taken from the Elementwise add's input 'Y'.
-        :param index: insert location of FC
-        :type  index: Int
-        :param mul_op: MUL operator to be copied
-        :type  mul_op: Operator
-        :param add_op: Elementwise add operator taken bias from
-        :type  add_op: Operator
-        :return: fc_op_new
-        :type:   Operator
-        '''
-
-        def get_op_outputs(op, names):
-            result = {}
-            for name in names:
-                result[name] = self.block.var(op.output(name)[0])
-            return result
-
-        fc_inputs = {}
-        fc_inputs['Input'] = self.block.var(mul_op.input('X')[0])
-        fc_inputs['W'] = self.block.var(mul_op.input('Y')[0])
-        fc_inputs['Bias'] = self.block.var(add_op.input('Y')[0])
-        fc_outputs = get_op_outputs(add_op, ['Out'])
-        fc_attrs = {}
-        fc_attrs['use_mkldnn'] = True
-
-        fc_op_new = self.block._insert_op(
-            index,
-            type='fc',
-            inputs=fc_inputs,
-            outputs=fc_outputs,
-            attrs=fc_attrs)
-        return fc_op_new
-
     def _fuse_conv_eltwise(self, index, conv_op, eltwise_op):
         '''
         fuse the conv op with elementwise_add
diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
index 64c8bc04860..c434423bae7 100755
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
@@ -498,57 +498,17 @@ def memory_optimize(input_program,
                     print_log=False,
                     level=0,
                     skip_grads=False):
-    """
-    | Legacy memory optimization strategy, reduce total memory consumption by reuse variable memory between different operators.
-    | Simple sample to explain the algorithm:
-    
-        ..  code-block:: python
-        
-            c = a + b  # assume this is the last time a is used
-            d = b * c
-         
-    | since **a** will not be used anymore after **"c = a + b"**, and the size of **a** and **d** are the same, 
-      we can use variable **a** to replace variable **d**, so actually we can optimize the above code to below:
-
-        ..  code-block:: python
-        
-            c = a + b
-            a = b * c 
-          
-    
-    | Please notice that, in this legacy design, we are using variable **a** to replace **d** directly, which means 
-      after you call this API, some variables may disappear, and some variables may hold unexpected values, like 
-      the above case, actually **a** holds the value of **d** after execution. 
-    
-    | So to protect important variables from being reused/removed in the optimization, we provide skip_opt_set 
-      to allow you specify a variable whitelist. 
-      The variables in the skip_opt_set will not be affected by memory_optimize API.
-    
-    Note: 
-        | **This API is deprecated, please avoid to use it in your new code.**
-        | Does not support operators which will create sub-block like While, IfElse etc.
-    
+    """Optimize memory by reusing var memory.
+
+      Note: it doesn't not support subblock nested in subblock.
+
     Args:
         input_program(str): Input Program
         skip_opt_set(set): vars wil be skipped in memory optimze
         print_log(bool): whether to print debug log.
-        level(int): 0 or 1, 0 means we replace a with b only when a.size == b.size, 1 means we can replace a with b if a.size <= b.size
+        level(int): If level=0, reuse if the shape is completely equal, o
     Returns:
         None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            main_prog = fluid.Program()
-            startup_prog = fluid.Program()
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-
-            exe.run(startup_prog)
-            fluid.memory_optimize(main_prog)
-
     """
     sys.stderr.write('memory_optimize is deprecated. '
                      'Use CompiledProgram and Executor\n')
@@ -605,18 +565,6 @@ def release_memory(input_program, skip_opt_set=None):
         skip_opt_set(set): vars wil be skipped in memory optimze
     Returns:
         None
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-
-            # build network
-            # ...
-            
-            # deprecated API
-            fluid.release_memory(fluid.default_main_program())
-    
     """
     cfgs = _get_cfgs(input_program)
     input_program._is_mem_optimized = True
diff --git a/python/paddle/fluid/transpiler/ps_dispatcher.py b/python/paddle/fluid/transpiler/ps_dispatcher.py
index a04f6c2c794..6a6d14a69ba 100644
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
@@ -50,16 +50,6 @@ class HashName(PSDispatcher):
 
     Args:
         pserver_endpoints (list): list of endpoint(ip:port).
-
-    Examples:
-        .. code-block:: python
-
-        pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"]
-        vars = ["var1","var2","var3","var4","var5"]
-
-        rr = RoundRobin(pserver_endpoints)
-        rr.dispatch(vars)
-
     """
 
     def __init__(self, pserver_endpoints):
@@ -84,16 +74,6 @@ class RoundRobin(PSDispatcher):
 
     Args:
         pserver_endpoints (list): list of endpoint(ip:port).
-
-    Examples:
-        .. code-block:: python
-
-        pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"]
-        vars = ["var1","var2","var3","var4","var5"]
-
-        rr = RoundRobin(pserver_endpoints)
-        rr.dispatch(vars)
-
     """
 
     def __init__(self, pserver_endpoints):
diff --git a/python/paddle/fluid/unique_name.py b/python/paddle/fluid/unique_name.py
index 9e3cd063092..324257c13ff 100644
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
@@ -55,75 +55,10 @@ generator = UniqueNameGenerator()
 
 
 def generate(key):
-    """
-    Generate unique name with prefix key.
-
-    Args:
-        key(str): The generated name prefix. All generated name will be 
-                  started with this prefix.
-
-    Returns: 
-        str: A unique string with the prefix key.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_1
-            print name1, name2 
-    """
-    return generator(key)
-
-
-# FIXME(zjl): The previous naming rule in static graph would
-# cause memory leak in dygraph mode. It is because the previous
-# naming rule would use `conv_0.tmp` as the key, and in dygraph
-# mode, `conv_i` increases as batch increases. Thus, keys would
-# increase in a way like `conv_0.tmp`, `conv_1.tmp`, .... 
-# Not find a better way to fix this bug in dygraph mode. In TF,
-# variable name is meaningless in eager execution mode, and in
-# PyTorch, there is no variable name at all. Maybe we should
-# discard variable name in dygraph mode.
-#
-# Another concern is that save/load interfaces. Usually, user
-# would save model in static graph mode, and load it in dygraph
-# mode. Therefore, we keep the variable name of Parameter currently.
-# 
-# Please fix me if a better method is found.        
-def generate_with_ignorable_key(key):
-    from .framework import in_dygraph_mode
-    if in_dygraph_mode():
-        key = "tmp"
-
     return generator(key)
 
 
 def switch(new_generator=None):
-    """
-    Switch the Global namespace to a new namespace.
-
-    Args:
-        new_generator(None|UniqueNameGenerator): A new UniqueNameGenerator.
-
-    Returns: 
-        UniqueNameGenerator: The previous UniqueNameGenerator.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            name1 = fluid.unique_name.generate('fc')
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_1
-            print name1, name2 
-
-            fluid.unique_name.switch()
-            name2 = fluid.unique_name.generate('fc')
-            # The result is fc_0
-            print name2
-    """
     global generator
     old = generator
     if new_generator is None:
@@ -135,32 +70,6 @@ def switch(new_generator=None):
 
 @signature_safe_contextmanager
 def guard(new_generator=None):
-    """
-    Change the global namespace with `with` statement.
-    
-    Args:
-        new_generator(None|str|bytes): New name of global namespace.
-            Note that str in Python2 was spilted into str and bytes in Python3, 
-            so here are two types. Default is None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            with fluid.unique_name.guard():
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard():
-              name_2 = fluid.unique_name.generate('fc')
-            # The result is fc_0, fc_0
-            print name_1, name_2
-
-            with fluid.unique_name.guard('A'):
-              name_1 = fluid.unique_name.generate('fc')
-            with fluid.unique_name.guard('B'):
-              name_2 = fluid.unique_name.generate('fc')
-            # The result is Afc_0, Bfc_0
-            print name_1, name_2
-    """
     if isinstance(new_generator, six.string_types):
         new_generator = UniqueNameGenerator(new_generator)
     elif isinstance(new_generator, six.binary_type):
diff --git a/python/requirements.txt b/python/requirements.txt
index f971587bd7c..ce56462fac9 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,19 +1,15 @@
-requests>=2.20.0
+requests==2.9.2
 numpy>=1.12
 protobuf>=3.1.0
 recordio>=0.1.0
-matplotlib<=2.2.4 ; python_version<"3.6"
-scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
-nltk>=3.2.2, <=3.4 ; python_version<"3.5"
-matplotlib ; python_version>="3.6"
-scipy ; python_version>="3.5"
-nltk ; python_version>="3.5"
+matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
 rarfile
+scipy>=0.19.0
 Pillow
+nltk>=3.2.2
 graphviz
 six
 funcsigs
 pyyaml
 decorator
 prettytable
-py-cpuinfo==5.0.0
diff --git a/python/setup.py.in b/python/setup.py.in
index a392e230709..0ce98481f04 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -118,12 +118,9 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.slim.distillation',
-          'paddle.fluid.contrib.slim.nas',
-          'paddle.fluid.contrib.slim.searcher',
           'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
-          'paddle.fluid.contrib.layers',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
@@ -131,7 +128,7 @@ packages=['paddle',
           'paddle.fluid.incubate.fleet',
           'paddle.fluid.incubate.fleet.base',
           'paddle.fluid.incubate.fleet.parameter_server',
-          'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
+          'paddle.fluid.incubate.fleet.parameter_server.distributed_transpiler',
           'paddle.fluid.incubate.fleet.parameter_server.pslib',
           'paddle.fluid.incubate.fleet.collective']
 
@@ -145,9 +142,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
 paddle_bins = ''
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + (ext_name if os.name != 'nt' else '.pyd')]}
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += ['core_noavx' + (ext_name if os.name != 'nt' else '.pyd')]
+package_data={'paddle.fluid': ['core' + (ext_name if os.name != 'nt' else '.pyd')]}
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -169,10 +164,6 @@ if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_SHARED_LIB}', libs_path)
     shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path)
     package_data['paddle.libs']+=[('libmklml_intel' if os.name != 'nt' else 'mklml') + ext_name, ('libiomp5' if os.name != 'nt' else 'libiomp5md') + ext_name]
-    # mklml has dependency to vs runtime library
-    if os.name == 'nt':
-        shutil.copy('${MKLML_SHARED_LIB_DEPS}', libs_path)
-        package_data['paddle.libs'] += ['msvcr120.dll']
 else:
     if os.name == 'nt':
         # copy the openblas.dll
@@ -215,19 +206,19 @@ if os.path.isfile(libs_path+'/__init__.py'):
     os.remove(libs_path+'/__init__.py')
 package_dir['paddle.libs']=libs_path
 
-# change rpath of ${FLUID_CORE_NAME}.ext, add $ORIGIN/../libs/ to it.
+# change rpath of core.ext, add $ORIGIN/../libs/ to it.
 # The reason is that libwarpctc.ext, libiomp5.ext etc are in paddle.libs, and
-# ${FLUID_CORE_NAME}.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
+# core.ext is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries.
 # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213
 if '${CMAKE_BUILD_TYPE}' == 'Release':
     if os.name != 'nt':
-        # only change rpath in Release mode, since in Debug mode, ${FLUID_CORE_NAME}.xx is too large to be changed.
+        # only change rpath in Release mode, since in Debug mode, core.xx is too large to be changed.
         if "@APPLE@" == "1":
-            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + ext_name
+            command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         else:
-            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + ext_name
+            command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core" + ext_name
         if os.system(command) != 0:
-            raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
+            raise Exception("patch core.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
diff --git a/run.md b/run.md
new file mode 100644
index 00000000000..c05af009130
--- /dev/null
+++ b/run.md
@@ -0,0 +1,4 @@
+1. mkdir build & cd build
+2. ../paddle/fluid/lite/tools/build.sh cmake_x86
+3. make test_step_rnn_lite_x86 -j
+4. ./paddle/fluid/lite/api/test_step_rnn_lite_x86 --model_dir=<model dir> --warmup=10000 --repeats=10000
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 17d9a1d10a3..d0e9b3178a6 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,12 +1,10 @@
 #!/bin/bash
-PADDLE_ROOT=/home
-mkdir ${PADDLE_ROOT}
+PADDLE_ROOT=/paddle
 cd ${PADDLE_ROOT}
-pip install /paddle/build/opt/paddle/share/wheels/*.whl
 git clone https://github.com/PaddlePaddle/FluidDoc
 git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git
-cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api
-sh gen_doc.sh
+sh ${PADDLE_ROOT}/FluidDoc/doc/fluid/api/gen_doc.sh
+pip install ${PADDLE_ROOT}/build/opt/paddle/share/wheels/*.whl
 apt-get update && apt-get install -y python-dev build-essential
 cd ${PADDLE_ROOT}/PaddlePaddle.org/portal
 pip install -r requirements.txt
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index ebddbefaf9d..c37a9a92e65 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -38,12 +38,6 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
 
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U
-
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
index d9801417675..caf21722158 100755
--- a/tools/manylinux1/build_all.sh
+++ b/tools/manylinux1/build_all.sh
@@ -25,7 +25,7 @@ sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -genco
 docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
 docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
 
-sed 's/<baseimg>/10.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<baseimg>/10.0-devel-centos6/g' Dockerfile.x64 | \
 sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
 docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
 docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 62c7a21f300..43a99d8287b 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -1,15 +1,13 @@
 #!/bin/bash
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
-if [ "$VERSION" == "10.0" ]; then
-  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "9.0" ]; then
+if [ "$VERSION" == "9.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
+  URL="http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
 else
   DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+  URL="http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
 fi
 
-URL="http://nccl2-deb.gz.bcebos.com/$DEB"
-
 DIR="/nccl2"
 mkdir -p $DIR
 # we cached the nccl2 deb package in BOS, so we can download it with wget
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 0de2e4f81f0..6a262529b5c 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -28,7 +28,7 @@ import hashlib
 
 member_dict = collections.OrderedDict()
 
-experimental_namespace = {"paddle.fluid.dygraph", "paddle.fluid.LoDTensorset"}
+experimental_namespace = {"paddle.fluid.dygraph"}
 
 
 def md5(doc):
@@ -38,8 +38,6 @@ def md5(doc):
 
 
 def visit_member(parent_name, member):
-    if parent_name + member.__name__ in experimental_namespace:
-        return
     cur_name = ".".join([parent_name, member.__name__])
     if inspect.isclass(member):
         for name, value in inspect.getmembers(member):
-- 
GitLab