Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into doc

27f7a726 · ceci3 · 3f5f5ed3 · c7f1f3ed · 27f7a726 · 27f7a726
130 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+message(STATUS "AR tools: ${CMAKE_AR}")
 if(WIN32)
    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -12,7 +12,7 @@ paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], va
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
 paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
 paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
 paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
@@ -330,7 +330,8 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
@@ -367,7 +368,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init',
 paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
 paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
 paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
-paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
+paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
@@ -392,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'
 paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
 paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
-paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7'))
+paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
-paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196'))
+paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
-paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8'))
+paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
 paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
 paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
@@ -493,7 +494,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -517,11 +518,11 @@ paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', de
 paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
 paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
 paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
 paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
 paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
 paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
-paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
-paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
 class DummyVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto& inputs = op_desc.Input("X");
+    auto& inputs = ctx->Input("X");
-    auto type = block->Var(inputs.front())->GetType();
+    auto type = ctx->GetType(inputs.front());
-    auto out_var_name = op_desc.Output("Out").front();
+    auto out_var_name = ctx->Output("Out").front();
-    block->Var(out_var_name)->SetType(type);
+    ctx->SetType(out_var_name, type);
  }
 };

--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 #include <string>
 #include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
@@ -127,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
  void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+    info->infer_var_type_ = [](InferVarTypeContext* context) {
      T inference;
-      inference(fwd_op, block);
+      inference(context);
    };
  }
 };

--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -14,7 +14,9 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
@@ -55,11 +57,11 @@ class GradOpDescMakerBase {
                   std::back_inserter(ret_val),
                   [this](const std::string& fwd_var_name) -> std::string {
                     auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.count(g_name)) {
+                     if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
-                       return kEmptyVarName;
-                     } else {
                       (*this->grad_to_var_)[g_name] = fwd_var_name;
                       return g_name;
+                     } else {
+                       return kEmptyVarName;
                     }
                   });
    if (!drop_empty_grad) {

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,8 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_placement_pass base)
+pass_library(cpu_quantize_pass inference)
 pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
@@ -68,6 +70,7 @@ pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
+pass_library(runtime_context_cache_pass base)
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -102,8 +105,12 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
+cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
 cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
+if(NOT WIN32)
+    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+endif()
 if (WITH_MKLDNN)
    cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)

--- a/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace {
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+}  // namespace
+enum { U8_MAX = 255, S8_MAX = 127 };
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
+using string::PrettyLogDetail;
+void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
+                                    std::string input_name, double scale_to_one,
+                                    bool is_unsigned,
+                                    std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize");
+  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
+  q_desc.SetOutput("Output",
+                   std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("Scale", scale);
+  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+  // update op's input
+  op->Op()->SetInput(input_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
+                                       std::string output_name,
+                                       double scale_to_one, bool is_unsigned,
+                                       std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+  // create a dequantize op node for output.
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("Scale", scale);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+  // update op's output
+  op->Op()->SetOutput(output_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   bool with_residual_data) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ConvResidual conv_pattern{pattern, name_scope_};
+  conv_pattern(with_residual_data);
+  int quantize_conv_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize conv2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    auto* conv_op_desc = conv_op->Op();
+    // skip if should not be quantized
+    if (!conv_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
+      return;
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[conv_input->Name()].first;
+    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
+                  is_input_unsigned, "Scale_in");
+    auto filter_scale_tensor = scales[conv_filter->Name()].second;
+    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
+                                     filter_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        filter_scale_tensor.data<double>(),
+        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
+    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
+                                conv_pattern);
+      auto residual_scale =
+          scales[conv_residual_data->Name()].second.data<double>()[0];
+      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
+      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
+                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
+    }
+    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[conv_output->Name()].first;
+    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
+                     is_output_unsigned, "Scale_out");
+    ++quantize_conv_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_conv_count);
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
+  if (with_residual_data) msg_ss << " with residual connection";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+void CPUQuantizePass::QuantizePool(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Pool pool_pattern{pattern, name_scope_};
+  pool_pattern();
+  int quantize_pool_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize pool2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
+    auto* pool_op_desc = pool_op->Op();
+    // skip if should not be quantized
+    if (!pool_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
+      return;
+    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[pool_input->Name()].first;
+    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
+    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[pool_output->Name()].first;
+    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
+                     is_output_unsigned);
+    ++quantize_pool_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_pool_count);
+  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
+}
+std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Quantizing the graph.";
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+  PADDLE_ENFORCE(param_scope());
+  QuantizeConv(graph.get(), true /* with_residual_data */);
+  QuantizeConv(graph.get());
+  QuantizePool(graph.get());
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
+    .RequirePassAttr("quant_var_scales");
--- a/paddle/fluid/framework/ir/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+/*
+ * Quantize all supported operators.
+ */
+class CPUQuantizePass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizePass() {}
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+  void QuantizePool(Graph* graph) const;
+  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
+                     double scale_to_one, bool is_unsigned,
+                     std::string scale_attr_name = "") const;
+  void DequantizeOutput(Graph* g, Node* op, Node* output,
+                        std::string output_name, double scale_to_one,
+                        bool is_unsigned,
+                        std::string scale_attr_name = "") const;
+  const std::string name_scope_{"quantize"};
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           bool use_quantizer = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+    if (inputs.size() > 3) {
+      op->SetInput("ResidualData", {inputs[3]});
+      op->SetAttr("fuse_residual_connection", true);
+    } else {
+      op->SetInput("ResidualData", {});
+      op->SetAttr("fuse_residual_connection", false);
+    }
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Out", {outputs[0]});
+  }
+}
+static const std::initializer_list<std::string> variable_names{
+    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
+    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+// (a,w1)->Conv1->c and c->Pool1->d
+//
+// (d,w2)->Conv2->e and e->Pool2->f
+//
+// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+//
+// (d,w4, b2)->Conv4->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
+        use_quantizer);
+  return prog;
+}
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int quant_count, int dequant_count, int added_nodes_count,
+              float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+  auto* scales = new VarQuantScale();
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+  int original_nodes_num = graph->Nodes().size();
+  graph = pass->Apply(std::move(graph));
+  int current_nodes_num = graph->Nodes().size();
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
+            << "Scale_in for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+        EXPECT_EQ(
+            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
+            scale)
+            << "Scale_weights for node '" + op_name + "'.";
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = true;
+  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
+  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
+  //
+  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
+  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
+  //
+  // d->Dropout1->g and g->Fc1->h and
+  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
+  //
+  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
+  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
+  int added_nodes = 7 + 7 + 6 + 6;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+           2.0f * 127);
+}
+TEST(CpuQuantizePass, do_not_quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = false;
+  int added_nodes = 0;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
+           1.0f);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(cpu_quantize_pass);
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include <string>
+#include <unordered_set>
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Marks operators which are to be quantized.";
+  const auto& excluded_ids_list =
+      Get<std::unordered_set<int>>("quantize_excluded_op_ids");
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                    n->id()) != excluded_ids_list.end())
+        continue;
+      auto* op = n->Op();
+      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_quantizer", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_quantizer", true);
+        }
+      }
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(cpu_quantize_placement_pass,
+              paddle::framework::ir::CPUQuantizePlacementPass)
+    // a vector of operator type names to be quantized ("conv2d" etc.)
+    .RequirePassAttr("quantize_enabled_op_types")
+    // a vector of operator ids that are to be excluded from quantization
+    .RequirePassAttr("quantize_excluded_op_ids");
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be quantized.
+ */
+class CPUQuantizePlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+namespace paddle {
+namespace framework {
+namespace ir {
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           boost::tribool use_quantizer) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (!boost::indeterminate(use_quantizer))
+    op->SetAttr("use_quantizer", use_quantizer);
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+// operator                      use_quantizer
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     false
+// f->relu->g                    none
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   false
+// k->pool->l                    false
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+  return prog;
+}
+void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
+              std::initializer_list<int> quantize_excluded_op_ids,
+              unsigned expected_use_quantizer_true_count) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+  pass->Set("quantize_excluded_op_ids",
+            new std::unordered_set<int>(quantize_excluded_op_ids));
+  graph = pass->Apply(std::move(graph));
+  unsigned use_quantizer_true_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_quantizer") &&
+          boost::get<bool>(op->GetAttr("use_quantizer"))) {
+        ++use_quantizer_true_count;
+      }
+    }
+  }
+  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+}
+TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
+TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
+  MainTest({"conv2d"}, {4}, 1);
+}
+TEST(QuantizerPlacementPass, excluded_none) {
+  // 2 conv + 2 pool
+  MainTest({}, {}, 4);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(cpu_quantize_placement_pass);
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
  ValidateByNodeRole(&subgraphs);
  if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
+                subgraphs.size());
  int id = 0;
  for (auto &g : subgraphs) {
    VLOG(3) << "optimizing #" << id++ << " subgraph";
@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() {
                        ->AsOutput()
                        ->assert_is_op_output("conv2d", "Output");
-  conv_op->LinksFrom({input_var, filter_var});
+  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
-  conv_op->LinksTo({output_var});
+  return output_var;
+}
+PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  if (!with_residual_data)
+    conv_op->assert_op_attr("fuse_residual_connection", false);
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+  std::vector<PDNode *> links_from{input_var, filter_var};
+  if (with_residual_data) {
+    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("conv2d", "ResidualData");
+    links_from.push_back(res_conn_var);
+  }
+  conv_op->LinksFrom(links_from).LinksTo({output_var});
+  return output_var;
+}
+PDNode *patterns::Pool::operator()() {
+  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
+  auto input_var = pattern->NewNode(pool_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("pool2d", "X");
+  auto output_var = pattern->NewNode(pool_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("pool2d", "Out");
+  pool_op->LinksFrom({input_var}).LinksTo({output_var});
  return output_var;
 }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -659,6 +659,35 @@ struct Conv : public PatternBase {
  PATTERN_DECL_NODE(conv_output);
 };
+// Convolution op with residual data
+struct ConvResidual : public PatternBase {
+  ConvResidual(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_residual") {}
+  PDNode* operator()(bool with_residual_data);
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+// Pool op
+// Forward pass for pooling.
+// pool_input is the input.
+// pool_output is a result of the operator.
+struct Pool : public PatternBase {
+  Pool(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "pooling") {}
+  PDNode* operator()();
+  PATTERN_DECL_NODE(pool_op);
+  PATTERN_DECL_NODE(pool_input);
+  PATTERN_DECL_NODE(pool_output);
+};
 // ElementwiseAdd used in residual connections.
 // y_var is used and convolution output.
 // The operator is removed, when residual

--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 class SumOpVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+  void operator()(InferVarTypeContext *ctx) const override {
-    auto &inputs = op_desc.Input("X");
+    auto &inputs = ctx->Input("X");
    auto default_var_type = proto::VarType::SELECTED_ROWS;
    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
        });
    if (any_input_is_lod_tensor) {
      default_var_type = proto::VarType::LOD_TENSOR;
    }
-    auto out_var_name = op_desc.Output("Out").front();
+    auto out_var_name = ctx->Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    ctx->SetType(out_var_name, default_var_type);
  }
 };
@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
 class DummyOpVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Applies Runtime Context Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
+    }
+  }
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(runtime_context_cache_pass,
+              paddle::framework::ir::RuntimeContextCachePass);
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class RuntimeContextCachePass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 namespace paddle {
 namespace framework {
@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const {
  // var type inference. Hence, we don't do any "default" setting here.
  auto &info = OpInfoMap::Instance().Get(this->Type());
  if (info.infer_var_type_) {
-    info.infer_var_type_(*this, block);
+    InferVarTypeContext context(this, block);
+    info.infer_var_type_(&context);
  }
 }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -874,9 +874,23 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
  return kernel_configs;
 }
+RuntimeContext* OperatorWithKernel::GetRuntimeContext(
+    const Scope& scope) const {
+  if (!HasAttr(kEnableCacheRuntimeContext)) {
+    return new RuntimeContext(Inputs(), Outputs(), scope);
+  } else {
+    const Scope* cur_scope = &scope;
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
+    }
+    return runtime_ctx_.get();
+  }
+}
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
-  RuntimeContext ctx(Inputs(), Outputs(), scope);
+  auto runtime_ctx = GetRuntimeContext(scope);
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  auto* dev_ctx = pool.Get(place);
@@ -891,7 +905,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  OpKernelMap& kernels = kernels_iter->second;
  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
  auto kernel_iter = kernels.find(expected_kernel_key);
@@ -915,8 +929,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  // do data transformScope &transfer_scope;
  std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
-      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
+                                     &transfered_inplace_vars, runtime_ctx);
  // exec scope is the scope that kernel actually executed on.
  const Scope& exec_scope =
@@ -927,13 +941,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
  }
  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
-    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
    this->InferShape(&infer_shape_ctx);
  }
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
-      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
+                                       *runtime_ctx, kernel_configs));
  if (!transfered_inplace_vars.empty()) {
    // there is inplace variable has been transfered.

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
+/// RuntimeContext is used to relate input/output names of Operator with
+/// the corresponding variables in name scope.
+/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
+/// name scope, since the input/output names of this Op do not change in the
+/// execution, RuntimeContext could be created only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -456,6 +464,7 @@ class OperatorWithKernel : public OperatorBase {
  // same.
  proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
  void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  RuntimeContext* GetRuntimeContext(const Scope& scope) const;
  /**
   * Transfer data from scope to a transfered scope. If there is no data need to
@@ -474,6 +483,8 @@ class OperatorWithKernel : public OperatorBase {
 protected:
  mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
+  mutable const Scope* pre_scope_ = nullptr;
 };
 extern bool OpSupportGPU(const std::string& op_type);

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
              << dst_place;
      return;
    }
+#ifdef PADDLE_WITH_MKLDNN
+    if (src.layout() == DataLayout::kMKLDNN) {
+      dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
+    }
+#endif
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
  }

--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -27,6 +27,7 @@ namespace framework {
 class OperatorBase;
 class OpDesc;
 class InferShapeContext;
+class InferVarTypeContext;
 class BlockDesc;
 class Variable;
@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
    const std::vector<BlockDesc*>& grad_block)>;
 using InferVarTypeFN =
-    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+    std::function<void(framework::InferVarTypeContext* /*context*/)>;
 using InferShapeFN = std::function<void(InferShapeContext*)>;

--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -14,6 +14,8 @@ limitations under the License. */
 #pragma once
 #include <string>
+#include <unordered_map>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -21,26 +23,123 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+class OpDesc;
+class BlockDesc;
+// default infer var type context
+class InferVarTypeContext {
+ public:
+  InferVarTypeContext(const OpDesc* op, BlockDesc* block)
+      : op_(op), block_(block) {}
+  virtual ~InferVarTypeContext() {}
+  virtual Attribute GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->GetAttr(name);
+  }
+  virtual bool HasVar(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindVarRecursive(name) != nullptr;
+  }
+  virtual bool HasInput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Inputs().count(name) > 0;
+  }
+  virtual bool HasOutput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Outputs().count(name) > 0;
+  }
+  virtual const std::vector<std::string>& Input(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Input(name);
+  }
+  virtual const std::vector<std::string>& Output(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Output(name);
+  }
+  virtual proto::VarType::Type GetType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetType();
+  }
+  virtual void SetType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetType(type);
+  }
+  virtual proto::VarType::Type GetDataType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataType();
+  }
+  virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataType(type);
+  }
+  virtual std::vector<proto::VarType::Type> GetDataTypes(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
+  }
+  virtual void SetDataTypes(
+      const std::string& name,
+      const std::vector<proto::VarType::Type>& multiple_data_type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
+  }
+  virtual std::vector<int64_t> GetShape(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetShape();
+  }
+  virtual void SetShape(const std::string& name,
+                        const std::vector<int64_t>& dims) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetShape(dims);
+  }
+  virtual int32_t GetLoDLevel(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
+  }
+  virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
+  }
+ protected:
+  const OpDesc* op_;
+  BlockDesc* block_;
+};
 class VarTypeInference {
 public:
  virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+  virtual void operator()(InferVarTypeContext* context) const = 0;  // NOLINT
 };
 class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const final {  // NOLINT
-                  framework::BlockDesc* block) const final {
    auto in_out_var_names = this->GetInputOutputWithSameType();
    for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = op_desc.Input(i_o_n.first).at(0);
+      auto& x_name = ctx->Input(i_o_n.first).at(0);
-      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+      auto& out_name = ctx->Output(i_o_n.second).at(0);
-      auto& x = block->FindRecursiveOrCreateVar(x_name);
+      ctx->SetType(out_name, ctx->GetType(x_name));
-      auto& out = block->FindRecursiveOrCreateVar(out_name);
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
-      out.SetType(x.GetType());
-      out.SetDataType(x.GetDataType());
    }
  }

--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 class SumOpVarTypeInference : public VarTypeInference {
 public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto &inputs = op_desc.Input("X");
+    auto &inputs = ctx->Input("X");
    auto default_var_type = proto::VarType::SELECTED_ROWS;
    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
        });
    if (any_input_is_lod_tensor) {
      default_var_type = proto::VarType::LOD_TENSOR;
    }
-    auto out_var_name = op_desc.Output("Out").front();
+    auto out_var_name = ctx->Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    ctx->SetType(out_var_name, default_var_type);
  }
 };
 }  // namespace framework

--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_PYTHON)
 cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
+cc_library(imperative_profiler SRCS profiler.cc)
 endif()
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -214,13 +214,11 @@ framework::LoDTensor& VarBase::GradValue() {
 }
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (grad_op_descs_.empty() && backward_id_ <= 0) {
+  PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
-    VLOG(3) << "op with no grad: " << Type();
+                 "%s has no backward implementation", Type());
-    return {};
-  }
  VLOG(3) << "apply op grad: " << Type();
-  std::vector<framework::VariableValueMap> tmp_grad_outputs;
+  std::vector<VarBasePtrMap> tmp_grad_outputs;
  if (backward_id_ > 0) {
    VLOG(3) << "py_layer_grad";
    tmp_grad_outputs.resize(1);
@@ -239,30 +237,66 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
      VLOG(3) << "apply grad op " << grad_op_desc->Type();
      // Allocate tmp grad output variable
-      for (auto it : grad_output_variable_map) {
+      for (const auto& it : grad_output_variable_map) {
        auto& outputs = tmp_grad_outputs[k][it.first];
        outputs.reserve(it.second.size());
        for (size_t i = 0; i < it.second.size(); ++i) {
+          VarBase* origin_grad_var_base = it.second[i];
          // Allocate a new variable
-          Variable* tmp_var = new framework::Variable();
+          VarBase* tmp_grad_var_base = new VarBase(
-          tmp_var->GetMutable<framework::LoDTensor>();
+              string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
-          outputs.emplace_back(tmp_var);
+              origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
+              place_, true, false);
+          outputs.emplace_back(tmp_grad_var_base);
        }
      }
-      // Run grad op
-      framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
      // No need to do compile time infer shape here.
      // grad_op_desc_->InferShape(*block_);
      // grad_op_desc->InferVarType(block_);
      std::unique_ptr<framework::OperatorBase> opbase =
          framework::OpRegistry::CreateOp(*grad_op_desc);
+      auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      if (info.infer_var_type_) {
+        RuntimeInferVarTypeContext infer_var_type_ctx(
+            &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
+        info.infer_var_type_(&infer_var_type_ctx);
+      }
      framework::OperatorWithKernel* op_kernel =
          dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
      PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
+      // Run grad op
+      framework::VariableValueMap grad_invars_map;
+      framework::VariableValueMap grad_outvars_map;
+      for (const auto& it : grad_input_vars_[k]) {
+        auto& grad_invars = grad_invars_map[it.first];
+        grad_invars.reserve(it.second.size());
+        for (const VarBase* grad_inp : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
+                                  grad_op_desc->Type(), grad_inp->Name());
+          grad_invars.emplace_back(grad_inp->var_);
+        }
+      }
+      for (const auto& it : tmp_grad_outputs[k]) {
+        auto& grad_outvars = grad_outvars_map[it.first];
+        grad_outvars.reserve(it.second.size());
+        for (VarBase* grad_out : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
+                                  grad_op_desc->Type(), grad_out->Name());
+          grad_outvars.emplace_back(grad_out->var_);
+        }
+      }
+      framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
      framework::Scope scope;
      PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
      p.op.RuntimeInferShape(scope, place_, ctx);
@@ -273,14 +307,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
  // Add tmp grad outputs to original grad vars
  for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
-    for (auto it : grad_output_vars_[k]) {
+    for (const auto& it : grad_output_vars_[k]) {
      auto& outputs = tmp_grad_outputs[k][it.first];
-      auto& origin_outputs = it.second;
+      const auto& origin_outputs = it.second;
      PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
      for (size_t i = 0; i < outputs.size(); ++i) {
-        framework::Variable* grad = outputs[i];
+        framework::Variable* grad = outputs[i]->var_;
-        framework::Variable* orig_grad = origin_outputs[i];
+        framework::Variable* orig_grad = origin_outputs[i]->var_;
        AddTo(grad, orig_grad, place_);
        delete grad;
      }
@@ -328,28 +362,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
 int PyLayer::NumFuncs() { return py_funcs_.size(); }
-std::vector<Variable*> PyLayer::Apply(int func_id,
+std::vector<framework::Variable*> PyLayer::Apply(
-                                      const std::vector<VarBase*>& inputs) {
+    int func_id, const std::vector<VarBase*>& inputs) {
-  std::vector<framework::Variable*> invars;
-  for (const VarBase* in : inputs) {
-    invars.push_back(in->var_);
-  }
  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], invars);
+  return CallPythonFunc(py_funcs_[func_id], inputs);
 }
-std::vector<Variable*> PyLayer::ApplyGrad(
+std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
-    int func_id, const std::vector<framework::Variable*>& inputs) {
+                                         const std::vector<VarBase*>& inputs) {
  PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], inputs);
+  auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
+  std::vector<VarBase*> outs;
+  outs.reserve(rets.size());
+  for (size_t i = 0U; i != rets.size(); ++i) {
+    outs.emplace_back(new VarBase(
+        string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
+                        i),
+        rets[i], nullptr, true));
+  }
+  return outs;
 }
 std::vector<framework::Variable*> PyLayer::CallPythonFunc(
-    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+    const py::object& callable, const std::vector<VarBase*>& ins) {
  py::gil_scoped_acquire guard;
  py::tuple in_args(ins.size());
  for (size_t i = 0; i < ins.size(); ++i) {
-    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
    in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
  }
  VLOG(3) << "pyfunc in " << py::len(in_args);
@@ -359,6 +400,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
  auto ret_tuple = py::cast<py::tuple>(ret);
  size_t ret_num = py::len(ret_tuple);
  std::vector<framework::Variable*> outs;
+  outs.reserve(ret_num);
  VLOG(3) << "pyfunc out " << ret_num;
  for (size_t i = 0; i < ret_num; ++i) {
    try {
@@ -369,7 +411,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
      auto* tensor = var->GetMutable<framework::LoDTensor>();
      tensor->ShareDataWith(*py_out_tensor);
      tensor->set_lod(py_out_tensor->lod());
-      outs.push_back(var);
+      outs.emplace_back(var);
    } catch (py::cast_error&) {
      PADDLE_THROW("The %d-th output must be LoDTensor", i);
    }

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -18,14 +18,16 @@
 #include "paddle/fluid/framework/python_headers.h"
 // clang-format on
-#include <map>     // NOLINT
+#include <map>            // NOLINT
-#include <string>  // NOLINT
+#include <string>         // NOLINT
-#include <vector>  // NOLINT
+#include <vector>         // NOLINT
-#include <memory>  // NOLINT
+#include <memory>         // NOLINT
+#include <unordered_map>  // NOLINT
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -135,13 +137,13 @@ class VarBase {
                persistable) {}
 private:
+  // TODO(minqiyang): need support SelectedRows
  VarBase(const std::string& name, framework::proto::VarType::Type dtype,
          const framework::DDim& shape, const platform::Place& place,
          framework::Variable* var, VarBase* grad, bool stop_gradient,
          bool persistable)
      : name_(name),
-        dtype_(dtype),
+        type_(framework::proto::VarType::LOD_TENSOR),
-        place_(place),
        var_(var),
        grads_(grad),
        stop_gradient_(stop_gradient),
@@ -151,10 +153,12 @@ class VarBase {
        pre_op_out_idx_(-1) {
    if (!var_) {
      var_ = new framework::Variable();
-      auto tensor = var_->GetMutable<framework::LoDTensor>();
-      tensor->Resize(shape);
-      tensor->mutable_data(place_, dtype_);
    }
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->Resize(shape);
+    tensor->mutable_data(place, dtype);
+    VLOG(10) << "create varbase: " << name_ << " type: " << dtype
+             << " place: " << place;
  }
 public:
@@ -184,7 +188,23 @@ class VarBase {
    }
  }
-  inline framework::proto::VarType::Type DType() const { return dtype_; }
+  inline framework::DDim Dims() const {
+    return var_->Get<framework::LoDTensor>().dims();
+  }
+  // data type. e.g.. FP32
+  inline void SetDataType(framework::proto::VarType::Type type) {
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(tensor->place(), type);
+  }
+  inline framework::proto::VarType::Type DataType() const {
+    auto tensor = var_->Get<framework::LoDTensor>();
+    return tensor.type();
+  }
+  // tensor type. e.g.. LoDTensor
+  inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
+  inline framework::proto::VarType::Type Type() const { return type_; }
  inline void SetStopGradient(bool stop_gradient) {
    stop_gradient_ = stop_gradient;
@@ -238,7 +258,7 @@ class VarBase {
  }
  std::string name_;
-  framework::proto::VarType::Type dtype_;
+  framework::proto::VarType::Type type_;
  platform::Place place_;
  framework::Variable* var_;
@@ -294,17 +314,23 @@ class PYBIND11_HIDDEN OpBase {
  void InvokeBackwardHooks();
-  void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
+  void TrackPreOp(const std::string& inp_name,
-    if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
+                  const std::vector<VarBase*>& inputs) {
-      VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
+    auto& pre_ops_list = pre_ops_[inp_name];
-              << inp_name;
+    pre_ops_list.reserve(inputs.size());
-      pre_ops_[inp_name].push_back(inp_var->PreOp());
+    auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
-      pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
+    for (VarBase* inp_var : inputs) {
-    } else {
+      if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
-      VLOG(3) << "no pre op in slot " << inp_name
+        VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
-              << " input var stop_gradient: " << inp_var->IsStopGradient();
+                << inp_name;
-      pre_ops_[inp_name].push_back(nullptr);
+        pre_ops_list.emplace_back(inp_var->PreOp());
-      // pre_ops_out_idx_[inp_name].push_back(-1);
+        pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
+      } else {
+        VLOG(3) << "no pre op in slot " << inp_name
+                << " input var stop_gradient: " << inp_var->IsStopGradient();
+        pre_ops_list.emplace_back(nullptr);
+        // pre_ops_out_idx_list.push_back(-1);
+      }
    }
  }
@@ -328,11 +354,13 @@ class PYBIND11_HIDDEN OpBase {
  std::map<std::string, std::vector<int>> pre_ops_out_idx_;
  // Inputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_input_vars_;
+  std::vector<VarBasePtrMap> grad_input_vars_;
  // Outputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_output_vars_;
+  std::vector<VarBasePtrMap> grad_output_vars_;
  std::vector<py::object> backward_hooks_;
+  framework::AttributeMap attrs_;
 };
 class Layer {
@@ -359,12 +387,131 @@ class PyLayer {
  static std::vector<framework::Variable*> Apply(
      int func_id, const std::vector<VarBase*>& inputs);
-  static std::vector<framework::Variable*> ApplyGrad(
+  static std::vector<VarBase*> ApplyGrad(int func_id,
-      int func_id, const std::vector<framework::Variable*>& inputs);
+                                         const std::vector<VarBase*>& inputs);
 private:
  static std::vector<framework::Variable*> CallPythonFunc(
-      const py::object& callable, const std::vector<framework::Variable*>& ins);
+      const py::object& callable, const std::vector<VarBase*>& ins);
+};
+// infer var type context for imperative mode
+class PYBIND11_HIDDEN RuntimeInferVarTypeContext
+    : public framework::InferVarTypeContext {
+ public:
+  RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
+                             imperative::VarBasePtrMap* outputs,
+                             const framework::AttributeMap* attrs_map)
+      : InferVarTypeContext(nullptr, nullptr),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs_map),
+        input_names_(),
+        output_names_(),
+        var_set_() {
+    input_names_.reserve(inputs_->size());
+    for (auto& it : *inputs_) {
+      for (imperative::VarBase* var : it.second) {
+        input_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+    output_names_.reserve(outputs_->size());
+    for (auto& it : *outputs_) {
+      for (imperative::VarBase* var : it.second) {
+        output_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+  }
+  virtual ~RuntimeInferVarTypeContext() {}
+  framework::Attribute GetAttr(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(attrs_);
+    return attrs_->at(name);
+  }
+  bool HasVar(const std::string& name) const override {
+    return var_set_.count(name) > 0;
+  }
+  bool HasInput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(inputs_);
+    return inputs_->count(name) > 0;
+  }
+  bool HasOutput(const std::string& name) const override {
+    PADDLE_ENFORCE_NOT_NULL(outputs_);
+    return outputs_->count(name) > 0;
+  }
+  const std::vector<std::string>& Input(
+      const std::string& name) const override {
+    return input_names_.at(name);
+  }
+  const std::vector<std::string>& Output(
+      const std::string& name) const override {
+    return output_names_.at(name);
+  }
+  framework::proto::VarType::Type GetType(
+      const std::string& name) const override {
+    return var_set_.at(name)->Type();
+  }
+  void SetType(const std::string& name,
+               framework::proto::VarType::Type type) override {
+    var_set_[name]->SetType(type);
+  }
+  framework::proto::VarType::Type GetDataType(
+      const std::string& name) const override {
+    return var_set_.at(name)->DataType();
+  }
+  void SetDataType(const std::string& name,
+                   framework::proto::VarType::Type type) override {
+    var_set_[name]->SetDataType(type);
+  }
+  std::vector<framework::proto::VarType::Type> GetDataTypes(
+      const std::string& name) const override {
+    PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
+  }
+  void SetDataTypes(const std::string& name,
+                    const std::vector<framework::proto::VarType::Type>&
+                        multiple_data_type) override {
+    PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
+  }
+  std::vector<int64_t> GetShape(const std::string& name) const override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+  void SetShape(const std::string& name,
+                const std::vector<int64_t>& dims) override {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+  int32_t GetLoDLevel(const std::string& name) const override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+  void SetLoDLevel(const std::string& name, int32_t lod_level) override {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+ private:
+  const imperative::VarBasePtrMap* inputs_;
+  imperative::VarBasePtrMap* outputs_;
+  const framework::AttributeMap* attrs_;
+  std::unordered_map<std::string, std::vector<std::string>> input_names_;
+  std::unordered_map<std::string, std::vector<std::string>> output_names_;
+  std::unordered_map<std::string, imperative::VarBase*> var_set_;
 };
 }  // namespace imperative

--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/imperative/profiler.h"
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
+DEFINE_string(
+    tracer_profile_fname, "xxgperf",
+    "Profiler filename for imperative tracer, which generated by gperftools."
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
+namespace paddle {
+namespace imperative {
+static std::once_flag gTracerProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gTracerProfilerStarted = false;
+#endif
+void StartProfile() {
+  if (!FLAGS_tracer_profile_fname.empty()) {
+    std::call_once(gTracerProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
+      gTracerProfilerStarted = true;
+#else
+      LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                      "FLAGS_tracer_profile_fname will be ignored";
+#endif
+    });
+  }
+}
+void StopProfile() {
+#ifdef WITH_GPERFTOOLS
+  ProfilerFlush();
+#else
+  LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                  "FLAGS_tracer_profile_fname will be ignored";
+#endif
+}
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/profiler.h
+++ b/paddle/fluid/imperative/profiler.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+namespace paddle {
+namespace imperative {
+extern void StartProfile();
+extern void StopProfile();
+}  // namespace imperative
+}  // namespace paddle
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,38 +19,26 @@
 #include <unordered_map>
 #include <unordered_set>
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-DEFINE_string(
-    tracer_profile_fname, "",
-    "Profiler filename for imperative tracer, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
 namespace paddle {
 namespace imperative {
-static std::once_flag gTracerProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gTracerProfilerStarted = false;
-#endif
 void CreateGradOp(const framework::OpDesc& op_desc,
                  const std::unordered_set<std::string>& no_grad_set,
                  const std::vector<framework::BlockDesc*>& grad_sub_block,
                  std::vector<framework::OpDesc*>* grad_op_descs,
                  std::unordered_map<std::string, std::string>* grad_to_var) {
  PADDLE_ENFORCE(grad_op_descs->empty());
-  std::vector<std::unique_ptr<framework::OpDesc>> descs =
+  const framework::OpInfo& op_info =
-      framework::OpInfoMap::Instance()
+      framework::OpInfoMap::Instance().Get(op_desc.Type());
-          .Get(op_desc.Type())
+  if (!op_info.grad_op_maker_) return;
-          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
+      op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
  for (auto& desc : descs) {
    grad_op_descs->emplace_back(desc.release());
  }
@@ -145,31 +133,13 @@ framework::VariableNameMap CreateOutputVarNameMap(
  return result;
 }
-Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
+Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
-  if (!FLAGS_tracer_profile_fname.empty()) {
-    std::call_once(gTracerProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
-      gTracerProfilerStarted = true;
-#else
-      LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                      "FLAGS_tracer_profile_fname will be ignored";
-#endif
-    });
-  }
-}
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                                    const VarBasePtrMap& outputs,
+                                    VarBasePtrMap* outputs,
                                    framework::AttributeMap attrs_map,
                                    const platform::Place expected_place,
                                    const bool stop_gradient) {
-#ifdef WITH_GPERFTOOLS
-  if (gTracerProfilerStarted) {
-    ProfilerFlush();
-  }
-#endif
  framework::VariableValueMap invars_map;
  framework::VariableValueMap outvars_map;
@@ -184,7 +154,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                              inp->Name());
      invars.emplace_back(inp->var_);
-      op->TrackPreOp(inp, it.first);
      if (!stop_gradient) {
        current_vars_map[inp->Name()] = inp;
      }
@@ -192,9 +161,10 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
              << " inited: " << inp->var_->IsInitialized()
              << " stop_grad: " << inp->IsStopGradient();
    }
+    op->TrackPreOp(it.first, it.second);
  }
-  op->output_vars_ = outputs;
+  op->output_vars_ = *outputs;
  for (auto it : op->output_vars_) {
    auto& outvars = outvars_map[it.first];
    const std::vector<VarBase*>& outputs = it.second;
@@ -217,7 +187,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
  framework::VariableNameMap invars_name_map =
      CreateInputVarNameMap(op, inputs);
  framework::VariableNameMap outvars_name_map =
-      CreateOutputVarNameMap(op, outputs);
+      CreateOutputVarNameMap(op, *outputs);
  auto& info = framework::OpInfoMap::Instance().Get(op->Type());
  if (info.Checker() != nullptr) {
@@ -228,6 +198,11 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
      framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
                                      outvars_name_map, attrs_map);
+  if (info.infer_var_type_) {
+    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
+    info.infer_var_type_(&infer_var_type_ctx);
+  }
  // TODO(minqiyang): Support infer var type in imperative mode
  // Run forward op
  VLOG(3) << "tracer running " << op->Type();
@@ -252,6 +227,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
    VLOG(5) << "start construct backward op";
    // construct grad op descs
+    op->attrs_ = attrs_map;
    std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
        op->Type(), invars_name_map, outvars_name_map, attrs_map));
    std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
@@ -278,12 +254,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
            auto fwd_var_it = current_vars_map.find(grad_invar);
            PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
            // Forward inputs or outputs.
-            grad_in_vars.emplace_back(fwd_var_it->second->var_);
+            grad_in_vars.emplace_back(fwd_var_it->second);
          } else {
            VarBase* var = current_vars_map[var_it->second];
            InitGrad(var, prepared_op.GetDeviceContext());
            // Douts.
-            grad_in_vars.emplace_back(var->grads_->var_);
+            grad_in_vars.emplace_back(var->grads_);
          }
          vars_saved_for_backward.insert(it.first);
@@ -300,7 +276,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                         op->Type());
          VarBase* var = current_vars_map[var_it->second];
          InitGrad(var, prepared_op.GetDeviceContext());
-          grad_out_vars.push_back(var->grads_->var_);
+          grad_out_vars.push_back(var->grads_);
        }
      }
    }
@@ -319,9 +295,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
  std::vector<framework::Variable*> ret_vars =
      PyLayer::Apply(op->forward_id_, inputs);
-  for (VarBase* inp : inputs) {
+  op->TrackPreOp(PyLayer::kFwdInp, inputs);
-    op->TrackPreOp(inp, PyLayer::kFwdInp);
-  }
  std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
  outputs.reserve(ret_vars.size());
@@ -342,23 +316,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
    auto& grad_output_vars =
        op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
-    for (const VarBase* inp : inputs) {
+    for (VarBase* inp : inputs) {
-      grad_input_vars.push_back(inp->var_);
+      grad_input_vars.push_back(inp);
    }
    for (VarBase* out : outputs) {
-      grad_input_vars.push_back(out->var_);
+      grad_input_vars.push_back(out);
    }
    // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
    platform::CPUPlace place;
    for (VarBase* out : outputs) {
      InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
-      grad_input_vars.push_back(out->grads_->var_);
+      grad_input_vars.push_back(out->grads_);
    }
    for (VarBase* inp : inputs) {
      InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
-      grad_output_vars.push_back(inp->grads_->var_);
+      grad_output_vars.push_back(inp->grads_);
    }
  }
  return outputs;

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -48,7 +48,7 @@ class Tracer {
  virtual ~Tracer() {}
  std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
-                              const VarBasePtrMap& outputs,
+                              VarBasePtrMap* outputs,  // NOLINT
                              framework::AttributeMap attrs_map,
                              const platform::Place expected_place,
                              const bool stop_gradient = false);

--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -25,6 +25,7 @@ class VarBase;
 class OpBase;
 typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
 typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
 }  // namespace imperative

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -91,5 +91,5 @@ if(WITH_TESTING)
  add_subdirectory(tests/book)
  if(WITH_INFERENCE_API_TEST)
    add_subdirectory(tests/api)
-  endif()  
+  endif()
 endif()
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -38,7 +39,10 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
 using framework::ir::Graph;
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
 /*
 * The argument definition of both Pass and PassManagers.
@@ -127,6 +131,8 @@ struct Argument {
  // Pass a set of op types to enable its mkldnn kernel
  DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                      std::unordered_set<std::string>);
+  // Scales for variables to be quantized
+  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
  // Passed from config.
  DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,7 @@
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument,
                                  ".dot";
      pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
      pass_num++;
-    }
+    } else if (pass_name == "mkldnn_placement_pass") {
-    if (pass_name == "mkldnn_placement_pass") {
      pass->Set("mkldnn_enabled_op_types",
                new std::unordered_set<std::string>(
                    argument->mkldnn_enabled_op_types()));
-    }
+    } else if (pass_name == "cpu_quantize_pass") {
+      pass->Set("quant_var_scales",
-    if (pass_name == "tensorrt_subgraph_pass") {
+                new VarQuantScale(argument->quant_var_scales()));
+    } else if (pass_name == "tensorrt_subgraph_pass") {
      pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
      pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
      pass->Set("min_subgraph_size",

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -118,6 +118,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(serialized_info_cache_);
+  // framework related.
+  CP_MEMBER(enable_runtime_context_cache_);
  if (use_gpu_) {
    pass_builder_.reset(new GpuPassStrategy(
        *static_cast<GpuPassStrategy *>(other.pass_builder())));
@@ -219,12 +222,23 @@ void AnalysisConfig::Update() {
  }
  if (enable_memory_optim_) {
-    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+    auto analysis_passes = pass_builder()->AnalysisPasses();
+    auto memory_opti_pass_name = "memory_optimize_pass";
+    bool already_exists =
+        std::find(analysis_passes.begin(), analysis_passes.end(),
+                  memory_opti_pass_name) != analysis_passes.end();
+    if (!already_exists) {
+      pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
+    }
  }
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
+  if (enable_runtime_context_cache_) {
+    pass_builder()->AppendPass("runtime_context_cache_pass");
+  }
 }
 std::string AnalysisConfig::SerializeInfoCache() {
@@ -258,6 +272,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << specify_input_name_;
  ss << cpu_math_library_num_threads_;
+  ss << enable_runtime_context_cache_;
  return ss.str();
 }

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -194,6 +194,23 @@ struct AnalysisConfig {
  /** Tell whether the memory optimization is activated. */
  bool enable_memory_optim() const;
+  // framework related
+  /** \brief Control whether to perform runtime context cache optimization.
+   *
+   * If turned off, in Op's every execution, RuntimeContext would be called to
+   * relate input/output names of this Op with the corresponding variables in
+   * Scope.
+   */
+  void SwitchRuntimeContextCache(int x = true) {
+    enable_runtime_context_cache_ = x;
+  }
+  /** A boolean state tell whether the runtime context cache optimization is
+   * actived.
+   */
+  bool runtime_context_cache_enabled() const {
+    return enable_runtime_context_cache_;
+  }
  friend class ::paddle::AnalysisPredictor;
  /** NOTE just for developer, not an official API, easily to be broken.
@@ -254,6 +271,15 @@ struct AnalysisConfig {
  int cpu_math_library_num_threads_{1};
+  // framework related
+  // RuntimeContext is used to relate input/output names of Operator with
+  // the corresponding variables in Scope.
+  // If enable_runtime_context_cache_ is true, it means that in a same Scope,
+  // since the input/output names of this Op do not change in the execution,
+  // RuntimeContext could be created only at the first iteration of this Op's
+  // execution to save the elapsed time.
+  bool enable_runtime_context_cache_{false};
  // A runtime cache, shouldn't be transferred to others.
  std::string serialized_info_cache_;

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -110,7 +110,7 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")

--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->DisableGpu();
  cfg->SwitchSpecifyInputNames();
  cfg->SwitchIrOptim();
+  cfg->SwitchRuntimeContextCache();
  if (FLAGS_zero_copy) {
    cfg->SwitchUseFeedFetchOps(false);
  }

--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 // Easy for profiling independently.
-TEST(Analyzer_Transformer, profile) {
+void profile(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
  std::vector<PaddleTensor> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) {
                 input_slots_all, &outputs, FLAGS_num_threads);
 }
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
 // Check the fuse status
 TEST(Analyzer_Transformer, fuse_statis) {
  AnalysisConfig cfg;
@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Transformer, compare) {
+void compare(bool use_mkldnn = false) {
  AnalysisConfig cfg;
  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInput(&input_slots_all);
@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) {
      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -72,7 +72,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
  }
  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
     << "\n";
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+  os << GenSpaces(num_spaces)
+     << "use_runtime_context_cache: " << config.runtime_context_cache_enabled()
     << "\n";
  os << GenSpaces(num_spaces)
     << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -58,8 +58,10 @@ if (WITH_GPU)
        op_library(conv_fusion_op)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
    endif()
-    op_library(sync_batch_norm_op)
+    if (NOT WIN32)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+        op_library(sync_batch_norm_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    endif()
 else()
    op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()

--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for
 each source sentence by walking back along the LoDTensorArray Input(ids)
 whose lods can be used to restore the path in the beam search tree.
-The Output(SentenceIds) and Output(SentenceScores) separately contain the 
+The Output(SentenceIds) and Output(SentenceScores) separately contain the
-generated id sequences and the corresponding scores. The shapes and lods of the 
+generated id sequences and the corresponding scores. The shapes and lods of the
-two LodTensor are same. The lod level is 2 and the two levels separately 
+two LodTensor are same. The lod level is 2 and the two levels separately
-indicate how many hypotheses each source sentence has and how many ids each 
+indicate how many hypotheses each source sentence has and how many ids each
 hypothesis has.
 )DOC");
  }
@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    for (auto& o : ctx->Output("SentenceIds")) {
-    for (auto& o : op_desc.Output("SentenceIds")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
-      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
    }
-    for (auto& o : op_desc.Output("SentenceScores")) {
+    for (auto& o : ctx->Output("SentenceScores")) {
-      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
    }
  }
 };

--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
        .SetDefault(true);
    AddComment(R"DOC(
-This operator does the search in beams for one time step. 
+This operator does the search in beams for one time step.
 Specifically, it selects the top-K candidate word ids of current step from
 Input(ids) according to their Input(scores) for all source sentences,
 where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 class BeamSearchInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &o : ctx->Output("selected_ids")) {
-    for (auto &o : op_desc.Output("selected_ids")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
-      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
    }
-    for (auto &o : op_desc.Output("selected_scores")) {
+    for (auto &o : ctx->Output("selected_scores")) {
-      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
-      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
    }
  }
 };

--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -93,11 +93,9 @@ execution.
 class GetPlacesInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &o_name : ctx->Output("Out")) {
-    for (auto &o_name : op_desc.Output("Out")) {
+      ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
-      block->FindRecursiveOrCreateVar(o_name).SetType(
-          framework::proto::VarType::PLACE_LIST);
    }
  }
 };

--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 class WriteToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto x_name = ctx->Input("X")[0];
-    auto x_name = op_desc.Input("X")[0];
+    auto out_name = ctx->Output("Out")[0];
-    auto out_name = op_desc.Output("Out")[0];
    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
+    ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
-    out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+    if (ctx->HasVar(x_name)) {
-    auto *x = block->FindVarRecursive(x_name);
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
-    if (x != nullptr) {
-      out.SetDataType(x->GetDataType());
    }
  }
 };

--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto p_names = ctx->Input(kX);
-    auto p_names = op_desc.Input(kX);
+    auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
-    auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
    for (size_t i = 0; i < p_names.size(); ++i) {
-      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      if (ctx->HasVar(pg_ig_names[i])) {
-      auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
-      if (g_var != nullptr) {  // Gradient could be @EMPTY@
        VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
+                << " type: " << ctx->GetType(p_names[i]);
-        g_var->SetType(p_var.GetType());
+        ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
-        g_var->SetDataType(p_var.GetDataType());
+        ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
      }
    }
  }

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_op.h"
+#include <memory>
 #include <string>
 #include <vector>
@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
  AddAttr<bool>("fuse_residual_connection",

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 if(WITH_GPU)

--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel {
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
      PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
                        "The rank of Input TargetBox must be 3");
-      if (axis == 0) {
+      PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1");
-        PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+      if (ctx->IsRuntime()) {
-      } else if (axis == 1) {
+        if (axis == 0) {
-        PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+          PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-      } else {
+        } else if (axis == 1) {
-        PADDLE_THROW("axis must be 0 or 1.");
+          PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+        }
+        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
      }
-      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
      ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
    }

--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class YoloBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
+                   "Input(ImgSize) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
+                   "Output(Boxes) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Scores"),
+                   "Output(Scores) of YoloBoxOp should not be null.");
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_imgsize = ctx->GetInputDim("ImgSize");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], anchor_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
+                      "Input(ImgSize) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0], dim_x[0],
+        "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
+    PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater than 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater than 0.");
+    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
+    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+    ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YoloBox operator is a 4-D tensor with "
+             "shape of [N, C, H, W]. The second dimension(C) stores "
+             "box locations, confidence score and classification one-hot "
+             "keys of each anchor box. Generally, X should be the output "
+             "of YOLOv3 network.");
+    AddInput("ImgSize",
+             "The image size tensor of YoloBox operator, "
+             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
+             "height and width of each input image used for resizing output "
+             "box in input image scale.");
+    AddOutput("Boxes",
+              "The output tensor of detection boxes of YoloBox operator, "
+              "This is a 3-D tensor with shape of [N, M, 4], N is the "
+              "batch num, M is output box number, and the 3rd dimension "
+              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
+    AddOutput("Scores",
+              "The output tensor of detection boxes scores of YoloBox "
+              "operator, This is a 3-D tensor with shape of "
+              "[N, M, :attr:`class_num`], N is the batch num, M is "
+              "output box number.");
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YoloBox operator "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YoloBox operators.")
+        .SetDefault(32);
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
+                   "be ignored.")
+        .SetDefault(0.01);
+    AddComment(R"DOC(
+         This operator generates YOLO detection boxes from output of YOLOv3 network.
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors. In the second dimension(the channel
+         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor 
+         box.
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         predictions should be as follows:
+         $$
+         b_x = \\sigma(t_x) + c_x
+         $$
+         $$
+         b_y = \\sigma(t_y) + c_y
+         $$
+         $$
+         b_w = p_w e^{t_w}
+         $$
+         $$
+         b_h = p_h e^{t_h}
+         $$
+         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
+         The logistic regression value of the 5th channel of each anchor prediction boxes
+         represents the confidence score of each prediction box, and the logistic
+         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         boxes represents the classifcation scores. Boxes with confidence scores less than
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
+         confidence scores and classification scores.
+         $$
+         score_{pred} = score_{conf} * score_{class}
+         $$
+         )DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
+                       ops::YoloBoxKernel<double>);
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
+                            T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    T conf = sigmoid<T>(input[obj_idx]);
+    if (conf < conf_thresh) {
+      continue;
+    }
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
+                  grid_num, img_height, img_width);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+template <typename T>
+class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* img_size = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = sizeof(int) * anchors.size();
+    auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
+    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto cplace = platform::CPUPlace();
+    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
+                 dev_ctx.stream());
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = img_size->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(dev_ctx, boxes, static_cast<T>(0));
+    set_zero(dev_ctx, scores, static_cast<T>(0));
+    int grid_dim = (n * box_num + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
+        anchors_data, n, h, w, an_num, class_num, box_num, input_size);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
+                        ops::YoloBoxOpCUDAKernel<double>);
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+HOSTDEVICE inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+template <typename T>
+HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
+                                  int j, int an_idx, int grid_size,
+                                  int input_size, int index, int stride,
+                                  int img_height, int img_width) {
+  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
+}
+HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+template <typename T>
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
+                                        const int img_height,
+                                        const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<T>(img_height - 1);
+}
+template <typename T>
+HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const T conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+template <typename T>
+class YoloBoxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* imgsize = ctx.Input<Tensor>("ImgSize");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+    Tensor anchors_;
+    auto anchors_data =
+        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
+    const T* input_data = input->data<T>();
+    const int* imgsize_data = imgsize->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    memset(boxes_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    memset(scores_data, 0, scores->numel() * sizeof(T));
+    T box[4];
+    for (int i = 0; i < n; i++) {
+      int img_height = imgsize_data[2 * i];
+      int img_width = imgsize_data[2 * i + 1];
+      for (int j = 0; j < an_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            int obj_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            T conf = sigmoid<T>(input_data[obj_idx]);
+            if (conf < conf_thresh) {
+              continue;
+            }
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
+                          box_idx, stride, img_height, img_width);
+            box_idx = (i * box_num + j * stride + k * w + l) * 4;
+            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
+                                img_width);
+            int label_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
+                              class_num, conf, stride);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
   limitations under the License. */
 #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
@@ -72,6 +73,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GT(class_num, 0,
                      "Attr(class_num) should be an integer greater then 0.");
+    if (ctx->HasInput("GTScore")) {
+      auto dim_gtscore = ctx->GetInputDim("GTScore");
+      PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
+                        "Input(GTScore) should be a 2-D tensor");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[0], dim_gtbox[0],
+          "Input(GTBox) and Input(GTScore) dim[0] should be same");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[1], dim_gtbox[1],
+          "Input(GTBox) and Input(GTScore) dim[1] should be same");
+    }
    std::vector<int64_t> dim_out({dim_x[0]});
    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
@@ -112,6 +125,12 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
             "This is a 2-D tensor with shape of [N, max_box_num], "
             "and each element should be an integer to indicate the "
             "box class id.");
+    AddInput("GTScore",
+             "The score of GTLabel, This is a 2-D tensor in same shape "
+             "GTLabel, and score values should in range (0, 1). This "
+             "input is for GTLabel score can be not 1.0 in image mixup "
+             "augmentation.")
+        .AsDispensable();
    AddOutput("Loss",
              "The output yolov3 loss tensor, "
              "This is a 1-D tensor with shape of [N]");
@@ -143,6 +162,9 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<float>("ignore_thresh",
                   "The ignore threshold to ignore confidence loss.")
        .SetDefault(0.7);
+    AddAttr<bool>("use_label_smooth",
+                  "Whether to use label smooth. Default True.")
+        .SetDefault(true);
    AddComment(R"DOC(
         This operator generates yolov3 loss based on given predict result and ground
         truth boxes.
@@ -204,6 +226,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
         loss = (loss_{xy} + loss_{wh}) * weight_{box}
              + loss_{conf} + loss_{class}
         $$
+         While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
+         target will be smoothed when calculating classification loss, target of 
+         positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
+         negetive samples will be smoothed to :math:`1.0 / class\_num`.
+         While :attr:`GTScore` is given, which means the mixup score of ground truth 
+         boxes, all losses incured by a ground truth box will be multiplied by its 
+         mixup score.
         )DOC");
  }
 };
@@ -240,6 +271,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
    op->SetInput("X", Input("X"));
    op->SetInput("GTBox", Input("GTBox"));
    op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput("GTScore", Input("GTScore"));
    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
    op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
    op->SetInput("GTMatchMask", Output("GTMatchMask"));
@@ -249,6 +281,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetOutput(framework::GradVarName("GTBox"), {});
    op->SetOutput(framework::GradVarName("GTLabel"), {});
+    op->SetOutput(framework::GradVarName("GTScore"), {});
    return std::unique_ptr<framework::OpDesc>(op);
  }
 };

--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
 }
 template <typename T>
-static T L2Loss(T x, T y) {
+static T L1Loss(T x, T y) {
-  return 0.5 * (y - x) * (y - x);
+  return std::abs(y - x);
 }
 template <typename T>
@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
 }
 template <typename T>
-static T L2LossGrad(T x, T y) {
+static T L1LossGrad(T x, T y) {
-  return x - y;
+  return x > y ? 1.0 : -1.0;
 }
 static int GetMaskIndex(std::vector<int> mask, int val) {
@@ -121,47 +121,49 @@ template <typename T>
 static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
                                std::vector<int> anchors, int an_idx,
                                int box_idx, int gi, int gj, int grid_size,
-                                int input_size, int stride) {
+                                int input_size, int stride, T score) {
  T tx = gt.x * grid_size - gi;
  T ty = gt.y * grid_size - gj;
  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
  loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
  loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
 }
 template <typename T>
 static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
                                    Box<T> gt, std::vector<int> anchors,
                                    int an_idx, int box_idx, int gi, int gj,
-                                    int grid_size, int input_size, int stride) {
+                                    int grid_size, int input_size, int stride,
+                                    T score) {
  T tx = gt.x * grid_size - gi;
  T ty = gt.y * grid_size - gj;
  T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
  T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
  input_grad[box_idx] =
      SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
  input_grad[box_idx + stride] =
      SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
  input_grad[box_idx + 2 * stride] =
-      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
  input_grad[box_idx + 3 * stride] =
-      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
 }
 template <typename T>
 static inline void CalcLabelLoss(T* loss, const T* input, const int index,
                                 const int label, const int class_num,
-                                 const int stride) {
+                                 const int stride, const T pos, const T neg,
+                                 T score) {
  for (int i = 0; i < class_num; i++) {
    T pred = input[index + i * stride];
-    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
  }
 }
@@ -169,11 +171,13 @@ template <typename T>
 static inline void CalcLabelLossGrad(T* input_grad, const T loss,
                                     const T* input, const int index,
                                     const int label, const int class_num,
-                                     const int stride) {
+                                     const int stride, const T pos, const T neg,
+                                     T score) {
  for (int i = 0; i < class_num; i++) {
    T pred = input[index + i * stride];
    input_grad[index + i * stride] =
-        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
+        loss;
  }
 }
@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
        for (int l = 0; l < w; l++) {
          T obj = objness[k * w + l];
          if (obj > 1e-5) {
-            // positive sample: obj = 1
+            // positive sample: obj = mixup score
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
          } else if (obj > -0.5) {
            // negetive sample: obj = 0
            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
          T obj = objness[k * w + l];
          if (obj > 1e-5) {
            input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
+                loss[i];
          } else if (obj > -0.5) {
            input_grad[k * w + l] =
                SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    auto* input = ctx.Input<Tensor>("X");
    auto* gt_box = ctx.Input<Tensor>("GTBox");
    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
    auto* loss = ctx.Output<Tensor>("Loss");
    auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
    auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    int class_num = ctx.Attr<int>("class_num");
    float ignore_thresh = ctx.Attr<float>("ignore_thresh");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
    const int n = input->dims()[0];
    const int h = input->dims()[2];
@@ -272,6 +279,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    const int stride = h * w;
    const int an_stride = (class_num + 5) * stride;
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
    const T* input_data = input->data<T>();
    const T* gt_box_data = gt_box->data<T>();
    const int* gt_label_data = gt_label->data<int>();
@@ -283,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
    int* gt_match_mask_data =
        gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
    // calc valid gt box mask, avoid calc duplicately in following code
    Tensor gt_valid_mask;
    bool* gt_valid_mask_data =
@@ -355,19 +382,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
        int mask_idx = GetMaskIndex(anchor_mask, best_n);
        gt_match_mask_data[i * b + t] = mask_idx;
        if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                      an_stride, stride, 0);
          CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
-                                 box_idx, gi, gj, h, input_size, stride);
+                                 box_idx, gi, gj, h, input_size, stride, score);
          int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
-          obj_mask_data[obj_idx] = 1.0;
+          obj_mask_data[obj_idx] = score;
          int label = gt_label_data[i * b + t];
          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                        an_stride, stride, 5);
          CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
-                           class_num, stride);
+                           class_num, stride, label_pos, label_neg, score);
        }
      }
    }
@@ -384,6 +412,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    auto* input = ctx.Input<Tensor>("X");
    auto* gt_box = ctx.Input<Tensor>("GTBox");
    auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
    auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
@@ -392,6 +421,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
    int class_num = ctx.Attr<int>("class_num");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
    const int n = input_grad->dims()[0];
    const int c = input_grad->dims()[1];
@@ -404,6 +434,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
    const int stride = h * w;
    const int an_stride = (class_num + 5) * stride;
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
    const T* input_data = input->data<T>();
    const T* gt_box_data = gt_box->data<T>();
    const int* gt_label_data = gt_label->data<int>();
@@ -414,25 +451,41 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
        input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
+          static_cast<T>(1.0));
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
    for (int i = 0; i < n; i++) {
      for (int t = 0; t < b; t++) {
        int mask_idx = gt_match_mask_data[i * b + t];
        if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
          Box<T> gt = GetGtBox(gt_box_data, i, b, t);
          int gi = static_cast<int>(gt.x * w);
          int gj = static_cast<int>(gt.y * h);
          int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                      an_stride, stride, 0);
-          CalcBoxLocationLossGrad<T>(
+          CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
-              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
+                                     input_data, gt, anchors,
-              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+                                     anchor_mask[mask_idx], box_idx, gi, gj, h,
+                                     input_size, stride, score);
          int label = gt_label_data[i * b + t];
          int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                        an_stride, stride, 5);
          CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
-                               label_idx, label, class_num, stride);
+                               label_idx, label, class_num, stride, label_pos,
+                               label_neg, score);
        }
      }
    }

--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
 class FakeInitOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
-                  framework::BlockDesc *block) const override {}
 };
 class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
 class MergeIdsOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
-    for (auto &out_var : op_desc.Output("Out")) {
+      ctx->SetType(out_var, input_type);
-      block->Var(out_var)->SetType(input_var->GetType());
    }
  }
 };

--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -71,11 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
-    for (auto &out_var : op_desc.Output("Out")) {
+      ctx->SetType(out_var, input_type);
-      block->Var(out_var)->SetType(input_var->GetType());
    }
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
 template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    T accum = in_accum.data<T>()[0];
+    T state = in_state.data<T>()[0];
+    T scale = cur_scale[0];
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+    out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
+    out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
+  }
+};
+template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
+                                               float>;
 class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
 public:
  FakeQuantizeAbsMaxOp(const std::string& type,
@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$
  }
 };
+class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
+                                    const framework::VariableNameMap& inputs,
+                                    const framework::VariableNameMap& outputs,
+                                    const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
+                   "should not be null");
+    if (ctx->HasOutput("OutState")) {
+      ctx->SetOutputDim("OutState", {1});
+    }
+    if (ctx->HasOutput("OutAccum")) {
+      ctx->SetOutputDim("OutAccum", {1});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+class FakeQuantizeMovingAverageAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("InAccum", "Last accum.").AsDispensable();
+    AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
+    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
+    AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
+        .SetDefault(0.9);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
+$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
+)DOC");
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
 REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
                       ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
+REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
+                  ops::FakeQuantizeMovingAverageAbsMaxOp,
+                  ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
+                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
                  ops::FakeChannelWiseQuantizeAbsMaxOp,
                  ops::FakeChannelWiseQuantizeAbsMaxOpMaker,

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    T accum;
+    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
+                 sizeof(T), 0);
+    T state;
+    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
+                 sizeof(T), 0);
+    T scale;
+    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
+                 0);
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &accum, sizeof(T), 0);
+    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &state, sizeof(T), 0);
+    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &scale, sizeof(T), 0);
+  }
+};
+template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
+                                               float>;
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& ctx,
@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
                        ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
                        ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_quantize_moving_average_abs_max,
+    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor {
                  framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };
+template <typename DeviceContext, typename T>
+struct FindMovingAverageAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state,
+                  const framework::Tensor& cur_scale,
+                  framework::Tensor* out_state, framework::Tensor* out_accum,
+                  framework::Tensor* out_scale);
+};
 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* in = context.Input<framework::Tensor>("X");
    auto* out = context.Output<framework::Tensor>("Out");
    auto* out_scale = context.Output<framework::Tensor>("OutScale");
    T* out_s = out_scale->mutable_data<T>(context.GetPlace());
@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  }
 };
+template <typename DeviceContext, typename T>
+class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    bool is_test = context.Attr<bool>("is_test");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
+    }
+    // training
+    auto* in_accum = context.Input<framework::Tensor>("InAccum");
+    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cur_scale = allocator.Allocate(1 * sizeof(T));
+    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+    auto* out_state = context.Output<framework::Tensor>("OutState");
+    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    out_state->mutable_data<T>(context.GetPlace());
+    out_accum->mutable_data<T>(context.GetPlace());
+    out_scale->mutable_data<T>(context.GetPlace());
+    float moving_rate = context.Attr<float>("moving_rate");
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
+        out_accum, out_scale);
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
      "The input tensor Input's rank of FCOp should be larger than "
      "in_num_col_dims.");
-  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
  std::vector<int64_t> output_dims;
-  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
-  for (int i = 0; i < in_num_col_dims; ++i) {
-    output_dims.push_back(in_dims[i]);
-  }
-  output_dims.push_back(w_dims[1]);
  ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
  ctx->ShareLoD("Input", "Out");
@@ -128,6 +119,9 @@ void FCOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                "Skip calling InferShape() function in the runtime.")
+      .SetDefault(true);
  AddComment(R"DOC(
  Fully Connected Operator.
@@ -142,13 +136,20 @@ class FCOpKernel : public framework::OpKernel<T> {
  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                   "It must use CPUPlace.");
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
    auto w = ctx.Input<Tensor>("W");
    auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
    auto w_dims = w->dims();
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
    auto out_dims = output->dims();
-    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
+    int M = framework::product(out_dims) / w_dims[1];
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();

--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override;
 };
+inline void FCOutputSize(const framework::DDim& in_dims,
+                         const framework::DDim& w_dims,
+                         std::vector<int64_t>& out_dims,  // NOLINT
+                         int in_num_col_dims) {
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    out_dims.push_back(in_dims[i]);
+  }
+  out_dims.push_back(w_dims[1]);
+}
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
    auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
+        boost::get<int>(ctx->GetAttr("dtype")));
-    auto& out_var_name = op_desc.Output("Out").front();
+    auto& out_var_name = ctx->Output("Out").front();
-    block->Var(out_var_name)->SetDataType(data_type);
+    ctx->SetDataType(out_var_name, data_type);
  }
 };

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -88,7 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update.")
        .SetDefault(false);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
        .SetDefault(true);
    AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.
@@ -137,22 +138,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
 class FusedEmbeddingSeqPoolOpGradVarTypeInference
    : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
-    auto attr = op_desc.GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);
    if (is_sparse) {
      VLOG(3) << "fused_embedding_seq_pool_grad op "
              << framework::GradVarName("W") << " is set to SelectedRows";
-      block->Var(out_var_name)
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
    } else {
      VLOG(3) << "fused_embedding_seq_pool_grad op "
              << framework::GradVarName("W") << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
    }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
  }
 };

--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
 class GetTensorFromSelectedRowsOpVarTypeInference
    : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const {  // NOLINT
-                  framework::BlockDesc *block) const final {
+    auto out_var_name = ctx->Output("Out").front();
-    auto out_var_name = op_desc.Output("Out").front();
+    auto in_var_name = ctx->Input("X").front();
-    auto in_var_name = op_desc.Input("X").front();
+    ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
-    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
-    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
-    out_var.SetDataType(in_var.GetDataType());
  }
 };

--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -54,7 +54,8 @@ $$Out = scale * X$$
 )DOC");
    AddAttr<int>("num_hash", "").SetDefault(1);
    AddAttr<int>("mod_by", "").SetDefault(100000);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
        .SetDefault(true);
  }
 };

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
 class HierarchicalSigmoidGradOpGradVarTypeInference
    : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
-    auto bias_grad_var_name_vec =
-        op_desc.Output(framework::GradVarName("Bias"));
    std::string bias_grad_var_name;
    bool hasBias = false;
    if (bias_grad_var_name_vec.size()) {
      hasBias = true;
-      bias_grad_var_name =
+      bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
-          op_desc.Output(framework::GradVarName("Bias")).front();
    }
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);
    if (is_sparse) {
      VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(w_grad_var_name)
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
    } else {
      VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(w_grad_var_name)
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
    }
    if (hasBias) {
      VLOG(30) << "hierarchical_sigmoid_grad op "
               << framework::GradVarName("Bias") << " is set to LoDTensor";
-      block->Var(bias_grad_var_name)
+      ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
    }
-    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
  }
 };

--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 class LoDRankTableInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &o : ctx->Output("Out")) {
-    for (auto &o : op_desc.Output("Out")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
-      block->FindRecursiveOrCreateVar(o).SetType(
-          framework::proto::VarType::LOD_RANK_TABLE);
    }
  }
 };

--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &out_var : ctx->Output("Out")) {
-    for (auto &out_var : op_desc.Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
    }
  }
 };

--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
-    auto attr = op_desc.GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);
    if (is_sparse) {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to SelectedRows";
-      block->Var(out_var_name)
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
    } else {
      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
              << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
    }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
  }
 };

--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
          platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                 &dst_memory_p);
        } else {
+          need_s8_to_u8 = fuse_relu;
          platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
                                                &dst_memory_p);
        }

--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -123,7 +123,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    auto input = ctx.Input<Tensor>("Input");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
    auto w = ctx.Input<Tensor>("W");
    auto bias = ctx.Input<Tensor>("Bias");
@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
    auto dst_memory = mem.dst(output_data);
@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
    if (input_grad) {
+      input_grad->Resize(input->dims());
      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
    }
    if (w_grad) {
+      w_grad->Resize(w->dims());
      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
    }
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const T* input_data = input->data<T>();
-    const Tensor* w = ctx.Input<Tensor>("W");
-    const T* w_data = w->data<T>();
    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
    const T* out_grad_data = out_grad->data<T>();

--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }
 };
+template <typename T>
+class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> axis_int8 = {0, 2, 3, 1};
+    if (axis.size() != 1) {
+      PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size());
+      for (size_t i = 0; i < axis.size(); i++) {
+        PADDLE_ENFORCE_EQ(axis[i], axis_int8[i],
+                          "Current INT8 MKLDNN Transpose kernel only surpport "
+                          "axis with [0, 2, 3, 1] due to MKL-DNN kernel "
+                          "implementation.");
+      }
+    }
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->ShareDataWith(*input);
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(input->format());
+  }
+};
 template <typename T>
 class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 public:
@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
+                   ops::TransposeMKLDNNOpKernel<float>,
+                   ops::TransposeINT8MKLDNNOpKernel<uint8_t>,
+                   ops::TransposeINT8MKLDNNOpKernel<int8_t>);
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                   ops::TransposeMKLDNNOpKernel<float>);

--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
 class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto out_var_name = ctx->Output("Communicator").front();
-    auto out_var_name = op_desc.Output("Communicator").front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
  }
 };

--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 class NCEOpGradVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
-    auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
    bool is_sparse = boost::get<bool>(attr);
    if (is_sparse) {
      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
              << " is set to SelectedRows";
-      block->Var(weight_grad)
+      ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
    } else {
      VLOG(3) << "nce_op_grad op " << weight_grad << " and "
              << " is set to LoDTensor";
-      block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
    }
-    block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
+    ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
  }
 };

--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -37,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 class NgraphEngineInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
-                  framework::BlockDesc *block) const override {}
 };
 }  // namespace operators

--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
 #include <Eigen/Dense>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> {
    T beta1_pow = *beta1_pow_;
    T beta2_pow = *beta2_pow_;
    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    size_t row_count = numel / row_numel_;
+    int64_t row_count = static_cast<int64_t>(numel / row_numel_);
-    for (size_t i = 0U, j = 0U; i != row_count; ++i) {
+    for (int64_t i = 0, j = 0; i != row_count; ++i) {
      if (i == *(rows_ + j)) {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
          T g = grad_[j * row_numel_ + k];
          adam_update(i * row_numel_ + k, g);
        }
        ++j;
      } else {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
          T mom1 = moment1_[i * row_numel_ + k];
          T mom2 = moment2_[i * row_numel_ + k];
          T p = param_[i * row_numel_ + k];
@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
        }
      }
-      framework::SelectedRows cpu_grad_merge;
+      framework::SelectedRows tmp_grad_merge;
      const framework::SelectedRows* grad_merge_ptr;
      if (is_strict_sorted) {
        grad_merge_ptr = &grad;
      } else {
        // merge duplicated rows if any.
        // The rows of grad_merge have been sorted inside MergeAdd functor
-        framework::SelectedRows* grad_merge_var;
        scatter::MergeAdd<DeviceContext, T> merge_func;
-        if (platform::is_cpu_place(ctx.GetPlace())) {
-          grad_merge_var = &cpu_grad_merge;
-        } else {
-          // FIXME(qiao): GPU also need to fix this
-          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                               .Var()
-                               ->GetMutable<framework::SelectedRows>();
-        }
        merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   grad_merge_var, true);
+                   &tmp_grad_merge, true);
-        grad_merge_ptr = grad_merge_var;
+        grad_merge_ptr = &tmp_grad_merge;
      }
      auto& grad_merge = *grad_merge_ptr;
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = nullptr;
+      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
-// When compiled without CUDA, the CUDAData() interface should not be
-// provided.
-#if defined(PADDLE_WITH_CUDA)
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = grad_merge.rows().data();
-#if defined(PADDLE_WITH_CUDA)
-      }
-#endif
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
      if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
          }
        }
 #ifndef _WIN32
-        else if (FLAGS_inner_op_parallelism > 1 &&
+        else if (FLAGS_inner_op_parallelism > 1 &&  // NOLINT
                 min_row_size_to_use_multithread > 0 &&
                 param.dims()[0] > min_row_size_to_use_multithread) {
          VLOG(3) << "use multi thread, inner_op_parallelism="
@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
          for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
            int64_t start = i * line_in_each_thread;
            int64_t end = (i + 1) * line_in_each_thread;
-            if (start >= param_row_count) {
+            if (start >= static_cast<int64_t>(param_row_count)) {
              break;
            }
-            if (end > param_row_count) {
+            if (end > static_cast<int64_t>(param_row_count)) {
-              end = param_row_count;
+              end = static_cast<int64_t>(param_row_count);
            }
            fs.push_back(
                framework::Async([&functor, &row_id_to_grad_row_offset,
@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
          }
          for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
        }
-#endif  // !_WIN32
+#endif          // !_WIN32
-        else {
+        else {  // NOLINT
          functor(param.numel());
        }
      } else if (platform::is_gpu_place(ctx.GetPlace())) {

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
 weight using a local learning rate:
 $$
-local\_lr = \eta  * 
+local\_lr = \eta  *
    \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
-velocity = mu * velocity + 
+velocity = mu * velocity +
    local\_lr * (grad + \beta * param) \\
 param = param - velocity. \\
 $$
@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS.
 class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
-                  framework::BlockDesc *block) const override {}
 };
 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -21,18 +21,14 @@ using Tensor = framework::Tensor;
 class MomentumOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto& input_var = ctx->Input("Param")[0];
-    auto input_var = op_desc.Input("Param")[0];
+    for (auto& out_var : ctx->Output("ParamOut")) {
-    for (auto& out_var : op_desc.Output("ParamOut")) {
+      if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+        ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
-          framework::proto::VarType::SELECTED_ROWS) {
+      } else if (ctx->GetType(input_var) ==
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
                 framework::proto::VarType::LOD_TENSOR) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
+        ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
-            framework::proto::VarType::LOD_TENSOR);
      } else {
        PADDLE_THROW(
            "Only support LodTensor and SelectedRows, Unexpected Input Type.");

--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("ParamOut", param_dim);
    ctx->SetOutputDim("VelocityOut", param_dim);
  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
        VLOG(3) << "Grad SelectedRows contains no data!";
        return;
      }
-      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
-                              .Var()
+      framework::SelectedRows tmp_merged_grad;
-                              ->GetMutable<framework::SelectedRows>();
+      framework::SelectedRows* merged_grad = &tmp_merged_grad;
      math::scatter::MergeAdd<DeviceContext, T> merge_func;
      merge_func(ctx.template device_context<DeviceContext>(), *grad,
                 merged_grad);
-      const int64_t* rows = nullptr;
+      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
      int64_t row_numel =
          merged_grad->value().numel() / merged_grad->rows().size();
      platform::ForRange<DeviceContext> for_range(

--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> {
      }
    } else if (grad_var->IsType<framework::SelectedRows>()) {
      auto &grad = grad_var->Get<framework::SelectedRows>();
-      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
+      framework::SelectedRows tmp_merged_grad;
-                              .Var()
+      framework::SelectedRows *merged_grad = &tmp_merged_grad;
-                              ->GetMutable<framework::SelectedRows>();
      math::scatter::MergeAdd<DeviceContext, T> merge_func;
      merge_func(dev_ctx, grad, merged_grad);
      platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows;
+      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
      auto &merged_tensor = merged_grad->value();
      int64_t row_count = merged_grad->rows().size();
      int64_t row_numel = merged_tensor.numel() / row_count;

--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
 class SGDOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto &input_var_n = ctx->Input("Param")[0];
-    auto input_var_n = op_desc.Input("Param")[0];
+    auto in_var_type = ctx->GetType(input_var_n);
-    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
    PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                       in_var_type == framework::proto::VarType::LOD_TENSOR,
                   "The input Var's type should be LoDtensor or SelectedRows,"
                   " but the received var(%s)'s type is %s",
                   input_var_n, in_var_type);
-    for (auto &out_var_n : op_desc.Output("ParamOut")) {
+    for (auto &out_var_n : ctx->Output("ParamOut")) {
-      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
+      if (ctx->GetType(out_var_n) != in_var_type) {
-      if (out_var.GetType() != in_var_type) {
+        ctx->SetType(out_var_n, in_var_type);
-        out_var.SetType(in_var_type);
      }
    }
  }

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "

--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -14,8 +14,11 @@
 #include "paddle/fluid/operators/py_func_op.h"
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -91,15 +94,12 @@ static void CallPythonFunc(py::object *callable,
  }
 }
-class PyFuncOpVarTypInference : public framework::VarTypeInference {
+class PyFuncOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
-    auto &outs = op.Outputs();
-    bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
-    auto &ins = op.Inputs();
+    bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
-    bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
    /**
     * X or Out can be empty, so that py_func can be more flexible
@@ -107,8 +107,8 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
     */
    PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
-    PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0,
+    PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
-                      "Function id cannot be less than 0");
+                      0, "Function id cannot be less than 0");
    if (!has_out) return;
@@ -118,7 +118,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
     * the corresponding forward variable
     */
    const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = outs.at("Out");
+    auto &out_var_names = ctx->Output("Out");
    for (auto &out_var_name : out_var_names) {
      if (out_var_name == framework::kEmptyVarName ||
          out_var_name.size() < kGradVarSuffix.size()) {
@@ -128,18 +128,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      size_t len = out_var_name.size() - kGradVarSuffix.size();
      if (out_var_name.substr(len) == kGradVarSuffix) {
        auto fwd_var_name = out_var_name.substr(0, len);
-        auto *out_var_desc = block->FindVarRecursive(out_var_name);
+        PADDLE_ENFORCE(ctx->HasVar(out_var_name),
-        auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name);
+                       "Backward variable %s not found", out_var_name);
-        PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found",
+        PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
-                                out_var_name);
+                       "Backward variable %s not found", fwd_var_name);
-        PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
-                                fwd_var_name);
        VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
                 << fwd_var_name << ")";
-        out_var_desc->SetShape(fwd_var_desc->GetShape());
-        out_var_desc->SetDataType(fwd_var_desc->GetDataType());
+        ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
-        out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel());
+        ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
-        out_var_desc->SetType(fwd_var_desc->GetType());
+        ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
+        ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
      }
    }
  }
@@ -309,5 +308,5 @@ class PyFuncOp : public framework::OperatorBase {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
-                  ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference,
+                  ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
                  ops::PyFuncOpGradDescMaker);
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
    AddComment(R"DOC(
      CreateCustomReader Operator
-      A custom reader can be used for input data preprocessing. 
+      A custom reader can be used for input data preprocessing.
-      A custom reader holds its own sub-block, which will be executed in CPU 
+      A custom reader holds its own sub-block, which will be executed in CPU
-      in its 'ReadNext()' function. Users can configurate their own 
+      in its 'ReadNext()' function. Users can configurate their own
-      preprocessing pipelines by inserting operators into custom reader's 
+      preprocessing pipelines by inserting operators into custom reader's
      sub-block.
    )DOC");
  }
@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 class CustomReaderInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto& out_var_name = ctx->Output("Out")[0];
-    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
+    PADDLE_ENFORCE(ctx->HasVar(out_var_name));
-    PADDLE_ENFORCE_NOT_NULL(out_reader);
+    ctx->SetType(out_var_name, framework::proto::VarType::READER);
-    out_reader->SetType(framework::proto::VarType::READER);
    auto sink_var_names =
-        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+        boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
    const auto* sub_block =
-        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+        boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
    std::vector<framework::proto::VarType::Type> res_data_types;
    for (const std::string& var_name : sink_var_names) {
      framework::VarDesc* var = sub_block->FindVar(var_name);
      PADDLE_ENFORCE_NOT_NULL(var);
      res_data_types.emplace_back(var->GetDataType());
    }
-    out_reader->SetDataTypes(res_data_types);
+    ctx->SetDataTypes(out_var_name, res_data_types);
  }
 };

--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
 class ReadInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
-    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
    if (infer_out) {
-      std::string reader_name = op_desc.Input("Reader")[0];
+      std::string reader_name = ctx->Input("Reader")[0];
-      std::vector<std::string> out_names = op_desc.Output("Out");
+      std::vector<std::string> out_names = ctx->Output("Out");
-      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+      auto dtypes = ctx->GetDataTypes(reader_name);
-      auto dtypes = reader->GetDataTypes();
      PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
      for (size_t i = 0; i < dtypes.size(); ++i) {
-        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
+        ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
-        out.SetType(framework::proto::VarType::LOD_TENSOR);
+        ctx->SetDataType(out_names[i], dtypes[i]);
-        out.SetDataType(dtypes[i]);
      }
    }
  }

--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
  }
 }
-void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
+void FileReaderInferVarType::operator()(
-                                        framework::BlockDesc* block) const {
+    framework::InferVarTypeContext* ctx) const {
-  std::string reader_name = op_desc.Output("Out")[0];
+  std::string reader_name = ctx->Output("Out")[0];
-  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
+  ctx->SetType(reader_name, framework::proto::VarType::READER);
-  reader->SetType(framework::proto::VarType::READER);
 }
 void DecoratedReaderInferShape::operator()(
@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()(
 }
 void DecoratedReaderInferVarType::operator()(
-    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
+    framework::InferVarTypeContext* ctx) const {
-  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
+  const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
-  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
+  const std::string& out_reader_name = ctx->Output("Out")[0];
-  std::string out_reader_name = op_desc.Output("Out")[0];
+  ctx->SetType(out_reader_name, framework::proto::VarType::READER);
-  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
+  ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
-  out_reader->SetType(framework::proto::VarType::READER);
-  out_reader->SetDataTypes(in_reader->GetDataTypes());
 }
 void DecoratedReaderMakerBase::Make() {

--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -14,7 +14,9 @@
 #pragma once
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
@@ -59,8 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
 class FileReaderInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override;
-                  framework::BlockDesc* block) const override;
 };
 // general infershape for decorated reader
@@ -72,8 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
 // general var type inference for decorated reader
 class DecoratedReaderInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override;
-                  framework::BlockDesc* block) const override;
 };
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -159,12 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
 class SaveOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
-    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
  }
 };

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
+#include <memory>
 #include <string>
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -69,17 +70,13 @@ $$Out = scale*(X + bias)$$
 class ScaleOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto &in_var_name = ctx->Input("X").front();
-    auto &in_var_name = op_desc.Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
-    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
-    auto out_var_name = op_desc.Output("Out").front();
-    auto *out_var = block->FindVarRecursive(out_var_name);
    if (in_var_name != out_var_name) {
-      out_var->SetType(in_var.GetType());
+      ctx->SetType(out_var_name, ctx->GetType(in_var_name));
-      out_var->SetDataType(in_var.GetDataType());
+      ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
    }
  }
 };

--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -59,7 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
        });
    AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
        .SetDefault(0);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
        .SetDefault(true);
    AddComment(R"DOC(
 Sequence Enumerate Operator.

--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
                        paddle::platform::float16* d_in) {
  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
  if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
    int coords[D] = {0};
    for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx % out_dims[i];
+      coords[i] = out_idx_tmp % out_dims[i];
-      out_idx /= out_dims[i];
+      out_idx_tmp /= out_dims[i];
      coords[i] += offsets[i];
    }
    int64_t in_idx = 0;
-    for (int i = 0; i < D - 1; ++i) {
+    for (int i = 0; i < D; ++i) {
-      in_idx += coords[i] * in_dims[i + 1];
+      in_idx = in_idx * in_dims[i] + coords[i];
    }
-    in_idx += coords[D - 1];
    d_in[in_idx] = d_out[out_idx];
  }
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
    int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
    auto stream = ctx.cuda_device_context().stream();
    auto out_shape = framework::vectorize2int(out_dims);

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
    Tensor* logit_grad =
        context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
+                          context.device_context(), logit_grad);
    T* logit_grad_data = logit_grad->data<T>();
    const int batch_size = logit_grad->dims()[0];

--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_selected_rows_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -60,10 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
 class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &out_var : ctx->Output("Out")) {
-    for (auto &out_var : op_desc.Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
-      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
    }
  }
 };

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
  }
 };
+// TODO(paddle-dev): Should use OpKernel.
 class SqueezeOp : public framework::OperatorBase {
 public:
  using OperatorBase::OperatorBase;

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
@@ -159,24 +160,20 @@ the LoD information with the first input.
 class SumOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc& op_desc,
+  void operator()(framework::InferVarTypeContext* ctx) const override {
-                  framework::BlockDesc* block) const override {
+    auto& inputs = ctx->Input("X");
-    auto& inputs = op_desc.Input("X");
    auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : op_desc.Input("X")) {
+    for (auto& name : ctx->Input("X")) {
-      VLOG(10) << name << " "
+      VLOG(10) << name << " " << ctx->GetType(name);
-               << block->FindRecursiveOrCreateVar(name).GetType();
    }
    bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string& name) {
+        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name).GetType() ==
+          return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
-                 framework::proto::VarType::LOD_TENSOR;
        });
-    auto is_tensor_array = [block](const std::string& name) {
+    auto is_tensor_array = [ctx](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name).GetType() ==
+      return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
-             framework::proto::VarType::LOD_TENSOR_ARRAY;
    };
    bool any_input_is_tensor_array =
@@ -188,8 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
      if (!all_inputs_are_tensor_array) {
        std::ostringstream os;
        for (auto& each : inputs) {
-          os << "    " << each << " type is "
+          os << "    " << each << " type is " << ctx->GetType(each) << "\n";
-             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
        }
        PADDLE_ENFORCE(all_inputs_are_tensor_array,
                       "Not all inputs are tensor array:\n%s", os.str());
@@ -199,11 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
      var_type = framework::proto::VarType::LOD_TENSOR;
    }
-    auto out_var_name = op_desc.Output("Out").front();
+    auto out_var_name = ctx->Output("Out").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    ctx->SetType(out_var_name, var_type);
-    out_var.SetType(var_type);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
-    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
-    out_var.SetDataType(in_var.GetDataType());
  }
 };

--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
 class LoDTensorArray2TensorGradInferVarType
    : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
-    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
    }
  }
 };

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 class TensorRTEngineInferVarType : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
-                  framework::BlockDesc *block) const override {}
 };
 }  // namespace operators

--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -112,17 +112,16 @@ uniform distribution. The random result is in set [min, max].
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
 public:
-  void operator()(const framework::OpDesc &op_desc,
+  void operator()(framework::InferVarTypeContext *ctx) const override {
-                  framework::BlockDesc *block) const override {
+    auto out_var_name = ctx->Output("Out").front();
-    auto out_var_name = op_desc.Output("Out").front();
    auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
+        boost::get<int>(ctx->GetAttr("dtype")));
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    if (ctx->GetType(out_var_name) !=
-    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
+        framework::proto::VarType::SELECTED_ROWS) {
-      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
    }
-    out_var.SetDataType(var_data_type);
+    ctx->SetDataType(out_var_name, var_data_type);
  }
 };

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
--- a/python/paddle/fluid/imperative/profiler.py
+++ b/python/paddle/fluid/imperative/profiler.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh