merge develop

test=develop

merge develop
test=develop
d231e550 · sneaxiy · cf8d2e67 · 080740b3 · d231e550 · d231e550
171 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -118,9 +118,10 @@ paddle.fluid.layers.label_smooth ArgSpec(args=['label', 'prior_dist', 'epsilon',
 paddle.fluid.layers.roi_pool ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0))
 paddle.fluid.layers.roi_align ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None))
 paddle.fluid.layers.dice_loss ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,))
-paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR'))
+paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None))
 paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',))
-paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
+paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.layers.resize_nearest ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape'], varargs=None, keywords=None, defaults=(None, None, None, None))
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -178,6 +179,7 @@ paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], vara
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
+paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
@@ -200,6 +202,7 @@ paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'],
 paddle.fluid.layers.create_parameter ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None))
 paddle.fluid.layers.create_global_var ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None))
 paddle.fluid.layers.cast ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.tensor_array_to_tensor ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.concat ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.sums ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.assign ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -18,8 +18,8 @@ namespace framework {

 void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
                     Tensor *out) {
-  VLOG(3) << "DeviceTransform in, src_place " << in.place()
-          << " dst_place: " << dst_place;
+  VLOG(30) << "DeviceTransform in, src_place " << in.place()
+           << " dst_place: " << dst_place;

  PADDLE_ENFORCE_NE(
      in.place().which(), dst_place.which(),

--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -49,10 +49,10 @@ class TestOpWithKernel : public OperatorWithKernel {
  OpKernelType GetExpectedKernelType(
      const ExecutionContext& ctx) const override {
    if (Attr<bool>("use_gpu")) {
-      VLOG(3) << "force use gpu kernel";
+      VLOG(30) << "force use gpu kernel";
      return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0));
    } else {
-      VLOG(3) << "use default kernel";
+      VLOG(30) << "use default kernel";
      return OpKernelType(proto::VarType::FP32,
                          ctx.Input<Tensor>("input")->place());
    }
@@ -148,7 +148,7 @@ TEST(Operator, CPUtoGPU) {
  // get output
  auto* output2 = scope.Var("OUT2");
  gpu_op->Run(scope, cuda_place);
-  VLOG(3) << "after gpu_op run";
+  VLOG(30) << "after gpu_op run";

  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
  paddle::platform::DeviceContextPool& pool =

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -60,7 +60,7 @@ void BroadcastOpHandle::BroadcastOneVar(
  PADDLE_ENFORCE_NOT_NULL(in_var);
  Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
  if (UNLIKELY(!in_tensor.IsInitialized())) {
-    VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!";
+    VLOG(30) << "in var " << in_var_handle.name_ << "not inited, return!";
    return;
  }


--- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc
@@ -45,8 +45,8 @@ std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
        IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
    compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
    if (is_lock_and_record_event_free) {
-      VLOG(10) << "Set is_lock_and_record_event_free be true in op "
-               << compute_op->DebugString();
+      VLOG(100) << "Set is_lock_and_record_event_free be true in op "
+                << compute_op->DebugString();
    }
  }
  return ir_graph;

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -399,7 +399,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
              for (size_t i = 0; i < backward_vars.size(); i += 2) {
                auto &p_name = backward_vars[i];
                auto &g_name = backward_vars[i + 1];
-                VLOG(10) << "Bcast " << g_name << " for parameter " << p_name;
+                VLOG(100) << "Bcast " << g_name << " for parameter " << p_name;

                switch (strategy_.reduce_) {
                  case BuildStrategy::ReduceStrategy::kReduce:
@@ -809,8 +809,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
          node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
      op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
-      VLOG(10) << "send grad " << input_var_names[0] << " origin "
-               << send_param_grad[1] << " place: " << op_dev_id;
+      VLOG(100) << "send grad " << input_var_names[0] << " origin "
+                << send_param_grad[1] << " place: " << op_dev_id;
      for (auto &varname : input_var_names) {
        sharded_var_device->emplace(varname, op_dev_id);
      }
@@ -826,9 +826,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
    if (recv_param_grad.size() == 2U) {
      op_dev_id =
          GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device);
-      VLOG(10) << "recv param " << recv_param_grad[0]
-               << " get grad place: " << recv_param_grad[1]
-               << " place: " << op_dev_id;
+      VLOG(100) << "recv param " << recv_param_grad[0]
+                << " get grad place: " << recv_param_grad[1]
+                << " place: " << op_dev_id;
    } else {
      op_dev_id = GetAppropriateDeviceID(output_var_names);
    }

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -140,8 +140,8 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
        if (next_compute_op != nullptr) {
          if (compute_ref_cnt_map.count(next_compute_op)) {
            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
+            VLOG(50) << "Add reference count of " << var_name << " to Operator "
+                     << next_compute_op->Name();
          } else {
            // Create new reference_count_op_handle
            ir::Node *ref_cnt_node = graph->CreateEmptyNode(

--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
                        ->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(10) << place_ << "RUN Scale loss grad op";
+      VLOG(100) << place_ << "RUN Scale loss grad op";
    });
 #endif
  }

--- a/paddle/fluid/framework/details/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/details/sequential_execution_pass.cc
@@ -94,8 +94,8 @@ std::unique_ptr<ir::Graph> SequentialExecutionPass::ApplyImpl(
    op_node_list[i - 1]->outputs.push_back(dep_var);
    dep_var->outputs.push_back(op_node_list[i]);
    dep_var->inputs.push_back(op_node_list[i - 1]);
-    VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name()
-             << " and " << op_node_list[i]->Name();
+    VLOG(100) << "Add dependencies between " << op_node_list[i - 1]->Name()
+              << " and " << op_node_list[i]->Name();
  }
  return graph;
 }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -210,16 +210,16 @@ void ThreadedSSAGraphExecutor::RunOp(
    details::OpHandleBase *op) {
  auto op_run = [ready_var_q, op, this] {
    try {
-      if (VLOG_IS_ON(10)) {
-        VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
+      if (VLOG_IS_ON(100)) {
+        VLOG(100) << op << " " << op->Name() << " : " << op->DebugString();
      }
      if (LIKELY(!strategy_.dry_run_)) {
        op->Run(strategy_.use_cuda_);
      }
-      VLOG(10) << op << " " << op->Name() << " Done ";
+      VLOG(100) << op << " " << op->Name() << " Done ";
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
-      VLOG(10) << op << " " << op->Name() << "Signal posted";
+      VLOG(100) << op << " " << op->Name() << "Signal posted";
    } catch (...) {
      exception_holder_.Catch(std::current_exception());
    }

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -43,7 +43,7 @@ ExecutorPrepareContext::ExecutorPrepareContext(
 }

 ExecutorPrepareContext::~ExecutorPrepareContext() {
-  VLOG(5) << "destroy ExecutorPrepareContext";
+  VLOG(50) << "destroy ExecutorPrepareContext";
 }

 template <typename RefCntMap>
@@ -60,7 +60,7 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
        if ((it->second)-- == 1) {
          auto* var = scope.FindVar(name);
          if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+            VLOG(100) << "Erase tensor \'" << name << "\'";
            if (var->IsType<LoDTensor>()) {
              erase_tensors.insert(var->GetMutable<LoDTensor>());
            } else if (var->IsType<SelectedRows>()) {
@@ -141,21 +141,21 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
      if (var->Persistable()) {
        auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " global, which pointer is " << ptr;
      } else {
        auto* ptr = scope->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " locally, which pointer is " << ptr;
      }
    }
  } else {
    for (auto& var : global_block.AllVars()) {
      auto* ptr = scope->Var(var->Name());
      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
+      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
+               << ptr;
    }
  }
 }
@@ -286,7 +286,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    int i = 0;
    for (auto& feed_target : (*feed_targets)) {
      std::string var_name = feed_target.first;
-      VLOG(3) << "feed target's name: " << var_name;
+      VLOG(30) << "feed target's name: " << var_name;

      // prepend feed op
      auto* op = global_block->PrependOp();
@@ -309,7 +309,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
    int i = 0;
    for (auto& fetch_target : (*fetch_targets)) {
      std::string var_name = fetch_target.first;
-      VLOG(3) << "fetch target's name: " << var_name;
+      VLOG(30) << "fetch target's name: " << var_name;

      // append fetch op
      auto* op = global_block->AppendOp();
@@ -459,7 +459,7 @@ void Executor::RunPreparedContext(

 void Executor::EnableMKLDNN(const ProgramDesc& program) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(3) << "use_mkldnn=True";
+  VLOG(30) << "use_mkldnn=True";
  for (size_t bid = 0; bid < program.Size(); ++bid) {
    auto* block = const_cast<ProgramDesc&>(program).MutableBlock(bid);
    for (auto* op : block->AllOps()) {

--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -25,7 +25,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
                     const std::string& var_name, size_t index) {
  // If var_name Variable is not found in GlobalScope, a new variable will
  // be created.
-  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  VLOG(30) << "SetFeedVariable name=" << var_name << " index=" << index;
  Variable* g_feed_value = scope->Var(var_name);
  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
  if (index >= feed_inputs.size()) {
@@ -47,8 +47,8 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
                 typeid(FeedFetchList).name());
  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
  auto& tensor = fetch_outputs[index];
-  VLOG(3) << "Fetch " << var_name << " with index " << index
-          << " shape= " << tensor.dims();
+  VLOG(30) << "Fetch " << var_name << " with index " << index
+           << " shape= " << tensor.dims();
  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
  return tensor;
 }

--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -147,19 +147,19 @@ void PrepareParameters(Graph* graph, const Param& param) {
  scope->Var(param.LSTMX)->GetMutable<LoDTensor>();
  scope->Var(param.LSTMOUT)->GetMutable<LoDTensor>();

-#define GATE_W(name__)                                               \
-  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");            \
-  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");            \
-  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");            \
-  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);       \
-  VLOG(4) << #name__ "_w0"                                           \
-          << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_w1"                                           \
-          << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
-  VLOG(4) << #name__ "_b0"                                           \
-          << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
-  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();       \
-  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();       \
+#define GATE_W(name__)                                                \
+  auto* W_##name__##_w0 = scope->FindVar(#name__ ".w_0");             \
+  auto* W_##name__##_w1 = scope->FindVar(#name__ ".w_1");             \
+  auto* W_##name__##_b0 = scope->FindVar(#name__ ".b_0");             \
+  CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0);        \
+  VLOG(40) << #name__ "_w0"                                           \
+           << " shape: " << W_##name__##_w0->Get<LoDTensor>().dims(); \
+  VLOG(40) << #name__ "_w1"                                           \
+           << " shape: " << W_##name__##_w1->Get<LoDTensor>().dims(); \
+  VLOG(40) << #name__ "_b0"                                           \
+           << " shape: " << W_##name__##_b0->Get<LoDTensor>().dims(); \
+  auto& W_##name__##_w0_t = W_##name__##_w0->Get<LoDTensor>();        \
+  auto& W_##name__##_w1_t = W_##name__##_w1->Get<LoDTensor>();        \
  auto& W_##name__##_b0_t = W_##name__##_b0->Get<LoDTensor>();

  GATE_W(forget);
@@ -208,7 +208,7 @@ void PrepareLSTMWeight(const LoDTensor& W_forget_w0,
  int D = W_forget_w0.dims()[0];
  int M = W_forget_w1.dims()[0];
  out->Resize(make_ddim({D + M, 4 * D}));
-  VLOG(3) << "LSTMWeight resized to " << out->dims();
+  VLOG(30) << "LSTMWeight resized to " << out->dims();

  float* out_data = out->mutable_data<float>(platform::CPUPlace());
  std::array<const float*, 4> tensors(

--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -57,7 +57,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
  int found_conv_bias_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle ConvBias fuse";
+    VLOG(40) << "handle ConvBias fuse";
    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                              conv_bias_pattern);                      // Filter
    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
@@ -74,7 +74,7 @@ std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
    // check if fuse can be done and if MKL-DNN should be used
    FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
    if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) {
-      VLOG(3) << "do not perform conv+bias fuse";
+      VLOG(30) << "do not perform conv+bias fuse";
      return;
    }


--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -121,7 +121,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
  int found_conv_bn_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
+    VLOG(40) << "handle ConvBN fuse";

    // conv, batch_norm,
    // conv_weight, conv_out,
@@ -133,7 +133,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
    // check if fuse can be done and if MKL-DNN should be used
    FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm);
    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+bn fuse";
+      VLOG(30) << "do not perform conv+bn fuse";
      return;
    }

@@ -241,7 +241,7 @@ std::unique_ptr<ir::Graph> ConvEltwiseAddBNFusePass::ApplyImpl(
  int found_conv_bn_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle ConvBN fuse";
+    VLOG(40) << "handle ConvBN fuse";

    // conv, batch_norm,
    // conv_weight, conv_out,

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
  int found_conv_relu_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle ConvReLU fuse";
+    VLOG(40) << "handle ConvReLU fuse";
    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                              conv_relu_pattern);                      // Filter
    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
@@ -48,7 +48,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(

    FuseOptions fuse_option = FindFuseOption(*conv, *relu);
    if (fuse_option == DO_NOT_FUSE) {
-      VLOG(3) << "do not perform conv+relu fuse";
+      VLOG(30) << "do not perform conv+relu fuse";
      return;
    }


--- a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> DepthwiseConvMKLDNNPass::ApplyImpl(
  int found_depthwise_conv_mkldnn_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(3) << "handle DepthwiseConvMKLDNN fuse";
+    VLOG(30) << "handle DepthwiseConvMKLDNN fuse";
    GET_NODE(depthwise_conv, (*pattern));
    depthwise_conv->Op()->SetType("conv2d");
    found_depthwise_conv_mkldnn_count++;

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -39,7 +39,7 @@ std::unique_ptr<ir::Graph> FCFusePass::ApplyImpl(
  int found_fc_count = 0;
  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle FC fuse";
+    VLOG(40) << "handle FC fuse";
    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);

--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
@@ -61,7 +61,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(

  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    VLOG(40) << "handle FuseElewiseAddAct fuse";
    GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
                              elewise_add_act_pattern);
@@ -77,10 +77,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
        g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);

-    VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
-            << ele_add->Name() << " -> " << ele_out_n << "\n"
-            << "\t " << ele_out_n << " -> " << act->Name() << " -> "
-            << act_out_n;
+    VLOG(40) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
+             << ele_add->Name() << " -> " << ele_out_n << "\n"
+             << "\t " << ele_out_n << " -> " << act->Name() << " -> "
+             << act_out_n;

    ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
    found_elewise_add_act_count++;
@@ -113,7 +113,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(

  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    VLOG(40) << "handle FuseElewiseAddAct fuse";
    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
@@ -129,9 +129,9 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
        g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);

-    VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
-            << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
-            << ele_add->Name() << " -> " << elewise_add_out_n;
+    VLOG(40) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
+             << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
+             << ele_add->Name() << " -> " << elewise_add_out_n;

    ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
    found_elewise_add_act_count++;
@@ -165,7 +165,7 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(

  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
                     Graph *g) {
-    VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
+    VLOG(40) << "handle FuseElewiseAddActGrad1 fuse";
    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
@@ -208,10 +208,10 @@ std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(

    auto fused_node = g->CreateOpNode(&desc);

-    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
-            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
-            << d_itermediate_out_n << " and " << act_out_n << " -> "
-            << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
+    VLOG(40) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+             << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
+             << d_itermediate_out_n << " and " << act_out_n << " -> "
+             << ele_add_grad->Name() << " -> " << d_itermediate_out_n;

    ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
    found_elewise_add_act_count++;

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -92,7 +92,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {

 std::map<std::string, std::vector<ir::Node *>> Graph::InitFromProgram(
    const ProgramDesc &program) {
-  VLOG(3) << "block in program:" << program_.Size();
+  VLOG(30) << "block in program:" << program_.Size();
  std::unordered_map<std::string, VarDesc *> all_vars;
  // var nodes for each var name, will have multiple versions in SSA
  std::map<std::string, std::vector<ir::Node *>> var_nodes;
@@ -160,7 +160,7 @@ void Graph::ResolveHazard(
    auto it_old = versions.rbegin();
    ++it_old;
    for (; it_old != versions.rend(); it_new = it_old, ++it_old) {
-      VLOG(3) << "deal with var: " << (*it_new)->Name();
+      VLOG(30) << "deal with var: " << (*it_new)->Name();
      ir::Node *write_op =
          (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0];
      const auto &read_ops = (*it_old)->outputs;

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -89,7 +89,7 @@ class Graph {
                   attr_name);
    attrs_[attr_name] = attr;
    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
+      VLOG(30) << "deleting " << attr_name;
      delete attr;
    };
  }

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -33,8 +33,9 @@ void SortHelper(
    }
  }

-  VLOG(3) << "topology sort insert: " << node->Name()
-          << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
+  VLOG(30) << "topology sort insert: " << node->Name()
+           << reinterpret_cast<void *>(node) << " input "
+           << node->inputs.size();
  ret->push_back(node);
 }

@@ -103,9 +104,9 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
-        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
-                << " -> " << n->Name() << reinterpret_cast<void *>(n)
-                << "  via " << var->Name() << reinterpret_cast<void *>(var);
+        VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
+                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
+                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
        adj_list[n].insert(adj_n);
      }
    }
@@ -163,10 +164,10 @@ size_t GraphNum(const Graph &graph) {
    graph_nodes.emplace_back(g_nodes);
  }

-  if (VLOG_IS_ON(10)) {
-    VLOG(10) << "graph_num: " << graph_nodes.size();
+  if (VLOG_IS_ON(100)) {
+    VLOG(100) << "graph_num: " << graph_nodes.size();
    for (auto &g_n : graph_nodes) {
-      VLOG(10) << "graph_nodes: " << g_n.size();
+      VLOG(100) << "graph_nodes: " << g_n.size();
      if (g_n.size() < 10) {
        std::stringstream out;
        for (auto &node : g_n) {
@@ -180,7 +181,7 @@ size_t GraphNum(const Graph &graph) {
          }
          out << "]";
        }
-        VLOG(10) << out.str();
+        VLOG(100) << out.str();
      }
    }
  }

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <algorithm>
 #include <array>
 #include <string>
 #include <vector>
@@ -91,19 +92,19 @@ void GraphPatternDetector::operator()(Graph *graph,
  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
  int id = 0;
  for (auto &g : subgraphs) {
-    VLOG(3) << "optimizing #" << id++ << " subgraph";
+    VLOG(30) << "optimizing #" << id++ << " subgraph";
    handler(g, graph);
  }
 }

 bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
-  VLOG(3) << "mark pdnodes in graph";
+  VLOG(30) << "mark pdnodes in graph";
  if (graph.Nodes().empty()) return false;

  for (auto &node : GraphTraits::DFS(graph)) {
    for (const auto &pdnode : pattern_.nodes()) {
      if (pdnode->Tell(&node)) {
-        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        VLOG(40) << "pdnode " << pdnode->name() << " marked";
        pdnodes2nodes_[pdnode.get()].insert(&node);
      }
    }
@@ -111,7 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
  // Check to early stop if some PDNode can't find matched Node.
  for (auto &pdnode : pattern_.nodes()) {
    if (!pdnodes2nodes_.count(pdnode.get())) {
-      VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
+      VLOG(40) << pdnode->name() << " can't find matched Node, early stop";
      // return false;
    }
  }
@@ -120,7 +121,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
      GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
    }
  }
-  VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
+  VLOG(30) << pdnodes2nodes_.size() << " nodes marked";

  return !pdnodes2nodes_.empty();
 }
@@ -213,7 +214,7 @@ GraphPatternDetector::DetectPatterns() {
  // Extend a PDNode to subgraphs by deducing the connection relations defined
  // in edges of PDNodes.
  for (const auto &edge : pattern_.edges()) {
-    VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
+    VLOG(40) << "check " << edge.first->name() << " -> " << edge.second->name();
    // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
    // Each role has two PDNodes, which indicates two roles.
    // Detect two Nodes that can match these two roles and they are connected.
@@ -224,7 +225,7 @@ GraphPatternDetector::DetectPatterns() {
    // source -> target
    for (Node *source : pdnodes2nodes_[edge.first]) {
      for (Node *target : pdnodes2nodes_[edge.second]) {
-        VLOG(8) << "check " << source->id() << " -- " << target->id();
+        VLOG(80) << "check " << source->id() << " -- " << target->id();
        // TODO(Superjomn) add some prune strategies.
        for (const auto &group : pre_groups) {
          HitGroup new_group = group;
@@ -240,12 +241,13 @@ GraphPatternDetector::DetectPatterns() {
        }
      }
    }
-    VLOG(3) << "step " << step << " get records: " << cur_groups.size();
+    VLOG(30) << "step " << step << " get records: " << cur_groups.size();
    for (auto &group : cur_groups) {
      for (auto &item : group.roles) {
-        VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
+        VLOG(40) << "node " << item.second->id() << " as "
+                 << item.first->name();
      }
-      VLOG(4) << "=========================================================";
+      VLOG(40) << "=========================================================";
    }
  }


--- a/paddle/fluid/framework/ir/graph_viz_pass.cc
+++ b/paddle/fluid/framework/ir/graph_viz_pass.cc
@@ -41,7 +41,7 @@ std::string FormatName(const Node* node) {
 std::unique_ptr<ir::Graph> GraphVizPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  const std::string graph_viz_path = Get<std::string>(kGraphVizPath);
-  VLOG(3) << "draw IR graph viz to " << graph_viz_path;
+  VLOG(30) << "draw IR graph viz to " << graph_viz_path;
  std::unique_ptr<std::ostream> fout(new std::ofstream(graph_viz_path));
  PADDLE_ENFORCE(fout->good());
  std::ostream& sout = *fout;

--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@@ -20,7 +20,7 @@ namespace ir {

 std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  VLOG(3) << "Aplies MKL-DNN placement strategy.";
+  VLOG(30) << "Aplies MKL-DNN placement strategy.";
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
      n->Op()->SetAttr("use_mkldnn", true);

--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -62,7 +62,7 @@ VarDesc UpdateGradVarDesc(
        string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat);
    VarDesc repeated_var = CopyVarDesc(var_desc);
    repeated_var.SetName(new_gname);
-    VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat;
+    VLOG(30) << "update " << var_desc->Name() << " to repeat " << repeat;
    return repeated_var;
  }
  return *var_desc;
@@ -78,7 +78,7 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(

  std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
  auto origin_nodes = graph->ReleaseNodes();
-  VLOG(3) << "origin nodes count: " << origin_nodes.size();
+  VLOG(30) << "origin nodes count: " << origin_nodes.size();
  ir::Graph& result = *graph;

  // 1. record op nodes of different roles
@@ -137,8 +137,8 @@ std::unique_ptr<Graph> BatchMergePass::ApplyImpl(
            "%s.repeat.%d", repeated_op.Input("Variance")[0], i);
        bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]);
        bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]);
-        VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to "
-                << new_mean_name;
+        VLOG(30) << "renaming " << repeated_op.Input("Mean")[0] << " to "
+                 << new_mean_name;
        repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name);
        repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name);
        repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0],

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -76,7 +76,7 @@ class Pass {
                   attr_name);
    attrs_[attr_name] = attr;
    attr_dels_[attr_name] = [attr, attr_name]() {
-      VLOG(3) << "deleting " << attr_name;
+      VLOG(30) << "deleting " << attr_name;
      delete attr;
    };
  }

--- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc
@@ -12,10 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
+#include <set>
+#include <string>
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h"
 #include "paddle/fluid/framework/lod_tensor.h"

 namespace paddle {
@@ -159,10 +162,7 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) {

  std::set<std::string> acts({"sigmoid", "tanh", "relu", "identity"});
  PDNode* act = pattern->NewNode(
-      [=](Node* x) {
-        return x && x->IsOp() && acts.count(x->Op()->Type());
-
-      },
+      [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); },
      "act");

  PDNode* fc_out = pattern->NewNode(
@@ -196,7 +196,7 @@ std::unique_ptr<ir::Graph> SeqConcatFcFusePass::ApplyImpl(

  detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph,
                            Graph* graph) {
-    VLOG(4) << "get one concat pattern";
+    VLOG(40) << "get one concat pattern";
    // fc
    GET_NODE(fc_w, detector.pattern());
    GET_NODE(fc_bias, detector.pattern());

--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -60,7 +60,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) {

  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                     Graph* g) {
-    VLOG(4) << "handle SeqConv EltAdd Relu fuse";
+    VLOG(40) << "handle SeqConv EltAdd Relu fuse";
    GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern);

--- a/paddle/fluid/framework/lod_rank_table.cc
+++ b/paddle/fluid/framework/lod_rank_table.cc
@@ -31,7 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
    TableItem item;
    item.index = i;
    item.length = vec[i + 1] - vec[i];
-    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
+    VLOG(100) << "Add item to rank table " << item.index << " " << item.length;
    items_.emplace_back(item);
  }
  // NOTE(yuyang18):

--- a/paddle/fluid/framework/mixed_vector_test.cc
+++ b/paddle/fluid/framework/mixed_vector_test.cc
@@ -51,7 +51,7 @@ TEST(mixed_vector, InitWithCount) {
 TEST(mixed_vector, ForEach) {
  vec<int> tmp;
  for (auto& v : tmp) {
-    VLOG(3) << v;
+    VLOG(30) << v;
  }
 }


--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -71,7 +71,7 @@ void NaiveExecutor::Prepare(Scope *parent_scope,

 void NaiveExecutor::Run() {
  for (auto &op : ops_) {
-    VLOG(4) << "run " << op->Type();
+    VLOG(40) << "run " << op->Type();
    op->Run(*scope_, place_);
  }
 }
@@ -95,21 +95,21 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
      if (var->Persistable()) {
        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " global, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " global, which pointer is " << ptr;
      } else {  // Create temporary variables in local scope.
        auto *ptr = scope->Var(var->Name());
        InitializeVariable(ptr, var->GetType());
-        VLOG(3) << "Create Variable " << var->Name()
-                << " locally, which pointer is " << ptr;
+        VLOG(30) << "Create Variable " << var->Name()
+                 << " locally, which pointer is " << ptr;
      }
    }
  } else {
    for (auto &var : global_block.AllVars()) {
      auto *ptr = scope->Var(var->Name());
      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
-              << ptr;
+      VLOG(30) << "Create variable " << var->Name() << ", which pointer is "
+               << ptr;
    }
  }
 }

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -82,7 +82,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
    if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
-      VLOG(3) << "input " << in << " is not LodTensor";
+      VLOG(30) << "input " << in << " is not LodTensor";
      return;
    }
    out_var->SetLoDLevel(in_var->GetLoDLevel());
@@ -241,32 +241,32 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
    const proto::OpProto::Attr &attr = GetProtoAttr(name);
    switch (attr.type()) {
      case proto::AttrType::BOOLEANS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BOOLEANS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to BOOLEANS";
        this->attrs_[name] = std::vector<bool>();
        break;
      }
      case proto::AttrType::INTS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to INTS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to INTS";
        this->attrs_[name] = std::vector<int>();
        break;
      }
      case proto::AttrType::FLOATS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to FLOATS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to FLOATS";
        this->attrs_[name] = std::vector<float>();
        break;
      }
      case proto::AttrType::STRINGS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to STRINGS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to STRINGS";
        this->attrs_[name] = std::vector<std::string>();
        break;
      }
      case proto::AttrType::BLOCKS: {
-        VLOG(11) << "SetAttr: " << Type() << ", " << name
-                 << " from INTS to BLOCKS";
+        VLOG(110) << "SetAttr: " << Type() << ", " << name
+                  << " from INTS to BLOCKS";
        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
        return;
      }
@@ -499,13 +499,13 @@ void OpDesc::CheckAttrs() {
 }

 void OpDesc::InferShape(const BlockDesc &block) const {
-  VLOG(3) << "CompileTime infer shape on " << Type();
+  VLOG(30) << "CompileTime infer shape on " << Type();
  InitInferShapeFuncs();
  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
                 "%s's infer_shape has not been registered", this->Type());
  CompileTimeInferShapeContext ctx(*this, block);
-  if (VLOG_IS_ON(10)) {
+  if (VLOG_IS_ON(100)) {
    std::ostringstream sout;
    auto inames = this->InputArgumentNames();
    sout << " From [";
@@ -516,7 +516,7 @@ void OpDesc::InferShape(const BlockDesc &block) const {
    std::copy(onames.begin(), onames.end(),
              std::ostream_iterator<std::string>(sout, ", "));
    sout << "]";
-    VLOG(10) << sout.str();
+    VLOG(100) << sout.str();
  }
  infer_shape(&ctx);
 }
@@ -607,7 +607,7 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
    auto shape = var->GetShape();
    res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape);
  } catch (...) {
-    VLOG(5) << "GetDim of variable " << name << " error";
+    VLOG(50) << "GetDim of variable " << name << " error";
    std::rethrow_exception(std::current_exception());
  }
  return res;
@@ -624,7 +624,7 @@ std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
      res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s));
    }
  } catch (...) {
-    VLOG(5) << "GetRepeatedDim of variable " << name << " error.";
+    VLOG(50) << "GetRepeatedDim of variable " << name << " error.";
    std::rethrow_exception(std::current_exception());
  }
  return res;

--- a/paddle/fluid/framework/op_registry.cc
+++ b/paddle/fluid/framework/op_registry.cc
@@ -46,9 +46,9 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(

 std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
    const proto::OpDesc& op_desc) {
-  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
-             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
-             "instead.";
+  VLOG(10) << "CreateOp directly from OpDesc is deprecated. It should only be"
+              "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
+              "instead.";
  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
  AttributeMap attrs;

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -140,7 +140,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }

 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
+  VLOG(40) << place << " " << DebugStringEx(&scope);
  if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
    PADDLE_THROW("Cannot run operator on place %s", place);
@@ -160,7 +160,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
  } else {
    RunImpl(scope, place);
  }
-  VLOG(3) << place << " " << DebugStringEx(&scope);
+  VLOG(30) << place << " " << DebugStringEx(&scope);
 }

 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -259,6 +259,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
          if (row_size >= 0) {
            ss << "[row_size=" << row_size << "]";
          }
+          std::string dtype = GetDtype(*scope, output.second[i]);
+          ss << ":" << dtype;
          ss << "[" << GetDims(*scope, var_name, true) << "]";
          ss << "(" << GetLoD(*scope, var_name) << ")";
        }
@@ -715,14 +717,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,

  auto expected_kernel_key =
      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(30) << "expected_kernel_key:" << expected_kernel_key;

  auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
  if (kernel_iter == kernels.end() &&
      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    VLOG(30) << "missing MKLDNN kernel: fallbacking to PLAIN one";
    expected_kernel_key.library_type_ = LibraryType::kPlain;
    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
    kernel_iter = kernels.find(expected_kernel_key);
@@ -774,7 +776,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
    const Scope& scope, const std::vector<std::string>& inplace_vars,
    const Scope& transfer_scope) const {
  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    VLOG(30) << "share inplace var " + var_name +
+                    " back to it's original scope";
    auto* original_tensor =
        GetMutableLoDTensorOrSelectedRowsValueFromVar(scope.FindVar(var_name));
    auto* var = transfer_scope.FindVar(var_name);
@@ -815,8 +818,8 @@ Scope* OperatorWithKernel::TryTransferData(
        transfered_inplace_vars->emplace_back(var_name);
      }

-      VLOG(3) << "Transform Variable " << var_name << " from "
-              << kernel_type_for_var << " to " << expected_kernel_key;
+      VLOG(30) << "Transform Variable " << var_name << " from "
+               << kernel_type_for_var << " to " << expected_kernel_key;

      if (new_scope == nullptr) {
        new_scope = &scope.NewScope();

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -199,7 +199,7 @@ void ParallelExecutor::BCastParamsToDevices(

    auto &main_tensor = main_var->Get<LoDTensor>();
    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
+      VLOG(30) << "one in var not inited, return!";
      continue;
    }
    auto &dims = main_tensor.dims();

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -149,7 +149,7 @@ Variable* Scope::VarInternal(const std::string& name) {

  v = new Variable();
  vars_[name].reset(v);
-  VLOG(3) << "Create variable " << name;
+  VLOG(30) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
  return v;
 }

--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -176,7 +176,7 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
  if (ids.numel() == 0) {
-    VLOG(3) << "keys is empty, please check data!";
+    VLOG(30) << "keys is empty, please check data!";
  } else {
    int64_t value_width = value_->numel() / value_->dims()[0];
    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -23,8 +23,8 @@ namespace framework {

 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                const platform::DeviceContext& ctx, Tensor* dst) {
-  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
-          << dst_place;
+  VLOG(30) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+           << dst_place;
  src.check_memory_size();

  dst->Resize(src.dims());
@@ -38,8 +38,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,

  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data async from " << src_place << " to "
+               << dst_place;
      return;
    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -78,8 +78,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
    if (platform::is_same_place(src_place, dst_place)) {
      if (src_ptr == dst_ptr) {
-        VLOG(3) << "Skip copy the same data async from " << src_place << " to "
-                << dst_place;
+        VLOG(30) << "Skip copy the same data async from " << src_place << " to "
+                 << dst_place;
        return;
      }
      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
@@ -115,8 +115,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,

 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                    Tensor* dst) {
-  VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place()
-          << " to " << dst_place;
+  VLOG(30) << "TensorCopySync " << src.dims() << " from " << src.place()
+           << " to " << dst_place;
  src.check_memory_size();
  dst->Resize(src.dims());
  dst->set_layout(src.layout());
@@ -126,8 +126,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  auto size = src.numel() * SizeOfType(src.type());
  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
    if (src_ptr == dst_ptr) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data from " << src_place << " to "
+               << dst_place;
      return;
    }
    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@@ -147,8 +147,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
  } else if (platform::is_gpu_place(src_place) &&
             platform::is_gpu_place(dst_place)) {
    if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) {
-      VLOG(3) << "Skip copy the same data from " << src_place << " to "
-              << dst_place;
+      VLOG(30) << "Skip copy the same data from " << src_place << " to "
+               << dst_place;
      return;
    }
    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);

--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -39,7 +39,7 @@ void ThreadPool::Init() {
    int num_threads = std::thread::hardware_concurrency();
    if (FLAGS_dist_threadpool_size > 0) {
      num_threads = FLAGS_dist_threadpool_size;
-      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+      VLOG(10) << "set dist_threadpool_size to " << num_threads;
    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));

--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -61,10 +61,10 @@ size_t VarDesc::GetTensorDescNum() const {
 void VarDesc::SetShapes(
    const std::vector<std::vector<int64_t>> &multiple_dims) {
  if (multiple_dims.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given shapes(" << multiple_dims.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
    SetTensorDescNum(multiple_dims.size());
  }
  std::vector<proto::VarType::TensorDesc *> tensors = mutable_tensor_descs();
@@ -94,11 +94,11 @@ void VarDesc::SetDataType(proto::VarType::Type data_type) {
 void VarDesc::SetDataTypes(
    const std::vector<proto::VarType::Type> &multiple_data_type) {
  if (multiple_data_type.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given data types("
-            << multiple_data_type.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given data types("
+             << multiple_data_type.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
    SetTensorDescNum(multiple_data_type.size());
  }
  std::vector<proto::VarType::TensorDesc *> tensor_descs =
@@ -139,11 +139,11 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {

 void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
  if (multiple_lod_level.size() != GetTensorDescNum()) {
-    VLOG(3) << "WARNING: The number of given lod_levels("
-            << multiple_lod_level.size()
-            << ") doesn't match the existing tensor number("
-            << GetTensorDescNum()
-            << "). The Reader is going to be reinitialized.";
+    VLOG(30) << "WARNING: The number of given lod_levels("
+             << multiple_lod_level.size()
+             << ") doesn't match the existing tensor number("
+             << GetTensorDescNum()
+             << "). The Reader is going to be reinitialized.";
    SetTensorDescNum(multiple_lod_level.size());
  }
  switch (desc_.type().type()) {

--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"

 namespace paddle {
@@ -24,5 +27,27 @@ class VarTypeInference {
  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
 };

+class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const final {
+    auto in_out_var_names = this->GetInputOutputWithSameType();
+
+    for (auto& i_o_n : in_out_var_names) {
+      auto& x_name = op_desc.Input(i_o_n.first).at(0);
+      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+
+      auto& x = block->FindRecursiveOrCreateVar(x_name);
+      auto& out = block->FindRecursiveOrCreateVar(out_name);
+      out.SetType(x.GetType());
+      out.SetDataType(x.GetDataType());
+    }
+  }
+
+ protected:
+  virtual std::unordered_map<std::string, std::string>
+  GetInputOutputWithSameType() const = 0;
+};
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -60,7 +60,7 @@ class DfgPassManagerImpl final : public DfgPassManager {

 private:
  void AddPass(const std::string& name, AnalysisPass* pass) {
-    VLOG(3) << "Adding pass " << name;
+    VLOG(30) << "Adding pass " << name;
    Register(name, pass);
    AddGraphvizDebugerPass(pass);
  }
@@ -104,7 +104,7 @@ void Analyzer::Run(Argument* argument) {
  passes.push_back("graph_viz_pass");  // add graphviz for debug.
 #ifdef PADDLE_WITH_MKLDNN
  if (use_mkldnn_) {
-    VLOG(3) << "Adding MKL-DNN placement pass";
+    VLOG(30) << "Adding MKL-DNN placement pass";
    passes.push_back("mkldnn_placement_pass");
  }
 #endif
@@ -113,7 +113,9 @@ void Analyzer::Run(Argument* argument) {
  passes.push_back("infer_clean_graph_pass");
  passes.push_back("graph_viz_pass");  // add graphviz for debug.
  for (auto& pass : ir_passes_) {
-    if (!disabled_ir_passes_.count(pass)) {
+    // skip mkldnn pass when use_mkldnn_ = false;
+    bool skip_pass = (!use_mkldnn_) && pass.find("mkldnn") != std::string::npos;
+    if (!disabled_ir_passes_.count(pass) && !skip_pass) {
      passes.push_back(pass);
      passes.push_back("graph_viz_pass");  // add graphviz for debug.
    }

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -68,8 +68,8 @@ struct Argument {
                   key);
    attrs_[key] = data;
    attr_deleters_[key] = [data, key]() {
-      VLOG(3) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
-      VLOG(3) << "argument delete attr: " << key;
+      VLOG(30) << "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+      VLOG(30) << "argument delete attr: " << key;
      delete data;
    };
  }

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -132,7 +132,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
    Node *x{nullptr};
    if (ir_node->IsOp()) {
      PADDLE_ENFORCE(ir_node->Op());
-      VLOG(4) << "get op " << ir_node << " " << ir_node->Name();
+      VLOG(40) << "get op " << ir_node << " " << ir_node->Name();
      x = nodes.Create(Node::Type::kFunction);
      x->attr("ir_node").Pointer() = ir_node;
      PADDLE_ENFORCE(ir_node->Op()->Proto());
@@ -141,7 +141,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
    } else if (ir_node->IsVar()) {
      // Not create a Node for IR ControlDepVar, considering Inference currently
      // just used in single thread scenerio.
-      VLOG(4) << "get var " << ir_node->Name();
+      VLOG(40) << "get var " << ir_node->Name();
      x = nodes.Create(Node::Type::kValue);
      x->attr("ir_node").Pointer() = ir_node;
      x->SetName(ir_node->Name());
@@ -151,9 +151,9 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
    }
    ir_node_map.emplace(ir_node, x);
  }
-  VLOG(4) << "finish creating Nodes";
+  VLOG(40) << "finish creating Nodes";

-  VLOG(4) << "to create edge";
+  VLOG(40) << "to create edge";
  // Create links
  for (auto *ir_node : graph.Nodes()) {
    auto it = ir_node_map.find(ir_node);
@@ -175,7 +175,7 @@ void DataFlowGraph::Build(const framework::ir::Graph &graph) {
                 "Can't deduce any inputs from the graph, Is the graph empty?");

  ir_graph = &graph;
-  VLOG(3) << "finished build from IR";
+  VLOG(30) << "finished build from IR";
 }

 void DataFlowGraph::Clean() {

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -239,9 +239,10 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
  framework::BlockDesc block_desc(nullptr, &proto);
  block_desc.Proto()->set_parent_idx(-1);
  block_desc.Proto()->set_idx(0);
-  VLOG(4) << "origin variable size: "
-          << argument_->origin_program_desc->blocks(0).vars().size();
-  VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size();
+  VLOG(40) << "origin variable size: "
+           << argument_->origin_program_desc->blocks(0).vars().size();
+  VLOG(40) << "transformed variable size: "
+           << block_desc.Proto()->vars().size();
  // copy ops.

  for (auto *node : block_node->subgraph) {

--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc
@@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) {

  auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png";
  std::string message;
-  VLOG(3) << "draw to " << png_path;
+  VLOG(30) << "draw to " << png_path;
  ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message);
 }


--- a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc
@@ -29,7 +29,7 @@ void FluidToIrPass::EnableParamModify(const std::string &model_dir,
  PADDLE_ENFORCE(argument_);
  argument_->Set(framework::ir::kParamScopeAttr, new framework::Scope);
  // Load parameters.
-  VLOG(3) << "Loading parameters from " << model_dir;
+  VLOG(30) << "Loading parameters from " << model_dir;
  LoadParams(&argument_->Get<framework::Scope>(framework::ir::kParamScopeAttr),
             model_dir, prog_file, param_file);
 }

--- a/paddle/fluid/inference/analysis/model_store_pass.cc
+++ b/paddle/fluid/inference/analysis/model_store_pass.cc
@@ -35,21 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) {
  std::stringstream ss;
  // NOTE these commands only works on linux.
  ss << "mkdir -p " << *argument_->model_output_store_path;
-  VLOG(3) << "run command: " << ss.str();
+  VLOG(30) << "run command: " << ss.str();
  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);
  ss.str("");

  ss << "cp " << *argument_->fluid_model_dir << "/*"
     << " " << *argument_->model_output_store_path;
-  VLOG(3) << "run command: " << ss.str();
+  VLOG(30) << "run command: " << ss.str();
  PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0);

  // Store program
  PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc,
                          "program desc is not transformed, should call "
                          "DataFlowGraphToFluidPass first.");
-  VLOG(3) << "store analyzed program to "
-          << *argument_->model_output_store_path;
+  VLOG(30) << "store analyzed program to "
+           << *argument_->model_output_store_path;
  const std::string program_output_path =
      *argument_->model_output_store_path + "/__model__";
  std::ofstream file(program_output_path, std::ios::binary);

--- a/paddle/fluid/inference/analysis/pass_manager.cc
+++ b/paddle/fluid/inference/analysis/pass_manager.cc
@@ -23,7 +23,7 @@ namespace analysis {
 bool PassManager::Initialize(Argument* argument) {
  argument_ = argument;
  for (auto& pass : data_) {
-    VLOG(3) << "Initializing pass [" << pass->repr() << "]";
+    VLOG(30) << "Initializing pass [" << pass->repr() << "]";
    if (!pass->Initialize(argument)) {
      LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]";
      return false;
@@ -34,7 +34,7 @@ bool PassManager::Initialize(Argument* argument) {

 void DfgPassManager::RunAll() {
  PADDLE_ENFORCE(argument_);
-  VLOG(3) << "Total " << data_.size() << " Analysys passes";
+  VLOG(30) << "Total " << data_.size() << " Analysys passes";
  for (auto& pass : data_) {
    string::PrettyLogEndl(string::Style::H1(), "* Running Analysis pass [%s]",
                          pass->repr());

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -232,7 +232,7 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
    BriefNode *brief_node = itr.second;

    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
-      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+      VLOG(40) << brief_node->node->id() << " node not a trt candicate.";
      continue;
    }


--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
@@ -25,9 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(

 void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
  SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)();
-  VLOG(4) << "debug info "
-          << graph->HumanReadableInfo(false /*show_values*/,
-                                      true /*show_functions*/);
+  VLOG(40) << "debug info "
+           << graph->HumanReadableInfo(false /*show_values*/,
+                                       true /*show_functions*/);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -38,7 +38,7 @@ using contrib::AnalysisConfig;
 bool AnalysisPredictor::Init(
    const std::shared_ptr<framework::Scope> &parent_scope,
    const std::shared_ptr<framework::ProgramDesc> &program) {
-  VLOG(3) << "Predictor::init()";
+  VLOG(30) << "Predictor::init()";
 #if !defined(_WIN32)
  if (FLAGS_profile) {
    LOG(WARNING) << "Profiler is actived, might affect the performance";
@@ -89,7 +89,7 @@ bool AnalysisPredictor::Init(
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
-  VLOG(3) << "Predictor::predict";
+  VLOG(30) << "Predictor::predict";
  inference::Timer timer;
  timer.tic();
  // set feed variable
@@ -109,7 +109,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
    LOG(ERROR) << "fail to get fetches";
    return false;
  }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  VLOG(30) << "predict cost: " << timer.toc() << "ms";

  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
@@ -119,7 +119,7 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,

 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                framework::Scope *scope) {
-  VLOG(3) << "Predictor::set_feed";
+  VLOG(30) << "Predictor::set_feed";
  if (inputs.size() != feeds_.size()) {
    LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get "
               << inputs.size();
@@ -184,7 +184,7 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch,

 bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
                                 framework::Scope *scope) {
-  VLOG(3) << "Predictor::get_fetch";
+  VLOG(30) << "Predictor::get_fetch";
  outputs->resize(fetchs_.size());
  for (size_t i = 0; i < fetchs_.size(); ++i) {
    int idx = boost::get<int>(fetchs_[i]->GetAttr("col"));
@@ -246,7 +246,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
  }

  CHECK(argument_.transformed_program_desc);
-  VLOG(5) << "to prepare executor";
+  VLOG(50) << "to prepare executor";
  inference_program_.reset(
      new framework::ProgramDesc(*argument_.transformed_program_desc));
  if (argument_.Has(framework::ir::kParamScopeAttr)) {
@@ -260,7 +260,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
-  VLOG(3) << "create AnalysisConfig";
+  VLOG(30) << "create AnalysisConfig";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
@@ -274,7 +274,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
                         std::to_string(config.fraction_of_gpu_memory);
      flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
+      VLOG(30) << "set flag: " << flag;
      framework::InitGflags(flags);
    }
  }

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -13,6 +13,8 @@
 // limitations under the License.

 #pragma once
+#include <algorithm>
+#include <map>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -16,7 +16,6 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle_inference_api.h"

 namespace paddle {


--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -157,7 +157,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
    LOG(ERROR) << "fail to get fetches";
    return false;
  }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+  VLOG(30) << "predict cost: " << timer.toc() << "ms";

  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());

--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -34,7 +34,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {

  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
-    VLOG(3) << "Predictor::init()";
+    VLOG(30) << "Predictor::init()";
    if (config_.use_gpu) {
      place_ = paddle::platform::CUDAPlace(config_.device);
    } else {
@@ -70,7 +70,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
    OptimizeInferenceProgram();
    ctx_ = executor_->Prepare(*inference_program_, 0);

-    VLOG(5) << "to create variables";
+    VLOG(50) << "to create variables";
    executor_->CreateVariables(*inference_program_,
                               sub_scope_ ? sub_scope_ : scope_.get(), 0);
    // Get the feed_target_names and fetch_target_names
@@ -114,9 +114,9 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
        new ProgramDesc(*inference_program_->Proto()));
    Singleton<Analyzer>::Global().Run(&argument);
    CHECK(argument.transformed_program_desc);
-    VLOG(5) << "transformed program:\n"
-            << argument.transformed_program_desc->SerializeAsString();
-    VLOG(5) << "to prepare executor";
+    VLOG(50) << "transformed program:\n"
+             << argument.transformed_program_desc->SerializeAsString();
+    VLOG(50) << "to prepare executor";
    inference_program_.reset(
        new framework::ProgramDesc(*argument.transformed_program_desc));
  }
@@ -129,7 +129,7 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
    const MixedRTConfig& config) {
-  VLOG(3) << "create TensorRTSubgraphPredictor";
+  VLOG(30) << "create TensorRTSubgraphPredictor";
  if (config.use_gpu) {
    // 1. GPU memeroy
    PADDLE_ENFORCE_GT(
@@ -143,7 +143,7 @@ CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
      std::string flag = "--fraction_of_gpu_memory_to_use=" +
                         std::to_string(config.fraction_of_gpu_memory);
      flags.push_back(flag);
-      VLOG(3) << "set flag: " << flag;
+      VLOG(30) << "set flag: " << flag;
      framework::InitGflags(flags);
    }
  }

--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -45,7 +45,7 @@ void Main() {
  config.fraction_of_gpu_memory = 0.1;  // set by yourself
  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);

-  VLOG(3) << "begin to process data";
+  VLOG(30) << "begin to process data";
  // Just a single batch of data.
  std::string line;
  std::ifstream file(FLAGS_data);
@@ -60,13 +60,13 @@ void Main() {
      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
  input.dtype = PaddleDType::FLOAT32;

-  VLOG(3) << "run executor";
+  VLOG(30) << "run executor";
  std::vector<PaddleTensor> output;
  predictor->Run({input}, &output, 1);

-  VLOG(3) << "output.size " << output.size();
+  VLOG(30) << "output.size " << output.size();
  auto& tensor = output.front();
-  VLOG(3) << "output: " << SummaryTensor(tensor);
+  VLOG(30) << "output: " << SummaryTensor(tensor);

  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);

--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -47,7 +47,7 @@ static void split(const std::string& str, char sep,
 }

 Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
+  VLOG(30) << "process a line";
  std::vector<std::string> columns;
  split(line, '\t', &columns);
  CHECK_EQ(columns.size(), 2UL)
@@ -65,8 +65,8 @@ Record ProcessALine(const std::string& line) {
  for (auto& s : shape_strs) {
    record.shape.push_back(std::stoi(s));
  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(30) << "data size " << record.data.size();
+  VLOG(30) << "data shape size " << record.shape.size();
  return record;
 }

@@ -78,8 +78,8 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
  file.close();

  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
+  VLOG(30) << "predictor output numel " << numel;
+  VLOG(30) << "reference output numel " << refer.data.size();
  CHECK_EQ(numel, refer.data.size());
  switch (output.dtype) {
    case PaddleDType::INT64: {

--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -49,11 +49,11 @@ void Main(bool use_gpu) {
    config.fraction_of_gpu_memory = 0.1;  // set by yourself
  }

-  VLOG(3) << "init predictor";
+  VLOG(30) << "init predictor";
  predictor = CreatePaddlePredictor<NativeConfig>(config);
  analysis_predictor = CreatePaddlePredictor<AnalysisConfig>(config);

-  VLOG(3) << "begin to process data";
+  VLOG(30) << "begin to process data";
  // Just a single batch of data.
  std::string line;
  std::ifstream file(FLAGS_data);
@@ -68,13 +68,13 @@ void Main(bool use_gpu) {
      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
  input.dtype = PaddleDType::FLOAT32;

-  VLOG(3) << "run executor";
+  VLOG(30) << "run executor";
  std::vector<PaddleTensor> output, analysis_output;
  predictor->Run({input}, &output, 1);

-  VLOG(3) << "output.size " << output.size();
+  VLOG(30) << "output.size " << output.size();
  auto& tensor = output.front();
-  VLOG(3) << "output: " << SummaryTensor(tensor);
+  VLOG(30) << "output: " << SummaryTensor(tensor);

  // compare with reference result
  CheckOutput(FLAGS_refer, tensor);

--- a/paddle/fluid/inference/api/details/reset_tensor_array.cc
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc
@@ -26,7 +26,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
      // parameter.
      if (var_name == "feed" || var_name == "fetch") continue;
      if (var->Type() == typeid(framework::LoDTensorArray)) {
-        VLOG(4) << "collect " << var_name;
+        VLOG(40) << "collect " << var_name;
        arrays_.push_back(var->GetMutable<framework::LoDTensorArray>());
      }
    }
@@ -34,7 +34,7 @@ void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) {
      CollectTensorArrays(kid);
    }

-    VLOG(3) << "Collect " << arrays_.size() << " arrays";
+    VLOG(30) << "Collect " << arrays_.size() << " arrays";
    flag_ = false;
  }
 }

--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -16,13 +16,14 @@

 #include <glog/logging.h>
 #include <sys/time.h>
+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <vector>
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle_inference_api.h"

 namespace paddle {
 namespace inference {

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -59,7 +59,8 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 bool IsPersistable(const framework::VarDesc* var) {
  if (var->Persistable() &&
      var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != framework::proto::VarType::FETCH_LIST) {
+      var->GetType() != framework::proto::VarType::FETCH_LIST &&
+      var->GetType() != framework::proto::VarType::RAW) {
    return true;
  }
  return false;
@@ -77,7 +78,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,

  for (auto* var : global_block.AllVars()) {
    if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
+      VLOG(30) << "persistable variable's name: " << var->Name();

      framework::VarDesc* new_var = load_block->Var(var->Name());
      new_var->SetShape(var->GetShape());
@@ -120,7 +121,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                             const std::string& dirname) {
  std::string model_filename = dirname + "/__model__";
  std::string program_desc_str;
-  VLOG(3) << "loading model from " << model_filename;
+  VLOG(30) << "loading model from " << model_filename;
  ReadBinaryFile(model_filename, &program_desc_str);

  std::unique_ptr<framework::ProgramDesc> main_program(

--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";

    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
@@ -25,7 +25,7 @@ class DropoutOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
+    VLOG(40) << "convert a fluid dropout op to tensorrt dropout layer";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs
    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -52,7 +52,7 @@ class FcOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid fc op to tensorrt fc layer without bias";
+    VLOG(40) << "convert a fluid fc op to tensorrt fc layer without bias";

    framework::OpDesc op_desc(op, nullptr);
    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);

--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -25,7 +25,7 @@ class MulOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+    VLOG(40) << "convert a fluid mul op to tensorrt mul layer without bias";

    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -25,7 +25,7 @@ class PadOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a fluid transpose op to tensorrt tranpose layer";
+    VLOG(40) << "convert a fluid transpose op to tensorrt tranpose layer";

    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -25,7 +25,7 @@ class Pool2dOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4)
+    VLOG(40)
        << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -25,7 +25,7 @@ class SoftMaxOpConverter : public OpConverter {
 public:
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(4)
+    VLOG(40)
        << "convert a fluid softmax op to tensorrt softmax layer without bias";
    framework::OpDesc op_desc(op, nullptr);
    // Declare inputs

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -134,7 +134,7 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;

-  // TODO: (NHZLX)
+  // TODO(NHZLX)
  // In the normal case, the paddle-trt exists bug when runing the googlenet.
  // When there are more than two convolutions of 1 * 1 with the same input, the
  // paddle-tensorrt will do the merging optimization, which fuse those conv

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -27,7 +27,7 @@ struct Record {
 };

 Record ProcessALine(const std::string &line) {
-  VLOG(3) << "process a line";
+  VLOG(30) << "process a line";
  std::vector<std::string> columns;
  split(line, '\t', &columns);
  CHECK_EQ(columns.size(), 2UL)
@@ -45,8 +45,8 @@ Record ProcessALine(const std::string &line) {
  for (auto &s : shape_strs) {
    record.shape.push_back(std::stoi(s));
  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(30) << "data size " << record.data.size();
+  VLOG(30) << "data shape size " << record.shape.size();
  return record;
 }


--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -32,11 +32,11 @@ BuddyAllocator::BuddyAllocator(
      system_allocator_(std::move(system_allocator)) {}

 BuddyAllocator::~BuddyAllocator() {
-  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
-              "have actually been freed";
+  VLOG(100) << "BuddyAllocator Disconstructor makes sure that all of these "
+               "have actually been freed";
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    VLOG(100) << "Free from block (" << block << ", " << max_chunk_size_ << ")";

    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);
@@ -57,12 +57,12 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // acquire the allocator lock
  std::lock_guard<std::mutex> lock(mutex_);

-  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
-           << size;
+  VLOG(100) << "Allocate " << unaligned_size << " bytes from chunk size "
+            << size;

  // if the allocation is huge, send directly to the system allocator
  if (size > max_chunk_size_) {
-    VLOG(10) << "Allocate from system allocator.";
+    VLOG(100) << "Allocate from system allocator.";
    return SystemAlloc(size);
  }

@@ -77,9 +77,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
      return nullptr;
    }
  } else {
-    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
-             << " at address "
-             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+    VLOG(100) << "Allocation from existing memory block " << std::get<2>(*it)
+              << " at address "
+              << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
  }

  total_used_ += size;
@@ -96,10 +96,10 @@ void BuddyAllocator::Free(void* p) {
  // Acquire the allocator lock
  std::lock_guard<std::mutex> lock(mutex_);

-  VLOG(10) << "Free from address " << block;
+  VLOG(100) << "Free from address " << block;

  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    VLOG(10) << "Free directly from system allocator";
+    VLOG(100) << "Free directly from system allocator";
    system_allocator_->Free(block, block->total_size(cache_),
                            block->index(cache_));

@@ -116,8 +116,8 @@ void BuddyAllocator::Free(void* p) {

  // Trying to merge the right buddy
  if (block->has_right_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its right buddy "
-             << block->right_buddy(cache_);
+    VLOG(100) << "Merging this block " << block << " with its right buddy "
+              << block->right_buddy(cache_);

    auto right_buddy = block->right_buddy(cache_);

@@ -134,8 +134,8 @@ void BuddyAllocator::Free(void* p) {

  // Trying to merge the left buddy
  if (block->has_left_buddy(cache_)) {
-    VLOG(10) << "Merging this block " << block << " with its left buddy "
-             << block->left_buddy(cache_);
+    VLOG(100) << "Merging this block " << block << " with its left buddy "
+              << block->left_buddy(cache_);

    auto left_buddy = block->left_buddy(cache_);

@@ -151,8 +151,8 @@ void BuddyAllocator::Free(void* p) {
  }

  // Dumping this block into pool
-  VLOG(10) << "Inserting free block (" << block << ", "
-           << block->total_size(cache_) << ")";
+  VLOG(100) << "Inserting free block (" << block << ", "
+            << block->total_size(cache_) << ")";
  pool_.insert(
      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));

@@ -174,7 +174,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;
  void* p = system_allocator_->Alloc(&index, size);

-  VLOG(10) << "Allocated " << p << " from system allocator.";
+  VLOG(100) << "Allocated " << p << " from system allocator.";

  if (p == nullptr) return nullptr;

@@ -200,8 +200,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {

  if (p == nullptr) return pool_.end();

-  VLOG(10) << "Creating and inserting new block " << p
-           << " from system allocator";
+  VLOG(100) << "Creating and inserting new block " << p
+            << " from system allocator";

  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
                                     max_chunk_size_, nullptr, nullptr);
@@ -245,19 +245,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
  pool_.erase(it);

-  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
-           << ") into";
+  VLOG(100) << "Split block (" << block << ", " << block->total_size(cache_)
+            << ") into";
  block->split(&cache_, size);

-  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
-           << ")";
+  VLOG(100) << "Left block (" << block << ", " << block->total_size(cache_)
+            << ")";
  block->set_type(&cache_, MemoryBlock::ARENA_CHUNK);

  // the rest of memory if exist
  if (block->has_right_buddy(cache_)) {
    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
-               << block->right_buddy(cache_)->total_size(cache_) << ")";
+      VLOG(100) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                << block->right_buddy(cache_)->total_size(cache_) << ")";

      pool_.insert(
          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
@@ -284,7 +284,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
      return;
    }

-    VLOG(10) << "Return block " << block << " to fallback allocator.";
+    VLOG(100) << "Return block " << block << " to fallback allocator.";

    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);
@@ -320,7 +320,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {

    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

-    VLOG(10) << "Return block " << block << " to base allocator.";
+    VLOG(100) << "Return block " << block << " to base allocator.";

    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
    cache_.invalidate(block);

--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -29,7 +29,7 @@ MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const {
    return existing_desc->second;
  } else {
    auto* desc = reinterpret_cast<const MemoryBlock::Desc*>(block);
-    VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type;
+    VLOG(100) << "Load MemoryBlock::Desc type=" << desc->type;
    PADDLE_ASSERT(desc->check_guards());
    return *reinterpret_cast<const MemoryBlock::Desc*>(block);
  }

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -78,7 +78,7 @@ void* Alloc<platform::CPUPlace>(const platform::CPUPlace& place, size_t size) {
  if (FLAGS_init_allocated_mem) {
    memset(p, 0xEF, size);
  }
-  VLOG(10) << "  pointer=" << p;
+  VLOG(100) << "  pointer=" << p;
  return p;
 }

@@ -111,12 +111,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
          std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)),
          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());

-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
+      VLOG(100) << "\n\nNOTE: each GPU device use "
+                << FLAGS_fraction_of_gpu_memory_to_use * 100
+                << "% of GPU memory.\n"
+                << "You can set GFlags environment variable '"
+                << "FLAGS_fraction_of_gpu_memory_to_use"
+                << "' to change the fraction of GPU usage.\n\n";
    }
  });


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -317,6 +317,7 @@ op_library(save_op DEPS lod_tensor)
 op_library(load_op DEPS lod_tensor)
 op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
+op_library(tensor_array_to_tensor_op DEPS concat_op)
 op_library(concat_op DEPS concat_and_split)

 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -91,16 +91,12 @@ class ActivationOp : public framework::OperatorWithKernel {
  }
 };

-class ActivationOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto& x = block->FindRecursiveOrCreateVar(x_name);
-    auto& out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ActivationOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
  }
 };


--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -95,7 +95,7 @@ class ActivationGradKernel
      auto x = framework::EigenVector<T>::Flatten(*X);
      functor(*place, x, out, dout, dx);
    } else {
-      VLOG(10) << " Inplace activation ";
+      VLOG(100) << " Inplace activation ";
      auto x = framework::EigenVector<T>::Flatten(*dX);
      functor(*place, x, out, dout, dx);
    }

--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -297,7 +297,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
      auto& grad =
          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
      if (grad.rows().size() == 0) {
-        VLOG(3) << "grad row size is 0!!";
+        VLOG(30) << "grad row size is 0!!";
        return;
      }


--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -66,9 +66,10 @@ class AddPositionEncodingKernel : public framework::OpKernel<T> {
          x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i];
      for (int j = 0; j < max_length; ++j) {
        for (int k = 0; k < half_size; ++k) {
-          const double val = (half_size > 1)
-                                 ? j / pow(10000.0, double(k) / (half_size - 1))
-                                 : j / 10000.0;
+          const double val =
+              (half_size > 1)
+                  ? j / pow(10000.0, static_cast<double>(k) / (half_size - 1))
+                  : j / 10000.0;
          dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta;
          dst_ptr[half_size + k] =
              src_ptr[half_size + k] * alpha + cos(val) * beta;

--- a/paddle/fluid/operators/array_operator.h
+++ b/paddle/fluid/operators/array_operator.h
@@ -49,7 +49,7 @@ class ArrayOp : public framework::OperatorBase {
    } else {
      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
    }
-    VLOG(10) << " Offset = " << offset;
+    VLOG(100) << " Offset = " << offset;
    return offset;
  }
 };

--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -148,8 +148,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {

        size_t start_offset = lod_and_offset.second.first;
        size_t end_offset = lod_and_offset.second.second;
-        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
-                 << ", " << end_offset << "]";
+        VLOG(100) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                  << ", " << end_offset << "]";
        // Copy data
        PADDLE_ENFORCE_GE(end_offset, start_offset);
        size_t len = end_offset - start_offset;

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -170,6 +170,15 @@ The required data format for this layer is one of the following:
  }
 };

+class BatchNormOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
+
 template <typename T>
 class BatchNormKernel<platform::CPUDeviceContext, T>
    : public framework::OpKernel<T> {
@@ -525,7 +534,7 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
-                  ops::BatchNormGradMaker);
+                  ops::BatchNormOpInferVarType, ops::BatchNormGradMaker);
 REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);

 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -96,7 +96,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
    mode_ = CUDNN_BATCHNORM_SPATIAL;
 #endif

-    VLOG(3) << "Setting descriptors.";
+    VLOG(30) << "Setting descriptors.";
    std::vector<int> dims;
    std::vector<int> strides;
    if (data_layout == DataLayout::kNCHW) {

--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -33,11 +33,11 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,

  auto items = SelectTopBeamSizeItems(pre_ids, pre_scores);
  auto selected_items = ToMap(items, high_level.back());
-  VLOG(3) << "selected_items:";
+  VLOG(30) << "selected_items:";
  for (size_t i = 0; i < selected_items.size(); ++i) {
-    VLOG(3) << "offset:" << i;
+    VLOG(30) << "offset:" << i;
    for (auto &item : selected_items[i]) {
-      VLOG(3) << ItemToString(item);
+      VLOG(30) << ItemToString(item);
    }
  }

@@ -138,11 +138,11 @@ std::vector<std::vector<BeamSearch::Item>> BeamSearch::SelectTopBeamSizeItems(
    }
    result.emplace_back(items);
  }
-  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  VLOG(30) << "SelectTopBeamSizeItems result size " << result.size();
  for (auto &items : result) {
-    VLOG(3) << "item set:";
+    VLOG(30) << "item set:";
    for (auto &item : items) {
-      VLOG(3) << ItemToString(item);
+      VLOG(30) << ItemToString(item);
    }
  }


--- a/paddle/fluid/operators/bilinear_interp_op.h
+++ b/paddle/fluid/operators/bilinear_interp_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class BilinearInterpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input_t = ctx.Input<Tensor>("X");      // float tensor
-    auto* output_t = ctx.Output<Tensor>("Out");  // float tensor
-    auto out_dims = output_t->dims();
-    auto* input = input_t->data<T>();
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      auto out_size_data = out_size_t->data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-    auto* output = output_t->mutable_data<T>(
-        {out_dims[0], out_dims[1], out_h, out_w}, ctx.GetPlace());
-    int batch_size = input_t->dims()[0];
-    int channels = input_t->dims()[1];
-    int in_h = input_t->dims()[2];
-    int in_w = input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(output, input, input_t->numel() * sizeof(T));
-    } else {
-      for (int k = 0; k < batch_size; ++k) {  // loop for batches
-        for (int i = 0; i < out_h; ++i) {     // loop for images
-          int h = ratio_h * i;
-          int hid = (h < in_h - 1) ? 1 : 0;
-          float h1lambda = ratio_h * i - h;
-          float h2lambda = 1.f - h1lambda;
-
-          for (int j = 0; j < out_w; ++j) {
-            int w = ratio_w * j;
-            int wid = (w < in_w - 1) ? 1 : 0;
-            float w1lambda = ratio_w * j - w;
-            float w2lambda = 1.f - w1lambda;
-            // calculate four position for bilinear interpolation
-            const T* in_pos = &input[k * in_chw + h * in_w + w];
-            T* out_pos = &output[k * out_chw + i * out_w + j];
-
-            for (int c = 0; c < channels; ++c) {  // loop for channels
-              // bilinear interpolation
-              out_pos[0] = static_cast<T>(
-                  h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
-                  h1lambda * (w2lambda * in_pos[hid * in_w] +
-                              w1lambda * in_pos[hid * in_w + wid]));
-              in_pos += in_hw;
-              out_pos += out_hw;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BilinearInterpGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_input_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* d_output_t = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_output = d_output_t->data<T>();
-    auto* d_input = d_input_t->mutable_data<T>(ctx.GetPlace());
-    auto& device_ctx =
-        ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
-    zero(device_ctx, d_input_t, static_cast<T>(0.0));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-
-    auto out_size_t = ctx.Input<Tensor>("OutSize");
-    if (out_size_t != nullptr) {
-      auto out_size_data = out_size_t->data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-
-    int batch_size = d_input_t->dims()[0];
-    int channels = d_input_t->dims()[1];
-    int in_h = d_input_t->dims()[2];
-    int in_w = d_input_t->dims()[3];
-
-    int in_hw = in_h * in_w;
-    int out_hw = out_h * out_w;
-    int in_chw = channels * in_hw;
-    int out_chw = channels * out_hw;
-
-    float ratio_h =
-        (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-    float ratio_w =
-        (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-    if (in_h == out_h && in_w == out_w) {
-      memcpy(d_input, d_output, d_input_t->numel() * sizeof(T));
-    } else {
-      for (int k = 0; k < batch_size; ++k) {  // loop for batches
-        for (int i = 0; i < out_h; ++i) {     // loop for images
-          int h = ratio_h * i;
-          int hid = (h < in_h - 1) ? 1 : 0;
-          float h1lambda = ratio_h * i - h;
-          float h2lambda = 1 - h1lambda;
-
-          for (int j = 0; j < out_w; ++j) {
-            int w = ratio_w * j;
-            int wid = (w < in_w - 1) ? 1 : 0;
-            float w1lambda = ratio_w * j - w;
-            float w2lambda = 1 - w1lambda;
-            T* in_pos = &d_input[k * in_chw + h * in_w + w];
-            const T* out_pos = &d_output[k * out_chw + i * out_w + j];
-
-            for (int c = 0; c < channels; ++c) {  // loop for channels
-              in_pos[0] += static_cast<T>(h2lambda * w2lambda * out_pos[0]);
-              in_pos[wid] += static_cast<T>(h2lambda * w1lambda * out_pos[0]);
-              in_pos[hid * in_w] +=
-                  static_cast<T>(h1lambda * w2lambda * out_pos[0]);
-              in_pos[hid * in_w + wid] +=
-                  static_cast<T>(h1lambda * w1lambda * out_pos[0]);
-              in_pos += in_hw;
-              out_pos += out_hw;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/checkpoint_notify_op.cc
+++ b/paddle/fluid/operators/checkpoint_notify_op.cc
@@ -46,8 +46,8 @@ class CheckpointNotifyOp : public framework::OperatorBase {
      auto lookup_table_save_dir =
          string::Sprintf("%s/%s_%d", dir, lookup_table_name, i);
      rpc_client->AsyncCheckpointNotify(epmap[i], lookup_table_save_dir);
-      VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
-              << " and dir:" << dir << " to " << epmap[i];
+      VLOG(30) << "checkpoint notify sending lookup table: "
+               << lookup_table_name << " and dir:" << dir << " to " << epmap[i];
    }
    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -37,7 +37,7 @@ class ConcatOp : public framework::OperatorWithKernel {

    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
    if (n == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+      VLOG(30) << "Warning: concat op have only one input, may waste memory";
    }

    auto out_dims = ins[0];

--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -15,15 +15,22 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/profiler.h"

 DEFINE_bool(cudnn_deterministic, false,
            "Whether allow using an autotuning algorithm for convolution "
            "operator. The autotuning algorithm may be non-deterministic. If "
            "true, the algorithm is deterministic.");
+DEFINE_uint64(conv_workspace_size_limit, 4096,
+              "cuDNN convolution workspace limit in MB unit.");
+DEFINE_bool(cudnn_exhaustive_search, false,
+            "Whether enable exhaustive search for cuDNN convolution or "
+            "not, defalut is False.");

 namespace paddle {
 namespace operators {
@@ -36,13 +43,25 @@ using DataLayout = platform::DataLayout;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;

+static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache";
+static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache";
+static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache";
+
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;

+static constexpr size_t kNUM_CUDNN_FWD_ALGS =
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
+static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
+
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto* input = ctx.Input<Tensor>("Input");
@@ -55,6 +74,8 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    int groups = ctx.Attr<int>("groups");
    int64_t user_workspace_size =
        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+    bool exhaustive_search =
+        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");

    const T* input_data = input->data<T>();
    const T* filter_data = filter->data<T>();
@@ -120,19 +141,19 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    // ------------------- cudnn conv workspace ---------------------
    size_t workspace_size_in_bytes;  // final workspace to allocate.
    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
    }
+
    // ------------------- cudnn conv algorithm ---------------------
    cudnnConvolutionFwdAlgo_t algo;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();

-    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-        workspace_size_limit, &algo));
-
+    bool half_float = false;
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
    // Tensor core is supported since the volta GPU and
    // is only enabled when input and filter data are float16
@@ -143,14 +164,66 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-      VLOG(5) << "use cudnn_tensor_op_math";
+      half_float = true;
+      VLOG(50) << "use cudnn_tensor_op_math";
    } else {
      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-      VLOG(5) << "NOT use cudnn_tensor_op_math";
+      VLOG(50) << "NOT use cudnn_tensor_op_math";
    }
 #endif

+    auto x_dims = framework::vectorize(input->dims());
+    auto f_dims = framework::vectorize(filter->dims());
+    if ((!exhaustive_search) && (!half_float)) {
+      CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &algo));
+      VLOG(3) << "cuDNN forward algo " << algo;
+    } else if (exhaustive_search && (!half_float)) {
+      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* algo_cache = nullptr;
+      if (ctx.scope().FindVar(kCUDNNFwdAlgoCache)) {
+        algo_cache =
+            ctx.scope()
+                .FindVar(kCUDNNFwdAlgoCache)
+                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+      } else {
+        algo_cache =
+            const_cast<framework::Scope&>(ctx.scope())
+                .Var(kCUDNNFwdAlgoCache)
+                ->GetMutable<AlgorithmsCache<cudnnConvolutionFwdAlgo_t>>();
+      }
+      algo = algo_cache->GetAlgorithm(
+          x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+            int returned_algo_count;
+            std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
+                fwd_perf_stat;
+            auto cudnn_find_func = [&](void* cudnn_workspace) {
+              CUDNN_ENFORCE(
+                  platform::dynload::cudnnFindConvolutionForwardAlgorithmEx(
+                      handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+                      filter_data, cudnn_conv_desc, cudnn_output_desc,
+                      output_data, kNUM_CUDNN_FWD_ALGS, &returned_algo_count,
+                      fwd_perf_stat.data(), cudnn_workspace,
+                      workspace_size_limit));
+            };
+            workspace_handle.RunFunc(cudnn_find_func, workspace_size_limit);
+
+            VLOG(3) << "Perf result: (algo: stat, time, memory)";
+            for (int i = 0; i < returned_algo_count; ++i) {
+              const auto& stat = fwd_perf_stat[i];
+              VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+                      << " " << stat.memory;
+            }
+            return fwd_perf_stat[0].algo;
+          });
+      VLOG(3) << "choose algo " << algo;
+    } else {
+      PADDLE_ENFORCE(half_float,
+                     "cuDNN exhaustive search doesn't support half float.");
+    }
+
    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
@@ -162,7 +235,6 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {

    // ------------------- cudnn conv forward ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    for (int i = 0; i < groups; i++) {
      auto cudnn_func = [&](void* cudnn_workspace) {
        CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
@@ -180,6 +252,7 @@ template <typename T>
 class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
                   "It must use CUDAPlace.");
    auto input = ctx.Input<Tensor>("Input");
@@ -198,6 +271,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    int groups = ctx.Attr<int>("groups");
    int64_t user_workspace_size =
        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+    bool exhaustive_search =
+        FLAGS_cudnn_exhaustive_search || ctx.Attr<bool>("exhaustive_search");
+    if (exhaustive_search && FLAGS_cudnn_deterministic) {
+      PADDLE_THROW(
+          "Cann't set exhaustive_search True and "
+          "FLAGS_cudnn_deterministic True at same time.");
+    }

    // ------------------- cudnn descriptors ---------------------
    ScopedTensorDescriptor input_desc;
@@ -265,14 +345,66 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionBwdFilterAlgo_t filter_algo;
    size_t workspace_size_in_bytes = 0, tmp_size = 0;
    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
-    if (user_workspace_size > 0) {
-      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) {
+      int64_t max_user_size =
+          std::max(static_cast<int64_t>(FLAGS_conv_workspace_size_limit),
+                   user_workspace_size);
+      workspace_size_limit = max_user_size * 1024 * 1024;
    }

-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto x_dims = framework::vectorize(input->dims());
+    auto f_dims = framework::vectorize(filter->dims());
    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    if (input_grad) {
-      if (!FLAGS_cudnn_deterministic) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      if (exhaustive_search) {
+        AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>* data_algo_cache;
+        if (ctx.scope().FindVar(kCUDNNBwdDataAlgoCache)) {
+          data_algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNBwdDataAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
+        } else {
+          data_algo_cache =
+              const_cast<framework::Scope&>(ctx.scope())
+                  .Var(kCUDNNBwdDataAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t>>();
+        }
+        data_algo = data_algo_cache->GetAlgorithm(
+            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+              int returned_algo_count;
+              std::array<cudnnConvolutionBwdDataAlgoPerf_t,
+                         kNUM_CUDNN_BWD_DATA_ALGS>
+                  data_perf_stat;
+              auto cudnn_find_bd_data_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardDataAlgorithmEx(
+                            handle, cudnn_filter_desc, filter_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_input_desc, input_grad_data,
+                            kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
+                            data_perf_stat.data(), cudnn_workspace,
+                            workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_data_func,
+                                       workspace_size_limit);
+
+              VLOG(3) << "Perf result: (algo: stat, time, memory)";
+              for (int i = 0; i < returned_algo_count; ++i) {
+                const auto& stat = data_perf_stat[i];
+                VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time
+                        << " " << stat.memory;
+              }
+              return data_perf_stat[0].algo;
+            });
+        VLOG(3) << "cuDNN backward data algo " << data_algo;
+      } else if (FLAGS_cudnn_deterministic) {
+        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+      } else {
        CUDNN_ENFORCE(
            platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
                handle, cudnn_filter_desc,
@@ -285,10 +417,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                cudnn_input_desc,
                CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                workspace_size_limit, &data_algo));
-      } else {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      }
-
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
              handle, cudnn_filter_desc, cudnn_output_grad_desc,
@@ -297,17 +426,54 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    }

    if (filter_grad) {
-      if (!FLAGS_cudnn_deterministic) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      if (exhaustive_search) {
+        AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* f_algo_cache;
+        if (ctx.scope().FindVar(kCUDNNBwdFilterAlgoCache)) {
+          f_algo_cache =
+              ctx.scope()
+                  .FindVar(kCUDNNBwdFilterAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
+        } else {
+          f_algo_cache =
+              const_cast<framework::Scope&>(ctx.scope())
+                  .Var(kCUDNNBwdFilterAlgoCache)
+                  ->GetMutable<
+                      AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>>();
+        }
+        filter_algo = f_algo_cache->GetAlgorithm(
+            x_dims, f_dims, strides, paddings, dilations, 0, [&]() {
+              int returned_algo_count;
+              std::array<cudnnConvolutionBwdFilterAlgoPerf_t,
+                         kNUM_CUDNN_BWD_FILTER_ALGS>
+                  filter_perf_stat;
+              auto cudnn_find_bd_f_func = [&](void* cudnn_workspace) {
+                CUDNN_ENFORCE(
+                    platform::dynload::
+                        cudnnFindConvolutionBackwardFilterAlgorithmEx(
+                            handle, cudnn_input_desc, input_data,
+                            cudnn_output_grad_desc, output_grad_data,
+                            cudnn_conv_desc, cudnn_filter_desc,
+                            filter_grad_data, kNUM_CUDNN_BWD_FILTER_ALGS,
+                            &returned_algo_count, filter_perf_stat.data(),
+                            cudnn_workspace, workspace_size_limit));
+              };
+              workspace_handle.RunFunc(cudnn_find_bd_f_func,
+                                       workspace_size_limit);
+              return filter_perf_stat[0].algo;
+            });
+        VLOG(3) << "cuDNN backward filter algo " << filter_algo;
+      } else if (FLAGS_cudnn_deterministic) {
+        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+      } else {
        CUDNN_ENFORCE(
            platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
                handle, cudnn_input_desc, cudnn_output_grad_desc,
                cudnn_conv_desc, cudnn_filter_desc,
                CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
                workspace_size_limit, &filter_algo));
-      } else {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
      }
-
      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -317,7 +483,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {

    // ------------------- cudnn conv backward data ---------------------
    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
-    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
    if (input_grad) {
      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
      // Because beta is zero, it is unnecessary to reset input_grad.

--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides, const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  std::mutex mutex_;
+};
+
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1, const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    const std::vector<int>& dilations, int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+
+  if (seed == 0) return gen_func();
+
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -189,6 +189,11 @@ void Conv2DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
+  AddAttr<bool>("exhaustive_search",
+                "(bool, default false) cuDNN has many algorithm to calculation "
+                "convolution, whether enable exhaustive search ",
+                "for cuDNN convolution or not, defalut is False.")
+      .SetDefault(false);
  AddComment(R"DOC(
 Convolution Operator.

@@ -219,6 +224,15 @@ $$
 )DOC");
 }

+class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{
+        {"Input", /*->*/ "Output"}};
+  }
+};
+
 void Conv3DOpMaker::Make() {
  AddInput(
      "Input",
@@ -283,7 +297,11 @@ void Conv3DOpMaker::Make() {
               "workspace size can increase performance but also requires "
               "better hardware. This size should be chosen carefully.")
      .SetDefault(4096);
-
+  AddAttr<bool>("exhaustive_search",
+                "(bool, default false) cuDNN has many algorithm to calculation "
+                "convolution, whether enable exhaustive search ",
+                "for cuDNN convolution or not, defalut is False.")
+      .SetDefault(false);
  AddComment(R"DOC(
 Convolution3D Operator.

@@ -356,6 +374,7 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+                  ops::ConvOpInferVarType,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);

@@ -363,7 +382,9 @@ REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
+
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
+                  ops::ConvOpInferVarType,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);


--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include <string>

 namespace paddle {
 namespace operators {
@@ -179,6 +180,15 @@ or not. But the output only shares the LoD information with input X.
 )DOC");
  }
 };
+
+class CrossEntropyOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Y"}};
+  }
+};
 }  // namespace operators
 }  // namespace paddle

@@ -186,6 +196,7 @@ namespace ops = paddle::operators;
 using CPUCtx = paddle::platform::CPUDeviceContext;

 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+                  ops::CrossEntropyOpInferVarType,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,

--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -133,10 +133,10 @@ void AsyncBRPCServer::StartServer() {
 void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }

 void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
+  VLOG(30) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(30) << "AsyncGRPCServer WaitSeverReady";
 }

 };  // namespace distributed

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -38,7 +38,7 @@ void GRPCClient::SendComplete() {
  std::unique_lock<std::mutex> lk(completed_mutex_);
  if (!completed_) {
    for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
+      VLOG(30) << "send complete message to " << it.first;
      this->AsyncSendComplete(it.first);
    }
    PADDLE_ENFORCE(this->Wait(), "internal grpc error");
@@ -81,7 +81,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
    ::grpc::ByteBuffer req;
    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);

-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";

    // stub context
    s->response_call_back_ = nullptr;
@@ -142,7 +142,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
    ::grpc::ByteBuffer buf;
    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);

-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";

    // stub context
    s->response_call_back_ = ProcGetResponse;
@@ -190,7 +190,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
    ::grpc::ByteBuffer req;
    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);

-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(30) << s->GetVarHandlePtr()->String() << " begin";

    // stub context
    s->response_call_back_ = ProcGetResponse;
@@ -328,14 +328,14 @@ void GRPCClient::Proceed() {
  void* tag = nullptr;
  bool ok = false;

-  VLOG(3) << "GRPCClient Proceed begin";
+  VLOG(30) << "GRPCClient Proceed begin";
  while (!stopped_ && cq_.Next(&tag, &ok)) {
    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
    GPR_ASSERT(ok);
    PADDLE_ENFORCE(c);

    if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
+      VLOG(30) << c->GetVarHandlePtr()->String() << " process";
      c->Process();
    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
      // FIXME(gongwb): parse error_details?
@@ -370,7 +370,7 @@ void GRPCClient::Proceed() {
      sync_cond_.notify_all();
    }
  }
-  VLOG(3) << "GRPCClient Proceed end";
+  VLOG(30) << "GRPCClient Proceed end";
 }

 std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {

--- a/paddle/fluid/operators/distributed/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc_server.cc
@@ -98,7 +98,7 @@ class RequestSend final : public RequestBase {

  void Process() override {
    std::string varname = GetReqName();
-    VLOG(4) << "RequestSend var_name:" << varname;
+    VLOG(40) << "RequestSend var_name:" << varname;

    auto scope = request_->GetMutableLocalScope();
    auto invar = request_->GetVar();
@@ -135,7 +135,7 @@ class RequestGet final : public RequestBase {
    // proc request.
    std::string varname = request_.varname();
    int trainer_id = request_.trainer_id();
-    VLOG(4) << "RequestGet " << varname;
+    VLOG(40) << "RequestGet " << varname;

    auto scope = request_handler_->scope();
    auto invar = scope->FindVar(varname);
@@ -182,8 +182,8 @@ class RequestPrefetch final : public RequestBase {
    std::string in_var_name = request_->Varname();
    std::string out_var_name = request_->OutVarname();
    int trainer_id = request_->GetTrainerId();
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name;
+    VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name
+             << " out_var_name: " << out_var_name;

    auto scope = request_->GetMutableLocalScope();
    auto invar = scope->FindVar(in_var_name);
@@ -231,8 +231,8 @@ class RequestCheckpointNotify final : public RequestBase {
    std::string checkpoint_dir = request_->OutVarname();
    int trainer_id = request_->GetTrainerId();

-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
+    VLOG(40) << "RequestCheckpointNotify notify: " << checkpoint_notify
+             << ", dir: " << checkpoint_dir;

    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
                             trainer_id, checkpoint_dir);
@@ -246,10 +246,10 @@ class RequestCheckpointNotify final : public RequestBase {
 };

 void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is wait server ready";
+  VLOG(40) << "AsyncGRPCServer is wait server ready";
  std::unique_lock<std::mutex> lock(this->mutex_ready_);
  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
+  VLOG(40) << "AsyncGRPCServer WaitSeverReady";
 }

 void AsyncGRPCServer::StartServer() {
@@ -282,14 +282,15 @@ void AsyncGRPCServer::StartServer() {
    reqs.reserve(kRequestBufSize);

    for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
+      VLOG(60) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+               << " I: " << i;
      TryToRegisterNewOne(rpc_name, i);
    }

    for (int i = 0; i < threadnum; i++) {
      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
+      VLOG(40) << t.first << " creates threads!";
    }
  }

@@ -306,7 +307,7 @@ void AsyncGRPCServer::StartServer() {
    auto& threads = t.second;
    for (size_t i = 0; i < threads.size(); ++i) {
      threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
+      VLOG(40) << t.first << " threads ends!";
    }
  }
 }
@@ -314,7 +315,7 @@ void AsyncGRPCServer::StartServer() {
 void AsyncGRPCServer::ShutdownQueue() {
  for (auto& t : rpc_cq_) {
    t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
+    VLOG(40) << t.first << " queue shutdown!";
  }
 }

@@ -323,7 +324,7 @@ void AsyncGRPCServer::ShutDownImpl() {
  is_shut_down_ = true;
  ShutdownQueue();

-  VLOG(4) << "server_ shutdown!";
+  VLOG(40) << "server_ shutdown!";
  server_->Shutdown();
 }

@@ -331,12 +332,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
                                          int req_id) {
  std::unique_lock<std::mutex> lock(cq_mutex_);
  if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
+    VLOG(40) << "shutdown, do not TryToRegisterNewSendOne";
    return;
  }

-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
+  VLOG(40) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
+           << " REQ ID: " << req_id;

  auto& reqs = rpc_reqs_[rpc_name];
  auto& handler = rpc_call_map_[rpc_name];
@@ -357,7 +358,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,

  reqs[req_id] = b;

-  VLOG(4) << "Create RequestSend status:" << b->Status();
+  VLOG(40) << "Create RequestSend status:" << b->Status();
 }

 void AsyncGRPCServer::HandleRequest(
@@ -367,15 +368,15 @@ void AsyncGRPCServer::HandleRequest(
  bool ok = false;

  while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
+    VLOG(40) << "HandleRequest " << rpc_name << " wait next";
    if (!cq->Next(&tag, &ok)) {
-      VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!";
+      VLOG(30) << "CompletionQueue " << rpc_name << " shutdown!";
      break;
    }

    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
+    VLOG(40) << "HandleRequest " << rpc_name << ", req_id:" << req_id
+             << " get next";

    auto& reqs = rpc_reqs_[rpc_name];
    RequestBase* base = nullptr;
@@ -385,7 +386,7 @@ void AsyncGRPCServer::HandleRequest(
      base = reqs[req_id];
    }

-    VLOG(3) << base->Status2String(rpc_name);
+    VLOG(30) << base->Status2String(rpc_name);

    // reference:
    // https://github.com/tensorflow/tensorflow/issues/5596

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -75,7 +75,7 @@ class VarHandle {
      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
      ret = status_;
    }
-    VLOG(7) << "VarHandle wait:" << ret;
+    VLOG(70) << "VarHandle wait:" << ret;
    return ret != kErrorState;
  }

@@ -84,7 +84,7 @@ class VarHandle {
      std::unique_lock<std::mutex> lk(sync_mutex_);
      status_ = ok ? kFinishState : kErrorState;
    }
-    VLOG(7) << "VarHandle finish:" << ok;
+    VLOG(70) << "VarHandle finish:" << ok;
    wait_cond_.notify_all();
  }


--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -38,19 +38,19 @@ bool RequestSendHandler::Handle(const std::string& varname,
                                framework::Variable** outvar,
                                const int trainer_id,
                                const std::string& out_var_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
+  VLOG(40) << "RequestSendHandler:" << varname;

  // Sync
  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
+    VLOG(30) << "sync: recv BATCH_BARRIER_MESSAGE";
    rpc_server_->IncreaseBatchBarrier(kRequestSend);
  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
+    VLOG(30) << "sync: recv complete message";
    rpc_server_->Complete();
  } else {
    // Async
    if (!sync_mode_) {
-      VLOG(3) << "async process var: " << varname;
+      VLOG(30) << "async process var: " << varname;
      try {
        executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                      scope);
@@ -61,7 +61,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
      return true;
    } else {  // sync
      rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
+      VLOG(30) << "sync: processing received var: " << varname;

      if (invar == nullptr) {
        LOG(FATAL) << "sync: Can not find server side var: " << varname;
@@ -78,10 +78,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
                               framework::Variable** outvar,
                               const int trainer_id,
                               const std::string& out_var_name) {
-  VLOG(4) << "RequestGetHandler:" << varname;
+  VLOG(40) << "RequestGetHandler:" << varname;
  if (sync_mode_) {
    if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
+      VLOG(30) << "sync: recv fetch barrier message";
      rpc_server_->IncreaseBatchBarrier(kRequestGet);
    } else {
      rpc_server_->WaitCond(kRequestGet);
@@ -93,13 +93,14 @@ bool RequestGetHandler::Handle(const std::string& varname,
        // NOTE: the format is determined by distributed_transpiler.py
        std::string param_bak_name =
            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
+        VLOG(30) << "getting " << param_bak_name << " trainer_id "
+                 << trainer_id;
        auto var = scope_->FindVar(varname);
        auto t_orig = var->Get<framework::LoDTensor>();
        auto param_bak = scope_->Var(param_bak_name);
        auto t = param_bak->GetMutable<framework::LoDTensor>();
        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
+        VLOG(30) << "copying " << varname << " to " << param_bak_name;
        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
      }
      *outvar = scope_->FindVar(varname);
@@ -114,7 +115,7 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
                                    framework::Variable** outvar,
                                    const int trainer_id,
                                    const std::string& out_var_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
+  VLOG(40) << "RequestPrefetchHandler " << varname;

  auto var_desc = program_->Block(0).FindVar(out_var_name);
  InitializeVariable(*outvar, var_desc->GetType());
@@ -138,8 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
  lt_var->clear();
  lt_var->append(out_var_name);
-  VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
-          << out_var_name;
+  VLOG(40) << "RequestCheckpointHandler update var kLookupTablePath to: "
+           << out_var_name;
  executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_);
  return true;
 }

--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -39,7 +39,7 @@ void RPCServer::SavePort() const {
  port_file.open(file_path);
  port_file << selected_port_;
  port_file.close();
-  VLOG(4) << "selected port written to " << file_path;
+  VLOG(40) << "selected port written to " << file_path;
 }

 void RPCServer::WaitBarrier(const std::string& rpc_name) {
@@ -49,12 +49,12 @@ void RPCServer::WaitBarrier(const std::string& rpc_name) {
            exit_flag_.load());
  });

-  VLOG(3) << "batch_barrier_: " << rpc_name << " "
-          << barrier_counter_[rpc_name];
+  VLOG(30) << "batch_barrier_: " << rpc_name << " "
+           << barrier_counter_[rpc_name];
 }

 void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(4) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
+  VLOG(40) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
  int b = 0;
  std::unique_lock<std::mutex> lock(mutex_);
  b = ++barrier_counter_[rpc_name];
@@ -71,7 +71,7 @@ void RPCServer::Complete() {
    client_num_--;
    need_reset_all_vars_ = true;

-    VLOG(4) << "decrease client_num to: " << client_num_;
+    VLOG(40) << "decrease client_num to: " << client_num_;
    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
      barrier_counter_[kRequestGet]--;
    }
@@ -90,7 +90,7 @@ int RPCServer::GetClientNum() {
 }

 void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
+  VLOG(30) << "RPCServer ResetBarrierCounter ";
  std::unique_lock<std::mutex> lock(mutex_);
  for (auto& t : barrier_counter_) {
    t.second = 0;
@@ -105,12 +105,12 @@ void RPCServer::RegisterRPC(const std::string& rpc_name,

  static int cond = -1;
  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(4) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
-          << ", cond:" << rpc_cond_map_[rpc_name];
+  VLOG(40) << "RegisterRPC rpc_name:" << rpc_name << ", handler:" << handler
+           << ", cond:" << rpc_cond_map_[rpc_name];
 }

 void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
+  VLOG(30) << "RPCServer SetCond " << rpc_name;
  {
    std::unique_lock<std::mutex> lock(mutex_);
    cur_cond_ = rpc_cond_map_[rpc_name];
@@ -120,7 +120,7 @@ void RPCServer::SetCond(const std::string& rpc_name) {
 }

 void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(4) << "RPCServer WaitCond " << rpc_name;
+  VLOG(40) << "RPCServer WaitCond " << rpc_name;
  int cond = 0;
  {
    std::unique_lock<std::mutex> lock(mutex_);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -50,7 +50,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
        size_to_write = length - total_written;
      }
      // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
+      VLOG(70) << "copy " << size_to_write << " data to CUDAPlace";
      memory::Copy(boost::get<platform::CUDAPlace>(place),
                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
                   gpu_dev_ctx.stream());
@@ -79,7 +79,7 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
    // TODO(gongwb): can we avoid copy?
    platform::CPUPlace cpu;
    // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
+    VLOG(70) << "copy " << size_to_write << " data to CPUPlace";
    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);

    p += size_to_write;
@@ -198,8 +198,8 @@ bool VariableResponse::ProcSerializedField(
 #endif
  }

-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
+  VLOG(70) << "ProcSerializedField:" << meta_.varname()
+           << ", type:" << meta_.type() << std::endl;
  framework::DDim dims = GetDims(meta_.dims());
  if (meta_.type() == sendrecv::LOD_TENSOR) {
    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -75,16 +75,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  }
 };

-class ElementwiseOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
-    auto &x = block->FindRecursiveOrCreateVar(x_name);
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(x.GetType());
-    out.SetDataType(x.GetDataType());
+class ElementwiseOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
+      const override {
+    return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
  }
 };


--- a/paddle/fluid/operators/feed_op.cc
+++ b/paddle/fluid/operators/feed_op.cc
--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
--- a/paddle/fluid/operators/fetch_op.cc
+++ b/paddle/fluid/operators/fetch_op.cc
--- a/paddle/fluid/operators/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/gen_nccl_id_op.cc
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
--- a/paddle/fluid/operators/bilinear_interp_op.cc
+++ b/paddle/fluid/operators/bilinear_interp_op.cc
--- a/paddle/fluid/operators/bilinear_interp_op.cu
+++ b/paddle/fluid/operators/bilinear_interp_op.cu
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/math/cpu_vec_test.cc
+++ b/paddle/fluid/operators/math/cpu_vec_test.cc
--- a/paddle/fluid/operators/math/jit_kernel_test.cc
+++ b/paddle/fluid/operators/math/jit_kernel_test.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
--- a/paddle/fluid/operators/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl_op.cu.cc
--- a/paddle/fluid/operators/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl_op_test.cu.cc
--- a/paddle/fluid/operators/parallel_do_op.cc
+++ b/paddle/fluid/operators/parallel_do_op.cc
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
--- a/paddle/fluid/operators/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_mask_op.h
--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
--- a/paddle/fluid/operators/similarity_focus_op.cc
+++ b/paddle/fluid/operators/similarity_focus_op.cc
--- a/paddle/fluid/operators/similarity_focus_op.h
+++ b/paddle/fluid/operators/similarity_focus_op.h
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
--- a/paddle/fluid/operators/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/sum_mkldnn_op.cc
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
--- a/paddle/fluid/operators/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/tensor_array_read_write_op.cc
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/distribute_lookup_table.py
+++ b/python/paddle/fluid/distribute_lookup_table.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_similarity_focus_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py