diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index bdf7e7c1248d8378c9e66389dc4297294982f1c7..ddbcff7b398bee1748f358d6ea66037a072ba7ec 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -299,16 +299,19 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
   std::unique_ptr<ExecutorPrepareContext> ctx(
       new ExecutorPrepareContext(program, block_id));
   VLOG(3) << "after create prepare";
- // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
+  // PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
   VLOG(3) << "before create op_desc";
   auto& block = program.Block(block_id);
-  VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size();
+  VLOG(3) << "create before" << ctx->ops_.size() << " "
+          << block.AllOps().size();
   int counter = 0;
   for (auto& op_desc : block.AllOps()) {
     ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type();
+    VLOG(3) << "create op "
+            << "index " << ++counter << " type " << op_desc->Type();
   }
-  VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size();
+  VLOG(3) << "create finished" << ctx->ops_.size() << " "
+          << block.AllOps().size();
   return ctx;
 }
 
@@ -320,22 +323,25 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
   for (auto& bid : block_ids) {
     VLOG(3) << "block id" << bid;
     auto* ctx = new ExecutorPrepareContext(program, bid);
-    //PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
+    // PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
     auto& block = program.Block(bid);
     int counter = 0;
-    VLOG(3) << "create before" << ctx->ops_.size() << " " << block.AllOps().size();
+    VLOG(3) << "create before" << ctx->ops_.size() << " "
+            << block.AllOps().size();
     for (auto& op_desc : block.AllOps()) {
-
       ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
-      VLOG(3) << "create op " << "index " << ++counter << " type " << op_desc->Type();
+      VLOG(3) << "create op "
+              << "index " << ++counter << " type " << op_desc->Type();
     }
-    VLOG(3) << "create finished" << ctx->ops_.size() << " " << block.AllOps().size();
+    VLOG(3) << "create finished" << ctx->ops_.size() << " "
+            << block.AllOps().size();
     result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
   }
   return result;
 }
 
-// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx, Scope* local_scope) {
+// void CheckResult(const std::string op_type, ExecutorPrepareContext* ctx,
+// Scope* local_scope) {
 //     VLOG(3) << "before checking result";
 //   auto& dev_ctx = *platform::DeviceContextPool::Instance().Get(place_);
 //   std::vector<std::string> outputs;
@@ -343,7 +349,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 //   bool found = false;
 //   framework::OpDesc* myop = nullptr;
 //   for(auto& op : block.AllOps()) {
-//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") return;
+//     if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
+//     "feed") return;
 //     if (op->Type() == op_type) {
 //         found = true;
 //         myop = op;
@@ -370,7 +377,8 @@ std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
 //       for(size_t i=0; i < check.numel(); ++i) {
 //           sum += check.data<float>()[i];
 //       }
-//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+//       VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
+//       << sum;
 //   VLOG(3) << "after checking result";
 // }
 
@@ -389,11 +397,14 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   VLOG(3) << "Scope ptr " << local_scope;
   for (auto& op : ctx->ops_) {
     op->Run(*local_scope, place_);
-   // CheckResult(op->Type(), ctx, local_scope);
-    if (FLAGS_benchmark) {
-      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
-              << memory::memory_usage(place_);
-    }
+    // CheckResult(op->Type(), ctx, local_scope);
+    // if (FLAGS_benchmark) {
+    //   VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+    //           << memory::memory_usage(place_);
+    // }
+    VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+            << memory::memory_usage(place_);
+    // platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
   platform::DeviceContextPool::Instance().Get(place_)->Wait();
 
@@ -403,13 +414,15 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   // auto& block = ctx->prog_.Block(0);
 
   // for(auto& op : block.AllOps()) {
-  //   if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() == "feed") continue;
+  //   if(op->Type() == "load_combine" || op->Type() == "fetch" || op->Type() ==
+  //   "feed") continue;
   //   // for(auto& real_op : ctx->ops_) {
   //   //   if(real_op->Type() == op->Type()) {
-  //   //     VLOG(3) << real_op->Type() << " " <<place_ << " " << real_op->DebugStringEx(local_scope);
+  //   //     VLOG(3) << real_op->Type() << " " <<place_ << " " <<
+  //   real_op->DebugStringEx(local_scope);
   //   //   }
   //   // }
-     
+
   //    //VLOG(3) << "start op output" << op->Type();
   //     for(auto var_name: op->InputArgumentNames()) {
   //     auto* var = local_scope->Var(var_name);
@@ -418,19 +431,21 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   //     auto* tensor = var->GetMutable<framework::LoDTensor>();
   //     framework::Tensor check;
   //     VLOG(3) << "before tensor copy";
-   
+
   //     framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
-      
+
   //     VLOG(3) << "after tensor copy";
   //     float sum = .0;
   //     for(size_t i=0; i < check.numel(); ++i) {
-  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
+  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
+  //       {
   //         sum += static_cast<float>(check.data<int64_t>()[i]);
   //       } else {
   //         sum += check.data<float>()[i];
   //       }
   //     }
-  //     VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum " << sum;
+  //     VLOG(3) << "op " << op->Type() << " input var " << var_name << " sum "
+  //     << sum;
   //   }
 
   //   VLOG(3) << "op " << op->Type() << "input finished";
@@ -442,23 +457,28 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
   //     framework::Tensor check;
   //     VLOG(3) << "before tensor copy";
   //     if(op->Type() == "batch_norm" && platform::is_gpu_place(place_)) {
-  //       VLOG(3) << "op " << op->Type() << " output var " << var_name << " " << tensor->numel();
+  //       VLOG(3) << "op " << op->Type() << " output var " << var_name << " "
+  //       << tensor->numel();
   //       tensor->mutable_data<float>(place_);
-  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
+  //        &check);
   //     } else {
-  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx, &check);
+  //        framework::TensorCopy(*tensor, platform::CPUPlace(), dev_ctx,
+  //        &check);
   //     }
-      
+
   //     VLOG(3) << "after tensor copy";
   //     float sum = .0;
   //     for(size_t i=0; i < check.numel(); ++i) {
-  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t))) {
+  //       if(std::type_index(check.type()) == std::type_index(typeid(int64_t)))
+  //       {
   //         sum += static_cast<float>(check.data<int64_t>()[i]);
   //       } else {
   //         sum += check.data<float>()[i];
   //       }
   //     }
-  //     VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum " << sum;
+  //     VLOG(3) << "op " << op->Type() << " output var " << var_name << " sum "
+  //     << sum;
   //   }
   // }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 555faba9624b9c76a9efdf4a62cd319f9682566e..c293cf92b4f3d530109c76850df184af9cad7399 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -50,19 +50,41 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   const std::vector<std::string> &Outputs(
       const std::string &name) const override;
 
+  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string &input_n = Inputs(in)[i];
+    const std::string &output_n = Outputs(out)[j];
+
+    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+                   in, i);
+    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+
+    auto *in_var = block_.FindVarRecursive(input_n);
+    auto *out_var = block_.FindVarRecursive(output_n);
+
+    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+                   "The type of %s and %s is not the same.", input_n, output_n);
+
+    SetDim(output_n, GetDim(input_n));
+  }
+
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) const override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size());
     PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", in, i);
+    PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != proto::VarType::LOD_TENSOR) {
       VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
-    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarType::LOD_TENSOR,
-                      "The %d-th output of Output(%s) must be LoDTensor.", j,
-                      out);
     out_var->SetLoDLevel(in_var->GetLoDLevel());
   }
 
@@ -441,7 +463,10 @@ static void InitInferShapeFuncs() {
 
     for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
       auto op_type = kern_pair.first;
-      auto &op_info = info_map.at(op_type);
+      auto it = info_map.find(op_type);
+      PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered",
+                     op_type);
+      auto &op_info = it->second;
       auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
           "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
       if (op_info.infer_shape_) {  // infer_shape has been registered.
@@ -490,20 +515,14 @@ void OpDesc::InferShape(const BlockDesc &block) const {
 }
 
 void OpDesc::InferVarType(BlockDesc *block) const {
+  // There are a few places that var type can be set.
+  // When VarDesc is created, default set to LOD_TENSOR.
+  // When output variable is created, default is defaut set to LOD_TENSOR.
+  // We limit here to be the only place that operator defines its customized
+  // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     info.infer_var_type_(*this, block);
-  } else {
-    // all output type is LoDTensor by default
-    VLOG(10) << this->Type()
-             << " has not registered InferVarType. Set output variables to "
-                "LOD_TENSOR";
-    for (auto &out_pair : this->outputs_) {
-      for (auto &out_var_name : out_pair.second) {
-        block->FindRecursiveOrCreateVar(out_var_name)
-            .SetType(proto::VarType::LOD_TENSOR);
-      }
-    }
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index b4205aba83e774fb9c08193124adb93935c00157..440e0509be727ec2b84abc76fca44edda11f8a0a 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -100,16 +100,6 @@ class OpDesc {
   std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
   std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
 
-  void SetInputMap(const VariableNameMap &input) {
-    this->inputs_ = input;
-    this->need_update_ = true;
-  }
-
-  void SetOutputMap(const VariableNameMap &output) {
-    this->outputs_ = output;
-    this->need_update_ = true;
-  }
-
   const VariableNameMap &Inputs() const { return inputs_; }
 
   const VariableNameMap &Outputs() const { return outputs_; }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 3b4a620f8ce2ed6a4197063b54111a0e78d5c438..ea060ebc60dc83a9b7c2c43ed5e72ed0525f8453 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -62,7 +62,7 @@ static DDim GetDims(const Scope& scope, const std::string& name,
 
   if (var->IsType<LoDTensor>()) {
     const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
       return DDim({-1});
     }
     return tensor.dims();
@@ -91,13 +91,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) {
 
   if (var->IsType<LoDTensor>()) {
     const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
       return "";
     }
     return DataTypeToString(ToDataType(tensor.type()));
   } else if (var->IsType<SelectedRows>()) {
     auto tensor = var->Get<SelectedRows>().value();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
       return "uninited";
     } else {
       return DataTypeToString(ToDataType(tensor.type()));
@@ -130,7 +130,7 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 
   if (var->IsType<LoDTensor>()) {
     const LoDTensor& tensor = var->Get<LoDTensor>();
-    if (!tensor.IsInitialized()) {
+    if (UNLIKELY(!tensor.IsInitialized())) {
       return default_lod;
     }
     return tensor.lod();
@@ -149,11 +149,13 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     platform::SetDeviceId(dev_id);
 #endif
   }
-  VLOG(3) << "start pool";
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  VLOG(3) << "start RunImpl";
+
+  // The profile has a process-wide mutex, results in serious performance issue
+  // in concurrency scenerio. Here use an `if` to fix this issue.
+  // Please not remove the `if`, ask @Superjomn if there are any concern.
+
   RunImpl(scope, place);
+
   VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
@@ -206,7 +208,6 @@ const std::vector<std::string>& OperatorBase::Outputs(
 }
 
 std::string OperatorBase::DebugStringEx(const Scope* scope) const {
-  VLOG(3) << this->Type() << " scope ptr " << scope;
   std::stringstream ss;
   ss << "Op(" << type_ << "), inputs:{";
   for (auto it = inputs_.begin(); it != inputs_.end();) {
@@ -470,35 +471,35 @@ class RuntimeInferShapeContext : public InferShapeContext {
       : op_(op), scope_(scope) {}
 
   bool HasInput(const std::string& name) const override {
-    if (!op_.HasInputs(name)) {
+    // has only one input
+    const auto& ins = op_.Inputs();
+    auto it = ins.find(name);
+    if (it == ins.end()) {
       return false;
     }
-    auto& ins = Inputs(name);
-    size_t length = ins.size();
-    if (length == 0) {
+    const auto& in = it->second;
+    if (in.size() == 0 || in[0] == kEmptyVarName) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL,
+    PADDLE_ENFORCE_EQ(in.size(), 1UL,
                       "Input %s should not have more than one inputs", name);
-    auto ipt = ins[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    return scope_.FindVar(in[0]) != nullptr;
   }
 
   bool HasOutput(const std::string& name) const override {
-    if (!op_.HasOutputs(name)) {
+    // has only one output
+    const auto& outs = op_.Outputs();
+    auto it = outs.find(name);
+    if (it == outs.end()) {
       return false;
     }
-    auto& outs = Outputs(name);
-    size_t length = outs.size();
-    if (length == 0) {
+    const auto& out = it->second;
+    if (out.size() == 0 || out[0] == kEmptyVarName) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(length, 1UL,
-                      "Output %s should not have more than one inputs", name);
-    auto ipt = outs[0];
-    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-    return var != nullptr;
+    PADDLE_ENFORCE_EQ(out.size(), 1UL,
+                      "Output %s should not have more than one outputs", name);
+    return scope_.FindVar(out[0]) != nullptr;
   }
 
   bool HasInputs(const std::string& name) const override {
@@ -545,13 +546,45 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return op_.Outputs(name);
   }
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
-                size_t j = 0) const override {
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size());
     PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    const std::string& input_n = Inputs(in)[i];
+    const std::string& output_n = Outputs(out)[j];
+
+    Variable* in_var = scope_.FindVar(input_n);
+    Variable* out_var = scope_.FindVar(output_n);
+    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
+                   "The type of %s and %s is not the same.", output_n,
+                   GetDim(input_n));
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows.");
+    }
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    const std::vector<std::string>& inputs = Inputs(in);
+    const std::vector<std::string>& outputs = Outputs(out);
+    PADDLE_ENFORCE_LT(i, inputs.size());
+    PADDLE_ENFORCE_LT(j, outputs.size());
+    Variable* in_var = scope_.FindVar(inputs.at(i));
     if (!in_var->IsType<LoDTensor>()) return;
+    Variable* out_var = scope_.FindVar(outputs.at(j));
     PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
                    "The %d-th output of Output(%s) must be LoDTensor.", j, out);
     auto in_tensor = in_var->Get<LoDTensor>();
@@ -579,20 +612,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
       out_tensor->set_layout(in_tensor.layout());
   }
 
-  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
-                   size_t j = 0) const {
-    PADDLE_ENFORCE_LT(i, Inputs(in).size());
-    PADDLE_ENFORCE_LT(j, Outputs(out).size());
-    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
-    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
-    if (!in_var->IsType<LoDTensor>()) return;
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
-    auto in_tensor = in_var->Get<LoDTensor>();
-    auto* out_tensor = out_var->GetMutable<LoDTensor>();
-    out_tensor->set_layout(in_tensor.layout());
-  }
-
   bool IsRuntime() const override { return true; }
 
  protected:
@@ -663,16 +682,12 @@ static void CheckTensorNANOrInf(const std::string& name,
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  VLOG(3) << "start Infershape";
   this->InferShape(&infer_shape_ctx);
-  VLOG(3) << "Infershape Pass";
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
   // check if op[type] has kernel registered.
-  VLOG(3) << "Start Kernels";
   auto& all_op_kernels = AllOpKernels();
-  VLOG(3) << "Kernel map finish";
   auto kernels_iter = all_op_kernels.find(type_);
   if (kernels_iter == all_op_kernels.end()) {
     PADDLE_THROW(
@@ -690,7 +705,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   auto expected_kernel_key =
       this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
-  VLOG(3) << "expected_kernel_key: " << expected_kernel_key;
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 5f497cafa0f75f7c23d550ef767d55274de7c900..280bc19dce7b604d67aefdc572de96b479b8d2d7 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -56,6 +56,9 @@ class InferShapeContext {
   virtual const std::vector<std::string> &Outputs(
       const std::string &name) const = 0;
 
+  virtual void ShareDim(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) = 0;
+
   virtual void ShareLoD(const std::string &in, const std::string &out,
                         size_t i = 0, size_t j = 0) const = 0;
 
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index aaf6d5a4f3051ff5f3f6b1afc047c9ebd936fb5b..c778529cc4b6f98902a2629298ffb0eafb8734c2 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -112,11 +112,11 @@ bool NativePaddlePredictor::Init(
 
     auto &block = inference_program_->Block(0);
     for (auto *op_desc : block.AllOps()) {
-      if (op_desc->HasAttr("use_cudnn")) {
-        op_desc->SetAttr("use_cudnn", false);
-      }
+      // if (op_desc->HasAttr("use_cudnn")) {
+      //   op_desc->SetAttr("use_cudnn", false);
+      // }
       if (op_desc->HasAttr("workspace_size_MB")) {
-        op_desc->SetAttr("workspace_size_MB", 0);
+        op_desc->SetAttr("workspace_size_MB", 1024);
       }
     }
 
diff --git a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
index 1b6463a333cffafc67540ffbb537c9c96ee36903..c7db21d093f88c09604f7e82c232a08222b7a9b3 100644
--- a/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
+++ b/paddle/fluid/inference/api/demo_ci/real_data_icnet_tester.cc
@@ -27,8 +27,8 @@ NativeConfig GetConfig() {
   NativeConfig config;
 
   // config.model_dir = FLAGS_dirname;
-  config.prog_file = "hs_lb_without_bn/__model__";
-  config.param_file = "hs_lb_without_bn/__params__";
+  config.prog_file = "hs_lb_without_bn_cudnn/__model__";
+  config.param_file = "hs_lb_without_bn_cudnn/__params__";
   // config.prog_file = "hs_lb_without_bn_cuda/__model__";
   // config.param_file = "hs_lb_without_bn_cuda/__params__";
   config.fraction_of_gpu_memory = 0.0;
@@ -106,7 +106,7 @@ void test_naive(int batch_size) {
     std::cout << "batch: " << batch_size
               << " predict cost: " << time_diff(time1, time2) / steps << "ms"
               << std::endl;
-    std::cout << outputs.size() << std::endl; 
+    std::cout << outputs.size() << std::endl;
     int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
     int64_t sum_out = 0;
     for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t); ++j) {
diff --git a/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
index 9a018ee347e821035d14a27bbab236fd01c2d48b..e1ce46b3bbe72c50abb7a6f1a44cbf8ef352c037 100644
--- a/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
+++ b/paddle/fluid/inference/api/demo_ci/thread_icnet_test.cc
@@ -21,12 +21,12 @@
 #include <fstream>
 #include <iostream>
 #include <thread>  // NOLINT
+#include <utility>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 
 #define ASSERT_TRUE(x) x
 #define ASSERT_EQ(x, y) assert(x == y)
 
-
 // DEFINE_string(dirname, "./LB_icnet_model",
 //               "Directory of the inference model.");
 namespace paddle {
@@ -34,7 +34,7 @@ NativeConfig GetConfig() {
   NativeConfig config;
   config.prog_file = "./hs_lb_without_bn_cuda/__model__";
   config.param_file = "./hs_lb_without_bn_cuda/__params__";
-  config.fraction_of_gpu_memory = 0.5;
+  config.fraction_of_gpu_memory = 0.0;
   config.use_gpu = true;
   config.device = 0;
   return config;
@@ -54,7 +54,7 @@ void test_naive(int batch_size, std::string model_path) {
   int height = 449;
   int width = 581;
   std::vector<float> data;
-  for(int i=0; i < 3 * height * width; ++i) {
+  for (int i = 0; i < 3 * height * width; ++i) {
     data.push_back(0.0);
   }
 
@@ -86,47 +86,61 @@ void test_naive(int batch_size, std::string model_path) {
   //   in_img.close();
   //   std::cout << "sum: " << sum_n << std::endl;
 
-    PaddleTensor tensor;
-    tensor.shape = std::vector<int>({batch_size, 3, height, width});
-    tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
-    std::copy(data.begin(), data.end(),
-              static_cast<float*>(tensor.data.data()));
-    tensor.dtype = PaddleDType::FLOAT32;
-    std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-
-    constexpr int num_jobs = 2;  // each job run 1 batch
-    std::vector<std::thread> threads;
-
+  PaddleTensor tensor;
+  tensor.shape = std::vector<int>({batch_size, 3, height, width});
+  tensor.data.Resize(sizeof(float) * batch_size * 3 * height * width);
+  std::copy(data.begin(), data.end(), static_cast<float*>(tensor.data.data()));
+  tensor.dtype = PaddleDType::FLOAT32;
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+
+  constexpr int num_jobs = 5;  // each job run 1 batch
+  std::vector<std::thread> threads;
+  // using PtrPred = std::vector<std::unique_ptr<PaddlePredictor>>;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    auto& pred = CreatePaddlePredictor<NativeConfig>(config);
+    predictors.emplace_back(std::move(pred));
+  }
 
-    for (int tid = 0; tid < num_jobs; ++tid) {
-      threads.emplace_back([&, tid]() {
+  using namespace std::chrono_literals;
+  // std::this_thread::sleep_for(std::chrono::seconds(20));
+  std::cout << "before start predict";
+
+  int epoches = 100000;
+  for (int tid = 0; tid < num_jobs; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // auto predictor = CreatePaddlePredictor<NativeConfig>(config);
+      auto& predictor = predictors[tid];
+      // auto& predictor = predictors[tid];
+      // auto predictor = preds[tid];
+      // std::this_thread::sleep_for(std::chrono::seconds(20));
       PaddleTensor tensor_out;
       std::vector<PaddleTensor> outputs(1, tensor_out);
-        auto predictor = CreatePaddlePredictor<NativeConfig>(config);
-        for (size_t i = 0; i < 1000; i++) {
-          ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
-          VLOG(0) << "tid : " << tid << " run: " << i << "finished";
-          //std::cout <<"tid : " << tid << " run: " << i << "finished" << std::endl;
-          ASSERT_EQ(outputs.size(), 1UL);
-          // int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
-          // int64_t sum_out = 0;
-          // for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
-          //      ++j) {
-          //   sum_out += data_o[j];
-          // }
-          // std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
-          //           << std::endl;
-        }
-      });
-    }
-    for (int i = 0; i < num_jobs; ++i) {
-      threads[i].join();
-    }
+      for (size_t i = 0; i < epoches; i++) {
+        ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+        VLOG(0) << "tid : " << tid << " run: " << i << "finished";
+        // std::cout <<"tid : " << tid << " run: " << i << "finished" <<
+        // std::endl;
+        ASSERT_EQ(outputs.size(), 1UL);
+        // int64_t* data_o = static_cast<int64_t*>(outputs[0].data.data());
+        // int64_t sum_out = 0;
+        // for (size_t j = 0; j < outputs[0].data.length() / sizeof(int64_t);
+        //      ++j) {
+        //   sum_out += data_o[j];
+        // }
+        // std::cout << "tid : " << tid << "pass : " << i << " " << sum_out
+        //           << std::endl;
+      }
+    });
+  }
+  for (int i = 0; i < num_jobs; ++i) {
+    threads[i].join();
   }
+}
 // }
-} // namespace paddle
+}  // namespace paddle
 
-  int main(int argc, char** argv) { 
-    paddle::test_naive(1 << 0, ""); 
-    return 0;
+int main(int argc, char** argv) {
+  paddle::test_naive(1 << 0, "");
+  return 0;
 }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index c2f45fdc99b87bc12c2aadf1985de6e98a24fce7..dad5c8257a9f3f89d4494097023ac3a598ba146d 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -11,7 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc
index b86e4f38c42a26e155f276f9b73cbed1d0d83f7d..2a283733f5c25b0f666b1067c3e5151ce9f75873 100644
--- a/paddle/fluid/memory/detail/meta_cache.cc
+++ b/paddle/fluid/memory/detail/meta_cache.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/memory_block.h"
 #include "paddle/fluid/platform/assert.h"
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index c17d1afc309c65035063348d4934ea1783b018ed..4a8ac441cfaf642fde58ee30865a22e83c065498 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) The input of Topk op");
-    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X");
     AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
     AddComment(R"DOC(
 Top K operator
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 0cad224ca8860b0e4bc2e3f2bc1659235aadfe2d..9da8551eb2d7ea66ad434c42b54522432095ce29 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -256,65 +256,36 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
  * 3. go to the second setp, until one thread's topk value is null;
  * 4. go to the first setp, until get the topk value.
  */
-
 template <typename T, int MaxLength, int BlockSize>
 __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
-                             const T* src, int lds, int dim, int k,
-                             int grid_dim, int num) {
+                             const T* src, int lds, int dim, int k) {
   __shared__ Pair<T> sh_topk[BlockSize];
+  __shared__ int maxid[BlockSize / 2];
   const int tid = threadIdx.x;
   const int warp = threadIdx.x / 32;
+  output += blockIdx.x * output_stride;
+  indices += blockIdx.x * k;
 
-  const int bid = blockIdx.x;
-  for (int i = bid; i < num; i += grid_dim) {
-    int top_num = k;
-    __shared__ int maxid[BlockSize / 2];
-    T* out = output + i * output_stride;
-    int64_t* inds = indices + i * k;
-    Pair<T> topk[MaxLength];
-    int beam = MaxLength;
-    Pair<T> max;
-    bool is_empty = false;
-    bool firststep = true;
-
-    for (int j = 0; j < MaxLength; j++) {
-      topk[j].set(-INFINITY, -1);
-    }
-    while (top_num) {
-      ThreadGetTopK<T, MaxLength, BlockSize>(
-          topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid);
+  Pair<T> topk[MaxLength];
+  int beam = MaxLength;
+  Pair<T> max;
+  bool is_empty = false;
+  bool firststep = true;
 
-      sh_topk[tid] = topk[0];
-      BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &out, &inds,
-                                           &beam, &top_num, tid, warp);
-    }
+  for (int k = 0; k < MaxLength; k++) {
+    topk[k].set(-INFINITY, -1);
   }
-}
-
-inline static int GetDesiredBlockDim(int dim) {
-  if (dim > 128) {
-    return 256;
-  } else if (dim > 64) {
-    return 128;
-  } else if (dim > 32) {
-    return 64;
-  } else {
-    return 32;
+  while (k) {
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
+
+    sh_topk[tid] = topk[0];
+    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
-#define FIXED_BLOCK_DIM_BASE(dim, ...) \
-  case (dim): {                        \
-    constexpr auto kBlockDim = (dim);  \
-    __VA_ARGS__;                       \
-  } break
-
-#define FIXED_BLOCK_DIM(...)                \
-  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \
-  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);  \
-  FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
-
 template <typename T>
 class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -327,38 +298,30 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     size_t k = static_cast<int>(ctx.Attr<int>("k"));
 
     const T* input_data = input->data<T>();
+
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     // FIXME(typhoonzero): data is always converted to type T?
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
-    framework::DDim inputdims = input->dims();
-    const size_t input_height = framework::product(
-        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
-    const size_t input_width = inputdims[inputdims.size() - 1];
-
+    size_t input_height = input->dims()[0];
+    size_t input_width = input->dims()[1];
     if (k > input_width) k = input_width;
 
     // NOTE: pass lds and dim same to input width.
     // NOTE: old matrix implementation of stride is different to eigen.
     // TODO(typhoonzero): refine this kernel.
-    const int kMaxHeight = 2048;
-    int gridx = input_height < kMaxHeight ? input_height : kMaxHeight;
-    auto& dev_ctx = ctx.cuda_device_context();
-    switch (GetDesiredBlockDim(input_width)) {
-      FIXED_BLOCK_DIM(
-          KeMatrixTopK<T, 5,
-                       kBlockDim><<<gridx, kBlockDim, 0, dev_ctx.stream()>>>(
-              output_data, k, indices_data, input_data, input_width,
-              input_width, static_cast<int>(k), gridx, input_height));
-      default:
-        PADDLE_THROW("Error");
-    }
+    dim3 threads(256, 1);
+    dim3 grid(input_height, 1);
+
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };
 
-#undef FIXED_BLOCK_DIM_BASE
-#undef FIXED_BLOCK_DIM
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h
index 76ece57b39919148da04caecaa43ea9d2b9d95df..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644
--- a/paddle/fluid/operators/top_k_op.h
+++ b/paddle/fluid/operators/top_k_op.h
@@ -34,6 +34,7 @@ class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
+    // FIXME: only deal with matrix(2d tensor).
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
     auto* indices = ctx.Output<Tensor>("Indices");
@@ -43,6 +44,8 @@ class TopkKernel : public framework::OpKernel<T> {
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
     int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
 
+    auto eg_input = EigenMatrix<T>::From(*input);
+
     // reshape input to a flattern matrix(like flat_inner_dims)
     framework::DDim inputdims = input->dims();
     const size_t row = framework::product(
@@ -50,7 +53,7 @@ class TopkKernel : public framework::OpKernel<T> {
     const size_t col = inputdims[inputdims.size() - 1];
     Eigen::DSizes<int, 2> flat2dims(row, col);
     // NOTE: eigen shape doesn't affect paddle tensor.
-    auto eg_input = EigenMatrix<T>::Reshape(*input, inputdims.size() - 1);
+    eg_input.reshape(flat2dims);
 
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 4e2b3ac0e3eddd664159658d3b16a7ca823b589c..9ac8ae2ac7b78bef6019399eac866f70dc43bb4e 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -27,6 +27,12 @@ ENDIF()
 cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
+set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+set(MYDEPS ${MYDEPS} libcmt shlwapi)
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(MYDEPS ${MYDEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX})
+
 nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
 
 cc_library(place SRCS place.cc DEPS enforce boost)
@@ -58,6 +64,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+target_link_libraries(cudnn_helper_test ${MYDEPS})
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
 
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index b6e15862c16d7b3f3a0b6a1ae8fb2312789a4d21..8fe6c20be13ff43f4bdffd3cd16b4fcac08c7516 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -68,7 +68,14 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) {
     }                                                                \
   } while (false)
 #else
-#define CUDNN_ENFORCE(condition)
+// windows
+#define CUDNN_ENFORCE(condition)                                    \
+  do {                                                              \
+    cudnnStatus_t status = condition;                               \
+    if (status != CUDNN_STATUS_SUCCESS) {                           \
+      std::cerr << ::paddle::platform::cudnnGetErrorString(status); \
+    }                                                               \
+  } while (false)
 #endif
 
 enum class DataLayout {  // Not use
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index baa123fd0f21d057f34350036a5f77c78409db28..241f79d8e75ae03933205848e3d43433eb2885d9 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -127,7 +127,7 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) ((condition) == 0)
 #endif
 
 template <typename... Args>
@@ -309,7 +309,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
   do {                                                       \
     if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -330,26 +329,26 @@ inline void throw_on_error(T e) {
     }                                                                   \
   } while (0)
 #else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0)==(__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0)!=(__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0)>(__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0)>=(__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0)<(__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0)<=(__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
-  do {  \
-    if (!((__VAL0)__CMP(__VAL1))) { \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed."); \
-    }  \
-  } while(0) 
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \
-    do {  \
-    if (nullptr == (__VAL1)) { \
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
+  do {                                                                 \
+    if (!((__VAL0)__CMP(__VAL1))) {                                    \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
+    }                                                                  \
+  } while (0)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
+  do {                                                             \
+    if (nullptr == (__VAL1)) {                                     \
       PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }  \
-  } while(0) 
-#endif // !_WIN32
+    }                                                              \
+  } while (0)
+#endif  // !_WIN32
 
 }  // namespace platform
 }  // namespace paddle