diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb646d3ce5d660734a27c0ac9f18ad54cd3e1c1b..c31f51a3f7371bd7b1b0ca3234091a35868806ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -208,10 +208,10 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/warpctc)   # download, build, install warpctc
 
 if (NOT WIN32)
-# there is no official support of warpctc, nccl, cupti in windows
-include(external/warpctc)   # download, build, install warpctc
+# there is no official support of nccl, cupti in windows
 include(cupti)
 include(external/gzstream)
 endif (NOT WIN32)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2..7b937c93febdfa1d7d5f4c73fc2a5830322688e5 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
 # Used in unit test test_WarpCTCLayer
 SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
     CACHE PATH "Warp-ctc Library Directory" FORCE)
-SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
-    CACHE FILEPATH "Warp-ctc Library" FORCE)
 
-IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32)
     SET(USE_OMP OFF)
 ELSE()
     SET(USE_OMP ON)
 ENDIF()
 
+IF(WIN32)
+    SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git")
+ELSE()
+    SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git")
+ENDIF()
+
 ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/dzhwinter/warp-ctc.git"
+    GIT_REPOSITORY ${WARPCTC_REPOSITORY}
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
                     -DWITH_GPU=${WITH_GPU}
                     -DWITH_OMP=${USE_OMP}
@@ -59,6 +67,18 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+IF(WIN32)
+    IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}")
+        add_custom_command(TARGET extern_warpctc POST_BUILD
+                COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}
+                )
+    ENDIF()
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+else(WIN32)
+    SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+            CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
 
 MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers.
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 2ced43f9e6c60da642f7a6252f889d9c9ab9748f..70d159b4f3549662e080794efad8af943ce1f0bc 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,7 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc
index a5acfd70449e92663cb66ef90a141c087ff6ec88..5fcb17b9f3ac390548aba33db7d0b8350cde7e00 100644
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -16,100 +16,25 @@ limitations under the License. */
 #include <functional>
 #include <vector>
 
+#include "ngraph/ngraph.hpp"
 #include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
 #include "paddle/fluid/platform/enforce.h"
-
-#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
 
 namespace paddle {
 namespace framework {
 
-static std::shared_ptr<ngraph::Node> GetNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    const VariableNameMap& var_map,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = var_map.at(name);
-  PADDLE_ENFORCE_EQ(var_names.size(), 1,
-                    "op %s name %s expects one associated var", op->Type(),
-                    name);
-  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
-    return (*ngb_node_map)[var_names[0]];
-  } else {
-    return nullptr;
-  }
-}
-
-static std::shared_ptr<ngraph::Node> GetInputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Inputs(), ngb_node_map);
-}
-
-static std::shared_ptr<ngraph::Node> GetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  return GetNode(op, name, op->Outputs(), ngb_node_map);
-}
-
-static void SetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string name,
-    std::shared_ptr<ngraph::Node> node,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto& var_names = op->Outputs().at(name);
-  if (var_names.size() == 1) {
-    (*ngb_node_map)[var_names[0]] = node;
-  } else if (var_names.size() == 0) {
-    (*ngb_node_map)[""] = node;
-  } else {
-    PADDLE_THROW("name %s has more than 1 var_names.", name);
-  }
-}
-
-static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
-                      const std::string name) {
-  auto& outputs = op->Outputs();
-  if (outputs.find(name) == outputs.end()) return false;
-  return outputs.at(name).size() > 0;
-}
-
-template <typename T>
-static void BuildBinaryNode(
-    const std::shared_ptr<OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto x = GetInputNode(op, "X", ngb_node_map);
-  auto y = GetInputNode(op, "Y", ngb_node_map);
-  auto out = std::make_shared<T>(x, y);
-  SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
-template <typename T>
-static void BuildUnaryNode(
-    const std::shared_ptr<OperatorBase>& op,
-    std::shared_ptr<
-        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-        ngb_node_map) {
-  auto input = GetInputNode(op, "X", ngb_node_map);
-  auto out = std::make_shared<T>(input);
-  SetOutputNode(op, "Out", out, ngb_node_map);
-}
-
 std::map<std::string,
          std::function<void(const std::shared_ptr<OperatorBase>&,
                             std::shared_ptr<std::unordered_map<
                                 std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
-                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
+    NgraphBridge::NG_NODE_MAP = {
+        {"mul", paddle::operators::ngraphs::BuildMulNode},
+        {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode},
+        {"relu", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Relu>},
+        {"tanh", paddle::operators::ngraphs::BuildUnaryNode<ngraph::op::Tanh>}};
 
 void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
   auto& op_type = op->Type();
diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc
index e2cdfc845fe531733acc65746e661bfc2eed4890..23f681ce886fd0d8c113ffe4e80e25e6a803e31b 100644
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -278,7 +278,8 @@ std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
     ngraph::runtime::Backend::create("CPU");
 
 void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
-  op->RuntimeInferShape(scope_, place_);
+  RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_);
+  op->RuntimeInferShape(scope_, place_, ctx);
   for (auto& var_name_item : op->Inputs()) {
     for (auto& var_name : var_name_item.second) {
       auto* var = scope_.FindVar(var_name);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a62afe248baa2ab57edabfaac05bb858a1a1280c..8c83748668e9f91e9c333beb8494c3ef1db875dc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -137,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
   }
 }
 
+RuntimeContext::RuntimeContext(const VariableNameMap& innames,
+                               const VariableNameMap& outnames,
+                               const Scope& scope) {
+  for (auto& var_name_item : innames) {
+    std::vector<Variable*>& input_vars = inputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      input_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+  for (auto& var_name_item : outnames) {
+    std::vector<Variable*>& output_vars = outputs[var_name_item.first];
+    for (auto& var_name : var_name_item.second) {
+      output_vars.push_back(scope.FindVar(var_name));
+    }
+  }
+}
+
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   VLOG(4) << place << " " << DebugStringEx(&scope);
   if (platform::is_gpu_place(place)) {
@@ -412,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
   return var != nullptr;
 }
 
+const Variable* ExecutionContext::InputVar(const std::string& name) const {
+  auto it = ctx_.inputs.find(name);
+  if (it == ctx_.inputs.end()) return nullptr;
+
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's input %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+
+const Variable* ExecutionContext::LegacyInputVar(
+    const std::string& name) const {
+  auto ipt = op_.Input(name);
+  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+}
+
+Variable* ExecutionContext::OutputVar(const std::string& name) const {
+  auto it = ctx_.outputs.find(name);
+  if (it == ctx_.outputs.end()) return nullptr;
+
+  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
+                    "Operator %s's output %s should contain only one variable.",
+                    op_.Type(), name);
+  return it->second.empty() ? nullptr : it->second[0];
+}
+
+Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
+  auto opt = op_.Output(name);
+  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
+}
+
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   return Input<LoDTensor>(name);
 }
 
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const {
+  return LegacyInput<LoDTensor>(name);
+}
+
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
@@ -441,6 +495,11 @@ Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
   return Output<LoDTensor>(name);
 }
 
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
+  return LegacyOutput<LoDTensor>(name);
+}
+
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
@@ -477,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) {
 
 class RuntimeInferShapeContext : public InferShapeContext {
  public:
-  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope,
+                           const RuntimeContext& ctx)
+      : op_(op), scope_(scope), ctx_(ctx) {}
 
   bool HasInput(const std::string& name) const override {
     // has only one input
-    const auto& ins = op_.Inputs();
+    const auto& ins = ctx_.inputs;
     auto it = ins.find(name);
     if (it == ins.end()) {
       return false;
     }
     const auto& in = it->second;
-    if (in.size() == 0 || in[0] == kEmptyVarName) {
-      return false;
-    }
+    if (in.size() == 0) return false;
     PADDLE_ENFORCE_EQ(in.size(), 1UL,
                       "Input %s should not have more than one inputs", name);
-    return scope_.FindVar(in[0]) != nullptr;
+    return in[0] != nullptr;
   }
 
   bool HasOutput(const std::string& name) const override {
@@ -678,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
  private:
   const OperatorBase& op_;
   const Scope& scope_;
+  const RuntimeContext& ctx_;
 };
 
 static void CheckTensorNANOrInf(const std::string& name,
@@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name,
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
-                                           const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+                                           const platform::Place& place,
+                                           const RuntimeContext& ctx) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx);
   this->InferShape(&infer_shape_ctx);
 }
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
-  this->InferShape(&infer_shape_ctx);
+  RuntimeContext ctx(Inputs(), Outputs(), scope);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   OpKernelMap& kernels = kernels_iter->second;
 
-  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
-  // transform functions are ready.
-
-  // for (auto& candidate : kKernelPriority) {
-  //   Do selection
-  // }
-
-  auto expected_kernel_key =
-      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
   auto* transfer_scope =
-      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
+      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx));
+  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+  this->InferShape(&infer_shape_ctx);
+  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
+  // not Scope. Imperative mode only pass inputs and get outputs.
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
@@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
@@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   }
 }
 
-Scope* OperatorWithKernel::TryTransferData(
+Scope* OperatorWithKernel::PrepareData(
     const Scope& scope, const OpKernelType& expected_kernel_key,
-    std::vector<std::string>* transfered_inplace_vars) const {
+    std::vector<std::string>* transfered_inplace_vars,
+    RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
   for (auto& var_name_item : Inputs()) {
-    for (auto& var_name : var_name_item.second) {
+    std::vector<Variable*>& input_vars = ctx->inputs[var_name_item.first];
+
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      auto& var_name = var_name_item.second[i];
       auto* var = scope.FindVar(var_name);
+      input_vars[i] = var;
+
       // Only tensor can be tranfer to another device.
       if (var == nullptr || !VarIsTensor(*var)) {
         continue;
@@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData(
       }
 
       auto* trans_var = new_scope->Var(var_name);
+      input_vars[i] = trans_var;
 
       Tensor out;
       TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a6a28a5bce01d71cf56f25f5556033db94452c2..39190d07b4ccdd5ffd03e2d50bb0e577ac00af75 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 class OperatorBase;
 class ExecutionContext;
 
+class RuntimeContext {
+ public:
+  RuntimeContext(const VariableNameMap& innames,
+                 const VariableNameMap& outnames, const Scope& scope);
+
+  VariableValueMap inputs;
+  VariableValueMap outputs;
+};
+
 /**
  * OperatorBase has the basic elements that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -129,7 +138,8 @@ class OperatorBase {
 
   void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
   virtual void RuntimeInferShape(const Scope& scope,
-                                 const platform::Place& place) const {}
+                                 const platform::Place& place,
+                                 const RuntimeContext& ctx) const {}
 
  protected:
   std::string type_;
@@ -156,8 +166,9 @@ class OperatorBase {
 class ExecutionContext {
  public:
   ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
-      : op_(op), scope_(scope), device_context_(device_context) {}
+                   const platform::DeviceContext& device_context,
+                   const RuntimeContext& ctx)
+      : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {}
 
   const OperatorBase& op() const { return op_; }
 
@@ -180,15 +191,9 @@ class ExecutionContext {
     return op_.Outputs(name).size();
   }
 
-  const Variable* InputVar(const std::string& name) const {
-    auto ipt = op_.Input(name);
-    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-  }
+  const Variable* InputVar(const std::string& name) const;
 
-  Variable* OutputVar(const std::string& name) const {
-    auto opt = op_.Output(name);
-    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-  }
+  Variable* OutputVar(const std::string& name) const;
 
   const std::vector<const Variable*> MultiInputVar(
       const std::string& name) const {
@@ -227,6 +232,22 @@ class ExecutionContext {
     return var == nullptr ? nullptr : var->GetMutable<T>();
   }
 
+  template <typename T>
+  const T* LegacyInput(const std::string& name) const {
+    auto* var = LegacyInputVar(name);
+    return var == nullptr ? nullptr : &var->Get<T>();
+  }
+
+  template <typename T>
+  T* LegacyOutput(const std::string& name) const {
+    auto var = LegacyOutputVar(name);
+    return var == nullptr ? nullptr : var->GetMutable<T>();
+  }
+
+  const Variable* LegacyInputVar(const std::string& name) const;
+
+  Variable* LegacyOutputVar(const std::string& name) const;
+
   template <typename T>
   const std::vector<const T*> MultiInput(const std::string& name) const {
     auto names = op_.Inputs(name);
@@ -286,11 +307,16 @@ class ExecutionContext {
   const OperatorBase& op_;
   const Scope& scope_;
   const platform::DeviceContext& device_context_;
+  const RuntimeContext& ctx_;
 };
 
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
+template <>
+const Tensor* ExecutionContext::LegacyInput<Tensor>(
+    const std::string& name) const;
+
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
@@ -298,6 +324,9 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 
+template <>
+Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
+
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
@@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase {
     OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
   }
 
-  void RuntimeInferShape(const Scope& scope,
-                         const platform::Place& place) const override;
+  void RuntimeInferShape(const Scope& scope, const platform::Place& place,
+                         const RuntimeContext& ctx) const override;
 
  protected:
   virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
@@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase {
    *
    * * transfered_inplace_vars is a output vector.
    */
-  Scope* TryTransferData(
-      const Scope& scope, const OpKernelType& expected_kernel_key,
-      std::vector<std::string>* transfered_inplace_vars) const;
+  Scope* PrepareData(const Scope& scope,
+                     const OpKernelType& expected_kernel_key,
+                     std::vector<std::string>* transfered_inplace_vars,
+                     RuntimeContext* ctx) const;
 
   void TransferInplaceVarsBack(const Scope& scope,
                                const std::vector<std::string>& inplace_vars,
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 2de6233a9e0d320ec9a06d547db3575eb61925c0..938e2024c3359c2acd65a1aa4af875a8350e4c58 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -28,8 +28,11 @@ class OperatorBase;
 class OpDesc;
 class InferShapeContext;
 class BlockDesc;
+class Variable;
 
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+// TODO(panyx0718): Replace vector with something like gtl::Vector.
+using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
 // The order should be as same as framework.proto
 using Attribute =
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 257bfc0a3f926d20abc4647b27e8e9cc2c49e014..d9b0c66e5727e80486423ab065dccf9105775127 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -64,9 +64,7 @@ endif()
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
-if (NOT WIN32)
-    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
-endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index ae9765b76138a34935619b662a8ffb7f46c8300c..7f2bde55c98277b9fd4b3374657001c42d673d43 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& dev_ctx = *pool.Get(dev_place);
 
-    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+    framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope);
+    framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx);
 
     const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
     const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
index 6fed9ba92c16477c79509a30911920e67b9f9bdd..e4604db3a381616c7420f816f0b49a015c925bd4 100644
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <nccl.h>
 #endif
 #include <sys/time.h>
+#include <limits>
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/framework/data_type.h"
@@ -31,7 +32,12 @@ namespace distributed {
 
 class IOBufWriter {
  public:
-  static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) {
+  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
+                     const char* v, int64_t vlen) {
+    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
+      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
+    }
+
     iobuf->append(reinterpret_cast<char*>(&k), 4);
     iobuf->append(reinterpret_cast<char*>(&vlen), 8);
     iobuf->append(v, vlen);
@@ -87,6 +93,10 @@ class IOBufWriter {
                              int k, const char* v, int64_t vlen,
                              bool in_cuda_pinned, void (*destroy)(void*),
                              void* user_data) {
+    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
+      LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen;
+    }
+
 #ifdef PADDLE_WITH_BRPC_RDMA
     IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
                                     destroy, user_data);
@@ -134,7 +144,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
     request->set_type(::sendrecv::NCCL_ID);
     const ncclUniqueId& uid = var->Get<ncclUniqueId>();
     // TODO(gongwb): use append_zero to avoid data copy.
-    IOBufWriter::Append(iobuf,
+    IOBufWriter::Append(name, iobuf,
                         sendrecv::VariableMessage::kSerializedFieldNumber,
                         uid.internal, NCCL_UNIQUE_ID_BYTES);
     return;
@@ -149,7 +159,7 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
   // FIXME(gongwb): it seems that can use zero copy.
   if (var_is_not_stable) {
     IOBufWriter::Append(
-        iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
         static_cast<const char*>(payload->ptr()), payload->memory_size());
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -171,10 +181,11 @@ void SerializeToIOBuf(const std::string& name, framework::Variable* var,
 
   if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
-    size_t rows_memory_size =
-        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
+    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
 
-    IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber,
+    IOBufWriter::Append(name, iobuf,
+                        ::sendrecv::VariableMessage::kRowsFieldNumber,
                         reinterpret_cast<const char*>(slr->rows().data()),
                         static_cast<int64_t>(rows_memory_size));
   }
diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc
index 299dfe35438c35ec922dcdc75bf774a00f111bbb..a9dea9cfd2eeaa7e7ed8f052d2f51f5893c1e2e3 100644
--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <nccl.h>
 #endif
+#include <limits>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -102,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
 
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
+  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
+    LOG(FATAL) << "AppendZeroCopy varname:" << name
+               << ", vlen:" << payload->memory_size();
+  }
   // steal reference of tensor data
   ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
   int num_slices = 2;       // only SelectedRows have rows buffer
@@ -115,7 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   if (var->IsType<framework::SelectedRows>()) {
     auto* slr = var->GetMutable<framework::SelectedRows>();
     ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
+
+    PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name());
     size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
+
     e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
     slices[2] = ::grpc::Slice(e2.size());
     memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
index 33eded0e6c0d90dadeeb63594983224d795fa244..6a87178be5daa02444c41a26f6e6c067713dd96f 100644
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <iostream>
 #include <string>
+#include <typeindex>
 #include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -23,9 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/port.h"
-
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -83,6 +83,11 @@ inline framework::proto::VarType::Type ToVarType(
   }
 }
 
+template <template <typename> class T, typename Elem>
+std::string VectorElemName(const T<Elem>& arg) {
+  return typeid(Elem).name();
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index 921c96b5839a37a4024b17b9ff71442f31b5b8f0..47ff568a1135f2f0a146faa4d5d6fc422a344f51 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -118,7 +118,7 @@ bool VariableResponse::CopyLodTensorData(
 
   VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
           << ", Buffer Size = " << length;
-  PADDLE_ENFORCE_EQ(tensor->memory_size(), length);
+  PADDLE_ENFORCE_EQ(tensor->memory_size(), static_cast<unsigned int>(length));
   return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
 }
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ed77ff5577cf4f45a8865db9b42e8bda9839478
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ngraph_ops.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the list of the ngraph operators for Paddle.
+ *
+ * ATTENTION: It requires some C++11 features, for lower version C++ or C, we
+ * might release another API.
+ */
+
+#pragma once
+
+#include "ops/binary_unnary_op.h"
+#include "ops/mul_op.h"
diff --git a/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e2f5e231c16cd0fad6db287aa19430c56b534fd
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/binary_unnary_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/ngraph/ops/mul_op.h b/paddle/fluid/operators/ngraph/ops/mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e12e5d7c3da04706907c7ae63ce8046ce667f25
--- /dev/null
+++ b/paddle/fluid/operators/ngraph/ops/mul_op.h
@@ -0,0 +1,134 @@
+/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <string>
+#include "ngraph/ngraph.hpp"
+#include "paddle/fluid/platform/ngraph_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace ngraphs {
+
+static void BuildMulNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
+  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+
+  auto x_reshape = x;
+  auto y_reshape = y;
+
+  if (x->get_shape().size() > 2) {
+    auto x_2d = paddle::platform::FlattenTo2d(x->get_shape(), x_num_col_dims);
+    x_reshape = paddle::platform::NgReshaper(x, x_2d);
+  }
+
+  if (y->get_shape().size() > 2) {
+    auto y_2d = paddle::platform::FlattenTo2d(y->get_shape(), y_num_col_dims);
+    y_reshape = paddle::platform::NgReshaper(y, y_2d);
+  }
+
+  std::shared_ptr<ngraph::Node> out =
+      std::make_shared<ngraph::op::Dot>(x_reshape, y_reshape);
+
+  auto dummy_out = paddle::platform::GetOutputNode(op, "Out", ngb_node_map);
+  if (dummy_out && dummy_out->get_shape() != out->get_shape()) {
+    out = paddle::platform::NgReshaper(out, dummy_out->get_shape());
+  }
+  paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+static void BuildMulGradNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto op_attrs = paddle::framework::AttrReader(op->Attrs());
+  int x_num_col_dims = op_attrs.Get<int>("x_num_col_dims");
+  int y_num_col_dims = op_attrs.Get<int>("y_num_col_dims");
+  auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map);
+  auto y = paddle::platform::GetInputNode(op, "Y", ngb_node_map);
+  auto dout = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map);
+
+  bool is_dx = paddle::platform::HasOutput(op, "X@GRAD") ? true : false;
+  bool is_dy = paddle::platform::HasOutput(op, "Y@GRAD") ? true : false;
+
+  auto x_shape = x->get_shape();
+  auto y_shape = y->get_shape();
+
+  auto x_reshape = x;
+  auto y_reshape = y;
+
+  if (x_shape.size() > 2) {
+    auto x_2d_shape = paddle::platform::FlattenTo2d(x_shape, x_num_col_dims);
+    x_reshape = paddle::platform::NgReshaper(x, x_2d_shape);
+  }
+
+  if (y_shape.size() > 2) {
+    auto y_2d_shape = paddle::platform::FlattenTo2d(y_shape, y_num_col_dims);
+    y_reshape = paddle::platform::NgReshaper(y, y_2d_shape);
+  }
+
+  auto x_reshape_shape = x_reshape->get_shape();
+  std::reverse(x_reshape_shape.begin(), x_reshape_shape.end());
+  auto x_transpose = std::make_shared<ngraph::op::Reshape>(
+      x_reshape, ngraph::AxisVector{1, 0}, x_reshape_shape);
+
+  auto y_reshape_shape = y_reshape->get_shape();
+  std::reverse(y_reshape_shape.begin(), y_reshape_shape.end());
+  auto y_transpose = std::make_shared<ngraph::op::Reshape>(
+      y_reshape, ngraph::AxisVector{1, 0}, y_reshape_shape);
+
+  if (is_dx) {
+    if (dout->get_shape().size() > 2) {
+      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
+      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
+    }
+    auto dx = std::make_shared<ngraph::op::Dot>(dout, y_transpose);
+
+    if (dx->get_shape() == x_shape) {
+      paddle::platform::SetOutputNode(op, "X@GRAD", dx, ngb_node_map);
+    } else {
+      auto dx_reshape = paddle::platform::NgReshaper(dx, x_shape);
+      paddle::platform::SetOutputNode(op, "X@GRAD", dx_reshape, ngb_node_map);
+    }
+  }
+
+  if (is_dy) {
+    if (dout->get_shape().size() > 2) {
+      auto dout_2d_shape = paddle::platform::FlattenTo2d(dout->get_shape(), 2);
+      dout = paddle::platform::NgReshaper(dout, dout_2d_shape);
+    }
+    auto dy = std::make_shared<ngraph::op::Dot>(x_transpose, dout);
+
+    if (dy->get_shape() == y_shape) {
+      paddle::platform::SetOutputNode(op, "Y@GRAD", dy, ngb_node_map);
+    } else {
+      auto dy_reshape = paddle::platform::NgReshaper(dy, y_shape);
+      paddle::platform::SetOutputNode(op, "Y@GRAD", dy_reshape, ngb_node_map);
+    }
+  }
+}
+}  // namespace ngraphs
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/transpose_mkldnn_op.cc b/paddle/fluid/operators/transpose_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..37f1cadc7d2ff248e8b6dcb3f0c8ba09f8ccd8b5
--- /dev/null
+++ b/paddle/fluid/operators/transpose_mkldnn_op.cc
@@ -0,0 +1,124 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+
+template <typename T>
+class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    PADDLE_ENFORCE(
+        is_test == true,
+        "ConvTransposeMKLDNN works only for inference!. Set is_test = True");
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    const T* input_data = input->data<T>();
+
+    if (ndims == 1) {
+      output->ShareDataWith(*input);
+      return;
+    }
+
+    std::vector<int> nchw_axis(ndims, 0);
+    for (size_t i = 0; i < nchw_axis.size(); ++i) {
+      nchw_axis[i] = i;
+    }
+
+    std::vector<int> nchw_tz = paddle::framework::vectorize2int(input->dims());
+    std::string data_format = ctx.Attr<std::string>("data_format");
+
+    auto src_md =
+        input->format() != mkldnn::memory::format::nchw
+            ? platform::MKLDNNMemDesc(nchw_tz, platform::MKLDNNGetDataType<T>(),
+                                      input->format())
+            : Axis2MemoryDesc(nchw_tz, nchw_axis);
+
+    this->TransposeKernel(ctx.GetPlace(), Axis2MemoryDesc(nchw_tz, axis),
+                          src_md, output, input_data, nchw_tz, mkldnn_engine);
+  }
+
+ protected:
+  mkldnn::memory::desc Axis2MemoryDesc(std::vector<int>& nchw_tz,
+                                       std::vector<int>& axis) const {
+    mkldnn_memory_desc_t mem_fmt;
+
+    mem_fmt.primitive_kind = mkldnn_memory;
+    mem_fmt.ndims = axis.size();
+    for (unsigned int i = 0; i < nchw_tz.size(); ++i) {
+      mem_fmt.dims[i] = nchw_tz[i];  // logical dimensions (nchw format,
+                                     // regardless physical layout)
+    }
+    mem_fmt.data_type = mkldnn_f32;
+    mem_fmt.format = mkldnn_blocked;
+
+    unsigned int total_stride = 1;
+    for (int i = nchw_tz.size() - 1; i >= 0; --i) {
+      mem_fmt.layout_desc.blocking.padding_dims[i] =
+          nchw_tz[i];  // logical dimensions (nchw format, regardless physical
+                       // layout)
+      mem_fmt.layout_desc.blocking.block_dims[i] = 1;
+      mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0;  // no offset
+      mem_fmt.layout_desc.blocking.strides[0][axis[i]] = total_stride;
+      mem_fmt.layout_desc.blocking.strides[1][axis[i]] = 1;
+      total_stride *= nchw_tz[axis[i]];
+    }
+    mem_fmt.layout_desc.blocking.offset_padding = 0;  // no initial offset
+    return mem_fmt;
+  }
+
+  void TransposeKernel(platform::Place place, mkldnn::memory::desc md_o,
+                       mkldnn::memory::desc md_i, Tensor* output,
+                       const T* data_i, std::vector<int>& nchw_dims,
+                       const mkldnn::engine& eng) const {
+    // Make Memory primitive descriptors
+    auto mpd_o = mkldnn::memory::primitive_desc(md_o, eng);
+    auto mpd_i = mkldnn::memory::primitive_desc(md_i, eng);
+
+    auto data_o = output->mutable_data<T>(
+        place, paddle::memory::Allocator::kDefault, mpd_o.get_size());
+
+    auto src = mkldnn::memory(mpd_i, (T*)(data_i));
+    auto dst = mkldnn::memory(mpd_o, data_o);
+
+    auto r = mkldnn::reorder(src, dst);
+    mkldnn::stream(mkldnn::stream::kind::eager).submit({r}).wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::TransposeMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index bc1f59bc1a7641764d1a76fc54ebe835f50aee3d..b3b379d16ff099ba244fc92ed149a0089c2750e4 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -16,6 +16,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -53,11 +57,32 @@ class TransposeOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", out_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
+  }
 };
 
 class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
     AddInput(
         "X",
         "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
@@ -67,6 +92,16 @@ class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
         "(vector<int>) A list of values, and the size of the list should be "
         "the same with the input tensor rank. This operator permutes the input "
         "tensor's axes according to the values given.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
     AddComment(R"DOC(
 Transpose Operator.
 
@@ -144,8 +179,18 @@ class Transpose2Op : public TransposeOp {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
-                                   ctx.device_context());
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    std::string data_format = ctx.Attr<std::string>("data_format");
+    framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+#ifdef PADDLE_WITH_MKLDNN
+    if (library_ == framework::LibraryType::kPlain &&
+        platform::CanMKLDNNBeUsed(ctx)) {
+      library_ = framework::LibraryType::kMKLDNN;
+      layout_ = framework::DataLayout::kMKLDNN;
+    }
+#endif
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace(), layout_, library_);
   }
 };
 
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 5939c500c946c44579d1de645ac9700c7701a4e9..07159d4a12ef4b628f7705ed206d3334be46dfc8 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -16,9 +16,7 @@ if (CUPTI_FOUND)
     list(APPEND CUDA_SRCS cupti.cc)
 endif(CUPTI_FOUND)
 nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
-if (NOT WIN32)
 cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
-endif(NOT WIN32)
 if (WITH_MKLML)
     cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
 endif()
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 550fe2edee13d628e761eca194809823537a4024..2f4f8101e4b957634d68fb0d64649ff8afba7c54 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -34,7 +34,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using cudnn_func = decltype(&::__name);                              \
       std::call_once(cudnn_dso_flag, []() {                                \
         cudnn_dso_handle = paddle::platform::dynload::GetCUDNNDsoHandle(); \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index cc5cda6106c188f3156d33480b5d3641eed32556..eddebfe92ae80be7e70090aca041df1c6ea4cd11 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -201,6 +201,8 @@ void* GetCurandDsoHandle() {
 void* GetWarpCTCDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib");
+#elif defined(_WIN32)
+  return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "warpctc.dll");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so");
 #endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 84fd2ce9987628a5ed29e4125a03dedb96e416c1..edb4c649addfaf941a00588395d9191038217979 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -18,6 +18,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
+#ifndef _WIN32
+#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
+#else
+#define DECLARE_TYPE(__name, ...) decltype(auto)
+#endif
+
 void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h
index c3f9433503accf98d30ccaa57b9b4b8f3c68666a..d0619293acf2d2df0d925e969bdeb8e45cda6e2b 100644
--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -34,7 +34,7 @@ extern void* mklml_dso_handle;
 #define DYNAMIC_LOAD_MKLML_WRAP(__name)                                    \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
-    auto operator()(Args... args) -> decltype(__name(args...)) {           \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {       \
       using mklmlFunc = decltype(&::__name);                               \
       std::call_once(mklml_dso_flag, []() {                                \
         mklml_dso_handle = paddle::platform::dynload::GetMKLMLDsoHandle(); \
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 5d67658b94af75680a100e13eed7b6b052162e00..751aa54b1ad1a3864f3a2aa956a7051dd8bd3628 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -33,7 +33,7 @@ extern void* tensorrt_dso_handle;
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
   struct DynLoad__##__name {                                            \
     template <typename... Args>                                         \
-    auto operator()(Args... args) -> decltype(__name(args...)) {        \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
       using tensorrt_func = decltype(__name(args...)) (*)(Args...);     \
       std::call_once(tensorrt_dso_flag, []() {                          \
         tensorrt_dso_handle =                                           \
diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h
index 18ed9956f1841874b27c2493e2f3e22fdfbf0448..bc1977b05de5da062fae5662dfb51d4a74868c8a 100644
--- a/paddle/fluid/platform/dynload/warpctc.h
+++ b/paddle/fluid/platform/dynload/warpctc.h
@@ -34,7 +34,7 @@ extern void* warpctc_dso_handle;
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
       using warpctcFunc = decltype(&::__name);                                 \
       std::call_once(warpctc_dso_flag, []() {                                  \
         warpctc_dso_handle = paddle::platform::dynload::GetWarpCTCDsoHandle(); \
diff --git a/paddle/fluid/platform/ngraph_helper.h b/paddle/fluid/platform/ngraph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..889fb55c87d752a0a149529e6144d1fc345b8f63
--- /dev/null
+++ b/paddle/fluid/platform/ngraph_helper.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_NGRAPH
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+#include "ngraph/ngraph.hpp"
+
+namespace paddle {
+namespace platform {
+
+static ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
+  auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
+                            std::multiplies<size_t>());
+  auto x2 = std::accumulate(std::begin(sh) + num, std::end(sh), 1,
+                            std::multiplies<size_t>());
+  size_t x1_l = static_cast<size_t>(x1);
+  size_t x2_l = static_cast<size_t>(x2);
+  return ngraph::Shape{x1_l, x2_l};
+}
+
+static std::shared_ptr<ngraph::Node> NgReshaper(
+    std::shared_ptr<ngraph::Node> input, ngraph::Shape shape) {
+  std::vector<size_t> input_order(input->get_shape().size());
+  std::iota(std::begin(input_order), std::end(input_order), 0);
+  return std::make_shared<ngraph::op::Reshape>(
+      input, ngraph::AxisVector(input_order), shape);
+}
+
+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm, const paddle::framework::VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm, std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+
+static bool HasOutput(
+    const std::shared_ptr<paddle::framework::OperatorBase>& op,
+    const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index ad070171df32fd436f24613561d9bc384f79195a..c1b81159aca979efe4b46777a1cef49e44b95e27 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -55,7 +55,6 @@ static void *dlsym(void *handle, const char *symbol_name) {
 
 static void *dlopen(const char *filename, int flag) {
   std::string file_name(filename);
-  file_name.replace(0, file_name.size() - 1, '/', '\\');
   HMODULE hModule = LoadLibrary(file_name.c_str());
   if (!hModule) {
     throw std::runtime_error(file_name + " not found.");
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index b00510d4438673f2d27c82fca1fcd4fc16bc7b68..8f3660ca387ba10309f829bd04ac2ffdc573f3d6 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -102,6 +102,13 @@ def __bootstrap__():
     import sys
     import os
     import platform
+
+    if os.name == 'nt':
+        third_lib_path = os.path.abspath(os.path.dirname(
+            __file__)) + os.sep + '..' + os.sep + 'libs'
+        os.environ['path'] += ';' + third_lib_path
+        sys.path.append(third_lib_path)
+
     from . import core
 
     in_test = 'unittest' in sys.modules
@@ -128,13 +135,12 @@ def __bootstrap__():
         'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
         'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
         'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir', 'pe_profile_fname'
+        'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
 
     if os.name != 'nt':
-        read_env_flags.append('warpctc_dir')
         read_env_flags.append('cpu_deterministic')
 
     if core.is_compiled_with_dist():
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d0bd78454db91d811b01e629e5668f818e79e992..b5d603d4781a7fa0b964b635e8f94f2557aeccd8 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import os
 import re
 import six
 import sys
@@ -27,11 +28,18 @@ from .proto import framework_pb2
 try:
     from . import core
 except ImportError as e:
-    raise ImportError(
-        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
-    if you encounters \"libmkldnn.so not found\" errors. If you have python
-    installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + cpt.get_exception_message(e))
+    if os.name == 'nt':
+        raise ImportError(
+            """NOTE: You may need to run \"set PATH=c:\python27\lib:%PATH%\"
+        if you encounters \"mkldnn.dll not found\" errors. If you have python
+        installed in other directory, replace \"c:\python27\lib" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
+    else:
+        raise ImportError(
+            """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
+        if you encounters \"libmkldnn.so not found\" errors. If you have python
+        installed in other directory, replace \"/usr/local/lib\" with your own
+        directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
     raise e
 from . import unique_name
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bd9bf843039573862a22c85557d416bf82b41f6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py
@@ -0,0 +1,58 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh
+
+
+class TestNGRAPHReluDim2(TestRelu):
+    def setUp(self):
+        super(TestNGRAPHReluDim2, self).setUp()
+
+
+class TestNGRAPHTanhDim2(TestTanh):
+    def setUp(self):
+        super(TestNGRAPHTanhDim2, self).setUp()
+
+
+class TestNGRAPHReluDim4(TestRelu):
+    def setUp(self):
+        super(TestNGRAPHReluDim4, self).setUp()
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+
+class TestNGRAPHTanhDim4(TestTanh):
+    def setUp(self):
+        super(TestNGRAPHTanhDim4, self).setUp()
+
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6aba62f7c08e3fe646372c851622f2e321b3aee2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from paddle.fluid.tests.unittests.test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2
+
+
+class TestNGRAPHMulOp(TestMulOp):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHMulOp2(TestMulOp2):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHFP16MulOp1(TestFP16MulOp1):
+    def init_dtype_type(self):
+        pass
+
+
+class TestNGRAPHFP16MulOp2(TestFP16MulOp2):
+    def init_dtype_type(self):
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..61ac8790112ceadfdef7b18aad70af77644581cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transpose_mkldnn_op.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+from test_transpose_op import TestTransposeOp
+
+
+class TestTransposeMKLDNN(TestTransposeOp):
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = True
+        self.is_test = True
+        return
+
+    def test_check_grad(self):
+        return
+
+    def test_check_grad_no_input(self):
+        return
+
+    def test_check_grad_no_filter(self):
+        return
+
+
+class TestCase0MKLDNN(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, )
+        self.axis = (0, )
+
+
+class TestCase1a(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (0, 2, 1)
+
+
+class TestCase1b(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (2, 1, 0)
+
+
+class TestCase2(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeMKLDNN):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index bbcabb751f0761705ff268c4408dc8673bb01b81..93be9d28da7a73f4fa972acf0dbd95167e7dfca3 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -21,15 +21,24 @@ from op_test import OpTest
 
 class TestTransposeOp(OpTest):
     def setUp(self):
+        self.init_op_type()
         self.initTestCase()
-        self.op_type = "transpose2"
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        self.attrs = {'axis': list(self.axis)}
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+            'is_test': self.is_test,
+        }
         self.outputs = {
             'XShape': np.random.random(self.shape).astype("float32"),
             'Out': self.inputs['X'].transpose(self.axis)
         }
 
+    def init_op_type(self):
+        self.op_type = "transpose2"
+        self.use_mkldnn = False
+        self.is_test = False
+
     def test_check_output(self):
         self.check_output(no_check_set=['XShape'])
 
diff --git a/python/setup.py.in b/python/setup.py.in
index cf8f28bd250d82c18b296a4d01c3a9856d801b8c..22b9537a90e79c2571f61ec0dc156b602df784d6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -160,10 +160,11 @@ if '${WITH_FLUID_ONLY}'== 'OFF':
 
 # put all thirdparty libraries in paddle.libs
 libs_path='${PADDLE_BINARY_DIR}/python/paddle/libs'
-if os.name != 'nt':
-    package_data['paddle.libs']= []
-    package_data['paddle.libs']=['libwarpctc' + ext_name]
-    shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
+package_data['paddle.libs']= []
+package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name]
+shutil.copy('${WARPCTC_LIBRARIES}', libs_path)
+
 if '${WITH_MKL}' == 'ON':
     shutil.copy('${MKLML_LIB}', libs_path)
     shutil.copy('${MKLML_IOMP_LIB}', libs_path)