diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0d9c460628e17186152462c313937aff5490e723..323e743087ffbc0f979768bb9a8b8dd7eaec25b2 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -132,6 +132,22 @@ if(WITH_MKLDNN)
     pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn)
 endif()
 
+if(WITH_IPU)
+    pass_library(forward_graph_extract_pass base DIR ipu)
+    pass_library(optimizer_extract_pass base DIR ipu)
+    pass_library(optimizer_state_align_pass base DIR ipu)
+    pass_library(ipu_graph_builder_pass base DIR ipu)
+    pass_library(ipu_runtime_replacer_pass base DIR ipu)
+    pass_library(inference_process_pass base DIR ipu)
+    pass_library(inference_postprocess_pass base DIR ipu)
+    pass_library(popart_canonicalization_pass base DIR ipu)
+    pass_library(ipu_inplace_pass base DIR ipu)
+    pass_library(infer_shape_pass base DIR ipu)
+    pass_library(delete_scale_op_pass base DIR ipu)
+    pass_library(avg_shard_pass base DIR ipu)
+    pass_library(transfer_cast_op_pass base DIR ipu)
+endif()
+
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ae61b7388d1b071779add74f70d3c76fc97e1136..087a817d03af1c5bffd15965071dc48b4a299e9f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1350,6 +1350,16 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  if (kernel_iter == kernels.end() &&
+      platform::is_ipu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing IPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
   if (kernel_iter == kernels.end() &&
       platform::is_npu_place(expected_kernel_key.place_)) {
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 6d348ceb87c83de1bb201a6b57477d764b58a2ba..d2ab438fd2946701c70ea0bebf35ac33fbfb521e 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -57,33 +57,6 @@ void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
   std::memcpy(dst, src, num);
 }
 
-// NOTE: only for CPUPlace and IPUPlace.
-template <>
-void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
-                                    pten::Place src_place, const void* src,
-                                    size_t num) {
-  if (src_place.GetType() == pten::AllocationType::CPU &&
-      dst_place.GetType() == pten::AllocationType::CPU) {
-    platform::CPUPlace place_dst, place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::CPU &&
-             dst_place.GetType() == pten::AllocationType::IPU) {
-    platform::IPUPlace place_dst(dst_place.GetDeviceId());
-    platform::CPUPlace place_src;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::IPU &&
-             dst_place.GetType() == pten::AllocationType::CPU) {
-    platform::IPUPlace place_src(src_place.GetDeviceId());
-    platform::CPUPlace place_dst;
-    return Copy(place_dst, dst, place_src, src, num);
-  } else if (src_place.GetType() == pten::AllocationType::IPU &&
-             dst_place.GetType() == pten::AllocationType::IPU) {
-    platform::IPUPlace place_src(src_place.GetDeviceId());
-    platform::IPUPlace place_dst(dst_place.GetDeviceId());
-    return Copy(place_dst, dst, place_src, src, num);
-  }
-}
-
 // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
 template <>
 void Copy<pten::IPUPlace, pten::Place>(pten::IPUPlace dst_place, void* dst,
@@ -1039,6 +1012,24 @@ void Copy<pten::Place, pten::Place>(pten::Place dst_place, void* dst,
     return Copy(place_dst, dst, place_src, src, num);
   }
 #endif
+#ifdef PADDLE_WITH_IPU
+  else if (src_place.GetType() == pten::AllocationType::CPU &&
+           dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    platform::CPUPlace place_src;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::CPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::CPUPlace place_dst;
+    return Copy(place_dst, dst, place_src, src, num);
+  } else if (src_place.GetType() == pten::AllocationType::IPU &&
+             dst_place.GetType() == pten::AllocationType::IPU) {
+    platform::IPUPlace place_src(src_place.GetDeviceId());
+    platform::IPUPlace place_dst(dst_place.GetDeviceId());
+    return Copy(place_dst, dst, place_src, src, num);
+  }
+#endif
 }
 
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 550278950c39170f46cf52d32ca9f50968f3e1ca..eb7057bcd50addd8053738b81b79cf6d0a915941 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -72,7 +72,7 @@ IF(WITH_GPU OR WITH_ROCM)
 ENDIF()
 
 IF(WITH_IPU)
-    set(IPU_CTX_DEPS ipu_backend)
+    set(IPU_CTX_DEPS ipu_info)
 ELSE()
     set(IPU_CTX_DEPS)
 ENDIF(WITH_IPU)
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index 43408ca207d1d2c10ba29b32b487e8a7ea99917f..4f8bbb2d2689eb6ffee1119c6eb14ef27de7a2c8 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -34,3 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 5f711937a8098b1d8d83ac0d9f284883191fc796..d54c6a33ecbf53071956aaf4b9d342efa5746f65 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,19 +1,22 @@
 IF(WITH_IPU)
   FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
   list(APPEND PADDLE_IPU_SRC ${POPART_CANONICALIZATION_SRC})
-  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
-  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
   set(IPU_BACKEND_SRC
-    "ipu_device.cc"
     "ipu_strategy.cc"
     "ipu_executor.cc"
     "ipu_compiler.cc"
     "ipu_backend.cc"
     "ipu_utils.cc"
   )
+  set(IPU_INFO_SRC
+    "ipu_info.cc"
+    "ipu_device.cc"
+  )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph framework_proto enforce graph_helper timer)
-  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
-  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper)
+  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce)
+  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper)
   add_dependencies(paddle_ipu ipu_backend)
+  set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
+  set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
 ENDIF()
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.cc b/paddle/fluid/platform/device/ipu/ipu_device.cc
index cd2a628c9abe2bf8e391fcfc7b9d37b293d19936..2459f5140eb5b25af82381366f25c714beb69aaf 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_device.cc
@@ -13,12 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/device/ipu/ipu_device.h"
-#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 namespace ipu {
 
+// TODO(alleng) merge with ipu_utils
+static bool GetBoolEnv(std::string str) {
+  char* str_val = getenv(str.c_str());
+  if (str_val == NULL) {
+    return false;
+  } else {
+    bool val = false;
+    if (strcmp(str_val, "1") == 0 || strcmp(str_val, "true") == 0 ||
+        strcmp(str_val, "True") == 0 || strcmp(str_val, "TRUE") == 0)
+      val = true;
+    return val;
+  }
+}
+
 int GetNumDevices() {
   bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
   if (ipu_model) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_device.h b/paddle/fluid/platform/device/ipu/ipu_device.h
index 3da13a522e19a3f6526751e48c70bdd8562d1b6c..d39feffc92655b52dae1792fab0a5ef95bb6075f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_device.h
+++ b/paddle/fluid/platform/device/ipu/ipu_device.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <popart/devicemanager.hpp>
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/device/ipu/ipu_info.cc b/paddle/fluid/platform/device/ipu/ipu_info.cc
index 4506bfbf972248fd0539927c483b3e23114a6750..9e6951c37139db2bbca6a1eab7f521e850dba6db 100644
--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
@@ -16,12 +16,10 @@ namespace paddle {
 namespace platform {
 
 //! Get a list of device ids from environment variable or use all.
-std::vector<int> GetSelectedIPUDevices() {
-  return platform::ipu::GetDeviceIds();
-}
+std::vector<int> GetSelectedIPUDevices() { return ipu::GetDeviceIds(); }
 
 //! Get the total number of IPU devices in system.
-int GetIPUDeviceCount() { return platform::ipu::GetNumDevices(); }
+int GetIPUDeviceCount() { return ipu::GetNumDevices(); }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
index 67012e8d4b92d8d6336f1b192a7b19828511c08e..d4a14a6d8409f9b50247f747016f5284f11037da 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -32,7 +32,7 @@ Node *mean_handler(Graph *graph, Node *node) {
 
 Node *pow_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+  if (!op->Input("FactorTensor").empty()) {
     return CreateBaseOp(
         graph, node, "popart_pow",
         {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
@@ -161,7 +161,7 @@ Node *scale_handler(Graph *graph, Node *node) {
                          static_cast<int>(framework::proto::VarType::FP32));
 
   Node *result = nullptr;
-  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+  if (!op->Input("ScaleTensor").empty()) {
     auto scale = GetInputVarNode("ScaleTensor", node);
     if (is_float_equal(bias_, 0.0)) {
       result = CreateBaseOp(graph, node, "popart_mul",
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
index b7412000107d3157c6b5c38d7c456af3bd36aabd..b731ba532d60c743278b73754deb884c800fe4d1 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -34,7 +34,7 @@ Node *conv2d_handler(Graph *graph, Node *node) {
   auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
   auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
   auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
-  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+  if (!op->Input("Bias").empty()) {
     return CreateConv(
         graph, node,
         {
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
index 662660c23b4a6a357d27565a8c6b37b25db9c9be..539053f2fb67bae4652e61a52bc3254f233d3417 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/search_ops.cc
@@ -65,7 +65,7 @@ Node *topk_handler(Graph *graph, Node *node) {
 
   Node *var_x = GetInputVarNode("X", node);
   Node *var_k = nullptr;
-  if (op->HasInput("K") && !op->Input("K").empty()) {
+  if (!op->Input("K").empty()) {
     var_k = GetInputVarNode("K", node);
   } else {
     auto k = BOOST_GET_CONST(int, op->GetAttr("k"));
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
index 296668890ebe5a0f1550e41aff4424b0f87b4f95..db429d2f6228455bd4ca1a47d117ddf2ad286e65 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc
@@ -23,7 +23,7 @@ namespace {
 
 Node *fill_constant_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("ShapeTensor") && !op->Input("ShapeTensor").empty()) {
+  if (!op->Input("ShapeTensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("op fill_constant with ShapeTensor"));
   }
@@ -328,7 +328,7 @@ Node *shape_handler(Graph *graph, Node *node) {
 Node *slice_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   Node *starts = nullptr;
-  if (op->HasInput("StartsTensor") && !op->Input("StartsTensor").empty()) {
+  if (!op->Input("StartsTensor").empty()) {
     starts = GetInputVarNode("StartsTensor", node);
   } else {
     auto starts_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("starts"));
@@ -338,7 +338,7 @@ Node *slice_handler(Graph *graph, Node *node) {
     starts = starts->outputs[0];
   }
   Node *ends = nullptr;
-  if (op->HasInput("EndsTensor") && !op->Input("EndsTensor").empty()) {
+  if (!op->Input("EndsTensor").empty()) {
     ends = GetInputVarNode("EndsTensor", node);
   } else {
     auto ends_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ends"));
@@ -384,14 +384,13 @@ Node *slice_handler(Graph *graph, Node *node) {
 
 Node *expand_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
-  if (op->HasInput("expand_times_tensor") &&
-      !op->Input("expand_times_tensor").empty()) {
+  if (!op->Input("expand_times_tensor").empty()) {
     PADDLE_THROW(
         platform::errors::Unimplemented("Expand op with expand_times_tensor"));
   }
 
   Node *expand_times = nullptr;
-  if (op->HasInput("ExpandTimes") && !op->Input("ExpandTimes").empty()) {
+  if (!op->Input("ExpandTimes").empty()) {
     // cast to int64
     expand_times =
         CreateCast(graph, node, {GetInputVarNode("ExpandTimes", node)}, {},
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index bfb1f572068e0aa5b46bbf967d5482e6627332a6..142e30d161ccadf3c3cb55eee430597e60d50624 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -21,9 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
 #endif
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/ipu/ipu_backend.h"
-#endif
 #include "glog/logging.h"
 #include "paddle/fluid/framework/expect.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -230,14 +227,10 @@ CPUDeviceContext::CPUDeviceContext() : pten::CPUContext() {}
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : pten::CPUContext() {}
 
 #ifdef PADDLE_WITH_IPU
-IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
-  int id = place.GetDeviceId();
-  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
-      platform::ipu::IpuBackend::GetInstance();
-  device_ = ipu_backend->GetDevice(id);
-}
+IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
 Place IPUDeviceContext::GetPlace() const { return place_; }
+
 void IPUDeviceContext::Wait() const {
   /*! \brief  Wait for all operations completion in the stream. */
 }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 52f17cd986ce2f45ec9fccd4e009d2bae2db8ad2..17b22907b15328ef8fe610ce126639b0a5f927e7 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -65,9 +65,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
-#ifdef PADDLE_WITH_IPU
-#include "paddle/fluid/platform/device/ipu/device.h"
-#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -151,11 +148,9 @@ class IPUDeviceContext : public DeviceContext {
   Place GetPlace() const override;
   /*! \brief  Wait for all operations completion in the stream. */
   void Wait() const override;
-  int DeviceId() const { return device_.getId(); }
 
  private:
   IPUPlace place_;
-  platform::ipu::Device device_;
 };
 template <>
 struct DefaultDeviceContextType<platform::IPUPlace> {