From 45fa6861c023e7dc3a4f5d7a3ba3b38141a569e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 30 Jul 2020 19:04:42 +0800
Subject: [PATCH] Cherry-pick of lite engine, test=release/1.8 (#25817)

* ignore warnings of external libraries, test=develop (#24193)

* fix repeat definitions in liengine.cc, test=develop (#25020)

* remove paddle_use_kernel and paddle_use_op. test=develop (#25189)

* fix compile for lite subgraph. test=develop (#25285)

* [CI] [Lite-Subgraph] CI add lite subgraph check. (#25346)

* supports xpu runtime, test=develop (#25554)

* fix cmake of lite, test=develop (#25680)

* change commit files, test=release/1.8

Co-authored-by: Wilber <jiweibo@baidu.com>
---
 CMakeLists.txt                                |  4 +-
 cmake/external/lite.cmake                     | 22 ++++++++--
 paddle/fluid/inference/analysis/argument.h    |  4 ++
 .../inference/analysis/ir_pass_manager.cc     |  4 ++
 .../analysis/ir_passes/lite_subgraph_pass.cc  | 22 +++++++++-
 paddle/fluid/inference/api/analysis_config.cc | 31 ++++++++++++-
 .../fluid/inference/api/analysis_predictor.cc |  3 ++
 .../inference/api/paddle_analysis_config.h    |  8 ++++
 paddle/fluid/inference/lite/CMakeLists.txt    |  8 +++-
 paddle/fluid/inference/lite/engine.cc         | 20 ++++++---
 paddle/fluid/inference/lite/engine.h          | 10 +++++
 paddle/fluid/inference/lite/op_teller.cc      | 12 +++--
 paddle/fluid/inference/lite/tensor_utils.cc   | 30 +++++++++++++
 paddle/fluid/inference/lite/tensor_utils.h    | 18 +++++++-
 .../fluid/inference/lite/test_tensor_utils.cc | 44 ++++++++++++++++---
 paddle/fluid/operators/lite/lite_engine_op.h  | 14 +++---
 .../operators/lite/lite_engine_op_test.cc     |  1 +
 paddle/fluid/pybind/inference_api.cc          |  1 +
 python/setup.py.in                            |  4 ++
 19 files changed, 222 insertions(+), 38 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04fbb3178b..e4d9060339 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -154,6 +154,9 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+# lite subgraph compilation depends on CUDNN_ROOT,
+# so include(cudnn) needs to be in front of include(third_party/lite)
+include(cudnn)              # set cudnn libraries, must before configure
 include(third_party)        # download, build, install third_party
 
 if(WITH_DISTRIBUTE)
@@ -173,7 +176,6 @@ if(NOT WIN32)
 endif()
 
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
 
 if(WITH_GPU)
     include(cuda)
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 70c11d37f9..b541d73bc6 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -18,6 +18,15 @@ if(NOT LINUX OR NOT WITH_MKL)
   return()
 endif()
 
+if(XPU_SDK_ROOT)
+  set(LITE_WITH_XPU ON)
+  include_directories("${XPU_SDK_ROOT}/XTDK/include")
+  include_directories("${XPU_SDK_ROOT}/XTCL/include")
+  add_definitions(-DPADDLE_WITH_XPU)
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
+  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
+endif()
+
 if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   include(ExternalProject)
   set(LITE_PROJECT extern_lite)
@@ -25,7 +34,11 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f)
+    set(LITE_GIT_TAG 42ab4d559f6659edfc35040fb30fdcec3dc3f8aa)
+  endif()
+
+  if(NOT CUDA_ARCH_NAME)
+    set(CUDA_ARCH_NAME "Auto")
   endif()
 
   # No quotes, so cmake can resolve it as a command with arguments.
@@ -43,6 +56,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                          -DCUDNN_ROOT=${CUDNN_ROOT}
                          -DLITE_WITH_STATIC_CUDA=OFF
                          -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
+                         -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
                          -DLITE_WITH_ARM=OFF)
 
   ExternalProject_Add(
@@ -79,7 +94,7 @@ message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
 include_directories(${LITE_SOURCE_DIR})
 include_directories(${LITE_BINARY_DIR})
 
-function(external_lite_static_libs alias path)
+function(external_lite_libs alias path)
   add_library(${alias} SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
                ${path})
@@ -88,7 +103,8 @@ function(external_lite_static_libs alias path)
   endif()
 endfunction()
 
-external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2fc7f81bf8..27bae7a71e 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -200,6 +200,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
                       AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(lite_zero_copy, LiteZeroCopy, bool);
+
+  DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
+  DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 4a79a3cf30..cd8d86d729 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -146,6 +146,10 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("predictor_id", new int(argument->predictor_id()));
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
+      pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
+      pass->Set("use_xpu", new bool(argument->use_xpu()));
+      pass->Set("xpu_l3_workspace_size",
+                new int(argument->xpu_l3_workspace_size()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 91d0aec3f4..6b16a481dd 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -242,16 +242,33 @@ void LiteSubgraphPass::SetUpEngine(
 
   bool use_gpu = Get<bool>("use_gpu");
   bool enable_int8 = Get<bool>("enable_int8");
-  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
+  bool use_xpu = Get<bool>("use_xpu");
+  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+
+  lite_api::TargetType target_type;
+  if (use_gpu) {
+    target_type = TARGET(kCUDA);
+  } else if (use_xpu) {
+    target_type = TARGET(kXPU);
+  } else {
+    target_type = TARGET(kX86);
+  }
+
   paddle::lite_api::PrecisionType precision_type =
-      enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
+      enable_int8 ? PRECISION(kInt8) : PRECISION(kFloat);
+
   serialize_params(&config.param, scope, repetitive_params);
   config.model = program->Proto()->SerializeAsString();
   config.valid_places = {
+      // Notice: The ordering here determines the device where the
+      // input tensor of the Lite engine is located, and then affects
+      // whether tensor sharing is feasible.
       paddle::lite::Place({target_type, precision_type}),
+      paddle::lite::Place({target_type, PRECISION(kInt64)}),
       paddle::lite::Place({target_type, PRECISION(kFloat)}),
       paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
@@ -283,6 +300,7 @@ void LiteSubgraphPass::BuildOperator(
   op_desc->SetAttr("engine_key", unique_key);
   op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
   op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
+  op_desc->SetAttr("zero_copy", Get<bool>("zero_copy"));
 }
 
 void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 8a047e5296..39c5cbff1f 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -88,6 +88,12 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
+void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+  use_xpu_ = true;
+  xpu_l3_workspace_size_ = l3_workspace_size;
+  Update();
+}
+
 AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 #define CP_MEMBER(member__) member__ = other.member__;
 
@@ -132,6 +138,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(lite_precision_mode_);
   CP_MEMBER(lite_passes_filter_);
   CP_MEMBER(lite_ops_filter_);
+  CP_MEMBER(lite_zero_copy_);
+
+  CP_MEMBER(use_xpu_);
+  CP_MEMBER(xpu_l3_workspace_size_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -342,6 +352,22 @@ void AnalysisConfig::Update() {
     }
   }
 
+  if (use_xpu_) {
+#ifndef PADDLE_WITH_XPU
+    PADDLE_THROW(platform::errors::Unavailable(
+        "You tried to use an XPU device, but Paddle was not compiled "
+        "with XPU-runtime."));
+#endif
+    if (!use_lite_) {
+      LOG(WARNING) << "Because XPU currently only works in Paddle-Lite "
+                      "subgraph mode, please make sure you have enabled it.";
+    }
+    PADDLE_ENFORCE_EQ(use_gpu_, false,
+                      platform::errors::Unavailable(
+                          "Currently, XPU and GPU cannot be enabled in the "
+                          "same analysis configuration."));
+  }
+
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
@@ -385,6 +411,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << cpu_math_library_num_threads_;
 
   ss << use_lite_;
+  ss << use_xpu_;
+  ss << xpu_l3_workspace_size_;
 
   return ss.str();
 }
@@ -460,13 +488,14 @@ void AnalysisConfig::DisableGlogInfo() {
 }
 
 void AnalysisConfig::EnableLiteEngine(
-    AnalysisConfig::Precision precision_mode,
+    AnalysisConfig::Precision precision_mode, bool zero_copy,
     const std::vector<std::string> &passes_filter,
     const std::vector<std::string> &ops_filter) {
   use_lite_ = true;
   lite_precision_mode_ = precision_mode;
   lite_passes_filter_ = passes_filter;
   lite_ops_filter_ = ops_filter;
+  lite_zero_copy_ = zero_copy;
   Update();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d21f0292d9..de3f9ab239 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -447,6 +447,9 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
+    argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
+    argument_.SetUseXpu(config_.use_xpu_);
+    argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 2002d1f76a..39346414a8 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -176,6 +176,8 @@ struct AnalysisConfig {
   ///
   ///
   void DisableGpu();
+
+  void EnableXpu(int l3_workspace_size = 0xfffc00);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -319,6 +321,7 @@ struct AnalysisConfig {
   ///
   void EnableLiteEngine(
       AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      bool zero_copy = false,
       const std::vector<std::string>& passes_filter = {},
       const std::vector<std::string>& ops_filter = {});
 
@@ -562,6 +565,11 @@ struct AnalysisConfig {
   std::vector<std::string> lite_passes_filter_;
   std::vector<std::string> lite_ops_filter_;
   Precision lite_precision_mode_;
+  bool lite_zero_copy_;
+
+  bool thread_local_stream_{false};
+  bool use_xpu_{false};
+  int xpu_l3_workspace_size_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 1d95704814..fd513b5958 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -1,5 +1,9 @@
+if(XPU_SDK_ROOT)
+  set(XPU_DEPS xpuapi xpurt)
+endif()
+
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
-cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index edc4f5220a..8e88c94493 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -16,12 +16,11 @@
 #define LITE_WITH_CUDA 1
 #endif
 
-#include "paddle/fluid/inference/lite/engine.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
+#ifdef PADDLE_WITH_XPU
+#define LITE_WITH_XPU 1
+#endif
 
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
+#include "paddle/fluid/inference/lite/engine.h"
 #include "lite/api/paddle_use_passes.h"
 
 namespace paddle {
@@ -43,10 +42,17 @@ paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
 
 paddle::lite::Predictor* EngineManager::Create(const std::string& name,
                                                const EngineConfig& cfg) {
-  auto* p = new paddle::lite::Predictor();
+  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
 #ifdef PADDLE_WITH_CUDA
-  paddle::lite::Env<TARGET(kCUDA)>::Init();
+    paddle::lite::Env<TARGET(kCUDA)>::Init();
 #endif
+  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+#ifdef PADDLE_WITH_XPU
+    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
+        cfg.xpu_l3_workspace_size;
+#endif
+  }
+  auto* p = new paddle::lite::Predictor();
   p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
            cfg.model_type, cfg.model_from_memory);
   engines_[name].reset(p);
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index f29607490e..345eb682e9 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -20,7 +20,16 @@
 #include <unordered_map>
 #include <vector>
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_place.h"
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
+#include "lite/core/memory.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#pragma GCC diagnostic pop
 
 namespace paddle {
 namespace inference {
@@ -34,6 +43,7 @@ struct EngineConfig {
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
+  size_t xpu_l3_workspace_size;
 };
 
 class EngineManager {
diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc
index c5f1eccc33..3a162c3fde 100644
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
@@ -16,10 +16,9 @@
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/lite/op_teller.h"
 
-#include "lite/core/op_registry.h"
-
 namespace paddle {
 namespace inference {
 namespace lite {
@@ -27,15 +26,14 @@ namespace lite {
 // Just tell by the op_types.
 struct SimpleOpTeller : public Teller {
   SimpleOpTeller() {
-    const std::map<std::string, std::string>& op2path =
-        paddle::lite::GetOp2PathDict();
+    std::vector<std::string> lite_ops = paddle::lite::GetAllOps();
     auto is_non_inst = [](const std::string& op) -> bool {
       const std::vector<std::string> ops = {"feed", "fetch", "while"};
       return std::find(ops.begin(), ops.end(), op) != ops.end();
     };
-    for (const auto& op : op2path) {
-      if (!is_non_inst(op.first)) {
-        ops_.insert(op.first);
+    for (const auto& op : lite_ops) {
+      if (!is_non_inst(op)) {
+        ops_.insert(op);
       }
     }
   }
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 6138e64e2d..0b738c1fb8 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -14,8 +14,10 @@
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
 #include <map>
+#include <memory>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 
 namespace paddle {
 namespace inference {
@@ -46,6 +48,9 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
       return platform::CPUPlace();
     case TargetType::kCUDA:
       return platform::CUDAPlace(id);
+    case TargetType::kXPU:
+      LOG(ERROR) << "No corresponding device for XPU yet.";
+      return platform::Place();
     default:
       LOG(FATAL) << "Error target type.";
       return platform::Place();
@@ -181,6 +186,31 @@ void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
   VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
 }
 
+template <>
+void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
+  const size_t bytes =
+      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
+  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
+      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+  dst->Resize(framework::vectorize(src->dims()));
+  dst->set_precision(GetLitePrecisionType(src->type()));
+  SetLoD(dst->mutable_lod(), src->lod());
+  dst->ResetBuffer(buf, bytes);
+}
+
+template <>
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+  constexpr framework::proto::VarType::Type dtype =
+      framework::proto::VarType_Type_FP32;
+  void* src_raw_data = src->raw_data();
+  std::shared_ptr<memory::allocation::Allocation> holder(
+      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+                                         GetNativePlace(src->target())));
+  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  SetLoD(dst->mutable_lod(), src->lod());
+  dst->ResetHolderWithType(holder, dtype);
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/inference/lite/tensor_utils.h b/paddle/fluid/inference/lite/tensor_utils.h
index 95fe8ae903..1b2923bc28 100644
--- a/paddle/fluid/inference/lite/tensor_utils.h
+++ b/paddle/fluid/inference/lite/tensor_utils.h
@@ -14,9 +14,8 @@
 
 #pragma once
 
-#include "lite/api/paddle_place.h"
-#include "lite/core/tensor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/lite/engine.h"
 
 namespace paddle {
 namespace inference {
@@ -27,6 +26,21 @@ template <typename DstTensor, typename SrcTensor>
 void TensorCopyAsync(DstTensor* dst, const SrcTensor& src,
                      const platform::DeviceContext& ctx);
 
+template <typename DstTensor, typename SrcTensor>
+void TensorDataShare(DstTensor* dst, SrcTensor* src);
+
+template <typename DstTensor, typename SrcTensor>
+void TensorCopy(DstTensor* dst, SrcTensor* src,
+                const platform::DeviceContext& ctx, bool shared = true) {
+  if (shared) {
+    VLOG(3) << "TensorDataShare is running";
+    TensorDataShare(dst, src);
+  } else {
+    VLOG(3) << "TensorCopyAsync is running";
+    TensorCopyAsync(dst, *src, ctx);
+  }
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index 48ae1bd71d..eef7bfb68f 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) {
   platform::Place GetNativePlace(const TargetType& type, int id = 0);
   EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost)));
   EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA)));
-  ASSERT_DEATH(GetNativePlace(TargetType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePlace(TargetType::kUnk));
 }
 
 TEST(LiteEngineOp, GetLiteTargetType) {
@@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) {
             PrecisionType::kInt8);
   ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32),
             PrecisionType::kInt32);
-  ASSERT_DEATH(
-      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), "");
+  EXPECT_ANY_THROW(
+      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS));
 }
 
 TEST(LiteEngineOp, GetNativePrecisionType) {
@@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) {
             framework::proto::VarType_Type_INT8);
   ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32),
             framework::proto::VarType_Type_INT32);
-  ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePrecisionType(PrecisionType::kUnk));
 }
 
 TEST(LiteEngineOp, GetNativeLayoutType) {
@@ -70,14 +70,14 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   framework::DataLayout GetNativeLayoutType(const DataLayoutType& type);
   ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW),
             framework::DataLayout::kNCHW);
-  ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), "");
+  EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
 void test_tensor_copy(const platform::DeviceContext& ctx) {
   // Create LoDTensor.
   std::vector<float> vector({1, 2, 3, 4});
   framework::LoDTensor lod_tensor;
-  framework::TensorFromVector(vector, &lod_tensor);
+  framework::TensorFromVector(vector, ctx, &lod_tensor);
   framework::LoD lod({{0, 2, 4}});
   lod_tensor.Resize({4, 1});
   lod_tensor.set_lod(lod);
@@ -94,7 +94,26 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   }
 #endif
   std::vector<float> result;
-  TensorToVector(lod_tensor_n, &result);
+  TensorToVector(lod_tensor_n, ctx, &result);
+  ASSERT_EQ(result, vector);
+  ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
+}
+
+void test_tensor_share(const platform::DeviceContext& ctx) {
+  std::vector<float> vector({1, 2, 3, 4});
+  framework::LoDTensor lod_tensor;
+  framework::TensorFromVector(vector, ctx, &lod_tensor);
+  framework::LoD lod({{0, 2, 4}});
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(lod);
+  // Create lite::Tensor and share.
+  paddle::lite::Tensor lite_tensor;
+  TensorDataShare(&lite_tensor, &lod_tensor);
+  // Copy to LoDTensor.
+  framework::LoDTensor lod_tensor_n;
+  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  std::vector<float> result;
+  TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
   ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
 }
@@ -110,6 +129,17 @@ TEST(LiteEngineOp, TensorCopyAsync) {
 #endif
 }
 
+TEST(LiteEngineOp, TensorShare) {
+  auto* ctx_cpu =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  test_tensor_share(*ctx_cpu);
+#ifdef PADDLE_WITH_CUDA
+  auto* ctx_gpu =
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
+  test_tensor_share(*ctx_gpu);
+#endif
+}
+
 }  // namespace utils
 }  // namespace lite
 }  // namespace inference
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index 3b48615338..a920bf7c3f 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -42,6 +42,7 @@ class LiteEngineOp : public framework::OperatorBase {
   paddle::lite::Predictor *engine_;
   framework::proto::VarType::Type precision_;
   bool use_gpu_;
+  bool zero_copy_;
 
  public:
   LiteEngineOp(const std::string &type,
@@ -60,6 +61,7 @@ class LiteEngineOp : public framework::OperatorBase {
       precision_ = framework::proto::VarType_Type_FP32;
     }
     use_gpu_ = Attr<bool>("use_gpu");
+    zero_copy_ = Attr<bool>("zero_copy");
   }
 
  protected:
@@ -73,13 +75,13 @@ class LiteEngineOp : public framework::OperatorBase {
     const platform::DeviceContext *ctx =
         platform::DeviceContextPool::Instance().Get(dev_place);
     for (size_t i = 0; i < in_names_.size(); i++) {
-      const framework::LoDTensor &src_t =
+      framework::LoDTensor src_t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
       paddle::lite::Tensor *dst_t = engine_->GetInput(i);
-      VLOG(3) << "[Copy] fluid -> lite (" << in_names_[i] << " -> "
+      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
               << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
@@ -91,13 +93,13 @@ class LiteEngineOp : public framework::OperatorBase {
     engine_->Run();
     VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
-      const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i));
+      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
-      VLOG(3) << "[Copy] lite -> fluid (" << out_names_[i] << " -> "
+      VLOG(3) << "== lite -> fluid (" << out_names_[i] << " -> "
               << engine_->GetOutputNames()[i] << ")";
-      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index 3812911e91..fb5c0dcb35 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -100,6 +100,7 @@ TEST(LiteEngineOp, engine_op) {
   engine_op_desc.SetAttr("engine_key", engine_key);
   engine_op_desc.SetAttr("enable_int8", false);
   engine_op_desc.SetAttr("use_gpu", true);
+  engine_op_desc.SetAttr("zero_copy", true);
   engine_op_desc.SetBlockAttr("sub_block", &block_desc);
   inference::Singleton<inference::lite::EngineManager>::Global().Create(
       engine_key, config);
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index e4927977aa..a7d5b36bfc 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -425,6 +425,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
+           py::arg("zero_copy") = false,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("passes_filter") = std::vector<std::string>(),
            py::arg("ops_filter") = std::vector<std::string>())
diff --git a/python/setup.py.in b/python/setup.py.in
index 7370c38ecf..5658638854 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -258,6 +258,10 @@ else:
         shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
         package_data['paddle.libs'] += ['openblas' + ext_name]
 
+if '${WITH_LITE}' == 'ON':
+    shutil.copy('${LITE_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
+
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
     if os.path.exists('${PSLIB_VERSION_PY}'):
-- 
GitLab