Cherry-pick. (#28454)

0a42986c · Wilber · GitHub · 78d68d59 · 0a42986c · 0a42986c
19 changed file
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-if(NOT LINUX OR NOT WITH_MKL)
+if(NOT LINUX)
-  message("Paddle-lite will not build because the required Linux and MKL do not exist.")
+  message("Paddle-lite will not build because the required Linux do not exist.")
  set(WITH_LITE OFF)
  return()
 endif()
@@ -22,9 +22,11 @@ if(XPU_SDK_ROOT)
  set(LITE_WITH_XPU ON)
  include_directories("${XPU_SDK_ROOT}/XTDK/include")
  include_directories("${XPU_SDK_ROOT}/XTCL/include")
-  add_definitions(-DPADDLE_WITH_XPU)
+  add_definitions(-DLITE_SUBGRAPH_WITH_XPU)
  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
  LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
+  set(XPURT_LIB ${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so)
+  set(XPUAPI_LIB ${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so)
 endif()
 if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
@@ -42,7 +44,50 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  endif()
  # No quotes, so cmake can resolve it as a command with arguments.
+  if(WITH_ARM)
    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    message(WARNING "BUILD_COMMAND: ${LITE_BUILD_COMMAND}")
+    set(LITE_OPTIONAL_ARGS -DWITH_MKL=OFF
+                           -DLITE_WITH_CUDA=OFF
+                           -DWITH_MKLDNN=OFF
+                           -DLITE_WITH_X86=OFF
+                           -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON
+                           -DLITE_WITH_PROFILE=OFF
+                           -DARM_TARGET_OS=armlinux
+                           -DWITH_LITE=ON
+                           -DWITH_PYTHON=OFF
+                           -DWITH_TESTING=OFF
+                           -DLITE_BUILD_EXTRA=ON
+                           -DLITE_WITH_XPU=${LITE_WITH_XPU}
+                           -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
+                           -DLITE_WITH_ARM=ON)
+    ExternalProject_Add(
+      ${LITE_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+      GIT_TAG             ${LITE_GIT_TAG}
+      PREFIX              ${LITE_SOURCES_DIR}
+      PATCH_COMMAND       mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc
+      UPDATE_COMMAND      ""
+      BUILD_COMMAND       ${LITE_BUILD_COMMAND}
+      INSTALL_COMMAND     ""
+      CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                          -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                          ${EXTERNAL_OPTIONAL_ARGS}
+                          ${LITE_OPTIONAL_ARGS}
+    )
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib.armlinux.armv8)
+  else()
+    set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
+    set(LITE_OUTPUT_BIN_DIR inference_lite_lib)
    set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
                           -DLITE_WITH_CUDA=${WITH_GPU}
                           -DWITH_MKLDNN=OFF
@@ -82,6 +127,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
                            ${EXTERNAL_OPTIONAL_ARGS}
                            ${LITE_OPTIONAL_ARGS}
    )
+  endif()
  ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
  ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
  set(LITE_BINARY_DIR ${BINARY_DIR})
@@ -103,8 +149,8 @@ function(external_lite_libs alias path)
  endif()
 endfunction()
-external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+external_lite_libs(lite_full_static ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
-set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/cxx/lib/libpaddle_full_api_shared.so)
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -125,7 +125,7 @@ function(copy_part_of_thrid_party TARGET DST)
    if (LITE_BINARY_DIR)
        set(dst_dir "${DST}/third_party/install/lite")
        copy(${TARGET}
-                SRCS ${LITE_BINARY_DIR}/inference_lite_lib/*
+                SRCS ${LITE_BINARY_DIR}/${LITE_OUTPUT_BIN_DIR}/*
                DSTS ${dst_dir})
    endif()
 endfunction()

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -219,6 +219,10 @@ struct Argument {
  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
+  // Only used in paddle-lite subgraph.
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+                      int);
 private:
  std::unordered_set<std::string> valid_fields_;
 };

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -151,6 +151,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("use_xpu", new bool(argument->use_xpu()));
      pass->Set("xpu_l3_workspace_size",
                new int(argument->xpu_l3_workspace_size()));
+      pass->Set("cpu_math_library_num_threads",
+                new int(argument->cpu_math_library_num_threads()));
    }
    disable_logs_ = argument->disable_logs();
    if (pass_name == "fc_fuse_pass") {

--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
  bool enable_int8 = Get<bool>("enable_int8");
  bool use_xpu = Get<bool>("use_xpu");
  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
  lite_api::TargetType target_type;
  if (use_gpu) {
@@ -251,7 +252,11 @@ void LiteSubgraphPass::SetUpEngine(
  } else if (use_xpu) {
    target_type = TARGET(kXPU);
  } else {
+#ifdef PADDLE_WITH_ARM
+    target_type = TARGET(kARM);
+#else
    target_type = TARGET(kX86);
+#endif
  }
  paddle::lite_api::PrecisionType precision_type =
@@ -263,11 +268,12 @@ void LiteSubgraphPass::SetUpEngine(
      // Notice: The ordering here determines the device where the
      // input tensor of the Lite engine is located, and then affects
      // whether tensor sharing is feasible.
-      paddle::lite::Place({target_type, precision_type}),
+      paddle::lite_api::Place({target_type, precision_type}),
-      paddle::lite::Place({target_type, PRECISION(kInt64)}),
+      paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
-      paddle::lite::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
  };
+  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
  if (dump_model) {
    lite::StrToBinaryFile("./model.bin", config.model);

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -356,7 +356,7 @@ void AnalysisConfig::Update() {
  }
  if (use_xpu_) {
-#ifndef PADDLE_WITH_XPU
+#ifndef LITE_SUBGRAPH_WITH_XPU
    PADDLE_THROW(platform::errors::Unavailable(
        "You tried to use an XPU device, but Paddle was not compiled "
        "with XPU-runtime."));

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -232,8 +232,17 @@ bool AnalysisPredictor::PrepareExecutor() {
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
-  VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id="
+  std::vector<std::vector<int>> inputs_shape;
-          << platform::get_cur_mkldnn_session_id();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(inputs[i].shape);
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<std::vector<int>> &inputs_shape) {
+#ifdef PADDLE_WITH_MKLDNN
  // In cache clearing mode.
  if (config_.mkldnn_cache_capacity_ > 0) {
    VLOG(2) << "In mkldnn cache clear mode.";
@@ -243,9 +252,9 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
        config_.mkldnn_cache_capacity_);
    // Set current_input_shape for caching dynamic shape.
    std::stringstream ss;
-    for (size_t i = 0; i < inputs.size(); ++i) {
+    for (size_t i = 0; i < inputs_shape.size(); ++i) {
-      for (size_t j = 0; j < inputs[i].shape.size(); ++j) {
+      for (size_t j = 0; j < inputs_shape[i].size(); ++j) {
-        ss << inputs[i].shape[j] << "-";
+        ss << inputs_shape[i][j] << "-";
      }
    }
    VLOG(2) << "Set input shape=" << ss.str();
@@ -445,6 +454,8 @@ void AnalysisPredictor::PrepareArgument() {
  }
  if (config_.lite_engine_enabled()) {
+    argument_.SetCpuMathLibraryNumThreads(
+        config_.cpu_math_library_num_threads());
    argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
    argument_.SetLitePassesFilter(config_.lite_passes_filter_);
    argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
@@ -656,6 +667,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 bool AnalysisPredictor::ZeroCopyRun() {
  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) {
+    std::vector<std::vector<int>> shape_vector;
+    auto names = GetInputNames();
+    for (size_t i = 0; i < names.size(); ++i) {
+      auto in_tensor = GetInputTensor(names[i]);
+      shape_vector.emplace_back(in_tensor->shape());
+    }
+    MkldnnPreSet(shape_vector);
+  }
+#endif
  executor_->Run();
  // Fix TensorArray reuse not cleaned bug.
  tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -664,6 +687,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
  // conflict when integrating it into deployment service.
  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
 #if defined(PADDLE_WITH_MKLML) && defined(_LINUX)
  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
  // avoid memory leak. See:

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -311,6 +311,17 @@ class AnalysisPredictor : public PaddlePredictor {
  /// \param[in] inputs tensors
  ///
  void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run(), do not support
+  /// AnalysisPredictor::ZeroCopyRun() now.
+  ///
+  /// \param[in] inputs tensor shape
+  ///
+  void MkldnnPreSet(const std::vector<std::vector<int>> &inputs_shape);
  ///
  /// \brief PostReset for Mkldnn multi-thread and dynamic shape input.
  ///

--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -4,6 +4,6 @@ endif()
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
 cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS})
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -16,12 +16,16 @@
 #define LITE_WITH_CUDA 1
 #endif
-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
 #define LITE_WITH_XPU 1
 #endif
+#ifndef PADDLE_WITH_ARM
+#define LITE_WITH_X86 1
+#endif
 #include "paddle/fluid/inference/lite/engine.h"
-#include "lite/api/paddle_use_passes.h"
+#include <utility>
 namespace paddle {
 namespace inference {
@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
  return engines_.at(name).get() != nullptr;
 }
-paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+paddle::lite_api::PaddlePredictor* EngineManager::Get(
+    const std::string& name) const {
  return engines_.at(name).get();
 }
-paddle::lite::Predictor* EngineManager::Create(const std::string& name,
+paddle::lite_api::PaddlePredictor* EngineManager::Create(
-                                               const EngineConfig& cfg) {
+    const std::string& name, const EngineConfig& cfg) {
-  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
+  // config info for predictor.
-#ifdef PADDLE_WITH_CUDA
+  paddle::lite_api::CxxConfig lite_cxx_config;
-    paddle::lite::Env<TARGET(kCUDA)>::Init();
+  lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
+                                   cfg.param.c_str(), cfg.param.size());
+  lite_cxx_config.set_valid_places(cfg.valid_places);
+#ifdef PADDLE_WITH_ARM
+  set_threads.set_threads(cfg.cpu_math_library_num_threads);
+#else
+  lite_cxx_config.set_x86_math_library_num_threads(
+      cfg.cpu_math_library_num_threads);
 #endif
-  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
-    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
+  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
-        cfg.xpu_l3_workspace_size;
+      cfg.xpu_l3_workspace_size);
 #endif
-  }
-  auto* p = new paddle::lite::Predictor();
+  // create predictor
-  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
+  std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
-           cfg.model_type, cfg.model_from_memory);
+      paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
-  engines_[name].reset(p);
+  engines_[name] = std::move(p);
-  return p;
+  return engines_[name].get();
 }
 void EngineManager::DeleteAll() {
  for (auto& item : engines_) {
-    item.second.reset(nullptr);
+    item.second.reset();
  }
 }

--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -23,12 +23,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
-#include "lite/core/context.h"
+#include "lite/api/paddle_use_passes.h"
-#include "lite/core/device_info.h"
-#include "lite/core/memory.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
 #pragma GCC diagnostic pop
 namespace paddle {
@@ -38,25 +35,33 @@ namespace lite {
 struct EngineConfig {
  std::string model;
  std::string param;
-  paddle::lite::Place prefer_place;
+  std::vector<paddle::lite_api::Place> valid_places;
-  std::vector<paddle::lite::Place> valid_places;
  std::vector<std::string> neglected_passes;
  lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
  bool model_from_memory{true};
+  // for xpu
  size_t xpu_l3_workspace_size;
+  // for x86 or arm
+  int cpu_math_library_num_threads{1};
+  // for cuda
+  bool use_multi_stream{false};
 };
 class EngineManager {
 public:
  bool Empty() const;
  bool Has(const std::string& name) const;
-  paddle::lite::Predictor* Get(const std::string& name) const;
+  paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
-  paddle::lite::Predictor* Create(const std::string& name,
+  paddle::lite_api::PaddlePredictor* Create(const std::string& name,
                                            const EngineConfig& cfg);
  void DeleteAll();
 private:
-  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+  std::unordered_map<std::string,
+                     std::shared_ptr<paddle::lite_api::PaddlePredictor>>
      engines_;
 };

--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <functional>
 #include <map>
 #include <memory>
 #include "paddle/fluid/framework/data_type.h"
@@ -45,6 +46,7 @@ platform::Place GetNativePlace(const TargetType& type, int id = 0) {
  switch (type) {
    case TargetType::kHost:
    case TargetType::kX86:
+    case TargetType::kARM:
      return platform::CPUPlace();
    case TargetType::kCUDA:
      return platform::CUDAPlace(id);
@@ -134,16 +136,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
  }
 }
-void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
+                           PrecisionType precision_type,
+                           TargetType target_type) {
+  void* res{nullptr};
+  switch (precision_type) {
+    case PrecisionType::kFloat:
+      res = static_cast<void*>(src->mutable_data<float>(target_type));
+      break;
+    case PrecisionType::kInt8:
+      res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
+      break;
+    case PrecisionType::kInt32:
+      res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
+      break;
+    case PrecisionType::kInt64:
+      res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported precision type. Now only supports FP32, INT8, INT32 and "
+          "INT64."));
+      break;
+  }
+  return res;
+}
+int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
+  auto shape = tensor.shape();
+  int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  return numel;
+}
+void InitDstTensor(paddle::lite_api::Tensor* dst,
+                   const framework::LoDTensor& src) {
  // Currently, Lite needs to explicitly specify the target type of
  // the input tensor.
  constexpr int empty_size = 0;
-  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
+  dst->Resize({empty_size});
-  dst->set_precision(GetLitePrecisionType(src.type()));
+  GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
-  SetLoD(dst->mutable_lod(), src.lod());
+                       GetLiteTargetType(src.place()));
+  dst->SetPrecision(GetLitePrecisionType(src.type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src.lod());
+  dst->SetLoD(lite_lod);
 }
-void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+void InitDstTensor(framework::LoDTensor* dst,
+                   const paddle::lite_api::Tensor& src) {
  constexpr framework::proto::VarType::Type dtype =
      framework::proto::VarType_Type_FP32;
  dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
@@ -152,7 +193,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
 }
 template <>
-void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+void TensorCopyAsync(paddle::lite_api::Tensor* dst,
+                     const framework::LoDTensor& src,
                     const platform::DeviceContext& ctx) {
  InitDstTensor(dst, src);
  const platform::Place& src_place = src.place();
@@ -161,52 +203,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
      static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
  dst->Resize(framework::vectorize(src.dims()));
  const void* src_data = src.data<void>();
-  void* dst_data = dst->mutable_data(bytes);
+  void* dst_data{nullptr};
+  dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                                  GetLiteTargetType(src.place()));
  VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
          << ", dst = " << dst << ", src_type = " << src.type();
  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 template <>
-void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+void TensorCopyAsync(framework::LoDTensor* dst,
+                     const paddle::lite_api::Tensor& src,
                     const platform::DeviceContext& ctx) {
-  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src.shape()));
  InitDstTensor(dst, src);
  const platform::Place& src_place = GetNativePlace(src.target());
  const platform::Place& dst_place = dst->place();
-  const size_t bytes =
+  int64_t src_numel = GetLiteTensorNumel(src);
-      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
+  const size_t bytes = src_numel * framework::SizeOfType(dst->type());
-  const void* src_data = src.raw_data();
+  const void* src_data = src.data<void>();
  // When Lite is ready, the source type needs to be modified here.
  void* dst_data = dst->mutable_data(dst_place, dst->type());
  VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
          << ", dst = " << dst << ", src_type = " << dst->type();
  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 template <>
-void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
+void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
-  const size_t bytes =
-      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
-  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
-      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
  dst->Resize(framework::vectorize(src->dims()));
-  dst->set_precision(GetLitePrecisionType(src->type()));
+  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
-  SetLoD(dst->mutable_lod(), src->lod());
+                           GetLiteTargetType(src->place()));
-  dst->ResetBuffer(buf, bytes);
+  dst->SetPrecision(GetLitePrecisionType(src->type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src->lod());
+  dst->SetLoD(lite_lod);
 }
 template <>
-void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
  constexpr framework::proto::VarType::Type dtype =
      framework::proto::VarType_Type_FP32;
-  void* src_raw_data = src->raw_data();
+  void* src_raw_data =
+      GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
+  size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+      new memory::allocation::Allocation(src_raw_data, memory_size,
                                         GetNativePlace(src->target())));
-  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src->shape()));
  SetLoD(dst->mutable_lod(), src->lod());
  dst->ResetHolderWithType(holder, dtype);
 }

--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -101,10 +101,10 @@ TEST(EngineManager, engine) {
  config.model_from_memory = true;
  config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
  };
  LOG(INFO) << "Create EngineManager";
@@ -117,7 +117,7 @@ TEST(EngineManager, engine) {
  ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
                unique_key),
            true);
-  paddle::lite::Predictor* engine_0 =
+  paddle::lite_api::PaddlePredictor* engine_0 =
      inference::Singleton<inference::lite::EngineManager>::Global().Get(
          unique_key);
  CHECK_NOTNULL(engine_0);

--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
  EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
+template <typename T>
+void test_lite_tensor_data_ptr(PrecisionType precision_type) {
+  void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
+                             PrecisionType precision_type,
+                             TargetType target_type);
+  const int count = 4;
+  paddle::lite::Tensor lite_tensor;
+  lite_tensor.Resize({count});
+  auto* lite_tensor_data = lite_tensor.mutable_data<T>();
+  for (size_t i = 0; i < count; ++i) {
+    lite_tensor_data[i] = i;
+  }
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  T* data = static_cast<T*>(GetLiteTensorDataPtr(
+      &lite_api_tensor, precision_type, TargetType::kHost));
+  for (size_t i = 0; i < count; ++i) {
+    CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
+  }
+}
+TEST(LiteEngineOp, GetLiteTensorDataPtr) {
+  test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
+  test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
+  test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
+  EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
+}
 void test_tensor_copy(const platform::DeviceContext& ctx) {
  // Create LoDTensor.
  std::vector<float> vector({1, 2, 3, 4});
@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
  lod_tensor.set_lod(lod);
  // Create lite::Tensor and copy.
  paddle::lite::Tensor lite_tensor;
-  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
  // Copy to LoDTensor.
  framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(ctx.GetPlace())) {
    platform::GpuStreamSync(
@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
  lod_tensor.set_lod(lod);
  // Create lite::Tensor and share.
  paddle::lite::Tensor lite_tensor;
-  TensorDataShare(&lite_tensor, &lod_tensor);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorDataShare(&lite_api_tensor, &lod_tensor);
  // Copy to LoDTensor.
  framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
  std::vector<float> result;
  TensorToVector(lod_tensor_n, ctx, &result);
  ASSERT_EQ(result, vector);

--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -25,9 +25,13 @@ namespace inference {
 TEST(AnalysisPredictor, use_gpu) {
  std::string model_dir = FLAGS_infer_model + "/" + "model";
  AnalysisConfig config;
+#if defined(PADDLE_WITH_CUDA)
  config.EnableUseGpu(100, 0);
+#elif defined(LITE_SUBGRAPH_WITH_XPU)
+  config.EnableXpu(100);
+#endif
  config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
  std::vector<PaddleTensor> inputs;
  auto predictor = CreatePaddlePredictor(config);
@@ -39,7 +43,7 @@ TEST(AnalysisPredictor, use_gpu) {
  std::vector<float> input(input_num, 1);
  PaddleTensor in;
-  in.shape = {1, 3, 318, 318};
+  in.shape = {batch, channel, height, width};
  in.data =
      PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
  in.dtype = PaddleDType::FLOAT32;

--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> in_names_;
  std::vector<std::string> out_names_;
-  paddle::lite::Predictor *engine_;
+  paddle::lite_api::PaddlePredictor *engine_;
  framework::proto::VarType::Type precision_;
  bool use_gpu_;
  bool zero_copy_;
@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
      framework::LoDTensor src_t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                  in_names_[i]);
-      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
              << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
+      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
    }
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(dev_place)) {
@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
    engine_->Run();
    VLOG(3) << "lite engine run done";
    for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
+      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
      framework::LoDTensor *dst_t =
          &inference::analysis::GetFromScope<framework::LoDTensor>(
              scope, out_names_[i]);

--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
  inference::lite::EngineConfig config;
  config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
  };
  serialize_params(&(config.param), &scope, repetitive_params);
  config.model = program.Proto()->SerializeAsString();

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -387,6 +387,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("params_file", &AnalysisConfig::params_file)
      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("enable_xpu", &AnalysisConfig::EnableXpu,
+           py::arg("l3_workspace_size"))
      .def("disable_gpu", &AnalysisConfig::DisableGpu)
      .def("use_gpu", &AnalysisConfig::use_gpu)
      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
@@ -427,8 +429,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
-           py::arg("zero_copy") = false,
           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
+           py::arg("zero_copy") = false,
           py::arg("passes_filter") = std::vector<std::string>(),
           py::arg("ops_filter") = std::vector<std::string>())
      .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled)

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -261,6 +261,10 @@ else:
 if '${WITH_LITE}' == 'ON':
    shutil.copy('${LITE_SHARED_LIB}', libs_path)
    package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
+    if '${XPU_SDK_ROOT}':
+        shutil.copy('${XPUAPI_LIB}', libs_path)
+        shutil.copy('${XPURT_LIB}', libs_path)
+        package_data['paddle.libs'] += ['libxpuapi.so', 'libxpurt.so']
 if '${WITH_PSLIB}' == 'ON':
    shutil.copy('${PSLIB_LIB}', libs_path)