[Feature] Lite subgraph (#22114)

ad0dfb17 · 石晓伟 · GitHub · 7d10edc5 · ad0dfb17 · ad0dfb17
39 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
+option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 # PY_VERSION
 if(NOT PY_VERSION)

--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(NOT LINUX OR NOT WITH_MKL)
+  message("Paddle-lite will not build because the required Linux and MKL do not exist.")
+  set(WITH_LITE OFF)
+  return()
+endif()
+if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
+  include(ExternalProject)
+  set(LITE_PROJECT extern_lite)
+  set(LITE_SOURCES_DIR ${THIRD_PARTY_PATH}/lite)
+  set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
+  # No quotes, so cmake can resolve it as a command with arguments.
+  set(LITE_BUILD_COMMAND $(MAKE) -j)
+  set(LITE_OPTIONAL_ARGS -DWITH_MKL=ON
+                         -DLITE_WITH_CUDA=${WITH_GPU}
+                         -DWITH_MKLDNN=OFF
+                         -DLITE_WITH_X86=ON
+                         -DLITE_WITH_PROFILE=OFF
+                         -DWITH_LITE=OFF
+                         -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF
+                         -DWITH_PYTHON=OFF
+                         -DWITH_TESTING=ON
+                         -DLITE_BUILD_EXTRA=ON
+                         -DCUDNN_ROOT=${CUDNN_ROOT}
+                         -DLITE_WITH_ARM=OFF)
+  ExternalProject_Add(
+      ${LITE_PROJECT}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+      GIT_TAG             947cda26637d46dc23f4e39d2b52e7d9a1fa6eef
+      PREFIX              ${LITE_SOURCES_DIR}
+      UPDATE_COMMAND      ""
+      BUILD_COMMAND       ${LITE_BUILD_COMMAND}
+      INSTALL_COMMAND     ""
+      CMAKE_ARGS          -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                          -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                          -DCMAKE_CXX_FLAGS=${LITE_CMAKE_CXX_FLAGS}
+                          -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                          -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                          -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                          -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                          -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                          -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                          ${EXTERNAL_OPTIONAL_ARGS}
+                          ${LITE_OPTIONAL_ARGS}
+  )
+  ExternalProject_Get_property(${LITE_PROJECT} BINARY_DIR)
+  ExternalProject_Get_property(${LITE_PROJECT} SOURCE_DIR)
+  set(LITE_BINARY_DIR ${BINARY_DIR})
+  set(LITE_SOURCE_DIR ${SOURCE_DIR})
+endif()
+message(STATUS "Paddle-lite BINARY_DIR: ${LITE_BINARY_DIR}")
+message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}")
+include_directories(${LITE_SOURCE_DIR})
+include_directories(${LITE_BINARY_DIR})
+function(external_lite_static_libs alias path)
+  add_library(${alias} STATIC IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ${alias} PROPERTY IMPORTED_LOCATION
+               ${path})
+  if (LITE_PROJECT)
+    add_dependencies(${alias} ${LITE_PROJECT})
+  endif()
+endfunction()
+external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/lite/api/libapi_full_static.a)
+add_definitions(-DPADDLE_WITH_LITE)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -207,11 +207,6 @@ if(LINUX)
        ${GPU_COMMON_FLAGS})
 endif(LINUX)
-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -20,6 +20,11 @@
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
 IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -284,4 +284,8 @@ if(WITH_DGC)
    list(APPEND third_party_deps extern_dgc)
 endif()
+if (WITH_LITE)
+    include(external/lite)
+endif (WITH_LITE)
 add_custom_target(third_party DEPENDS ${third_party_deps})
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -21,6 +21,10 @@ if (ANAKIN_SUBGRAPH)
  add_subdirectory(anakin)
 endif()
+if (WITH_LITE)
+  add_subdirectory(lite)
+endif()
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS)

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -197,6 +197,12 @@ struct Argument {
  DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter,
                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
+                      std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
+  DECL_ARGUMENT_FIELD(lite_precision_mode, LitePrecisionMode,
+                      AnalysisConfig::Precision);
  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -128,6 +128,17 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));
    }
+    if (pass_name == "lite_subgraph_pass") {
+      bool enable_int8 =
+          argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
+      pass->Set("lite_ops_filter",
+                new std::vector<std::string>(argument->lite_ops_filter()));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("enable_int8", new bool(enable_int8));
+      pass->Set("use_gpu", new bool(argument->use_gpu()));
+    }
    if (pass_name == "anakin_subgraph_pass") {
      pass->Set("program",
                new framework::ProgramDesc *(&argument->main_program()));

--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -23,3 +23,12 @@ if (ANAKIN_SUBGRAPH)
  file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n")
  set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "")
 endif()
+if (WITH_LITE) 
+  cc_library(lite_subgraph_pass SRCS lite_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util lite_op_teller)
+  set(analysis_deps ${analysis_deps} subgraph_util lite_subgraph_pass CACHE INTERNAL "")
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
+  file(APPEND ${pass_file} "USE_PASS(lite_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
+  cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
+endif()
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/inference/lite/op_teller.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
+#include "paddle/fluid/inference/lite/engine.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+using framework::ir::Node;
+using framework::ir::Agent;
+using framework::ir::SubGraphFuser;
+using framework::ir::Graph;
+namespace lite {
+std::string UniqueKey(const std::vector<std::string>& engine_inputs,
+                      const std::vector<std::string>& engine_outputs,
+                      const std::string& id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+std::vector<std::string> IOVarsFilter(const std::vector<Node*>& nodes) {
+  std::set<std::string> names;
+  for (const auto& node : nodes) {
+    if (node->IsVar() && !node->Var()->Persistable()) {
+      names.insert(node->Name());
+    }
+  }
+  return std::vector<std::string>(names.begin(), names.end());
+}
+void StrToBinaryFile(const std::string& path, const std::string& str) {
+  std::ofstream file(path.c_str(), std::ios::binary);
+  file.write(str.c_str(), str.size());
+  file.close();
+}
+void ModifyHostSubgraphOps(
+    framework::ProgramDesc* host_program, framework::BlockDesc* host_sub_block,
+    const std::vector<framework::OpDesc*>& subgraph_ops) {
+  for (auto* op_desc : subgraph_ops) {
+    auto* sub_block_op = host_sub_block->AppendOp();
+    sub_block_op->CopyFrom(*op_desc);
+    if (op_desc->HasAttr("sub_block")) {
+      int32_t global_sub_id = host_sub_block->ID();
+      auto* op_sub_block =
+          host_program->MutableBlock(op_desc->GetBlockAttrId("sub_block"));
+      op_sub_block->Proto()->set_parent_idx(global_sub_id);
+    }
+  }
+}
+void ModifyHostProgram(framework::ProgramDesc* host_program,
+                       framework::BlockDesc* host_sub_block,
+                       const std::unordered_set<Node*>& io_var_nodes,
+                       const std::vector<framework::OpDesc*>& subgraph_ops) {
+  for (auto* var_node : io_var_nodes) {
+    auto* sub_block_var = host_sub_block->Var(var_node->Name());
+    sub_block_var->Proto()->CopyFrom(*var_node->Var()->Proto());
+  }
+  ModifyHostSubgraphOps(host_program, host_sub_block, subgraph_ops);
+}
+void AppendLiteSubBlocks(const std::vector<framework::OpDesc*>& subgraph_ops,
+                         framework::ProgramDesc* engine_program,
+                         framework::ProgramDesc* host_program,
+                         const int32_t host_sub_id) {
+  std::unordered_map<int32_t, int32_t> sub_blocks_map;
+  std::unordered_set<int32_t> copied_host_ids;
+  sub_blocks_map[host_sub_id] = framework::kRootBlockIndex;
+  std::function<void(const std::vector<framework::OpDesc*>&)> append_sub_blocks;
+  append_sub_blocks = [&](const std::vector<framework::OpDesc*>& ops) {
+    for (auto* op_desc : ops) {
+      if (op_desc->HasAttr("sub_block")) {
+        int32_t host_op_sub_id = op_desc->GetBlockAttrId("sub_block");
+        if (copied_host_ids.count(host_op_sub_id)) continue;
+        size_t engine_block_size = engine_program->Size();
+        auto* host_op_sub_block = host_program->MutableBlock(host_op_sub_id);
+        auto* engine_op_sub_block =
+            engine_program->AppendBlock(*(op_desc->Block()));
+        for (auto* var : host_op_sub_block->AllVars()) {
+          auto* engine_var = engine_op_sub_block->Var(var->Name());
+          engine_var->Proto()->CopyFrom(*var->Proto());
+        }
+        for (auto* op : host_op_sub_block->AllOps()) {
+          auto* engine_op = engine_op_sub_block->AppendOp();
+          engine_op->Proto()->CopyFrom(*op->Proto());
+        }
+        sub_blocks_map[host_op_sub_id] = engine_block_size;
+        append_sub_blocks(host_op_sub_block->AllOps());
+      }
+    }
+  };
+  append_sub_blocks(subgraph_ops);
+  for (size_t i = 0; i < engine_program->Size(); i++) {
+    for (auto* op_desc : engine_program->Block(i).AllOps()) {
+      if (op_desc->HasAttr("sub_block")) {
+        int32_t id = op_desc->GetBlockAttrId("sub_block");
+        op_desc->SetAttr("sub_block", sub_blocks_map[id]);
+      }
+    }
+  }
+}
+// The modification of pass should be a process of framework::desc
+// (initial) -> proto::desc (flush) -> framework::desc (final).
+// Ir::Graph is limited to changing the main block, so the sub block
+// needs to be processed here.
+void ModifyEngineProgram(Node* merged_node,
+                         framework::ProgramDesc* host_program,
+                         framework::ProgramDesc* engine_program,
+                         const int32_t host_sub_block_id,
+                         const std::unordered_set<Node*>& io_var_nodes,
+                         const std::vector<framework::OpDesc*>& subgraph_ops) {
+  // 1. Fill the main block of lite program.
+  framework::BlockDesc* engine_global_block =
+      engine_program->MutableBlock(framework::kRootBlockIndex);
+  PrependFeedOps(engine_global_block, IOVarsFilter(merged_node->inputs));
+  for (auto* var_node : io_var_nodes) {
+    framework::VarDesc* sub_block_var =
+        engine_global_block->Var(var_node->Name());
+    sub_block_var->Proto()->CopyFrom(*var_node->Var()->Proto());
+  }
+  for (auto* op_desc : subgraph_ops) {
+    auto* sub_block_op = engine_global_block->AppendOp();
+    sub_block_op->CopyFrom(*op_desc);
+  }
+  PrependFetchOps(engine_global_block, IOVarsFilter(merged_node->outputs));
+  // 2. Append sub blocks in the lite program.
+  AppendLiteSubBlocks(subgraph_ops, engine_program, host_program,
+                      host_sub_block_id);
+}
+void OrganizeProgram(Node* merged_node, framework::ProgramDesc* host_program,
+                     framework::ProgramDesc* engine_program,
+                     std::vector<std::string>* repetitive_params) {
+  std::vector<framework::ir::Node*>& subgraph = *Agent(merged_node).subgraph();
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::NotFound(
+                        "No subgraph found in lite subgraph pass. Please use "
+                        "the full model call from Analysis Predictor."));
+  const framework::BlockDesc& host_global_block =
+      host_program->Block(framework::kRootBlockIndex);
+  framework::BlockDesc* host_sub_block =
+      host_program->AppendBlock(host_global_block);
+  string::PrettyLogDetail("---  detect a sub-graph with %d nodes",
+                          subgraph.size());
+  std::unordered_set<Node*> io_var_nodes = GetRelatedIOVarNodes(subgraph);
+  for (const auto* node : io_var_nodes) {
+    VLOG(3) << "IO Variable Name: " << node->Name();
+  }
+  std::vector<framework::OpDesc*> subgraph_ops;
+  for (auto* op_node : subgraph) {
+    subgraph_ops.push_back(op_node->Op());
+  }
+  ModifyHostProgram(host_program, host_sub_block, io_var_nodes, subgraph_ops);
+  ModifyEngineProgram(merged_node, host_program, engine_program,
+                      host_sub_block->ID(), io_var_nodes, subgraph_ops);
+  *repetitive_params = ExtractParameters(io_var_nodes, true);
+  for (const auto& param : *repetitive_params) {
+    VLOG(3) << "Repetitive param: " << param;
+  }
+  host_program->Flush();
+  engine_program->Flush();
+}
+}  // namespace lite
+void LiteSubgraphPass::SetUpEngine(
+    framework::ProgramDesc* program,
+    const std::vector<std::string>& repetitive_params,
+    const std::string& unique_key, bool dump_model) const {
+  inference::lite::EngineConfig config;
+  auto* scope = param_scope();
+  // When the pass is started, only the persistent variables of the
+  // main block are read. Fluid seems to allow persistence variables
+  // in the sub block, but they are controlled by context, so the
+  // support is suspended here.
+  auto serialize_params = [](std::string* str, framework::Scope* scope,
+                             const std::vector<std::string>& params) {
+    std::ostringstream os;
+    platform::CPUDeviceContext ctx;
+    for (const auto& param : params) {
+      VLOG(3) << "Serialize param: " << param;
+      PADDLE_ENFORCE_NOT_NULL(
+          scope->FindVar(param),
+          platform::errors::NotFound(
+              "Block should already have a '%s' variable", param));
+      auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
+      framework::SerializeToStream(os, *tensor, ctx);
+    }
+    *str = os.str();
+  };
+  bool use_gpu = Get<bool>("use_gpu");
+  bool enable_int8 = Get<bool>("enable_int8");
+  lite_api::TargetType target_type = use_gpu ? TARGET(kCUDA) : TARGET(kX86);
+  paddle::lite_api::PrecisionType precision_type =
+      enable_int8 ? PRECISION(kInt8) : PRECISION(kInt64);
+  serialize_params(&config.param, scope, repetitive_params);
+  config.model = program->Proto()->SerializeAsString();
+  config.valid_places = {
+      paddle::lite::Place({target_type, precision_type}),
+      paddle::lite::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+  };
+  if (dump_model) {
+    lite::StrToBinaryFile("./model.bin", config.model);
+    lite::StrToBinaryFile("./param.bin", config.param);
+  }
+  inference::Singleton<inference::lite::EngineManager>::Global().Create(
+      unique_key, config);
+}
+void LiteSubgraphPass::BuildOperator(
+    Node* merged_node, framework::ProgramDesc* global_program,
+    std::vector<std::string>* repetitive_params) const {
+  framework::ProgramDesc engine_program;
+  const std::string id = std::to_string(Get<int>("predictor_id"));
+  const std::vector<std::string> input_names =
+      lite::IOVarsFilter(merged_node->inputs);
+  const std::vector<std::string> output_names =
+      lite::IOVarsFilter(merged_node->outputs);
+  const std::string unique_key = lite::UniqueKey(input_names, output_names, id);
+  lite::OrganizeProgram(merged_node, global_program, &engine_program,
+                        repetitive_params);
+  SetUpEngine(&engine_program, *repetitive_params, unique_key);
+  auto* op_desc = merged_node->Op();
+  op_desc->SetInput("Xs", input_names);
+  op_desc->SetOutput("Ys", output_names);
+  op_desc->SetType("lite_engine");
+  op_desc->SetAttr("engine_key", unique_key);
+  op_desc->SetAttr("enable_int8", Get<bool>("enable_int8"));
+  op_desc->SetAttr("use_gpu", Get<bool>("use_gpu"));
+}
+void LiteSubgraphPass::ApplyImpl(framework::ir::Graph* graph) const {
+  framework::ir::FusePassBase::Init("lite_subgraph_pass", graph);
+  framework::ProgramDesc* global_program =
+      Get<framework::ProgramDesc*>("program");
+  auto& lite_ops_filter = Get<std::vector<std::string>>("lite_ops_filter");
+  auto teller = [&lite_ops_filter](const Node* node) {
+    if (!node->IsOp() || !node->Op())
+      return false;
+    else if (node->Op()->Type() == "feed" || node->Op()->Type() == "fetch")
+      return false;
+    else if (std::find(lite_ops_filter.begin(), lite_ops_filter.end(),
+                       node->Op()->Type()) != lite_ops_filter.end())
+      return false;
+    return inference::lite::OpTeller::Global().Tell(node->Op()->Type(),
+                                                    *node->Op());
+  };
+  SubGraphFuser fuser(graph, teller, 0 /* min_subgraph_size */, "lite_engine");
+  fuser();
+  std::vector<std::string> repetitive_params;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
+      BuildOperator(node, global_program, &repetitive_params);
+      std::unordered_set<const Node*> nodes2remove(
+          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+    }
+  }
+  std::unordered_set<const Node*> nodes2remove;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+REGISTER_PASS(lite_subgraph_pass,
+              paddle::inference::analysis::LiteSubgraphPass);
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+class LiteSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph* graph) const override;
+ private:
+  void BuildOperator(framework::ir::Node* merged_node,
+                     framework::ProgramDesc* global_program,
+                     std::vector<std::string>* repetitive_params) const;
+  void SetUpEngine(framework::ProgramDesc* program,
+                   const std::vector<std::string>& repetitive_params,
+                   const std::string& unique_key,
+                   bool dump_model = false) const;
+};
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/inference/lite/op_teller.h"
+namespace paddle {
+namespace inference {
+namespace analysis {
+namespace lite {
+void StrToBinaryFile(const std::string& path, const std::string& str);
+void ModifyHostSubgraphOps(framework::ProgramDesc* host_program,
+                           framework::BlockDesc* host_sub_block,
+                           const std::vector<framework::OpDesc*>& subgraph_ops);
+void AppendLiteSubBlocks(const std::vector<framework::OpDesc*>& subgraph_ops,
+                         framework::ProgramDesc* engine_program,
+                         framework::ProgramDesc* host_program,
+                         const int32_t host_sub_id);
+}
+TEST(LiteSubgraphPass, basic) {
+  framework::ProgramDesc host_program;
+  framework::ProgramDesc engine_program;
+  framework::BlockDesc* host_main_block = host_program.MutableBlock(0);
+  framework::BlockDesc* host_sub_block =
+      host_program.AppendBlock(*host_main_block);
+  framework::OpDesc* host_while_op = host_main_block->AppendOp();
+  host_main_block->Var("var_main");
+  host_sub_block->Var("var_sub");
+  host_while_op->SetType("while");
+  host_while_op->SetAttr("sub_block", host_sub_block);
+  framework::OpDesc* host_sub_block_op = host_sub_block->AppendOp();
+  host_sub_block_op->SetType("leaky_relu");
+  CHECK(inference::lite::OpTeller::Global().Tell("while", *host_while_op))
+      << "Lite operator teller test failed.";
+  lite::AppendLiteSubBlocks({host_while_op}, &engine_program, &host_program,
+                            host_sub_block->ID());
+  lite::ModifyHostSubgraphOps(&host_program, host_sub_block, {host_while_op});
+  lite::StrToBinaryFile("./", "test");
+}
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -26,7 +26,7 @@ namespace analysis {
 using framework::ir::Node;
 std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes) {
+    const std::unordered_set<Node *> &nodes, bool sorted) {
  // We can judge whether a variable is a parameter by
  // its presistable property, but sometimes the presistable
  // of the feed op output is true, so we have to identify it.
@@ -50,9 +50,59 @@ std::vector<std::string> ExtractParameters(
      parameters.push_back(node->Name());
    }
  }
+  if (sorted) {
+    std::sort(parameters.begin(), parameters.end());
+    parameters.erase(std::unique(parameters.begin(), parameters.end()),
+                     parameters.end());
+  }
  return parameters;
 }
+std::unordered_set<Node *> GetRelatedIOVarNodes(
+    const std::vector<Node *> &nodes) {
+  std::unordered_set<Node *> io_nodes;
+  for (const auto &node : nodes) {
+    if (!node->IsOp()) continue;
+    for (const auto &in : node->inputs) {
+      io_nodes.insert(in);
+    }
+    for (const auto &out : node->outputs) {
+      io_nodes.insert(out);
+    }
+  }
+  return io_nodes;
+}
+void PrependFeedOps(framework::BlockDesc *global_block,
+                    const std::vector<std::string> &feed_target_names,
+                    std::string feed_holder_name) {
+  framework::VarDesc *feed_var = global_block->Var(feed_holder_name);
+  feed_var->SetType(paddle::framework::proto::VarType::FEED_MINIBATCH);
+  feed_var->SetPersistable(true);
+  for (size_t i = 0; i < feed_target_names.size(); i++) {
+    framework::OpDesc *feed_op = global_block->AppendOp();
+    feed_op->SetType("feed");
+    feed_op->SetInput("X", {feed_holder_name});
+    feed_op->SetOutput("Out", {feed_target_names[i]});
+    feed_op->SetAttr("col", static_cast<int>(i));
+  }
+}
+void PrependFetchOps(framework::BlockDesc *global_block,
+                     const std::vector<std::string> &fetch_target_names,
+                     std::string fetch_holder_name) {
+  framework::VarDesc *fetch_var = global_block->Var(fetch_holder_name);
+  fetch_var->SetType(paddle::framework::proto::VarType::FETCH_LIST);
+  fetch_var->SetPersistable(true);
+  for (size_t i = 0; i < fetch_target_names.size(); i++) {
+    framework::OpDesc *fetch_op = global_block->AppendOp();
+    fetch_op->SetType("fetch");
+    fetch_op->SetInput("X", {fetch_target_names[i]});
+    fetch_op->SetOutput("Out", {fetch_holder_name});
+    fetch_op->SetAttr("col", static_cast<int>(i));
+  }
+}
 void RenameAndGetOutputs(
    const std::vector<framework::ir::Node *> &subgraph_nodes,
    framework::BlockDesc *block_desc,

--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -30,10 +30,21 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 namespace analysis {
-using framework::ir::Node;
 std::vector<std::string> ExtractParameters(
-    const std::unordered_set<Node *> &nodes);
+    const std::unordered_set<framework::ir::Node *> &nodes,
+    bool sorted = false);
+std::unordered_set<framework::ir::Node *> GetRelatedIOVarNodes(
+    const std::vector<framework::ir::Node *> &nodes);
+void PrependFeedOps(framework::BlockDesc *global_block,
+                    const std::vector<std::string> &feed_target_names,
+                    std::string feed_holder_name = "feed");
+void PrependFetchOps(framework::BlockDesc *global_block,
+                     const std::vector<std::string> &fetch_target_names,
+                     std::string fetch_holder_name = "fetch");
 void RenameAndGetOutputs(
    const std::vector<framework::ir::Node *> &subgraph_nodes,

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -23,6 +23,7 @@
 namespace paddle {
 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
+extern const std::vector<std::string> kLiteSubgraphPasses;
 PassStrategy *AnalysisConfig::pass_builder() const {
  if (!pass_builder_.get()) {
@@ -128,6 +129,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(anakin_passes_filter_);
  CP_MEMBER(anakin_ops_filter_);
+  CP_MEMBER(use_lite_);
+  CP_MEMBER(lite_precision_mode_);
+  CP_MEMBER(lite_passes_filter_);
+  CP_MEMBER(lite_ops_filter_);
  // profile related.
  CP_MEMBER(with_profile_);
@@ -351,6 +357,20 @@ void AnalysisConfig::Update() {
    }
  }
+  if (use_lite_) {
+#ifndef PADDLE_WITH_LITE
+    LOG(WARNING) << "You tried to enable the lite subgraph "
+                    "but did not have the option -DWITH_LITE compiled.";
+#endif
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kLiteSubgraphPasses) {
+      if (std::find(lite_passes_filter_.begin(), lite_passes_filter_.end(),
+                    pass) == lite_passes_filter_.end()) {
+        pass_builder()->AppendPass(pass);
+      }
+    }
+  }
  if (ir_debug_) {
    pass_builder()->TurnOnDebug();
  }
@@ -395,6 +415,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
  ss << cpu_math_library_num_threads_;
  ss << use_anakin_;
  ss << anakin_min_subgraph_size_;
+  ss << use_lite_;
  return ss.str();
 }
@@ -484,6 +506,17 @@ void AnalysisConfig::EnableAnakinEngine(
  Update();
 }
+void AnalysisConfig::EnableLiteEngine(
+    AnalysisConfig::Precision precision_mode,
+    const std::vector<std::string> &passes_filter,
+    const std::vector<std::string> &ops_filter) {
+  use_lite_ = true;
+  lite_precision_mode_ = precision_mode;
+  lite_passes_filter_ = passes_filter;
+  lite_ops_filter_ = ops_filter;
+  Update();
+}
 void AnalysisConfig::PartiallyRelease() {
  prog_file_.clear();
  prog_file_.shrink_to_fit();

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -428,6 +428,13 @@ void AnalysisPredictor::PrepareArgument() {
    LOG(INFO) << "Anakin subgraph engine is enabled";
  }
+  if (config_.lite_engine_enabled()) {
+    argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
+    argument_.SetLitePassesFilter(config_.lite_passes_filter_);
+    argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
+    LOG(INFO) << "Lite subgraph engine is enabled";
+  }
  if (config_.use_mkldnn_) {
    LOG(INFO) << "MKLDNN is enabled";
    argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -171,10 +171,19 @@ struct AnalysisConfig {
      std::vector<std::string> passes_filter = {},
      std::vector<std::string> ops_filter = {});
+  void EnableLiteEngine(
+      AnalysisConfig::Precision precision_mode = Precision::kFloat32,
+      const std::vector<std::string>& passes_filter = {},
+      const std::vector<std::string>& ops_filter = {});
  /** A boolean state indicating whether the Anakin sub-graph engine is used.
  */
  bool anakin_engine_enabled() const { return use_anakin_; }
+  /** A boolean state indicating whether the Lite sub-graph engine is used.
+  */
+  bool lite_engine_enabled() const { return use_lite_; }
  /** \brief Control whether to debug IR graph analysis phase.
   *
   * This will generate DOT files for visualizing the computation graph after
@@ -350,6 +359,11 @@ struct AnalysisConfig {
  std::vector<std::string> anakin_passes_filter_;
  std::vector<std::string> anakin_ops_filter_;
+  bool use_lite_{false};
+  std::vector<std::string> lite_passes_filter_;
+  std::vector<std::string> lite_ops_filter_;
+  Precision lite_precision_mode_;
  // mkldnn related.
  int mkldnn_cache_capacity_{0};
  bool use_mkldnn_quantizer_{false};

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -105,6 +105,12 @@ const std::vector<std::string> kAnakinSubgraphPasses({
    "fc_gru_fuse_pass",                             //
 });
+const std::vector<std::string> kLiteSubgraphPasses({
+#ifdef PADDLE_WITH_LITE
+    "lite_subgraph_pass",
+#endif
+});
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
  passes_.assign({
    //   "identity_scale_op_clean_pass",             //
@@ -123,7 +129,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
        "conv_elementwise_add2_act_fuse_pass",  //
        "conv_elementwise_add_fuse_pass",       //
 #endif                                          //
-        "transpose_flatten_concat_fuse_pass",
+        "transpose_flatten_concat_fuse_pass",   //
        // following pass should be located in the last, since it will
        // work on all fused ops.
        "runtime_context_cache_pass"

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -163,5 +163,6 @@ class GpuPassStrategy : public PassStrategy {
 extern const std::vector<std::string> kTRTSubgraphPasses;
 extern const std::vector<std::string> kAnakinSubgraphPasses;
+extern const std::vector<std::string> kLiteSubgraphPasses;
 }  // namespace paddle
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
+cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
+cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
+cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef PADDLE_WITH_CUDA
+#define LITE_WITH_CUDA 1
+#endif
+#include "paddle/fluid/inference/lite/engine.h"
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+bool EngineManager::Empty() const { return engines_.size() == 0; }
+bool EngineManager::Has(const std::string& name) const {
+  if (engines_.count(name) == 0) {
+    return false;
+  }
+  return engines_.at(name).get() != nullptr;
+}
+paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+  return engines_.at(name).get();
+}
+paddle::lite::Predictor* EngineManager::Create(const std::string& name,
+                                               const EngineConfig& cfg) {
+  auto* p = new paddle::lite::Predictor();
+#ifdef PADDLE_WITH_CUDA
+  paddle::lite::Env<TARGET(kCUDA)>::Init();
+#endif
+  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
+           cfg.model_type, cfg.model_from_memory);
+  engines_[name].reset(p);
+  return p;
+}
+void EngineManager::DeleteAll() {
+  for (auto& item : engines_) {
+    item.second.reset(nullptr);
+  }
+}
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/api/cxx_api.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+struct EngineConfig {
+  std::string model;
+  std::string param;
+  paddle::lite::Place prefer_place;
+  std::vector<paddle::lite::Place> valid_places;
+  std::vector<std::string> neglected_passes;
+  lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
+  bool model_from_memory{true};
+};
+class EngineManager {
+ public:
+  bool Empty() const;
+  bool Has(const std::string& name) const;
+  paddle::lite::Predictor* Get(const std::string& name) const;
+  paddle::lite::Predictor* Create(const std::string& name,
+                                  const EngineConfig& cfg);
+  void DeleteAll();
+ private:
+  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+      engines_;
+};
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <map>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/lite/op_teller.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+// Just tell by the op_types.
+struct SimpleOpTeller : public Teller {
+  SimpleOpTeller() {
+    const std::map<std::string, std::string>& op2path =
+        OpKernelInfoCollector::Global().GetOp2PathDict();
+    auto is_non_inst = [](const std::string& op) -> bool {
+      const std::vector<std::string> ops = {"feed", "fetch", "while"};
+      return std::find(ops.begin(), ops.end(), op) != ops.end();
+    };
+    for (const auto& op : op2path) {
+      if (!is_non_inst(op.first)) {
+        ops_.insert(op.first);
+      }
+    }
+  }
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& op_desc) override {
+    return ops_.count(op_type);
+  }
+ private:
+  std::unordered_set<std::string> ops_{};
+};
+struct SingleBlockOpTeller : public Teller {
+  SingleBlockOpTeller() { ops_.insert("while"); }
+  bool operator()(const std::string& op_type,
+                  const framework::OpDesc& op_desc) override {
+    if (ops_.count(op_type)) {
+      SimpleOpTeller supported;
+      const int id = op_desc.GetBlockAttrId("sub_block");
+      const framework::BlockDesc& block_desc =
+          op_desc.Block()->Program()->Block(id);
+      const std::vector<framework::OpDesc*>& ops_sub_block =
+          block_desc.AllOps();
+      for (auto* op : ops_sub_block) {
+        if (!supported(op->Type(), *op) && !this->operator()(op->Type(), *op)) {
+          return false;
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+ private:
+  std::unordered_set<std::string> ops_;
+};
+bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
+  for (auto& teller : tellers_) {
+    if ((*teller)(op_type, desc)) return true;
+  }
+  return false;
+}
+OpTeller::OpTeller() {
+  tellers_.emplace_back(new SimpleOpTeller);
+  tellers_.emplace_back(new SingleBlockOpTeller);
+}
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/op_teller.h
+++ b/paddle/fluid/inference/lite/op_teller.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/op_desc.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+/*
+ * Single Op teller definition.
+ * One can override this and define a more complex tell logic, considerring more
+ * issues such as op_desc.
+ */
+struct Teller {
+  virtual bool operator()(const std::string& op_type,
+                          const framework::OpDesc& desc) = 0;
+  virtual ~Teller() = default;
+};
+/*
+ * A real example:
+ *
+ * struct SomeTeller : public Teller {
+ * bool operator()(const std::string& op_type,
+ *                const framework::OpDesc& desc) override {
+ *  return op_type == "fc" && desc.Inputs().size() == 2;
+ * }
+ *};
+ */
+/*
+ * class OpTeller helps to tell whether a fluid
+ * operator can be transformed to a TensorRT layer.
+ */
+class OpTeller {
+ public:
+  static OpTeller& Global() {
+    static std::unique_ptr<OpTeller> x(new OpTeller);
+    return *x;
+  }
+  bool Tell(const std::string& op_type, const framework::OpDesc& desc);
+ private:
+  OpTeller();
+ private:
+  std::vector<std::unique_ptr<Teller>> tellers_;
+};
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <map>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/inference/lite/engine.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+namespace utils {
+using paddle::lite_api::TargetType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::DataLayoutType;
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+template void SetLoD<paddle::lite::LoD, framework::LoD>(
+    paddle::lite::LoD* dst, const framework::LoD& src);
+template void SetLoD<framework::LoD, paddle::lite::LoD>(
+    framework::LoD* dst, const paddle::lite::LoD& src);
+platform::Place GetNativePlace(const TargetType& type, int id = 0) {
+  switch (type) {
+    case TargetType::kHost:
+    case TargetType::kX86:
+      return platform::CPUPlace();
+    case TargetType::kCUDA:
+      return platform::CUDAPlace(id);
+    default:
+      LOG(FATAL) << "Error target type.";
+      return platform::Place();
+  }
+}
+TargetType GetLiteTargetType(const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return TargetType::kHost;
+  }
+  return TargetType::kCUDA;
+}
+PrecisionType GetLitePrecisionType(framework::proto::VarType::Type type) {
+  switch (type) {
+    case framework::proto::VarType_Type_FP32:
+      return PrecisionType::kFloat;
+    case framework::proto::VarType_Type_INT8:
+      return PrecisionType::kInt8;
+    case framework::proto::VarType_Type_INT32:
+      return PrecisionType::kInt32;
+    case framework::proto::VarType_Type_INT64:
+      return PrecisionType::kInt64;
+    default:
+      LOG(FATAL) << "Error precision type.";
+      return PrecisionType::kUnk;
+  }
+}
+framework::proto::VarType::Type GetNativePrecisionType(
+    const PrecisionType& type) {
+  switch (type) {
+    case PrecisionType::kFloat:
+      return framework::proto::VarType_Type_FP32;
+    case PrecisionType::kInt8:
+      return framework::proto::VarType_Type_INT8;
+    case PrecisionType::kInt32:
+      return framework::proto::VarType_Type_INT32;
+    case PrecisionType::kInt64:
+      return framework::proto::VarType_Type_INT64;
+    default:
+      LOG(FATAL) << "Error precision type.";
+      return static_cast<framework::proto::VarType::Type>(-1);
+  }
+}
+framework::DataLayout GetNativeLayoutType(const DataLayoutType& type) {
+  switch (type) {
+    case DataLayoutType::kNCHW:
+      return framework::DataLayout::kNCHW;
+    default:
+      LOG(FATAL) << "Error layout type.";
+      return static_cast<framework::DataLayout>(-1);
+  }
+}
+void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
+                     const platform::Place& src_place, const void* src_data,
+                     const size_t size, const platform::DeviceContext& ctx) {
+  const platform::CPUPlace cpu_place;
+  if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
+    memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_cpu_place(dst_place) &&
+        platform::is_gpu_place(src_place)) {
+      LOG(FATAL) << "lite::MemoryCopy GPU->CPU is not yet implemented.";
+    } else if (platform::is_gpu_place(dst_place) &&
+               platform::is_cpu_place(src_place)) {
+      LOG(FATAL) << "lite::MemoryCopy CPU->GPU is not yet implemented.";
+    } else if (platform::is_gpu_place(dst_place) &&
+               platform::is_gpu_place(src_place)) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(src_place);
+      memory::Copy(
+          gpu_place, dst_data, gpu_place, src_data, size,
+          static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+    }
+#else
+    LOG(FATAL) << "You must define PADDLE_WITH_CUDA for using CUDAPlace.";
+#endif
+  }
+}
+void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+  // Currently, Lite needs to explicitly specify the target type of
+  // the input tensor.
+  constexpr int empty_size = 0;
+  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
+  dst->set_precision(GetLitePrecisionType(src.type()));
+  SetLoD(dst->mutable_lod(), src.lod());
+}
+void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+  constexpr framework::proto::VarType::Type dtype =
+      framework::proto::VarType_Type_FP32;
+  dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
+                    dtype);
+  SetLoD(dst->mutable_lod(), src.lod());
+}
+template <>
+void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+                     const platform::DeviceContext& ctx) {
+  InitDstTensor(dst, src);
+  const platform::Place& src_place = src.place();
+  const platform::Place& dst_place = GetNativePlace(dst->target());
+  const size_t bytes =
+      static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
+  dst->Resize(framework::vectorize(src.dims()));
+  const void* src_data = src.data<void>();
+  void* dst_data = dst->mutable_data(bytes);
+  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
+}
+template <>
+void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+                     const platform::DeviceContext& ctx) {
+  InitDstTensor(dst, src);
+  const platform::Place& src_place = GetNativePlace(src.target());
+  const platform::Place& dst_place = dst->place();
+  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  const size_t bytes =
+      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
+  const void* src_data = src.raw_data();
+  // When Lite is ready, the source type needs to be modified here.
+  void* dst_data = dst->mutable_data(dst_place, dst->type());
+  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
+}
+}  // namespace utils
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/tensor_utils.h
+++ b/paddle/fluid/inference/lite/tensor_utils.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/api/paddle_place.h"
+#include "lite/core/tensor.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+namespace utils {
+template <typename DstTensor, typename SrcTensor>
+void TensorCopyAsync(DstTensor* dst, const SrcTensor& src,
+                     const platform::DeviceContext& ctx);
+}  // namespace utils
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+using inference::lite::AddTensorToBlockDesc;
+using inference::lite::CreateTensor;
+using inference::lite::serialize_params;
+void make_fake_model(std::string* model, std::string* param) {
+  framework::ProgramDesc program;
+  LOG(INFO) << "program.block size is " << program.Size();
+  auto* block_ = program.Proto()->mutable_blocks(0);
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  auto* feed0 = block_desc.AppendOp();
+  feed0->SetType("feed");
+  feed0->SetInput("X", {"feed"});
+  feed0->SetOutput("Out", {"x"});
+  feed0->SetAttr("col", 0);
+  auto* feed1 = block_desc.AppendOp();
+  feed1->SetType("feed");
+  feed1->SetInput("X", {"feed"});
+  feed1->SetOutput("Out", {"y"});
+  feed1->SetAttr("col", 1);
+  LOG(INFO) << "create elementwise_add op";
+  auto* elt_add = block_desc.AppendOp();
+  elt_add->SetType("elementwise_add");
+  elt_add->SetInput("X", std::vector<std::string>({"x"}));
+  elt_add->SetInput("Y", std::vector<std::string>({"y"}));
+  elt_add->SetOutput("Out", std::vector<std::string>({"z"}));
+  elt_add->SetAttr("axis", -1);
+  LOG(INFO) << "create fetch op";
+  auto* fetch = block_desc.AppendOp();
+  fetch->SetType("fetch");
+  fetch->SetInput("X", std::vector<std::string>({"z"}));
+  fetch->SetOutput("Out", std::vector<std::string>({"out"}));
+  fetch->SetAttr("col", 0);
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}), true);
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({2, 4}), true);
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4}), false);
+  AddTensorToBlockDesc(block_, "out", std::vector<int64_t>({2, 4}), false);
+  *block_->add_ops() = *feed0->Proto();
+  *block_->add_ops() = *feed1->Proto();
+  *block_->add_ops() = *elt_add->Proto();
+  *block_->add_ops() = *fetch->Proto();
+  framework::Scope scope;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+#else
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+#endif
+  // Prepare variables.
+  std::vector<std::string> repetitive_params{"x", "y"};
+  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}));
+  ASSERT_EQ(block_->ops_size(), 4);
+  *model = program.Proto()->SerializeAsString();
+  serialize_params(param, &scope, repetitive_params);
+}
+TEST(EngineManager, engine) {
+  ASSERT_EQ(
+      inference::Singleton<inference::lite::EngineManager>::Global().Empty(),
+      true);
+  inference::lite::EngineConfig config;
+  make_fake_model(&(config.model), &(config.param));
+  LOG(INFO) << "prepare config";
+  const std::string unique_key("engine_0");
+  config.model_from_memory = true;
+  config.valid_places = {
+#ifdef PADDLE_WITH_CUDA
+      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+#endif
+      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+  };
+  LOG(INFO) << "Create EngineManager";
+  inference::Singleton<inference::lite::EngineManager>::Global().Create(
+      unique_key, config);
+  LOG(INFO) << "Create EngineManager done";
+  ASSERT_EQ(
+      inference::Singleton<inference::lite::EngineManager>::Global().Empty(),
+      false);
+  ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
+                unique_key),
+            true);
+  paddle::lite::Predictor* engine_0 =
+      inference::Singleton<inference::lite::EngineManager>::Global().Get(
+          unique_key);
+  CHECK_NOTNULL(engine_0);
+  inference::Singleton<inference::lite::EngineManager>::Global().DeleteAll();
+  CHECK(inference::Singleton<inference::lite::EngineManager>::Global().Get(
+            unique_key) == nullptr)
+      << "the engine_0 should be nullptr";
+}
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/lite/tensor_utils.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+namespace utils {
+using paddle::lite_api::TargetType;
+using paddle::lite_api::PrecisionType;
+using paddle::lite_api::DataLayoutType;
+TEST(LiteEngineOp, GetNativePlace) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  platform::Place GetNativePlace(const TargetType& type, int id = 0);
+  EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost)));
+  EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA)));
+  ASSERT_DEATH(GetNativePlace(TargetType::kUnk), "");
+}
+TEST(LiteEngineOp, GetLiteTargetType) {
+  TargetType GetLiteTargetType(const platform::Place& place);
+  ASSERT_EQ(GetLiteTargetType(platform::CPUPlace()), TargetType::kHost);
+  ASSERT_EQ(GetLiteTargetType(platform::CUDAPlace(0)), TargetType::kCUDA);
+}
+TEST(LiteEngineOp, GetLitePrecisionType) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  PrecisionType GetLitePrecisionType(framework::proto::VarType::Type type);
+  ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_FP32),
+            PrecisionType::kFloat);
+  ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT8),
+            PrecisionType::kInt8);
+  ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32),
+            PrecisionType::kInt32);
+  ASSERT_DEATH(
+      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), "");
+}
+TEST(LiteEngineOp, GetNativePrecisionType) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  framework::proto::VarType::Type GetNativePrecisionType(
+      const PrecisionType& type);
+  ASSERT_EQ(GetNativePrecisionType(PrecisionType::kFloat),
+            framework::proto::VarType_Type_FP32);
+  ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt8),
+            framework::proto::VarType_Type_INT8);
+  ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32),
+            framework::proto::VarType_Type_INT32);
+  ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), "");
+}
+TEST(LiteEngineOp, GetNativeLayoutType) {
+  ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+  framework::DataLayout GetNativeLayoutType(const DataLayoutType& type);
+  ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW),
+            framework::DataLayout::kNCHW);
+  ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), "");
+}
+void test_tensor_copy(const platform::DeviceContext& ctx) {
+  // Create LoDTensor.
+  std::vector<float> vector({1, 2, 3, 4});
+  framework::LoDTensor lod_tensor;
+  framework::TensorFromVector(vector, &lod_tensor);
+  framework::LoD lod({{0, 2, 4}});
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(lod);
+  // Create lite::Tensor and copy.
+  paddle::lite::Tensor lite_tensor;
+  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  // Copy to LoDTensor.
+  framework::LoDTensor lod_tensor_n;
+  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    platform::GpuStreamSync(
+        static_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+  std::vector<float> result;
+  TensorToVector(lod_tensor_n, &result);
+  ASSERT_EQ(result, vector);
+  ASSERT_EQ(lod_tensor_n.lod(), lod_tensor.lod());
+}
+TEST(LiteEngineOp, TensorCopyAsync) {
+  auto* ctx_cpu =
+      platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
+  test_tensor_copy(*ctx_cpu);
+#ifdef PADDLE_WITH_CUDA
+  auto* ctx_gpu =
+      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
+  test_tensor_copy(*ctx_gpu);
+#endif
+}
+}  // namespace utils
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -352,7 +352,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
    set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant_small_model")
    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "quant_small_model.tar.gz")
@@ -362,6 +361,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
            ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 endif()
+set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
+download_data(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz")
+inference_analysis_test(lite_mul_model_test SRCS lite_mul_model_test.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${LITE_MODEL_INSTALL_DIR})
 inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc
            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_fluid_c
            ARGS --infer_model=${RESNET50_MODEL_DIR}/model)

--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cmath>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+namespace paddle {
+namespace inference {
+TEST(AnalysisPredictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mul_model";
+  AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel(model_dir);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  std::vector<PaddleTensor> inputs;
+  auto predictor = CreatePaddlePredictor(config);
+  std::vector<float> input({1});
+  PaddleTensor in;
+  in.shape = {1, 1};
+  in.data = PaddleBuf(static_cast<void*>(input.data()), 1 * sizeof(float));
+  in.dtype = PaddleDType::FLOAT32;
+  inputs.emplace_back(in);
+  std::vector<PaddleTensor> outputs;
+  ASSERT_TRUE(predictor->Run(inputs, &outputs));
+  const std::vector<float> truth_values = {
+      -0.00621776, -0.00620937, 0.00990623,  -0.0039817, -0.00074315,
+      0.61229795,  -0.00491806, -0.00068755, 0.18409646, 0.30090684};
+  const size_t expected_size = 1;
+  EXPECT_EQ(outputs.size(), expected_size);
+  float* data_o = static_cast<float*>(outputs[0].data.data());
+  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
+    EXPECT_LT(std::abs(data_o[j] - truth_values[j]), 10e-6);
+  }
+}
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -38,7 +38,12 @@ if (ANAKIN_SUBGRAPH)
    add_subdirectory(anakin)
 endif()
+if (WITH_LITE)
+    add_subdirectory(lite)
+endif()
 SET(OP_HEADER_DEPS xxhash executor)
 if (WITH_GPU)
    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
 endif()

--- a/paddle/fluid/operators/lite/CMakeLists.txt
+++ b/paddle/fluid/operators/lite/CMakeLists.txt
+op_library(lite_engine_op DEPS lite_engine lite_tensor_utils)
+cc_test(test_lite_engine_op SRCS lite_engine_op_test.cc DEPS lite_engine_op analysis)
--- a/paddle/fluid/operators/lite/lite_engine_op.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/lite/lite_engine_op.h"
+#include <string>
+#include <vector>
+namespace paddle {
+namespace operators {
+class LiteEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs.").AsDuplicable();
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different Lite Engines");
+    AddComment("Lite engine operator.");
+  }
+};
+class LiteInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lite_engine, ops::LiteEngineOp, ops::LiteEngineOpMaker);
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/inference/lite/tensor_utils.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+namespace paddle {
+namespace operators {
+class LiteEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> in_names_;
+  std::vector<std::string> out_names_;
+  paddle::lite::Predictor *engine_;
+  framework::proto::VarType::Type precision_;
+  bool use_gpu_;
+ public:
+  LiteEngineOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    in_names_ = Inputs("Xs");
+    out_names_ = Outputs("Ys");
+    engine_ =
+        inference::Singleton<inference::lite::EngineManager>::Global().Get(
+            Attr<std::string>("engine_key"));
+    if (Attr<bool>("enable_int8")) {
+      precision_ = framework::proto::VarType_Type_INT8;
+    } else {
+      precision_ = framework::proto::VarType_Type_FP32;
+    }
+    use_gpu_ = Attr<bool>("use_gpu");
+  }
+ protected:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    Execute(scope, dev_place);
+  }
+  void Execute(const framework::Scope &scope,
+               const platform::Place &dev_place) const {
+    const platform::DeviceContext *ctx =
+        platform::DeviceContextPool::Instance().Get(dev_place);
+    for (size_t i = 0; i < in_names_.size(); i++) {
+      const framework::LoDTensor &src_t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope,
+                                                                  in_names_[i]);
+      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      VLOG(3) << "fluid -> lite: " << in_names_[i];
+      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_place)) {
+      platform::GpuStreamSync(
+          static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
+    }
+#endif
+    VLOG(3) << "lite engine run";
+    engine_->Run();
+    VLOG(3) << "lite engine run done";
+    for (size_t i = 0; i < out_names_.size(); i++) {
+      const paddle::lite::Tensor &src_t = *(engine_->GetOutput(i));
+      framework::LoDTensor *dst_t =
+          &inference::analysis::GetFromScope<framework::LoDTensor>(
+              scope, out_names_[i]);
+      VLOG(3) << "lite -> fluid: " << out_names_[i];
+      inference::lite::utils::TensorCopyAsync(dst_t, src_t, *ctx);
+    }
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(dev_place)) {
+      platform::GpuStreamSync(
+          static_cast<const platform::CUDADeviceContext *>(ctx)->stream());
+    }
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/operators/lite/lite_engine_op.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+USE_NO_KERNEL_OP(lite_engine)
+using paddle::inference::lite::AddTensorToBlockDesc;
+using paddle::inference::lite::CreateTensor;
+using paddle::inference::lite::serialize_params;
+namespace paddle {
+namespace operators {
+TEST(LiteEngineOp, engine_op) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->mutable_blocks(0);
+  framework::BlockDesc block_desc(&program, block_);
+  auto* feed0 = block_desc.AppendOp();
+  feed0->SetType("feed");
+  feed0->SetInput("X", {"feed"});
+  feed0->SetOutput("Out", {"x"});
+  feed0->SetAttr("col", 0);
+  auto* feed1 = block_desc.AppendOp();
+  feed1->SetType("feed");
+  feed1->SetInput("X", {"feed"});
+  feed1->SetOutput("Out", {"y"});
+  feed1->SetAttr("col", 1);
+  LOG(INFO) << "create elementwise_add op";
+  auto* elt_add = block_desc.AppendOp();
+  elt_add->SetType("elementwise_add");
+  elt_add->SetInput("X", std::vector<std::string>({"x"}));
+  elt_add->SetInput("Y", std::vector<std::string>({"y"}));
+  elt_add->SetOutput("Out", std::vector<std::string>({"z"}));
+  elt_add->SetAttr("axis", -1);
+  LOG(INFO) << "create fetch op";
+  auto* fetch = block_desc.AppendOp();
+  fetch->SetType("fetch");
+  fetch->SetInput("X", std::vector<std::string>({"z"}));
+  fetch->SetOutput("Out", std::vector<std::string>({"out"}));
+  fetch->SetAttr("col", 0);
+  // Set inputs' variable shape in BlockDesc
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4}), true);
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({2, 4}), true);
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4}), false);
+  AddTensorToBlockDesc(block_, "out", std::vector<int64_t>({2, 4}), false);
+  *block_->add_ops() = *feed1->Proto();
+  *block_->add_ops() = *feed0->Proto();
+  *block_->add_ops() = *elt_add->Proto();
+  *block_->add_ops() = *fetch->Proto();
+  framework::Scope scope;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+#else
+  platform::CPUPlace place;
+  platform::CPUDeviceContext ctx(place);
+#endif
+  // Prepare variables.
+  CreateTensor(&scope, "x", std::vector<int64_t>({2, 4}), false);
+  CreateTensor(&scope, "y", std::vector<int64_t>({2, 4}), false);
+  CreateTensor(&scope, "out", std::vector<int64_t>({2, 4}), false);
+  ASSERT_EQ(block_->ops_size(), 4);
+  std::vector<std::string> repetitive_params{"x", "y"};
+  inference::lite::EngineConfig config;
+  config.valid_places = {
+#ifdef PADDLE_WITH_CUDA
+      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+#endif
+      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+  };
+  serialize_params(&(config.param), &scope, repetitive_params);
+  config.model = program.Proto()->SerializeAsString();
+  LOG(INFO) << "create lite_engine desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("lite_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"out"}));
+  std::string engine_key = "engine_0";
+  engine_op_desc.SetAttr("engine_key", engine_key);
+  engine_op_desc.SetAttr("enable_int8", false);
+  engine_op_desc.SetAttr("use_gpu", true);
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  inference::Singleton<inference::lite::EngineManager>::Global().Create(
+      engine_key, config);
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+  LOG(INFO) << "engine_op " << engine_op.get();
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+  LOG(INFO) << "done";
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+namespace paddle {
+namespace inference {
+namespace lite {
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape,
+                          bool persistable = false) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  desc.SetPersistable(persistable);
+  *var = *desc.Proto();
+}
+void serialize_params(std::string* str, framework::Scope* scope,
+                      const std::vector<std::string>& params) {
+  std::ostringstream os;
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+#else
+  platform::CPUDeviceContext ctx;
+#endif
+  for (const auto& param : params) {
+    PADDLE_ENFORCE_NOT_NULL(
+        scope->FindVar(param),
+        platform::errors::NotFound("Block should already have a '%s' variable",
+                                   param));
+    auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
+    framework::SerializeToStream(os, *tensor, ctx);
+  }
+  *str = os.str();
+}
+/*
+ * Get a random float value between [low, high]
+ */
+float random(float low, float high) {
+  // static std::random_device rd;
+  static std::mt19937 mt(100);
+  std::uniform_real_distribution<double> dist(low, high);
+  return dist(mt);
+}
+void RandomizeTensor(framework::LoDTensor* tensor,
+                     const platform::Place& place) {
+  auto dims = tensor->dims();
+  size_t num_elements = analysis::AccuDims(dims, dims.size());
+  PADDLE_ENFORCE_GT(num_elements, 0,
+                    platform::errors::InvalidArgument(
+                        "The input tensor dimension of the randomized tensor "
+                        "function should be greater than zero."));
+  platform::CPUPlace cpu_place;
+  framework::LoDTensor temp_tensor;
+  temp_tensor.Resize(dims);
+  auto* temp_data = temp_tensor.mutable_data<float>(cpu_place);
+  for (size_t i = 0; i < num_elements; i++) {
+    *(temp_data + i) = random(0., 1.);
+  }
+  TensorCopySync(temp_tensor, place, tensor);
+}
+void CreateTensor(framework::Scope* scope, const std::string& name,
+                  const std::vector<int64_t>& shape, bool in_cuda = true) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::Place place;
+  if (in_cuda) {
+#ifdef PADDLE_WITH_CUDA
+    place = platform::CUDAPlace(0);
+#else
+    LOG(FATAL) << "You must define PADDLE_WITH_CUDA for using CUDAPlace.";
+#endif
+  } else {
+    place = platform::CPUPlace();
+  }
+  RandomizeTensor(tensor, place);
+}
+}  // namespace lite
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -349,6 +349,16 @@ void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
                 error_code, CudaErrorWebsite());
 }
+void GpuStreamSync(cudaStream_t stream) {
+  auto error_code = cudaStreamSynchronize(stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      error_code,
+      platform::errors::External(
+          "cudaStreamSynchronize failed in paddle::platform::GpuStreamSync "
+          "error code : %d, %s",
+          error_code, CudaErrorWebsite()));
+}
 void RaiseNonOutOfMemoryError(cudaError_t *status) {
  if (*status == cudaErrorMemoryAllocation) {
    *status = cudaSuccess;
@@ -363,5 +373,6 @@ void RaiseNonOutOfMemoryError(cudaError_t *status) {
  PADDLE_ENFORCE_CUDA_SUCCESS(*status);
 }
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -101,6 +101,9 @@ void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
 //! Set memory dst with value count size asynchronously
 void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
+//! Blocks until stream has completed all operations.
+void GpuStreamSync(cudaStream_t stream);
 //! Raise error if status is not cudaSuccess or OOM, otherwise reset status.
 void RaiseNonOutOfMemoryError(cudaError_t *status);

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -203,6 +203,7 @@ function cmake_base() {
        -DPY_VERSION=${PY_VERSION:-2.7}
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
        -DWITH_GRPC=${grpc_flag}
+        -DWITH_LITE=${WITH_LITE:-OFF}
    ========================================
 EOF
    # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -234,7 +235,8 @@ EOF
        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
        -DPY_VERSION=${PY_VERSION:-2.7} \
        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag}
+        -DWITH_GRPC=${grpc_flag} \
+        -DWITH_LITE=${WITH_LITE:-OFF}
 }