add python api (#2225)

* [python api] init add python api test=develop

add python api (#2225)
* [python api] init add python api test=develop
76e74ef1 · sangoly · GitHub · 61a89ef2 · 76e74ef1 · 76e74ef1
22 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,29 +47,14 @@ include(simd)
 ################################ Exposed Configurations #######################################
 lite_option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 lite_option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ON IF ${AVX_FOUND})
-lite_option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 lite_option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 lite_option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ON IF ${AVX_FOUND})
 lite_option(WITH_ARM_DOTPROD "Compile PaddlePaddle with ARM dot production"  ON)
 lite_option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
-if(ANDROID OR IOS OR ARMLINUX)
-    set(WITH_GPU OFF CACHE STRING
-            "Disable GPU when cross-compiling for Android and iOS" FORCE)
-    set(WITH_DSO OFF CACHE STRING
-            "Disable DSO when cross-compiling for Android and iOS" FORCE)
-    set(WITH_AVX OFF CACHE STRING
-            "Disable AVX when cross-compiling for Android and iOS" FORCE)
-    set(WITH_PYTHON OFF CACHE STRING
-            "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
-    set(WITH_RDMA OFF CACHE STRING
-            "Disable RDMA when cross-compiling for Android and iOS" FORCE)
-    set(WITH_MKL OFF CACHE STRING
-            "Disable MKL when cross-compiling for Android and iOS" FORCE)
-endif()

 # for lite, both server and mobile framework.
 lite_option(LITE_WITH_JAVA "Enable Java JNI lib in lite mode" OFF)
+lite_option(LITE_WITH_PYTHON "Enable Python api lib in lite mode" OFF)
 lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 lite_option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 lite_option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
@@ -86,6 +71,22 @@ lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)

+# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
+if(ANDROID OR IOS OR ARMLINUX)
+    set(WITH_GPU OFF CACHE STRING
+            "Disable GPU when cross-compiling for Android and iOS" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when cross-compiling for Android and iOS" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when cross-compiling for Android and iOS" FORCE)
+    set(LITE_WITH_PYTHON OFF CACHE STRING
+            "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+            "Disable RDMA when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when cross-compiling for Android and iOS" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
        "A path setting third party libraries download & build directories.")

@@ -110,6 +111,12 @@ include_directories("${PADDLE_SOURCE_DIR}")
 set(LITE_GENERATED_INCLUDE_DIR "${CMAKE_BINARY_DIR}")
 include_directories("${LITE_GENERATED_INCLUDE_DIR}")

+if (LITE_WITH_PYTHON)
+    include(external/python)    # download, build, install python
+    include(external/pybind11)    # download, build, install pybind11
+endif()
+
+
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    message(STATUS "Building the mobile framework")

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -5,7 +5,7 @@ endif()
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs9 "30 35 50 52 60 61 62 70")
 set(paddle_known_gpu_archs10 "30 35 50 52 60 61 62 70 75")

 ######################################################################################

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT LITE_WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.2.4"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT LITE_WITH_PYTHON)
+    return()
+ENDIF()
+
+INCLUDE(python_module)
+
+FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED)
+FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED)
+
+if(WIN32)
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+"from distutils import sysconfig as s;import sys;import struct;
+print(sys.prefix);
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+"
+            RESULT_VARIABLE _PYTHON_SUCCESS
+            OUTPUT_VARIABLE _PYTHON_VALUES
+            ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+    if(NOT _PYTHON_SUCCESS MATCHES 0)
+        set(PYTHONLIBS_FOUND FALSE)
+        return()
+    endif()
+
+    # Convert the process output into a list
+    string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+    string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+    list(GET _PYTHON_VALUES 0 PYTHON_PREFIX)
+    list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX)
+
+    # Make sure all directory separators are '/'
+    string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX})
+
+    set(PYTHON_LIBRARY
+            "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+                "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+    SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+endif(WIN32)
+
+# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
+
+SET(py_env "")
+IF(PYTHONINTERP_FOUND)
+    find_python_module(pip REQUIRED)
+    find_python_module(numpy REQUIRED)
+    #find_python_module(wheel REQUIRED)
+    #find_python_module(google.protobuf REQUIRED)
+    FIND_PACKAGE(NumPy REQUIRED)
+    #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    #    MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+    #    "please use pip to upgrade protobuf. pip install -U protobuf")
+    #ENDIF()
+ENDIF(PYTHONINTERP_FOUND)
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
+# Find if a Python module is installed
+# Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html
+# To use do: find_python_module(PyQt4 REQUIRED)
+function(find_python_module module)
+    string(TOUPPER ${module} module_upper)
+    if(NOT PY_${module_upper})
+        if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED")
+            set(${module}_FIND_REQUIRED TRUE)
+        else()
+            set(${module}_FIND_REQUIRED FALSE)
+        endif()
+        # A module's location is usually a directory, but for binary modules
+        # it's a .so file.
+        execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+            "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))"
+            RESULT_VARIABLE _${module}_status
+            OUTPUT_VARIABLE _${module}_location
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(NOT _${module}_status)
+            set(PY_${module_upper} ${_${module}_location} CACHE STRING
+                "Location of Python module ${module}")
+        endif(NOT _${module}_status)
+    endif(NOT PY_${module_upper})
+    find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper})
+    if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
+        message(FATAL_ERROR "python module ${module} is not found")
+    endif()
+
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
+
+    set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
+endfunction(find_python_module)
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -222,6 +222,10 @@ if (LITE_WITH_JAVA AND LITE_WITH_ARM)
    add_subdirectory(android)
 endif()

+if (LITE_WITH_PYTHON) 
+    add_subdirectory(python)
+endif()
+
 if (LITE_ON_TINY_PUBLISH)
    return()
 endif()

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -53,13 +53,9 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 }

 // get inputs names
-const std::vector<std::string> &Predictor::GetInputNames() {
-  return input_names_;
-}
+std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
 // get outputnames
-const std::vector<std::string> &Predictor::GetOutputNames() {
-  return output_names_;
-}
+std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -74,8 +74,8 @@ class LITE_API Predictor {
  // get input by name.
  lite::Tensor* GetInputByName(const std::string& name);
  // get inputnames and get outputnames.
-  const std::vector<std::string>& GetInputNames();
-  const std::vector<std::string>& GetOutputNames();
+  std::vector<std::string> GetInputNames();
+  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();

  // Get offset-th col of fetch results.
@@ -111,6 +111,40 @@ class LITE_API Predictor {
  std::vector<std::string> output_names_;
 };

+class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
+ public:
+  CxxPaddleApiImpl() {}
+
+  /// Create a new predictor from a config.
+  void Init(const lite_api::CxxConfig& config);
+
+  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
+
+  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
+
+  void Run() override;
+
+  std::string GetVersion() const override;
+
+  // get inputs names and get outputs names
+  std::vector<std::string> GetInputNames() override;
+  std::vector<std::string> GetOutputNames() override;
+
+  std::unique_ptr<const lite_api::Tensor> GetTensor(
+      const std::string& name) const override;
+
+  // Get InputTebsor by name
+  std::unique_ptr<lite_api::Tensor> GetInputByName(
+      const std::string& name) override;
+
+  void SaveOptimizedModel(const std::string& model_dir,
+                          lite_api::LiteModelType model_type =
+                              lite_api::LiteModelType::kProtobuf) override;
+
+ private:
+  Predictor raw_predictor_;
+};
+
 /*
 * An executor for training.
 *

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -21,42 +21,6 @@
 namespace paddle {
 namespace lite {

-class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
- public:
-  CxxPaddleApiImpl();
-
-  /// Create a new predictor from a config.
-  void Init(const lite_api::CxxConfig &config);
-
-  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
-
-  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
-
-  void Run() override;
-
-  std::string GetVersion() const override;
-
-  // get inputs names and get outputs names
-  const std::vector<std::string> &GetInputNames() override;
-  const std::vector<std::string> &GetOutputNames() override;
-
-  std::unique_ptr<const lite_api::Tensor> GetTensor(
-      const std::string &name) const override;
-
-  // Get InputTebsor by name
-  std::unique_ptr<lite_api::Tensor> GetInputByName(
-      const std::string &name) override;
-
-  void SaveOptimizedModel(const std::string &model_dir,
-                          lite_api::LiteModelType model_type =
-                              lite_api::LiteModelType::kProtobuf) override;
-
- private:
-  Predictor raw_predictor_;
-};
-
-CxxPaddleApiImpl::CxxPaddleApiImpl() {}
-
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #ifdef LITE_WITH_CUDA
  Env<TARGET(kCUDA)>::Init();
@@ -76,11 +40,11 @@ std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
 }

-const std::vector<std::string> &CxxPaddleApiImpl::GetInputNames() {
+std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
  return raw_predictor_.GetInputNames();
 }

-const std::vector<std::string> &CxxPaddleApiImpl::GetOutputNames() {
+std::vector<std::string> CxxPaddleApiImpl::GetOutputNames() {
  return raw_predictor_.GetOutputNames();
 }


--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -81,11 +81,11 @@ const Tensor* LightPredictor::GetOutput(size_t offset) {
  return out_var->GetMutable<lite::Tensor>();
 }
 // get inputs names
-const std::vector<std::string>& LightPredictor::GetInputNames() {
+std::vector<std::string> LightPredictor::GetInputNames() {
  return input_names_;
 }
 // get outputnames
-const std::vector<std::string>& LightPredictor::GetOutputNames() {
+std::vector<std::string> LightPredictor::GetOutputNames() {
  return output_names_;
 }
 // append the names of inputs and outputs into input_names_ and output_names_

--- a/lite/api/light_api.h
+++ b/lite/api/light_api.h
@@ -64,8 +64,8 @@ class LITE_API LightPredictor {
  }

  // get inputnames and get outputnames.
-  const std::vector<std::string>& GetInputNames();
-  const std::vector<std::string>& GetOutputNames();
+  std::vector<std::string> GetInputNames();
+  std::vector<std::string> GetOutputNames();
  void PrepareFeedFetch();

 private:
@@ -86,5 +86,31 @@ class LITE_API LightPredictor {
  std::vector<std::string> output_names_;
 };

+class LightPredictorImpl : public lite_api::PaddlePredictor {
+ public:
+  LightPredictorImpl() = default;
+
+  std::unique_ptr<lite_api::Tensor> GetInput(int i) override;
+
+  std::unique_ptr<const lite_api::Tensor> GetOutput(int i) const override;
+
+  void Run() override;
+
+  std::string GetVersion() const override;
+  std::vector<std::string> GetInputNames() override;
+  std::vector<std::string> GetOutputNames() override;
+
+  std::unique_ptr<const lite_api::Tensor> GetTensor(
+      const std::string& name) const override;
+  // Get InputTebsor by name
+  std::unique_ptr<lite_api::Tensor> GetInputByName(
+      const std::string& name) override;
+
+  void Init(const lite_api::MobileConfig& config);
+
+ private:
+  std::unique_ptr<lite::LightPredictor> raw_predictor_;
+};
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -19,77 +19,60 @@
 #include "lite/model_parser/model_parser.h"

 namespace paddle {
-namespace lite_api {
-
-class LightPredictorImpl : public PaddlePredictor {
- public:
-  LightPredictorImpl() = default;
-
-  std::unique_ptr<Tensor> GetInput(int i) override;
-
-  std::unique_ptr<const Tensor> GetOutput(int i) const override;
-
-  void Run() override;
-
-  std::string GetVersion() const override;
-  const std::vector<std::string>& GetInputNames() override;
-  const std::vector<std::string>& GetOutputNames() override;
+namespace lite {

-  std::unique_ptr<const Tensor> GetTensor(
-      const std::string& name) const override;
-  // Get InputTebsor by name
-  std::unique_ptr<Tensor> GetInputByName(const std::string& name) override;
-
-  void Init(const MobileConfig& config);
-
- private:
-  std::unique_ptr<lite::LightPredictor> raw_predictor_;
-};
-
-void LightPredictorImpl::Init(const MobileConfig& config) {
+void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
  // LightPredictor Only support NaiveBuffer backend in publish lib
-  raw_predictor_.reset(new lite::LightPredictor(config.model_dir(),
-                                                config.model_buffer(),
-                                                config.param_buffer(),
-                                                config.model_from_memory(),
-                                                LiteModelType::kNaiveBuffer));
+  raw_predictor_.reset(
+      new LightPredictor(config.model_dir(),
+                         config.model_buffer(),
+                         config.param_buffer(),
+                         config.model_from_memory(),
+                         lite_api::LiteModelType::kNaiveBuffer));
 }

-std::unique_ptr<Tensor> LightPredictorImpl::GetInput(int i) {
-  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetInput(i)));
+std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInput(int i) {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_->GetInput(i)));
 }

-std::unique_ptr<const Tensor> LightPredictorImpl::GetOutput(int i) const {
-  return std::unique_ptr<Tensor>(new Tensor(raw_predictor_->GetOutput(i)));
+std::unique_ptr<const lite_api::Tensor> LightPredictorImpl::GetOutput(
+    int i) const {
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_->GetOutput(i)));
 }

 void LightPredictorImpl::Run() { raw_predictor_->Run(); }

 std::string LightPredictorImpl::GetVersion() const { return lite::version(); }

-std::unique_ptr<const Tensor> LightPredictorImpl::GetTensor(
+std::unique_ptr<const lite_api::Tensor> LightPredictorImpl::GetTensor(
    const std::string& name) const {
-  return std::unique_ptr<const Tensor>(
-      new Tensor(raw_predictor_->GetTensor(name)));
+  return std::unique_ptr<const lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_->GetTensor(name)));
 }
-std::unique_ptr<Tensor> LightPredictorImpl::GetInputByName(
+std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInputByName(
    const std::string& name) {
-  return std::unique_ptr<Tensor>(
-      new Tensor(raw_predictor_->GetInputByName(name)));
+  return std::unique_ptr<lite_api::Tensor>(
+      new lite_api::Tensor(raw_predictor_->GetInputByName(name)));
 }

-const std::vector<std::string>& LightPredictorImpl::GetInputNames() {
+std::vector<std::string> LightPredictorImpl::GetInputNames() {
  return raw_predictor_->GetInputNames();
 }

-const std::vector<std::string>& LightPredictorImpl::GetOutputNames() {
+std::vector<std::string> LightPredictorImpl::GetOutputNames() {
  return raw_predictor_->GetOutputNames();
 }

+}  // namespace lite
+
+namespace lite_api {
+
 template <>
 std::shared_ptr<PaddlePredictor> CreatePaddlePredictor(
    const MobileConfig& config) {
-  auto x = std::make_shared<LightPredictorImpl>();
+  auto x = std::make_shared<lite::LightPredictorImpl>();
  x->Init(config);
  return x;
 }

--- a/lite/api/light_api_test.cc
+++ b/lite/api/light_api_test.cc
@@ -37,13 +37,13 @@ TEST(LightAPI, load) {
  }

  predictor.PrepareFeedFetch();
-  const std::vector<std::string>& inputs = predictor.GetInputNames();
+  const std::vector<std::string> inputs = predictor.GetInputNames();

  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
-  const std::vector<std::string>& outputs = predictor.GetOutputNames();
+  const std::vector<std::string> outputs = predictor.GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -14,8 +14,13 @@

 #include "lite/api/paddle_api.h"
 #include "lite/core/device_info.h"
+#include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"

+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/target_wrapper.h"
+#endif
+
 namespace paddle {
 namespace lite_api {

@@ -42,6 +47,11 @@ const int8_t *Tensor::data() const {
  return ctensor(raw_tensor_)->data<int8_t>();
 }

+template <>
+const int32_t *Tensor::data() const {
+  return ctensor(raw_tensor_)->data<int32_t>();
+}
+
 template <>
 int *Tensor::mutable_data(TargetType type) const {
  return tensor(raw_tensor_)->mutable_data<int>(type);
@@ -55,10 +65,81 @@ int8_t *Tensor::mutable_data(TargetType type) const {
  return tensor(raw_tensor_)->mutable_data<int8_t>(type);
 }

+template <typename T, TargetType type>
+void Tensor::CopyFromCpu(const T *src_data) {
+  T *data = tensor(raw_tensor_)->mutable_data<T>(type);
+  int64_t num = tensor(raw_tensor_)->numel();
+  CHECK(num > 0) << "You should call Resize interface first";
+  if (type == TargetType::kHost || type == TargetType::kARM) {
+    lite::TargetWrapperHost::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
+  } else if (type == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    lite::TargetWrapperCuda::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else {
+    LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
+  }
+}
+template <typename T>
+void Tensor::CopyToCpu(T *data) {
+  const T *src_data = tensor(raw_tensor_)->data<T>();
+  int64_t num = tensor(raw_tensor_)->numel();
+  CHECK(num > 0) << "You should call Resize interface first";
+  auto type = tensor(raw_tensor_)->target();
+  if (type == TargetType::kHost || type == TargetType::kARM) {
+    lite::TargetWrapperHost::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
+  } else if (type == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    lite::TargetWrapperCuda::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else {
+    LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
+  }
+}
+
+template void Tensor::CopyFromCpu<int, TargetType::kHost>(const int *);
+template void Tensor::CopyFromCpu<float, TargetType::kHost>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kHost>(const int8_t *);
+
+template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
+template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
+template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
+template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
+
+template void Tensor::CopyToCpu(int8_t *);
+template void Tensor::CopyToCpu(float *);
+template void Tensor::CopyToCpu(int *);
+
 shape_t Tensor::shape() const {
  return ctensor(raw_tensor_)->dims().Vectorize();
 }

+TargetType Tensor::target() const {
+  auto type = ctensor(raw_tensor_)->target();
+  if (type == TargetType::kUnk) {
+    CHECK(false) << "This tensor was not initialized.";
+  }
+  return type;
+}
+
+PrecisionType Tensor::precision() const {
+  auto precision = ctensor(raw_tensor_)->precision();
+  if (precision == PrecisionType::kUnk) {
+    CHECK(false) << "This tensor was not initialized.";
+  }
+  return precision;
+}
+
 lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }

 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -45,8 +45,15 @@ struct LITE_API Tensor {
  template <typename T>
  T* mutable_data(TargetType type = TargetType::kHost) const;

+  template <typename T, TargetType type = TargetType::kHost>
+  void CopyFromCpu(const T* data);
+
+  template <typename T>
+  void CopyToCpu(T* data);
  /// Shape of the tensor.
  shape_t shape() const;
+  TargetType target() const;
+  PrecisionType precision() const;

  // LoD of the tensor
  lod_t lod() const;
@@ -75,9 +82,9 @@ class LITE_API PaddlePredictor {
  virtual std::string GetVersion() const = 0;

  // Get input names
-  virtual const std::vector<std::string>& GetInputNames() = 0;
+  virtual std::vector<std::string> GetInputNames() = 0;
  // Get output names
-  virtual const std::vector<std::string>& GetOutputNames() = 0;
+  virtual std::vector<std::string> GetOutputNames() = 0;

  // Get Input by name
  virtual std::unique_ptr<Tensor> GetInputByName(const std::string& name) = 0;

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -37,12 +37,12 @@ TEST(CxxApi, run) {

  LOG(INFO) << "Version: " << predictor->GetVersion();

-  auto& inputs = predictor->GetInputNames();
+  auto inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs[i];
  }
-  auto& outputs = predictor->GetOutputNames();
+  auto outputs = predictor->GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs[i];
  }
@@ -76,12 +76,12 @@ TEST(LightApi, run) {

  auto predictor = lite_api::CreatePaddlePredictor(config);

-  auto& inputs = predictor->GetInputNames();
+  auto inputs = predictor->GetInputNames();
  LOG(INFO) << "input size: " << inputs.size();
  for (int i = 0; i < inputs.size(); i++) {
    LOG(INFO) << "inputnames: " << inputs.at(i);
  }
-  auto& outputs = predictor->GetOutputNames();
+  auto outputs = predictor->GetOutputNames();
  for (int i = 0; i < outputs.size(); i++) {
    LOG(INFO) << "outputnames: " << outputs.at(i);
  }

--- a/lite/api/python/CMakeLists.txt
+++ b/lite/api/python/CMakeLists.txt
+if (NOT LITE_WITH_PYTHON)
+    return()
+endif()
+
+
+add_subdirectory(pybind)
+#add_subdirectory(interface)
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
+set(PYBIND_DEPS pybind python paddle_api_light paddle_api)
+if (NOT LITE_ON_TINY_PUBLISH)
+   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full)
+endif()
+
+lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/api/python/pybind/pybind.h"
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <cstring>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/cxx_api.h"
+#include "lite/api/paddle_use_passes.h"
+#endif
+
+#include "lite/api/light_api.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/tensor.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace lite {
+namespace pybind {
+
+using lite_api::Tensor;
+using lite_api::CxxConfig;
+using lite_api::MobileConfig;
+using lite_api::PowerMode;
+using lite_api::TargetType;
+using lite_api::PrecisionType;
+using lite_api::DataLayoutType;
+using lite_api::Place;
+using lite::LightPredictorImpl;
+
+#ifndef LITE_ON_TINY_PUBLISH
+using lite::CxxPaddleApiImpl;
+static void BindLiteCxxPredictor(py::module *m);
+#endif
+static void BindLiteLightPredictor(py::module *m);
+static void BindLiteCxxConfig(py::module *m);
+static void BindLiteMobileConfig(py::module *m);
+static void BindLitePowerMode(py::module *m);
+static void BindLitePlace(py::module *m);
+static void BindLiteTensor(py::module *m);
+
+void BindLiteApi(py::module *m) {
+  BindLiteCxxConfig(m);
+  BindLiteMobileConfig(m);
+  BindLitePowerMode(m);
+  BindLitePlace(m);
+  BindLiteTensor(m);
+#ifndef LITE_ON_TINY_PUBLISH
+  BindLiteCxxPredictor(m);
+#endif
+  BindLiteLightPredictor(m);
+  // Global helper methods
+  m->def("create_paddle_predictor",
+         [](const CxxConfig &config) -> std::unique_ptr<CxxPaddleApiImpl> {
+           auto x = std::unique_ptr<CxxPaddleApiImpl>(new CxxPaddleApiImpl());
+           x->Init(config);
+           return std::move(x);
+         });
+  m->def("create_paddle_predictor",
+         [](const MobileConfig &config) -> std::unique_ptr<LightPredictorImpl> {
+           auto x =
+               std::unique_ptr<LightPredictorImpl>(new LightPredictorImpl());
+           x->Init(config);
+           return std::move(x);
+         });
+}
+
+void BindLiteCxxConfig(py::module *m) {
+  py::class_<CxxConfig> cxx_config(*m, "CxxConfig");
+
+  cxx_config.def(py::init<>())
+      .def("set_model_dir", &CxxConfig::set_model_dir)
+      .def("model_dir", &CxxConfig::model_dir)
+      .def("set_model_file", &CxxConfig::set_model_file)
+      .def("model_file", &CxxConfig::model_file)
+      .def("set_param_file", &CxxConfig::set_param_file)
+      .def("param_file", &CxxConfig::param_file)
+      .def("set_valid_places", &CxxConfig::set_valid_places)
+      .def("set_model_buffer", &CxxConfig::set_model_buffer)
+      .def("model_from_memory", &CxxConfig::model_from_memory);
+#ifdef LITE_WITH_ARM
+  cxx_config.def("set_threads", &CxxConfig::set_threads)
+      .def("threads", &CxxConfig::threads)
+      .def("set_power_mode", &CxxConfig::set_power_mode)
+      .def("power_mode", &CxxConfig::power_mode);
+#endif
+}
+
+// TODO(sangoly): Should MobileConfig be renamed to LightConfig ??
+void BindLiteMobileConfig(py::module *m) {
+  py::class_<MobileConfig> mobile_config(*m, "MobileConfig");
+
+  mobile_config.def(py::init<>())
+      .def("set_model_dir", &MobileConfig::set_model_dir)
+      .def("model_dir", &MobileConfig::model_dir)
+      .def("set_model_buffer", &MobileConfig::set_model_buffer)
+      .def("model_from_memory", &MobileConfig::model_from_memory);
+#ifdef LITE_WITH_ARM
+  mobile_config.def("set_threads", &MobileConfig::set_threads)
+      .def("threads", &MobileConfig::threads)
+      .def("set_power_mode", &MobileConfig::set_power_mode)
+      .def("power_mode", &MobileConfig::power_mode);
+#endif
+}
+
+void BindLitePowerMode(py::module *m) {
+  py::enum_<PowerMode>(*m, "PowerMode")
+      .value("LITE_POWER_HIGH", PowerMode::LITE_POWER_HIGH)
+      .value("LITE_POWER_LOW", PowerMode::LITE_POWER_LOW)
+      .value("LITE_POWER_FULL", PowerMode::LITE_POWER_FULL)
+      .value("LITE_POWER_NO_BIND", PowerMode::LITE_POWER_NO_BIND)
+      .value("LITE_POWER_RAND_HIGH", PowerMode::LITE_POWER_RAND_HIGH)
+      .value("LITE_POWER_RAND_LOW", PowerMode::LITE_POWER_RAND_LOW);
+}
+
+void BindLitePlace(py::module *m) {
+  // TargetType
+  py::enum_<TargetType>(*m, "TargetType")
+      .value("Host", TargetType::kHost)
+      .value("X86", TargetType::kX86)
+      .value("CUDA", TargetType::kCUDA)
+      .value("ARM", TargetType::kARM)
+      .value("OpenCL", TargetType::kOpenCL)
+      .value("FPGA", TargetType::kFPGA)
+      .value("NPU", TargetType::kNPU)
+      .value("Any", TargetType::kAny);
+
+  // PrecisionType
+  py::enum_<PrecisionType>(*m, "PrecisionType")
+      .value("FP16", PrecisionType::kFP16)
+      .value("FP32", PrecisionType::kFloat)
+      .value("INT8", PrecisionType::kInt8)
+      .value("INT16", PrecisionType::kInt16)
+      .value("INT32", PrecisionType::kInt32)
+      .value("INT64", PrecisionType::kInt64)
+      .value("BOOL", PrecisionType::kBool)
+      .value("Any", PrecisionType::kAny);
+
+  // DataLayoutType
+  py::enum_<DataLayoutType>(*m, "DataLayoutType")
+      .value("NCHW", DataLayoutType::kNCHW)
+      .value("NHWC", DataLayoutType::kNHWC)
+      .value("Any", DataLayoutType::kAny);
+
+  // Place
+  py::class_<Place>(*m, "Place")
+      .def(py::init<TargetType, PrecisionType, DataLayoutType, int16_t>(),
+           py::arg("target"),
+           py::arg("percision") = PrecisionType::kFloat,
+           py::arg("layout") = DataLayoutType::kNCHW,
+           py::arg("device") = 0)
+      .def("is_valid", &Place::is_valid);
+}
+
+void BindLiteTensor(py::module *m) {
+  auto data_size_func = [](const std::vector<int64_t> &shape) -> int64_t {
+    int64_t res = 1;
+    for (size_t i = 0; i < shape.size(); i++) {
+      res *= shape[i];
+    }
+    return res;
+  };
+
+  py::class_<Tensor> tensor(*m, "Tensor");
+
+  tensor.def("resize", &Tensor::Resize)
+      .def("shape", &Tensor::shape)
+      .def("target", &Tensor::target)
+      .def("precision", &Tensor::precision)
+      .def("lod", &Tensor::lod)
+      .def("set_lod", &Tensor::SetLoD);
+
+#define DO_GETTER_ONCE(data_type__, name__)                           \
+  tensor.def(#name__, [=](Tensor &self) -> std::vector<data_type__> { \
+    std::vector<data_type__> data;                                    \
+    auto shape = self.shape();                                        \
+    int64_t num = data_size_func(shape);                              \
+    data.resize(num);                                                 \
+    self.CopyToCpu<data_type__>(data.data());                         \
+    return data;                                                      \
+  });
+
+#define DO_SETTER_ONCE(data_type__, name__)                              \
+  tensor.def(                                                            \
+      #name__,                                                           \
+      [](Tensor &self,                                                   \
+         const std::vector<data_type__> &data,                           \
+         TargetType type = TargetType::kHost) {                          \
+        if (type == TargetType::kHost || type == TargetType::kARM) {     \
+          self.CopyFromCpu<data_type__, TargetType::kHost>(data.data()); \
+        } else if (type == TargetType::kCUDA) {                          \
+          self.CopyFromCpu<data_type__, TargetType::kCUDA>(data.data()); \
+        }                                                                \
+      },                                                                 \
+      py::arg("data"),                                                   \
+      py::arg("type") = TargetType::kHost);
+
+#define DATA_GETTER_SETTER_ONCE(data_type__, name__) \
+  DO_SETTER_ONCE(data_type__, set_##name__##_data)   \
+  DO_GETTER_ONCE(data_type__, name__##_data)
+
+  DATA_GETTER_SETTER_ONCE(int8_t, int8);
+  DATA_GETTER_SETTER_ONCE(int32_t, int32);
+  DATA_GETTER_SETTER_ONCE(float, float);
+#undef DO_GETTER_ONCE
+#undef DO_SETTER_ONCE
+#undef DATA_GETTER_SETTER_ONCE
+}
+
+#ifndef LITE_ON_TINY_PUBLISH
+void BindLiteCxxPredictor(py::module *m) {
+  py::class_<CxxPaddleApiImpl>(*m, "CxxPredictor")
+      .def(py::init<>())
+      .def("get_input", &CxxPaddleApiImpl::GetInput)
+      .def("get_output", &CxxPaddleApiImpl::GetOutput)
+      .def("run", &CxxPaddleApiImpl::Run)
+      .def("get_version", &CxxPaddleApiImpl::GetVersion)
+      .def("save_optimized_model",
+           [](CxxPaddleApiImpl &self, const std::string &output_dir) {
+             self.SaveOptimizedModel(output_dir,
+                                     lite_api::LiteModelType::kNaiveBuffer);
+           });
+}
+#endif
+
+void BindLiteLightPredictor(py::module *m) {
+  py::class_<LightPredictorImpl>(*m, "LightPredictor")
+      .def(py::init<>())
+      .def("get_input", &LightPredictorImpl::GetInput)
+      .def("get_output", &LightPredictorImpl::GetOutput)
+      .def("run", &LightPredictorImpl::Run)
+      .def("get_version", &LightPredictorImpl::GetVersion)
+      .def("save_optimized_model",
+           [](LightPredictorImpl &self, const std::string &output_dir) {
+             self.SaveOptimizedModel(output_dir,
+                                     lite_api::LiteModelType::kNaiveBuffer);
+           });
+}
+
+}  // namespace pybind
+}  // namespace lite
+}  // namespace paddle
--- a/lite/api/python/pybind/pybind.h
+++ b/lite/api/python/pybind/pybind.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace lite {
+namespace pybind {
+
+void BindLiteApi(pybind11::module *m);
+
+PYBIND11_MODULE(lite_core, m) {
+  m.doc() = "C++ core of Paddle-Lite";
+
+  BindLiteApi(&m);
+}
+
+}  // namespace pybind
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
 if(NOT LITE_WITH_CUDA)
    return()
 endif()
+set(cuda_static_deps cudnn_static cublas_static curand_static
+    culibos_static cudart_static)

-nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
-nv_library(cuda_blas SRCS blas.cc)
+nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
+nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
 
 add_subdirectory(math)
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -37,5 +37,4 @@ nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
-nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)