add python inference api (#15248)

add python inference api

add python inference api (#15248)
add python inference api
d60751fb · flame · GitHub · 59ab98c9 · d60751fb · d60751fb
9 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -45,6 +45,7 @@ paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], vararg
 paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.__init__ ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.CompiledProgram.with_data_parallel ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from'], varargs=None, keywords=None, defaults=(None, None, None, None))
+paddle.fluid.CompiledProgram.with_inference_optimize ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None
 paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None

--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -45,6 +45,7 @@ using contrib::AnalysisConfig;
 class AnalysisPredictor : public PaddlePredictor {
 public:
  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  ~AnalysisPredictor();
  bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
            const std::shared_ptr<framework::ProgramDesc> &program = nullptr);
@@ -95,7 +96,6 @@ class AnalysisPredictor : public PaddlePredictor {
  template <typename T>
  void GetFetchOne(const framework::LoDTensor &fetchs,
                   PaddleTensor *output_data);
-  ~AnalysisPredictor();
 // Some more detailed tests, they are made the friends of the predictor, so that
 // the all the details can be tested.

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
  feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer)
+  tracer analysis_predictor)
 if(WITH_PYTHON)
  list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/pybind/inference_api.h"
+#include <pybind11/stl.h>
+#include <cstring>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+using paddle::PaddleDType;
+using paddle::PaddleBuf;
+using paddle::PaddleTensor;
+using paddle::PaddlePlace;
+using paddle::PaddlePredictor;
+using paddle::NativeConfig;
+using paddle::NativePaddlePredictor;
+using paddle::AnalysisPredictor;
+using paddle::contrib::AnalysisConfig;
+static void BindPaddleDType(py::module *m);
+static void BindPaddleBuf(py::module *m);
+static void BindPaddleTensor(py::module *m);
+static void BindPaddlePlace(py::module *m);
+static void BindPaddlePredictor(py::module *m);
+static void BindNativeConfig(py::module *m);
+static void BindNativePredictor(py::module *m);
+static void BindAnalysisConfig(py::module *m);
+static void BindAnalysisPredictor(py::module *m);
+void BindInferenceApi(py::module *m) {
+  BindPaddleDType(m);
+  BindPaddleBuf(m);
+  BindPaddleTensor(m);
+  BindPaddlePlace(m);
+  BindPaddlePredictor(m);
+  BindNativeConfig(m);
+  BindNativePredictor(m);
+  BindAnalysisConfig(m);
+  BindAnalysisPredictor(m);
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<AnalysisConfig>);
+  m->def("create_paddle_predictor",
+         &paddle::CreatePaddlePredictor<NativeConfig>);
+  m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
+}
+void BindPaddleDType(py::module *m) {
+  py::enum_<PaddleDType>(*m, "PaddleDType")
+      .value("FLOAT32", PaddleDType::FLOAT32)
+      .value("INT64", PaddleDType::INT64);
+}
+void BindPaddleBuf(py::module *m) {
+  py::class_<PaddleBuf>(*m, "PaddleBuf")
+      .def(py::init<size_t>())
+      .def(py::init([](std::vector<float> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(float));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def(py::init([](std::vector<int64_t> &data) {
+        auto buf = PaddleBuf(data.size() * sizeof(int64_t));
+        std::memcpy(buf.data(), static_cast<void *>(data.data()), buf.length());
+        return std::move(buf);
+      }))
+      .def("resize", &PaddleBuf::Resize)
+      .def("reset",
+           [](PaddleBuf &self, std::vector<float> &data) {
+             self.Resize(data.size() * sizeof(float));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("reset",
+           [](PaddleBuf &self, std::vector<int64_t> &data) {
+             self.Resize(data.size() * sizeof(int64_t));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("empty", &PaddleBuf::empty)
+      .def("float_data",
+           [](PaddleBuf &self) -> std::vector<float> {
+             auto *data = static_cast<float *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("int64_data",
+           [](PaddleBuf &self) -> std::vector<int64_t> {
+             int64_t *data = static_cast<int64_t *>(self.data());
+             return {data, data + self.length() / sizeof(*data)};
+           })
+      .def("length", &PaddleBuf::length);
+}
+void BindPaddleTensor(py::module *m) {
+  py::class_<PaddleTensor>(*m, "PaddleTensor")
+      .def(py::init<>())
+      .def_readwrite("name", &PaddleTensor::name)
+      .def_readwrite("shape", &PaddleTensor::shape)
+      .def_readwrite("data", &PaddleTensor::data)
+      .def_readwrite("dtype", &PaddleTensor::dtype)
+      .def_readwrite("lod", &PaddleTensor::lod);
+}
+void BindPaddlePlace(py::module *m) {
+  py::enum_<PaddlePlace>(*m, "PaddlePlace")
+      .value("UNK", PaddlePlace::kUNK)
+      .value("CPU", PaddlePlace::kCPU)
+      .value("GPU", PaddlePlace::kGPU);
+}
+void BindPaddlePredictor(py::module *m) {
+  auto paddle_predictor = py::class_<PaddlePredictor>(*m, "PaddlePredictor");
+  paddle_predictor
+      .def("run",
+           [](PaddlePredictor &self, const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &PaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &PaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
+      .def("clone", &PaddlePredictor::Clone);
+  auto config = py::class_<PaddlePredictor::Config>(paddle_predictor, "Config");
+  config.def(py::init<>())
+      .def_readwrite("model_dir", &PaddlePredictor::Config::model_dir);
+}
+void BindNativeConfig(py::module *m) {
+  py::class_<NativeConfig, PaddlePredictor::Config>(*m, "NativeConfig")
+      .def(py::init<>())
+      .def_readwrite("use_gpu", &NativeConfig::use_gpu)
+      .def_readwrite("device", &NativeConfig::device)
+      .def_readwrite("fraction_of_gpu_memory",
+                     &NativeConfig::fraction_of_gpu_memory)
+      .def_readwrite("prog_file", &NativeConfig::prog_file)
+      .def_readwrite("param_file", &NativeConfig::param_file)
+      .def_readwrite("specify_input_name", &NativeConfig::specify_input_name)
+      .def("set_cpu_math_library_num_threads",
+           &NativeConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &NativeConfig::cpu_math_library_num_threads);
+}
+void BindNativePredictor(py::module *m) {
+  py::class_<NativePaddlePredictor, PaddlePredictor>(*m,
+                                                     "NativePaddlePredictor")
+      .def(py::init<const NativeConfig &>())
+      .def("init", &NativePaddlePredictor::Init)
+      .def("run",
+           [](NativePaddlePredictor &self,
+              const std::vector<PaddleTensor> &inputs) {
+             std::vector<PaddleTensor> outputs;
+             self.Run(inputs, &outputs);
+             return outputs;
+           })
+      .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
+      .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
+      .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
+      .def("clone", &NativePaddlePredictor::Clone)
+      .def("scope", &NativePaddlePredictor::scope,
+           py::return_value_policy::reference);
+}
+void BindAnalysisConfig(py::module *m) {
+  py::class_<AnalysisConfig>(*m, "AnalysisConfig")
+      .def(py::init<const AnalysisConfig &>())
+      .def(py::init<const std::string &>())
+      .def(py::init<const std::string &, const std::string &>())
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_model", (void (AnalysisConfig::*)(const std::string &,
+                                                  const std::string &)) &
+                            AnalysisConfig::SetModel)
+      .def("set_prog_file", &AnalysisConfig::SetProgFile)
+      .def("set_params_file", &AnalysisConfig::SetParamsFile)
+      .def("model_dir", &AnalysisConfig::model_dir)
+      .def("prog_file", &AnalysisConfig::prog_file)
+      .def("params_file", &AnalysisConfig::params_file)
+      .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
+           py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("disable_gpu", &AnalysisConfig::DisableGpu)
+      .def("use_gpu", &AnalysisConfig::use_gpu)
+      .def("gpu_device_id", &AnalysisConfig::gpu_device_id)
+      .def("memory_pool_init_size_mb",
+           &AnalysisConfig::memory_pool_init_size_mb)
+      .def("fraction_of_gpu_memory_for_pool",
+           &AnalysisConfig::fraction_of_gpu_memory_for_pool)
+      .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim,
+           py::arg("x") = true)
+      .def("ir_optim", &AnalysisConfig::ir_optim)
+      .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps,
+           py::arg("x") = true)
+      .def("use_feed_fetch_ops_enabled",
+           &AnalysisConfig::use_feed_fetch_ops_enabled)
+      .def("switch_specify_input_names",
+           &AnalysisConfig::SwitchSpecifyInputNames, py::arg("x") = true)
+      .def("specify_input_name", &AnalysisConfig::specify_input_name)
+      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
+           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
+           py::arg("min_subgraph_size") = 3)
+      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
+           py::arg("x") = true)
+      .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
+      .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
+      .def("set_cpu_math_library_num_threads",
+           &AnalysisConfig::SetCpuMathLibraryNumThreads)
+      .def("cpu_math_library_num_threads",
+           &AnalysisConfig::cpu_math_library_num_threads)
+      .def("to_native_config", &AnalysisConfig::ToNativeConfig)
+      .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
+      .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
+      .def("model_from_memory", &AnalysisConfig::model_from_memory)
+      .def("pass_builder", &AnalysisConfig::pass_builder,
+           py::return_value_policy::reference);
+}
+void BindAnalysisPredictor(py::module *m) {
+  py::class_<AnalysisPredictor, PaddlePredictor>(*m, "AnalysisPredictor")
+      .def(py::init<const AnalysisConfig &>())
+      .def("init", &AnalysisPredictor::Init)
+      .def(
+          "run",
+          [](AnalysisPredictor &self, const std::vector<PaddleTensor> &inputs) {
+            std::vector<PaddleTensor> outputs;
+            self.Run(inputs, &outputs);
+            return outputs;
+          })
+      .def("get_input_tensor", &AnalysisPredictor::GetInputTensor)
+      .def("get_output_tensor", &AnalysisPredictor::GetOutputTensor)
+      .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
+      .def("clone", &AnalysisPredictor::Clone)
+      .def("scope", &AnalysisPredictor::scope,
+           py::return_value_policy::reference);
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/inference_api.h
+++ b/paddle/fluid/pybind/inference_api.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/pybind11.h>
+namespace paddle {
+namespace pybind {
+void BindInferenceApi(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -49,6 +49,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/imperative.h"
+#include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
@@ -1083,9 +1084,9 @@ All parameter, weight, gradient are variables in Paddle.
  BindRecordIOWriter(&m);
  BindAsyncExecutor(&m);
  BindGraph(&m);
  BindNode(&m);
+  BindInferenceApi(&m);
 }
 }  // namespace pybind
 }  // namespace paddle
--- a/python/paddle/fluid/compiler.py
+++ b/python/paddle/fluid/compiler.py
@@ -24,6 +24,8 @@ __all__ = ['CompiledProgram', 'ExecutionStrategy', 'BuildStrategy']
 ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 BuildStrategy = core.ParallelExecutor.BuildStrategy
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 def _place_obj(place):
@@ -70,6 +72,7 @@ class CompiledProgram(object):
        self._executor = None
        self._compiled = False
        self._is_data_parallel = False
+        self._is_inference = False
    def with_data_parallel(self,
                           loss_name=None,
@@ -109,10 +112,24 @@ class CompiledProgram(object):
            self._build_strategy = BuildStrategy()
        return self
-    def _with_distributed(self):
+    def with_inference_optimize(self, config):
-        raise NotImplementedError()
+        """ Add inference optimize
+        Args:
+            config: instance of `NativeConfig` or `AnalysisConfig` to create predictor
+        Returns:
+            self
+        """
+        assert any([
+            isinstance(config, InferNativeConfig),
+            isinstance(config, InferAnalysisConfig)
+        ])
+        self._is_data_parallel = False
+        self._is_inference = True
+        self._infer_config = config
+        return self
-    def _with_inference_optimize(self):
+    def _with_distributed(self):
        raise NotImplementedError()
    def _compile_data_parallel(self):
@@ -177,6 +194,10 @@ class CompiledProgram(object):
            if self._loss_name else six.u(''), self._scope, self._local_scopes,
            self._exec_strategy, self._build_strategy)
+    def _compile_inference(self):
+        assert self._is_data_parallel is False
+        return core.create_paddle_predictor(self._infer_config)
    def _compile(self, scope, place):
        """Compile the program based on the configs.
@@ -200,6 +221,8 @@ class CompiledProgram(object):
        self._place = place
        if self._is_data_parallel:
            self._executor = self._compile_data_parallel()
+        elif self._is_inference:
+            self._executor = self._compile_inference()
        else:
            p = _place_obj(self._place)
            self._executor = core.Executor(p)

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -27,6 +27,8 @@ from .. import compat as cpt
 __all__ = ['Executor', 'global_scope', 'scope_guard']
 g_scope = core.Scope()
+InferNativeConfig = core.NativeConfig
+InferAnalysisConfig = core.AnalysisConfig
 def global_scope():
@@ -533,6 +535,8 @@ class Executor(object):
                fetch_list=fetch_list,
                fetch_var_name=fetch_var_name,
                return_numpy=return_numpy)
+        elif program._is_inference:
+            return self._run_inference(program, feed)
        else:
            # TODO(panyx0718): Can compile program to optimize executor
            # performance.
@@ -590,3 +594,6 @@ class Executor(object):
        if return_numpy:
            outs = as_numpy(outs)
        return outs
+    def _run_inference(self, program, feed):
+        return self.executor.run(feed)
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -195,9 +195,34 @@ def infer(use_cuda, save_dirname=None):
                          },
                          fetch_list=fetch_targets,
                          return_numpy=False)
-        print(results[0].recursive_sequence_lengths())
+        def to_infer_tensor(lod_tensor):
+            infer_tensor = fluid.core.PaddleTensor()
+            infer_tensor.lod = lod_tensor.lod()
+            infer_tensor.data = fluid.core.PaddleBuf(np.array(lod_tensor))
+            infer_tensor.shape = lod_tensor.shape()
+            infer_tensor.dtype = fluid.core.PaddleDType.INT64
+            return infer_tensor
+        infer_inputs = [first_word, second_word, third_word, fourth_word]
+        infer_inputs = [to_infer_tensor(t) for t in infer_inputs]
+        infer_config = fluid.core.NativeConfig()
+        infer_config.model_dir = 'word2vec.inference.model'
+        infer_config.use_gpu = use_cuda
+        if use_cuda:
+            infer_config.device = 0
+            infer_config.fraction_of_gpu_memory = 0.15
+        compiled_program = fluid.compiler.CompiledProgram(inference_program)
+        compiled_program.with_inference_optimize(infer_config)
+        assert compiled_program._is_inference is True
+        infer_outputs = exe.run(compiled_program, feed=infer_inputs)
        np_data = np.array(results[0])
-        print("Inference Shape: ", np_data.shape)
+        infer_out = infer_outputs[0].data.float_data()
+        for a, b in zip(np_data[0], infer_out):
+            g_a = float("{:.6g}".format(a))
+            g_b = float("{:.6g}".format(b))
+            assert g_a == g_b
 def main(use_cuda, is_sparse, is_parallel):