Support cross compilation for ARM (#16899)

Enable setting thread number in Anakin Config Fix code style Reduce cmake file changes test=release/1.4

Support cross compilation for ARM (#16899)
Enable setting thread number in Anakin Config Fix code style Reduce cmake file changes test=release/1.4
24faf9e8 · Houjiang Chen · GitHub · e4e5bad6 · 24faf9e8 · 24faf9e8
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,6 +26,12 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")

+option(WITH_ARM_CPU     "Cross compile PaddlePaddle to support ARM CPU" OFF)
+if (WITH_ARM_CPU)
+    add_subdirectory(paddle/fluid/inference/lite)
+    return()
+endif()
+
 if(WIN32)
    set(CMAKE_SUPPRESS_REGENERATION ON)
    set(CMAKE_STATIC_LIBRARY_PREFIX lib)

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -20,6 +20,10 @@
 # for instance, protobuf libs path is <install_dir>/lib64
 # on CentOS, but <install_dir>/lib on other systems.

+if (WITH_ARM_CPU)
+  return()
+endif()
+
 IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)

--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
+add_definitions(-DUSE_ARM_PLACE)
+set(CMAKE_CXX_FLAGS "-std=c++11 -pie -fPIE -Wno-attributes ${CMAKE_CXX_FLAGS}")
+if (NOT (${CMAKE_CXX_COMPILER} MATCHES "clang\\+\\+$"))
+  set(CMAKE_CXX_FLAGS "-fopenmp ${CMAKE_CXX_FLAGS}")
+endif()
+if (ANDROID)
+  set(CMAKE_CXX_FLAGS "-llog ${CMAKE_CXX_FLAGS}")
+endif()
+if (IOS)
+  set(CMAKE_CXX_FLAGS "-fembed-bitcode ${CMAKE_CXX_FLAGS}")
+endif()
+
+set(PADDLE_LITE_LIB paddle-lite)
+set(PADDLE_LITE_SRCS api.cc api_anakin_engine.cc)
+
+set(PADDLE_LITE_PATH ${PADDLE_SOURCE_DIR}/paddle/fluid/inference/lite)
+include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${PADDLE_LITE_PATH} ${PADDLE_LITE_PATH}/output
+                    ${PADDLE_LITE_PATH}/output/saber)
+
+if (BUILD_SHARED_LIBS)
+  add_library(${PADDLE_LITE_LIB} SHARED ${PADDLE_LITE_SRCS})
+else()
+  add_library(${PADDLE_LITE_LIB} STATIC ${PADDLE_LITE_SRCS})
+endif(BUILD_SHARED_LIBS)
+#target_link_libraries(${PADDLE_LITE_LIB} )
+
+#add_library(anakin SHARED IMPORTED)
+#set_target_properties(anakin PROPERTIES IMPORTED_LOCATION
+#                      ${PADDLE_LITE_PATH}/output/libanakin.so)
+add_library(anakin STATIC IMPORTED)
+set_target_properties(anakin PROPERTIES IMPORTED_LOCATION
+                      ${PADDLE_LITE_PATH}/output/libanakin_static.a)
+add_library(saber_common STATIC IMPORTED)
+set_target_properties(saber_common PROPERTIES IMPORTED_LOCATION
+                      ${PADDLE_LITE_PATH}/output/libanakin_saber_common.a)
+add_library(protobuf STATIC IMPORTED)
+set_target_properties(protobuf PROPERTIES IMPORTED_LOCATION
+                      ${PADDLE_LITE_PATH}/output/protobuf/lib/libprotobuf.a)
+
+add_executable(test-benchmark benchmark/benchmark.cc)
+target_link_libraries(test-benchmark paddle-lite "-Wl,--whole-archive"
+                      saber_common anakin "-Wl,--no-whole-archive" protobuf)
--- a/paddle/fluid/inference/lite/anakin_config.h
+++ b/paddle/fluid/inference/lite/anakin_config.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle_api.h"  // NOLINT
+#include "utils/logger/logger.h"
+
+namespace paddle {
+namespace contrib {
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { ARM = 0, GPU };
+  enum PrecisionType { FP32 = 0, FP16, INT8 };
+
+  std::string model_file;
+  int max_batch_size = 1;
+  int thread_num = 1;
+  TargetType target_type = ARM;
+  PrecisionType precision_type = FP32;
+};
+
+}  // namespace contrib
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/api.cc
+++ b/paddle/fluid/inference/lite/api.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <sstream>
+#include "paddle/fluid/inference/lite/paddle_api.h"
+
+namespace paddle {
+
+int PaddleDtypeSize(PaddleDType dtype) {
+  switch (dtype) {
+    case PaddleDType::FLOAT32:
+      return sizeof(float);
+    case PaddleDType::INT64:
+      return sizeof(int64_t);
+    case PaddleDType::INT32:
+      return sizeof(int32_t);
+    default:
+      assert(false);
+      return -1;
+  }
+}
+
+PaddleBuf::PaddleBuf(PaddleBuf &&other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; }
+
+PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length());
+    memcpy(data_, other.data(), other.length());
+    length_ = other.length();
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) {
+  // only the buffer with external memory can be copied
+  data_ = other.data_;
+  length_ = other.length_;
+  memory_owned_ = other.memory_owned_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
+  return *this;
+}
+
+void PaddleBuf::Resize(size_t length) {
+  // Only the owned memory can be reset, the external memory can't be changed.
+  if (length_ >= length) return;
+  if (memory_owned_) {
+    Free();
+    data_ = malloc(length);
+    length_ = length;
+    memory_owned_ = true;
+  } else {
+    // PADDLE_THROW("The memory is allocated externally, can not Resized");
+  }
+}
+
+void PaddleBuf::Reset(void *data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void PaddleBuf::Free() {
+  if (memory_owned_ && data_) {
+    // PADDLE_ENFORCE_GT(length_, 0UL);
+    free(static_cast<char *>(data_));
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/api_anakin_engine.cc
+++ b/paddle/fluid/inference/lite/api_anakin_engine.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/lite/api_anakin_engine.h"
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "framework/core/net/net.h"
+#include "framework/operators/ops.h"
+#include "saber/funcs/timer.h"
+
+namespace paddle {
+
+using paddle::contrib::AnakinConfig;
+
+template <typename Target, anakin::Precision Precision>
+PaddleInferenceAnakinPredictor<Target, Precision>::
+    PaddleInferenceAnakinPredictor(const contrib::AnakinConfig &config) {
+  anakin::saber::Env<Target>::env_init();
+#ifdef USE_ARM_PLACE
+  anakin::saber::Context<Target> ctx;
+  // set mode and thread number
+  anakin::saber::PowerMode mode = anakin::saber::SABER_POWER_HIGH;
+  ctx.set_run_mode(mode, config.thread_num);
+// ctx.set_arch(anakin::A73);
+// ctx.set_cache(32 * 1024, 512 * 1024, 0);
+#endif
+  CHECK(Init(config));
+}
+
+template <typename Target, anakin::Precision Precision>
+bool PaddleInferenceAnakinPredictor<Target, Precision>::Init(
+    const contrib::AnakinConfig &config) {
+  if (!(graph_.load(config.model_file))) {
+    LOG(INFO) << "fail to load graph from " << config.model_file;
+    return false;
+  }
+
+  auto inputs = graph_.get_ins();
+  for (auto &input_str : inputs) {
+    graph_.ResetBatchSize(input_str, config.max_batch_size);
+    max_batch_size_ = config.max_batch_size;
+  }
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  if (executor_p_ == nullptr) {
+    executor_p_ = new anakin::Net<Target, Precision>(graph_, true);
+  }
+  return true;
+}
+
+template <typename Target, anakin::Precision Precision>
+bool PaddleInferenceAnakinPredictor<Target, Precision>::Run(
+    const std::vector<PaddleTensor> &inputs,
+    std::vector<PaddleTensor> *output_data, int batch_size) {
+  for (const auto &input : inputs) {
+    if (input.dtype != PaddleDType::FLOAT32) {
+      LOG(INFO) << "Only support float type inputs. " << input.name
+                << "'s type is not float";
+      return false;
+    }
+    auto d_tensor_in_p = executor_p_->get_in(input.name);
+    auto net_shape = d_tensor_in_p->shape();
+    if (net_shape.size() != input.shape.size()) {
+      LOG(INFO) << " input  " << input.name
+                << "'s shape size should be equal to that of net";
+      return false;
+    }
+    int sum = 1;
+    for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; });
+    if (sum > net_shape.count()) {
+      graph_.Reshape(input.name, input.shape);
+      delete executor_p_;
+      executor_p_ = new anakin::Net<Target, Precision>(graph_, true);
+      d_tensor_in_p = executor_p_->get_in(input.name);
+    }
+
+    anakin::saber::Shape tmp_shape;
+    for (auto s : input.shape) {
+      tmp_shape.push_back(s);
+    }
+    d_tensor_in_p->reshape(tmp_shape);
+
+    if (input.lod.size() > 0) {
+      if (input.lod.size() > 1) {
+        LOG(INFO) << " input lod first dim should <=1, but you set "
+                  << input.lod.size();
+        return false;
+      }
+      std::vector<int> offset(input.lod[0].begin(), input.lod[0].end());
+      d_tensor_in_p->set_seq_offset({offset});
+      LOG(INFO) << "offset.size(): " << offset.size();
+      for (int i = 0; i < offset.size(); i++) {
+        LOG(INFO) << offset[i];
+      }
+    }
+
+    void *d_data_p = d_tensor_in_p->mutable_data();
+    if (std::is_same<anakin::ARM, Target>::value) {
+      memcpy(d_data_p, static_cast<float *>(input.data.data()),
+             d_tensor_in_p->valid_size() * sizeof(float));
+    }
+  }
+
+  if (output_data->empty()) {
+    LOG(INFO) << "At least one output should be set with tensors' names.";
+    return false;
+  }
+  // run prediction
+  executor_p_->prediction();
+
+  for (auto &output : *output_data) {
+    auto *tensor = executor_p_->get_out(output.name);
+    output.shape = tensor->valid_shape();
+    if (output.data.length() < tensor->valid_size() * sizeof(float)) {
+      output.data.Resize(tensor->valid_size() * sizeof(float));
+    }
+
+    if (std::is_same<anakin::ARM, Target>::value) {
+      memcpy(output.data.data(), tensor->mutable_data(),
+             tensor->valid_size() * sizeof(float));
+    }
+  }
+  return true;
+}
+
+template <typename Target, anakin::Precision Precision>
+anakin::Net<Target, Precision>
+    &PaddleInferenceAnakinPredictor<Target, Precision>::get_executer() {
+  return *executor_p_;
+}
+
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
+template <typename Target, anakin::Precision Precision>
+std::unique_ptr<PaddlePredictor>
+PaddleInferenceAnakinPredictor<Target, Precision>::Clone() {
+  LOG(INFO) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(
+      new PaddleInferenceAnakinPredictor<Target, Precision>());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor<Target, Precision> *>(
+          cls.get());
+  if (!anakin_predictor_p) {
+    LOG(INFO) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+
+  return std::move(cls);
+}
+
+template class PaddleInferenceAnakinPredictor<anakin::ARM,
+                                              anakin::Precision::FP32>;
+template class PaddleInferenceAnakinPredictor<anakin::ARM,
+                                              anakin::Precision::INT8>;
+
+// A factory to help create difference predictor.
+template <>
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
+    const contrib::AnakinConfig &config) {
+  if (config.target_type != contrib::AnakinConfig::ARM) {
+    LOG(INFO) << "Anakin Predictor: Only ARM platform is supported currently.";
+    return nullptr;
+  }
+
+  LOG(INFO) << "Anakin Predictor create.";
+  if (config.precision_type == contrib::AnakinConfig::FP32) {
+    LOG(INFO) << "Anakin Predictor create on [ FP32 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::ARM,
+                                           anakin::Precision::FP32>(config));
+    return x;
+  } else if (config.precision_type == contrib::AnakinConfig::INT8) {
+    LOG(INFO) << "Anakin Predictor create on [ INT8 ].";
+    std::unique_ptr<PaddlePredictor> x(
+        new PaddleInferenceAnakinPredictor<anakin::ARM,
+                                           anakin::Precision::INT8>(config));
+    return x;
+  } else {
+    LOG(INFO) << "Anakin Predictor create on unsupported precision.";
+    return nullptr;
+  }
+}
+
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+template <typename Target, anakin::Precision Precision>
+using executor_t = anakin::Net<Target, Precision>;
+
+template <typename Target, anakin::Precision Precision>
+void DisplayOpTimer(executor_t<Target, Precision> *net_executor, int epoch) {
+  std::vector<float> op_time = net_executor->get_op_time();
+  auto exec_funcs = net_executor->get_exec_funcs();
+  auto op_param = net_executor->get_op_param();
+  for (int i = 0; i < op_time.size(); i++) {
+    LOG(INFO) << "name: " << exec_funcs[i].name
+              << " op_type: " << exec_funcs[i].op_name
+              << " op_param: " << op_param[i] << " time " << op_time[i] / epoch;
+  }
+  std::map<std::string, float> op_map;
+  for (int i = 0; i < op_time.size(); i++) {
+    auto it = op_map.find(op_param[i]);
+    if (it != op_map.end())
+      op_map[op_param[i]] += op_time[i];
+    else
+      op_map.insert(std::pair<std::string, float>(op_param[i], op_time[i]));
+  }
+  for (auto it = op_map.begin(); it != op_map.end(); ++it) {
+    LOG(INFO) << it->first << "  " << (it->second) / epoch << " ms";
+  }
+}
+#endif
+
+template <typename Target, anakin::Precision Precision>
+PaddleInferenceAnakinPredictor<Target,
+                               Precision>::~PaddleInferenceAnakinPredictor() {
+#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER
+  DisplayOpTimer<Target, Precision>(executor_p_, max_batch_size_);
+#endif
+  delete executor_p_;
+  executor_p_ = nullptr;
+}
+
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/api_anakin_engine.h
+++ b/paddle/fluid/inference/lite/api_anakin_engine.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains the implementation of inference API with Anakin engine
+ * embeded, this API can only support Anakin models.
+ */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "framework/core/net/net.h"
+#include "framework/graph/graph.h"
+#include "paddle/fluid/inference/lite/anakin_config.h"
+#include "saber/core/shape.h"
+#include "saber/saber_types.h"
+
+namespace paddle {
+
+using contrib::AnakinConfig;
+
+template <typename Target, anakin::Precision Precision>
+class PaddleInferenceAnakinPredictor : public PaddlePredictor {
+ public:
+  PaddleInferenceAnakinPredictor() {}
+
+  explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config);
+
+  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
+  // should be allocated first.
+  bool Run(const std::vector<PaddleTensor>& inputs,
+           std::vector<PaddleTensor>* output_data,
+           int batch_size = -1) override;
+
+  std::vector<std::string> GetInputNames() override { return graph_.get_ins(); }
+
+  std::vector<std::string> GetOutputNames() override {
+    return graph_.get_outs();
+  }
+
+  std::unique_ptr<PaddlePredictor> Clone() override;
+
+  anakin::Net<Target, Precision>& get_executer();
+
+  ~PaddleInferenceAnakinPredictor() override;
+
+ private:
+  bool Init(const AnakinConfig& config);
+
+  anakin::graph::Graph<Target, Precision> graph_;
+  anakin::Net<Target, Precision>* executor_p_{nullptr};
+  AnakinConfig config_;
+  int max_batch_size_{0};
+};
+
+}  // namespace paddle
--- a/paddle/fluid/inference/lite/benchmark/benchmark.cc
+++ b/paddle/fluid/inference/lite/benchmark/benchmark.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/inference/lite/anakin_config.h"
+#include "paddle/fluid/inference/lite/paddle_api.h"
+
+namespace paddle {
+
+void PrintShape(const std::vector<int> &shape) {
+  std::ostringstream os;
+  os << "Shape: ";
+  if (shape.size() > 0) {
+    os << shape[0];
+    for (int i = 1; i < shape.size(); ++i) {
+      os << ", " << shape[i];
+    }
+  }
+  LOG(INFO) << os.str();
+}
+
+int ShapeSize(const std::vector<int> &shape) {
+  int size = 1;
+  for (int j = 0; j < shape.size(); ++j) {
+    size *= shape[j];
+  }
+  return size;
+}
+
+template <typename T>
+int InitTensorValFromFile(const std::string &file, PaddleTensor *tensor) {
+  int size = ShapeSize(tensor->shape);
+  void *tensor_data = tensor->data.data();
+  std::ifstream in(file, std::ios::in | std::ios::binary);
+  in.read(reinterpret_cast<char *>(tensor_data), size * sizeof(T));
+  in.close();
+}
+
+int SetupTensors(const std::vector<std::vector<int>> &shapes,
+                 const std::vector<std::string> &names,
+                 std::vector<PaddleTensor> *outputs) {
+  while (outputs->size() < shapes.size()) {
+    outputs->emplace_back();
+  }
+  for (int i = 0; i < shapes.size(); ++i) {
+    int size = ShapeSize(shapes[i]);
+    outputs->at(i).name = names[i];
+    outputs->at(i).shape = shapes[i];
+    outputs->at(i).data.Resize(size * sizeof(float));
+    outputs->at(i).dtype = FLOAT32;
+  }
+}
+
+int test(const char *model, const char *image, const char *image_shape,
+         const int quant, const int times) {
+  contrib::AnakinConfig config;
+  config.model_file = std::string(model);
+  // config.model_file = "./mobilenetv1.anakin.bin";
+  config.max_batch_size = 1;
+  config.precision_type =
+      (quant == 1) ? contrib::AnakinConfig::INT8 : contrib::AnakinConfig::FP32;
+
+  LOG(INFO) << "quant: " << quant;
+
+  std::unique_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
+          config);
+
+  LOG(INFO) << "create predictor success";
+  std::vector<std::string> in_names = predictor->GetInputNames();
+  std::vector<PaddleTensor> inputs, outpus;
+  std::vector<std::vector<int>> in_shapes;
+  std::vector<int> dim{1, 3, 224, 224};
+  sscanf(image_shape, "%d,%d,%d,%d", &dim[0], &dim[1], &dim[2], &dim[3]);
+  in_shapes.push_back(dim);
+  SetupTensors(in_shapes, in_names, &inputs);
+  PrintShape(dim);
+
+  // InitTensorValFromFile<float>("./test_image_1x3x224x224_float", &inputs[0]);
+  InitTensorValFromFile<float>(std::string(image), &inputs[0]);
+  LOG(INFO) << "init tensor value success";
+
+  std::vector<std::string> out_names = predictor->GetOutputNames();
+  LOG(INFO) << "output size: " << out_names.size();
+  outpus.resize(out_names.size());
+  for (int i = 0; i < out_names.size(); ++i) {
+    outpus[i].name = out_names[i];
+  }
+
+  LOG(INFO) << "start run prediction";
+  predictor->Run(inputs, &outpus);
+
+  struct timespec ts_begin, ts_end;
+  clock_gettime(CLOCK_MONOTONIC, &ts_begin);
+  for (int i = 0; i < times; ++i) {
+    predictor->Run(inputs, &outpus);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &ts_end);
+  uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 +
+                     (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6;
+  LOG(INFO) << "elapsed: " << (1.f * elapsed) / times << " ms";
+
+  LOG(INFO) << "finish prediction";
+
+  for (int i = 0; i < outpus.size(); ++i) {
+    int size = ShapeSize(outpus[i].shape);
+    // int stride = (size + 19) / 20;
+    int stride = 1;
+    int loop = size / stride;
+    float *output_data = static_cast<float *>(outpus[i].data.data());
+    std::ostringstream os;
+    os << output_data[0];
+    for (int j = 1; j < loop; ++j) {
+      os << ", " << output_data[j * stride];
+    }
+    LOG(INFO) << os.str();
+  }
+  return 0;
+}
+
+}  // namespace paddle
+
+int main(int argc, char *argv[]) {
+  if (argc < 6) {
+    LOG(INFO) << "Usage: ./benchmark [model] [image] [image-shape] [8bit] "
+                 "[run-times]";
+    LOG(INFO) << "Example:";
+    LOG(INFO) << "    ./benchmark ./mobilenetv1.model ./test_image.bin "
+                 "1,3,224,224 0 10";
+    return 1;
+  }
+  int quant_8bit = atoi(argv[4]);
+  int times = atoi(argv[5]);
+  return paddle::test(argv[1], argv[2], argv[3], quant_8bit, times);
+}
--- a/paddle/fluid/inference/lite/paddle_api.h
+++ b/paddle/fluid/inference/lite/paddle_api.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+/*! \file paddle_api.h
+ */
+
+/*! \mainpage Paddle Inference APIs
+ * \section intro_sec Introduction
+ * The Paddle inference library aims to offer an high performance inference SDK
+ * for Paddle users.
+ */
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <vector>
+
+/*! \namespace paddle
+ */
+namespace paddle {
+
+/** paddle data type.
+ */
+enum PaddleDType {
+  FLOAT32,
+  INT64,
+  INT32,
+  // TODO(Superjomn) support more data types if needed.
+};
+
+/**
+ * \brief Memory manager for `PaddleTensor`.
+ *
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
+ *
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ *memory.
+ * ATTENTION, for user allocated memory, deallocation should be done by users
+ *externally after the program finished. The PaddleBuf won't do any allocation
+ *or deallocation.
+ *
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *  if the allocated memory is larger than `length`, nothing will done.
+ *
+ * Usage:
+ *
+ * Let PaddleBuf manage the memory internally.
+ * \code{cpp}
+ * const int num_elements = 128;
+ * PaddleBuf buf(num_elements * sizeof(float));
+ * \endcode
+ *
+ * Or
+ * \code{cpp}
+ * PaddleBuf buf;
+ * buf.Resize(num_elements * sizeof(float));
+ * \endcode
+ * Works the exactly the same.
+ *
+ * One can also make the `PaddleBuf` use the external memory.
+ * \code{cpp}
+ * PaddleBuf buf;
+ * void* external_memory = new float[num_elements];
+ * buf.Reset(external_memory, num_elements*sizeof(float));
+ * ...
+ * delete[] external_memory; // manage the memory lifetime outside.
+ * \endcode
+ */
+class PaddleBuf {
+ public:
+  /** PaddleBuf allocate memory internally, and manage it.
+   */
+  explicit PaddleBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  /** Set external memory, the PaddleBuf won't manage it.
+   */
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  /** Copy only available when memory is managed externally.
+   */
+  explicit PaddleBuf(const PaddleBuf&);
+
+  /** Resize the memory.
+   */
+  void Resize(size_t length);
+  /** Reset to external memory, with address and length set.
+   */
+  void Reset(void* data, size_t length);
+  /** Tell whether the buffer is empty.
+   */
+  bool empty() const { return length_ == 0; }
+  /** Get the data's memory address.
+   */
+  void* data() const { return data_; }
+  /** Get the memory length.
+   */
+  size_t length() const { return length_; }
+
+  ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
+
+ private:
+  void Free();
+  void* data_{nullptr};  // pointer to the data memory.
+  size_t length_{0};     // number of memory bytes.
+  bool memory_owned_{true};
+};
+
+/** Basic input and output data structure for PaddlePredictor.
+ */
+struct PaddleTensor {
+  PaddleTensor() = default;
+  std::string name;  // variable name.
+  std::vector<int> shape;
+  PaddleBuf data;  // blob of data.
+  PaddleDType dtype;
+  std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
+};
+
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+
+/** Tensor without copy, currently only supports `AnalysisPredictor`.
+ */
+class ZeroCopyTensor {
+ public:
+  void Reshape(const std::vector<int>& shape);
+
+  /** Get the memory in CPU or GPU with specific data type, should Reshape first
+   * to tell the data size.
+   * Once can directly call this data to feed the data.
+   * This is for write the input tensor.
+   */
+  template <typename T>
+  T* mutable_data(PaddlePlace place);
+  /** Get the memory directly, will return the place and element size by
+   * pointer.
+   * This is for reading the output tensor.
+   */
+  template <typename T>
+  T* data(PaddlePlace* place, int* size) const;
+
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  std::vector<int> shape() const;
+
+  void SetLoD(const std::vector<std::vector<size_t>>& x);
+  std::vector<std::vector<size_t>> lod() const;
+  const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }
+
+  PaddleDType type() const;
+
+ protected:
+  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
+  void SetName(const std::string& name) { name_ = name; }
+  void* FindTensor() const;
+
+ private:
+  std::string name_;
+  bool input_or_output_;
+  friend class AnalysisPredictor;
+  void* scope_{nullptr};
+  // The corresponding tensor pointer inside Paddle workspace is cached for
+  // performance.
+  mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  PaddleDType dtype_;
+  int device_;
+};
+
+/** A simple Inference API for Paddle.
+ */
+class PaddlePredictor {
+ public:
+  struct Config;
+  PaddlePredictor() = default;
+  PaddlePredictor(const PaddlePredictor&) = delete;
+  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
+
+  /** Predict an record.
+   * The caller should be responsible for allocating and releasing the memory of
+   * `inputs`. `inputs` should be available until Run returns. Caller should be
+   * responsible for the output tensor's buffer, either allocated or passed from
+   * outside.
+   */
+  virtual bool Run(const std::vector<PaddleTensor>& inputs,
+                   std::vector<PaddleTensor>* output_data,
+                   int batch_size = -1) = 0;
+
+  /** \brief Get input names of the model
+   */
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /** \brief Get output names of the model
+   */
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
+  /** \brief Get a mutable tensor directly.
+   *
+   * NOTE Only works in AnalysisPredictor.
+   *
+   * One can also use this to modify any temporary variable related tensors in
+   * the predictor.
+   *
+   */
+  virtual std::unique_ptr<ZeroCopyTensor> GetInputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  /**
+   * \brief Get an immutable tensor without copy.
+   *
+   * NOTE Only works in AnalysisPredictor.
+   * One can use this API to get any temporary tensors in the predictor and
+   * read it.
+   */
+  virtual std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
+      const std::string& name) {
+    return nullptr;
+  }
+  /**
+   * \brief Run the predictor with zero-copied inputs and outputs.
+   *
+   * NOTE Only works in AnalysisPredictor.
+   *
+   * This will save the IO copy for transfering inputs and outputs to predictor
+   * workspace and get some performance improvement.
+   * To use it, one should call the `AnalysisConfig.SwitchUseFeedFetchOp(true)`
+   * and then use the `GetInputTensor` and `GetOutputTensor` to directly write
+   * or read the input/output tensors.
+   */
+  virtual bool ZeroCopyRun() { return false; }
+
+  /** Clone a predictor that share the model weights, the Cloned predictor
+   * should be thread-safe.
+   */
+  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+
+  /** Destroy the Predictor.
+   */
+  virtual ~PaddlePredictor() = default;
+
+  /** \brief Get the serialized model program that executes in inference phase.
+   * Its data type is ProgramDesc, which is a protobuf message.
+   */
+  virtual std::string GetSerializedProgram() const {
+    assert(false);  // Force raise error.
+    return "NotImplemented";
+  }
+
+  /** The common configs for all the predictors.
+   */
+  struct Config {
+    std::string model_dir; /*!< path to the model directory. */
+  };
+};
+
+struct NativeConfig : public PaddlePredictor::Config {
+  // GPU related fields.
+  bool use_gpu{false};
+  int device{0};
+  float fraction_of_gpu_memory{
+      -1.f}; /*!< Change to a float in (0,1] if needed. */
+
+  // Specify the exact path of program and parameter files.
+  std::string prog_file;
+  std::string param_file;
+
+  /** Specify the variable's name of each input if input tensors don't follow
+   * the
+   * `feeds` and `fetches` of the phase `save_inference_model`.
+   */
+  bool specify_input_name{false};
+
+  /** Set and get the number of cpu math library threads.
+   */
+  void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) {
+    cpu_math_library_num_threads_ = cpu_math_library_num_threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_num_threads_;
+  }
+
+ protected:
+  // number of cpu math library (such as MKL, OpenBlas) threads for each
+  // instance.
+  int cpu_math_library_num_threads_{1};
+};
+
+/*! \fn std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT&
+ * config);
+ *
+ * \brief A factory to help create different predictors.
+ *
+ * Usage:
+ *
+ * \code{.cpp}
+ * NativeConfig config;
+ * ... // change the configs.
+ * auto native_predictor = CreatePaddlePredictor(config);
+ * \endcode
+ *
+ * FOR EXTENSION DEVELOPER:
+ * Different predictors are designated by config type. Similar configs can be
+ * merged, but there shouldn't be a huge config containing different fields for
+ * more than one kind of predictors.
+ */
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+/** NOTE The following APIs are too trivial, we will discard it in the following
+ * versions.
+ */
+enum class PaddleEngineKind {
+  kNative = 0,        /*!< Use the native Fluid facility. */
+  kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */
+  kAnalysis,          /*!< More optimization. */
+  kAnakin             /*!< Use Anakin for inference, not mature yet. */
+};
+
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+int PaddleDtypeSize(PaddleDType dtype);
+
+std::string get_version();
+
+}  // namespace paddle