add Anakin api for paddle (#11228)

9141bee1 · cuichaowen · Yan Chunwei · d48172f2 · 9141bee1 · 9141bee1
5 changed file
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -24,31 +24,37 @@ set(ANAKIN_LIBRARY "" CACHE STRING "path of Anakin library")
 set(inference_deps paddle_inference_api paddle_fluid_api)
 # if anakin is set enable anakin api implementation
-if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY)
+if(ANAKIN_INCLUDE AND ANAKIN_LIBRARY)
    set(ANAKIN_FOUND ON)
 else()
    set(ANAKIN_FOUND OFF)
 endif()
+function(fetch_include_recursively root_dir) 
+    if (IS_DIRECTORY ${root_dir}) 
+        include_directories(${root_dir})
+    endif()
+    file(GLOB ALL_SUB RELATIVE ${root_dir} ${root_dir}/*)
+    foreach(sub ${ALL_SUB})
+        if (IS_DIRECTORY ${root_dir}/${sub})
+            fetch_include_recursively(${root_dir}/${sub})
+        endif()
+    endforeach()
+endfunction()
 if (ANAKIN_FOUND)
    # Anakin's code style doesn't follow google c style.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=comment
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=unused-variable -Wno-error=format-extra-args -Wno-error=comment -Wno-error=format -Wno-error=switch -Wno-error=return-type -Wno-error=non-virtual-dtor -Wno-reorder -Wno-error=cpp")
-                                            -Wno-error=reorder
-                                            -Wno-error=format
-                                            -Wno-error=switch
-                                            -Wno-error=return-type
-                                            -Wno-error=non-virtual-dtor
-                                            -Wno-error=cpp")
    message(STATUS "Anakin for inference is enabled")
    message(STATUS "Anakin is set INCLUDE:${ANAKIN_INCLUDE} LIBRARY:${ANAKIN_LIBRARY}")
-    include_directories("${ANAKIN_INCLUDE}")
+    fetch_include_recursively(${ANAKIN_INCLUDE})
-    # Anakin's source path is a mass, need to set sub-directories trivially.
-    include_directories("${ANAKIN_INCLUDE}/saber")
+    link_directories(${ANAKIN_LIBRARY})
-    link_directories("${ANAKIN_LIBRARY}")
-    nv_library(inference_anakin_api SRCS paddle_inference_api_anakin_engine.cc)
+    nv_library(inference_anakin_api SHARED SRCS paddle_inference_api.cc paddle_inference_api_anakin_engine.cc)
-    target_link_libraries(inference_anakin_api anakin)
+    target_link_libraries(inference_anakin_api anakin anakin_saber_common)
    list(APPEND inference_deps inference_anakin_api)
 endif()
@@ -73,7 +79,7 @@ function(inference_api_test TARGET_NAME)
 endfunction(inference_api_test)
 cc_library(paddle_inference_api
-    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc 
    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 cc_test(test_paddle_inference_api
@@ -84,8 +90,8 @@ inference_api_test(test_paddle_inference_api_impl
                    ARGS test_word2vec test_image_classification)
 if (ANAKIN_FOUND)
-  nv_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
+    cc_test(inference_anakin_test SRCS paddle_inference_api_anakin_engine_tester.cc
-    DEPS ${inference_deps} protobuf)
+    DEPS ${inference_deps})
 endif()
 if(WITH_TESTING)

--- a/paddle/contrib/inference/paddle_inference_api.h
+++ b/paddle/contrib/inference/paddle_inference_api.h
@@ -113,5 +113,4 @@ struct AnakinConfig : public PaddlePredictor::Config {
 // Similarly, each engine kind should map to a unique predictor implementation.
 template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.cc
@@ -24,8 +24,16 @@ PaddleInferenceAnakinPredictor::PaddleInferenceAnakinPredictor(
 }
 bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
-  // TODO(Superjomn) Tell anakin to support return code.
+  if (!(graph_.load(config.model_file))) {
-  engine_.Build(config.model_file, config.max_batch_size);
+    return false;
+  }
+  graph_.ResetBatchSize("input_0", config.max_batch_size);
+  // optimization for graph
+  if (!(graph_.Optimize())) {
+    return false;
+  }
+  // construct executer
+  executor_.init(graph_);
  return true;
 }
@@ -38,24 +46,30 @@ bool PaddleInferenceAnakinPredictor::Run(
                 << "'s type is not float";
      return false;
    }
-    engine_.SetInputFromCPU(
+    auto d_tensor_in_p = executor_.get_in(input.name);
-        input.name, static_cast<float *>(input.data.data), input.data.length);
+    float *d_data_p = d_tensor_in_p->mutable_data();
+    if (cudaMemcpy(d_data_p,
+                   static_cast<float *>(input.data.data),
+                   d_tensor_in_p->valid_size() * sizeof(float),
+                   cudaMemcpyHostToDevice) != 0) {
+      LOG(ERROR) << "copy data from CPU to GPU error";
+      return false;
+    }
  }
-  // TODO(Superjomn) Tell anakin to support return code.
+  executor_.prediction();
-  engine_.Execute();
  if (output_data->empty()) {
    LOG(ERROR) << "At least one output should be set with tensors' names.";
    return false;
  }
  for (auto &output : *output_data) {
-    auto *tensor = engine_.GetOutputInGPU(output.name);
+    auto *tensor = executor_.get_out(output.name);
    output.shape = tensor->shape();
    // Copy data from GPU -> CPU
    if (cudaMemcpy(output.data.data,
-                   tensor->data(),
+                   tensor->mutable_data(),
-                   tensor->size(),
+                   tensor->valid_size() * sizeof(float),
                   cudaMemcpyDeviceToHost) != 0) {
      LOG(ERROR) << "copy data from GPU to CPU error";
      return false;
@@ -64,9 +78,26 @@ bool PaddleInferenceAnakinPredictor::Run(
  return true;
 }
-// TODO(Superjomn) To implement latter.
+anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+    &PaddleInferenceAnakinPredictor::get_executer() {
+  return executor_;
+}
+// the cloned new Predictor of anakin share the same net weights from original
+// Predictor
 std::unique_ptr<PaddlePredictor> PaddleInferenceAnakinPredictor::Clone() {
-  return nullptr;
+  VLOG(3) << "Anakin Predictor::clone";
+  std::unique_ptr<PaddlePredictor> cls(new PaddleInferenceAnakinPredictor());
+  // construct executer from other graph
+  auto anakin_predictor_p =
+      dynamic_cast<PaddleInferenceAnakinPredictor *>(cls.get());
+  if (!anakin_predictor_p) {
+    LOG(ERROR) << "fail to call Init";
+    return nullptr;
+  }
+  anakin_predictor_p->get_executer().init(graph_);
+  return std::move(cls);
 }
 // A factory to help create difference predictor.
@@ -74,6 +105,7 @@ template <>
 std::unique_ptr<PaddlePredictor>
 CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(
    const AnakinConfig &config) {
+  VLOG(3) << "Anakin Predictor create.";
  std::unique_ptr<PaddlePredictor> x(
      new PaddleInferenceAnakinPredictor(config));
  return x;

--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine.h
@@ -20,32 +20,42 @@ limitations under the License. */
 #pragma once
 // NOTE This header file do not have namespace.
-// TODO(Superjomn) Tell Anakin to provide better APIs.
+//#include <test/framework/net/paddle_api.h>
-#include <test/framework/net/paddle_api.h>
 #include "paddle/contrib/inference/paddle_inference_api.h"
+#include "framework/core/net/net.h"
+#include "saber/saber_types.h"
 namespace paddle {
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
 public:
+  PaddleInferenceAnakinPredictor() {}
  PaddleInferenceAnakinPredictor(const AnakinConfig& config);
  // NOTE Unlike the native engine, the buffers of anakin engine's output_data
  // should be allocated first.
-  // TODO(Superjomn) should unify all the behaviors of output_data accross all
-  // the engines.
  bool Run(const std::vector<PaddleTensor>& inputs,
           std::vector<PaddleTensor>* output_data) override;
  std::unique_ptr<PaddlePredictor> Clone() override;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>&
+  get_executer();
+  ~PaddleInferenceAnakinPredictor() override{};
 private:
  bool Init(const AnakinConfig& config);
-  anakin::AnakinEngine<anakin::NV,
+  anakin::graph::Graph<anakin::NV,
                       anakin::saber::AK_FLOAT,
                       anakin::Precision::FP32>
-      engine_;
+      graph_;
+  anakin::Net<anakin::NV, anakin::saber::AK_FLOAT, anakin::Precision::FP32>
+      executor_;
+  AnakinConfig config_;
 };
 }  // namespace paddle
--- a/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
+++ b/paddle/contrib/inference/paddle_inference_api_anakin_engine_tester.cc
@@ -12,16 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/contrib/inference/paddle_inference_api.h"
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include "gflags/gflags.h"
+#include "paddle/contrib/inference/paddle_inference_api.h"
 namespace paddle {
-TEST(inference, anakin) {
+AnakinConfig GetConfig() {
  AnakinConfig config;
+  config.model_file = "./mobilenet_v2.anakin.bin";
+  config.device = 0;
+  config.max_batch_size = 1;
+  return config;
+}
-  auto engine =
+TEST(inference, anakin) {
+  AnakinConfig config = GetConfig();
+  auto predictor =
      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+  float data[1 * 3 * 224 * 224] = {1.0f};
+  PaddleBuf buf{.data = data, .length = sizeof(data)};
+  PaddleTensor tensor{.name = "input_0",
+                      .shape = std::vector<int>({1, 3, 224, 224}),
+                      .data = buf,
+                      .dtype = PaddleDType::FLOAT32};
+  // For simplicity, we set all the slots with the same data.
+  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
+  float data_out[1000];
+  PaddleBuf buf_out{.data = data_out, .length = sizeof(data)};
+  PaddleTensor tensor_out{.name = "prob_out",
+                          .shape = std::vector<int>({1000, 1}),
+                          .data = buf_out,
+                          .dtype = PaddleDType::FLOAT32};
+  std::vector<PaddleTensor> outputs(1, tensor_out);
+  ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
+  float* data_o = static_cast<float*>(outputs[0].data.data);
+  for (size_t j = 0; j < 1000; ++j) {
+    LOG(INFO) << "output[" << j << "]: " << data_o[j];
+  }
 }
 }  // namespace paddle