Build: modules/perception/inference pass

d23d649b · storypku · 3ddcd6ed · d23d649b · d23d649b · d23d649b
23 changed file
--- a/modules/perception/inference/BUILD
+++ b/modules/perception/inference/BUILD
@@ -42,7 +42,7 @@ cc_test(
 )

 cc_library(
-    name = "inference_factory_lib",
+    name = "inference_factory",
    srcs = ["inference_factory.cc"],
    hdrs = ["inference_factory.h"],
    deps = [
@@ -59,8 +59,9 @@ cc_test(
    size = "small",
    srcs = ["inference_factory_test.cc"],
    deps = [
-        ":inference_factory_lib",
+        ":inference_factory",
        "@com_google_googletest//:gtest_main",
+        "@paddlepaddle",
    ],
 )


--- a/modules/perception/inference/inference.h
+++ b/modules/perception/inference/inference.h
@@ -16,12 +16,13 @@

 #pragma once

-#include <boost/shared_ptr.hpp>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>

+#include <boost/shared_ptr.hpp>
+
 #include "modules/perception/base/blob.h"

 namespace apollo {

--- a/modules/perception/inference/inference_factory_test.cc
+++ b/modules/perception/inference/inference_factory_test.cc
@@ -19,6 +19,7 @@
 #include "gtest/gtest.h"

 #include "modules/perception/inference/caffe/caffe_net.h"
+#include "modules/perception/inference/paddlepaddle/paddle_net.h"
 #include "modules/perception/inference/tensorrt/rt_net.h"

 namespace apollo {

--- a/modules/perception/inference/operators/roipooling_layer.cu
+++ b/modules/perception/inference/operators/roipooling_layer.cu
@@ -69,21 +69,14 @@ license and copyright terms herein.
 namespace apollo {
 namespace perception {
 namespace inference {
-template<typename Dtype>
-__global__ void ROIPoolForward(const int nthreads,
-                               const Dtype *bottom_data,
-                               const bool use_floor,
-                               const Dtype spatial_scale,
-                               const int channels,
-                               const int height,
-                               const int width,
-                               const int pooled_height,
-                               const int pooled_width,
-                               const Dtype *bottom_rois,
-                               Dtype *top_data,
-                               int *argmax_data) {
-  for (int index = blockIdx.x * blockDim.x + threadIdx.x;
-       index < (nthreads);
+template <typename Dtype>
+__global__ void ROIPoolForward(const int nthreads, const Dtype *bottom_data,
+                               const bool use_floor, const Dtype spatial_scale,
+                               const int channels, const int height,
+                               const int width, const int pooled_height,
+                               const int pooled_width, const Dtype *bottom_rois,
+                               Dtype *top_data, int *argmax_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
       index += blockDim.x * gridDim.x) {
    // (n, c, ph, pw) is an element in the pooled output
    int pw = index % pooled_width;
@@ -113,19 +106,15 @@ __global__ void ROIPoolForward(const int nthreads,
    // Force malformed ROIs to be 1x1
    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-    Dtype bin_size_h = static_cast<Dtype>(roi_height)
-        / static_cast<Dtype>(pooled_height);
-    Dtype bin_size_w = static_cast<Dtype>(roi_width)
-        / static_cast<Dtype>(pooled_width);
-
-    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
-                                            * bin_size_h));
-    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
-                                            * bin_size_w));
-    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
-                                         * bin_size_h));
-    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
-                                         * bin_size_w));
+    Dtype bin_size_h =
+        static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
+    Dtype bin_size_w =
+        static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));

    // Add roi offsets and clip to input boundaries
    hstart = min(max(hstart + roi_start_h, 0), height);
@@ -152,11 +141,10 @@ __global__ void ROIPoolForward(const int nthreads,
    argmax_data[index] = maxidx;
  }
 }
-template<typename Dtype>
-void ROIPoolingLayer<Dtype>::ForwardGPU(const std::vector<std::shared_ptr<
-                                            base::Blob<Dtype>>> &bottom,
-                                        const std::vector<std::shared_ptr<
-                                            base::Blob<Dtype>>> &top) {
+template <typename Dtype>
+void ROIPoolingLayer<Dtype>::ForwardGPU(
+    const std::vector<std::shared_ptr<base::Blob<Dtype>>> &bottom,
+    const std::vector<std::shared_ptr<base::Blob<Dtype>>> &top) {
  auto feat_b = bottom[0];
  auto roi_b = bottom[1];
  channels_ = feat_b->channels();
@@ -173,17 +161,17 @@ void ROIPoolingLayer<Dtype>::ForwardGPU(const std::vector<std::shared_ptr<
  int count = top[0]->count();
  const int thread_size = 512;
  int block_size = (count + thread_size - 1) / thread_size;
-  ROIPoolForward<Dtype> << < block_size, thread_size >> > (
+  ROIPoolForward<Dtype><<<block_size, thread_size>>>(
      count, bottom_data, use_floor_, spatial_scale_, channels_, height_,
-          width_,
-          pooled_height_, pooled_width_, bottom_rois, top_data, argmax_data);
+      width_, pooled_height_, pooled_width_, bottom_rois, top_data,
+      argmax_data);
 }
-template void ROIPoolingLayer<double>::ForwardGPU( \
-      const std::vector<std::shared_ptr<base::Blob<double>>> & bottom, \
-      const std::vector<std::shared_ptr<base::Blob<double>>> & top);
-template void ROIPoolingLayer<float>::ForwardGPU( \
-      const std::vector<std::shared_ptr<base::Blob<float>>> & bottom, \
-      const std::vector<std::shared_ptr<base::Blob<float>>> & top);
+template void ROIPoolingLayer<double>::ForwardGPU(
+    const std::vector<std::shared_ptr<base::Blob<double>>> &bottom,
+    const std::vector<std::shared_ptr<base::Blob<double>>> &top);
+template void ROIPoolingLayer<float>::ForwardGPU(
+    const std::vector<std::shared_ptr<base::Blob<float>>> &bottom,
+    const std::vector<std::shared_ptr<base::Blob<float>>> &top);

 }  // namespace inference
 }  // namespace perception

--- a/modules/perception/inference/paddlepaddle/paddle_net.cc
+++ b/modules/perception/inference/paddlepaddle/paddle_net.cc
@@ -33,7 +33,11 @@ PaddleNet::PaddleNet(const std::string &model_file,

 bool PaddleNet::Init(const std::map<std::string, std::vector<int>> &shapes) {
  paddle::AnalysisConfig config;
-  config.SetModel(model_file_, param_file_);
+  // TODO(storypku):
+  // paddle::AnalysisConfig seems to have no such method:
+  // SetModel(const string&, const string&)
+  // config.SetModel(model_file_, param_file_);
+  config.SetModel(model_file_);
  config.SwitchUseFeedFetchOps(false);
  if (gpu_id_ >= 0) {
    config.EnableUseGpu(MemoryPoolInitSizeMb, gpu_id_);

--- a/modules/perception/inference/paddlepaddle/paddle_net.h
+++ b/modules/perception/inference/paddlepaddle/paddle_net.h
@@ -26,6 +26,7 @@
 #include <utility>
 #include <vector>

+#include "paddle/paddle_analysis_config.h"
 #include "paddle/paddle_inference_api.h"

 #include "modules/perception/inference/inference.h"

--- a/modules/perception/inference/tensorrt/BUILD
+++ b/modules/perception/inference/tensorrt/BUILD
@@ -10,9 +10,9 @@ cc_library(
    deps = [
        "//cyber",
        "//modules/perception/base:common",
-        "//modules/perception/proto:rt_proto",
+        "//modules/perception/proto:rt_cc_proto",
        "@com_google_absl//absl/strings",
-        "@tensorrt",
+        "@local_config_cuda//cuda:cudnn_header",
    ],
 )

@@ -32,8 +32,8 @@ cc_library(
    hdrs = ["rt_utils.h"],
    deps = [
        "//cyber",
-        "//modules/perception/proto:rt_proto",
-        "@tensorrt",
+        "//modules/perception/proto:rt_cc_proto",
+        "@local_config_tensorrt//:tensorrt",
    ],
 )

@@ -64,10 +64,8 @@ cc_library(
        "//modules/perception/base",
        "//modules/perception/inference:inference_lib",
        "//modules/perception/inference/tensorrt/plugins:perception_inference_tensorrt_plugins",
-        "//modules/perception/proto:rt_proto",
        "@caffe",
        "@com_google_protobuf//:protobuf",
-        "@tensorrt",
    ],
 )

@@ -80,22 +78,24 @@ cc_test(
    ],
    deps = [
        ":rt_net",
-        "//modules/perception/inference:inference_factory_lib",
+        "//modules/perception/inference/utils:inference_util_cuda_lib",
+        "//third_party:cblas",
        "@com_google_googletest//:gtest_main",
    ],
 )

 cc_library(
    name = "entropy_calibrator",
+    srcs = ["entropy_calibrator.cc"],
    hdrs = ["entropy_calibrator.h"],
-    linkopts = ["-lopencv_core -lnvinfer_plugin -lopencv_imgproc -lopencv_highgui"],
    deps = [
        ":batch_stream",
        ":rt_utils",
        "@caffe",
        "@com_google_protobuf//:protobuf",
        "@local_config_cuda//cuda:cudart",
-        "@tensorrt",
+        "@local_config_tensorrt//:tensorrt",
+        "@opencv",
    ],
 )

@@ -116,14 +116,13 @@ cc_library(
    name = "batch_stream",
    srcs = ["batch_stream.cc"],
    hdrs = ["batch_stream.h"],
-    linkopts = ["-lopencv_core -lnvinfer_plugin -lopencv_imgproc -lopencv_highgui"],
    deps = [
        "//cyber",
-        "//modules/perception/proto:rt_proto",
+        "//modules/perception/proto:rt_cc_proto",
        "@caffe",
        "@com_google_absl//absl/strings",
        "@com_google_protobuf//:protobuf",
-        "@tensorrt",
+        "@opencv",
    ],
 )

@@ -131,6 +130,7 @@ cc_test(
    name = "batch_stream_test",
    size = "small",
    srcs = ["batch_stream_test.cc"],
+    copts = ["-fno-access-control"],
    data = [
        "//modules/perception/inference:inference_test_data",
    ],

--- a/modules/perception/inference/tensorrt/batch_stream_test.cc
+++ b/modules/perception/inference/tensorrt/batch_stream_test.cc
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *****************************************************************************/
-#define private public
 #include "modules/perception/inference/tensorrt/batch_stream.h"

 #include "gtest/gtest.h"

--- a/modules/perception/inference/tensorrt/entropy_calibrator.cc
+++ b/modules/perception/inference/tensorrt/entropy_calibrator.cc
+/******************************************************************************
+ * Copyright 2018 The Apollo Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *****************************************************************************/
+#include "modules/perception/inference/tensorrt/entropy_calibrator.h"
+
+#include <cuda_runtime_api.h>
+#include <fstream>
+
+namespace nvinfer1 {
+DimsHW ICaffePoolOutputDimensionsFormula::compute(DimsHW input_dims,
+                                                  DimsHW kernel_size,
+                                                  DimsHW stride, DimsHW padding,
+                                                  DimsHW dilation,
+                                                  const char *layerName) const {
+  const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
+  const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
+  auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) * 1.0 /
+                  stride.d[0];
+  auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) * 1.0 /
+                  stride.d[1];
+
+  std::string str_name(layerName);
+  if (str_name.find("as_conv") == std::string::npos) {
+    return DimsHW(static_cast<int>(ceil(h_temp)) + 1,
+                  static_cast<int>(ceil(w_temp)) + 1);
+  } else {
+    return DimsHW(static_cast<int>(h_temp) + 1, static_cast<int>(w_temp) + 1);
+  }
+}
+
+Int8EntropyCalibrator::Int8EntropyCalibrator(
+    const apollo::perception::inference::BatchStream &stream, int first_batch,
+    bool read_cache, std::string network)
+    : stream_(stream), read_cache_(read_cache), network_(network) {
+  DimsNCHW dims = stream_.getDims();
+  input_count_ = stream_.getBatchSize() * dims.c() * dims.h() * dims.w();
+  cudaMalloc(&device_input_, input_count_ * sizeof(float));
+  stream_.reset(first_batch);
+}
+
+Int8EntropyCalibrator::~Int8EntropyCalibrator() {
+  if (device_input_) {
+    (cudaFree(device_input_));
+  }
+}
+
+bool Int8EntropyCalibrator::getBatch(void *bindings[], const char *names[],
+                                     int nbBindings) {
+  if (!stream_.next()) {
+    return false;
+  }
+
+  (cudaMemcpy(device_input_, stream_.getBatch(), input_count_ * sizeof(float),
+              cudaMemcpyHostToDevice));
+  bindings[0] = device_input_;
+  return true;
+}
+
+const void *Int8EntropyCalibrator::readCalibrationCache(size_t &length) {
+  calibration_cache_.clear();
+  std::ifstream input(
+      apollo::perception::inference::locateFile(network_, "CalibrationTable"),
+      std::ios::binary);
+  input >> std::noskipws;
+  if (read_cache_ && input.good()) {
+    std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
+              std::back_inserter(calibration_cache_));
+  }
+
+  length = calibration_cache_.size();
+  return length ? &calibration_cache_[0] : nullptr;
+}
+
+void Int8EntropyCalibrator::writeCalibrationCache(const void *cache,
+                                                  size_t length) {
+  std::ofstream output(
+      apollo::perception::inference::locateFile(network_, "CalibrationTable"),
+      std::ios::binary);
+  output.write(reinterpret_cast<const char *>(cache), length);
+}
+
+}  //  namespace nvinfer1
--- a/modules/perception/inference/tensorrt/entropy_calibrator.h
+++ b/modules/perception/inference/tensorrt/entropy_calibrator.h
@@ -16,16 +16,14 @@

 #pragma once

-#include <cuda_runtime_api.h>
 #include <algorithm>
-#include <fstream>
+#include <cmath>
 #include <string>
 #include <vector>

 #include "NvCaffeParser.h"
 #include "NvInfer.h"
 #include "NvInferPlugin.h"
-
 #include "modules/perception/inference/tensorrt/batch_stream.h"
 #include "modules/perception/inference/tensorrt/rt_utils.h"

@@ -34,85 +32,25 @@ class ICaffePoolOutputDimensionsFormula : public IOutputDimensionsFormula {
 public:
  virtual DimsHW compute(DimsHW input_dims, DimsHW kernel_size, DimsHW stride,
                         DimsHW padding, DimsHW dilation,
-                         const char *layerName) const {
-    const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
-    const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
-    auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) *
-                    1.0 / stride.d[0];
-    auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) *
-                    1.0 / stride.d[1];
-
-    std::string str_name(layerName);
-    if (str_name.find("as_conv") == std::string::npos) {
-      return DimsHW(static_cast<int>(ceil(h_temp)) + 1,
-                    static_cast<int>(ceil(w_temp)) + 1);
-    } else {
-      return DimsHW(static_cast<int>(h_temp) + 1, static_cast<int>(w_temp) + 1);
-    }
-  }
-
-  ICaffePoolOutputDimensionsFormula() {}
-  ~ICaffePoolOutputDimensionsFormula() {}
+                         const char *layerName) const;
+  ICaffePoolOutputDimensionsFormula() = default;
+  ~ICaffePoolOutputDimensionsFormula() = default;
 };

 class Int8EntropyCalibrator : public IInt8EntropyCalibrator {
 public:
  Int8EntropyCalibrator(
      const apollo::perception::inference::BatchStream &stream, int first_batch,
-      bool read_cache, std::string network)
-      : stream_(stream), read_cache_(read_cache), network_(network) {
-    DimsNCHW dims = stream_.getDims();
-    input_count_ = stream_.getBatchSize() * dims.c() * dims.h() * dims.w();
-    cudaMalloc(&device_input_, input_count_ * sizeof(float));
-    stream_.reset(first_batch);
-  }
-
-  virtual ~Int8EntropyCalibrator() {
-    if (device_input_) {
-      (cudaFree(device_input_));
-    }
-  }
+      bool read_cache, std::string network);

+  virtual ~Int8EntropyCalibrator();
  int getBatchSize() const override { return stream_.getBatchSize(); }

-  bool getBatch(void *bindings[], const char *names[],
-                int nbBindings) override {
-    if (!stream_.next()) {
-      return false;
-    }
-
-    (cudaMemcpy(device_input_, stream_.getBatch(), input_count_ * sizeof(float),
-                cudaMemcpyHostToDevice));
-    bindings[0] = device_input_;
-    return true;
-  }
-
-  const void *readCalibrationCache(size_t &length) override {
-    calibration_cache_.clear();
-    std::ifstream input(
-        apollo::perception::inference::locateFile(network_, "CalibrationTable"),
-        std::ios::binary);
-    input >> std::noskipws;
-    if (read_cache_ && input.good()) {
-      std::copy(std::istream_iterator<char>(input),
-                std::istream_iterator<char>(),
-                std::back_inserter(calibration_cache_));
-    }
-
-    length = calibration_cache_.size();
-    return length ? &calibration_cache_[0] : nullptr;
-  }
+  bool getBatch(void *bindings[], const char *names[], int nbBindings) override;

-  void writeCalibrationCache(const void *cache, size_t length) override {
-    std::ofstream output(
-        apollo::perception::inference::locateFile(network_, "CalibrationTable"),
-        std::ios::binary);
-    output.write(reinterpret_cast<const char *>(cache), length);
-  }
+  const void *readCalibrationCache(size_t &length) override;

-  virtual CalibrationAlgoType getAlgorithm() {
-    return CalibrationAlgoType::kENTROPY_CALIBRATION;
-  }
+  void writeCalibrationCache(const void *cache, size_t length) override;

 private:
  apollo::perception::inference::BatchStream stream_;

--- a/modules/perception/inference/tensorrt/plugins/BUILD
+++ b/modules/perception/inference/tensorrt/plugins/BUILD
@@ -18,6 +18,11 @@ cc_library(
        "*.h",
    ]),
    linkstatic = True,
+    deps = [
+        "@local_config_cuda//cuda:cublas",
+        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cudnn",
+    ],
 )

 cuda_library(
@@ -28,7 +33,7 @@ cuda_library(
        "//modules/perception/inference/tensorrt:rt_common",
        "@eigen",
        "@local_config_cuda//cuda:cudart",
-        "@tensorrt",
+        "@local_config_tensorrt//:tensorrt",
    ],
 )

@@ -40,7 +45,7 @@ cuda_library(
        "//modules/perception/inference/tensorrt:rt_common",
        "@eigen",
        "@local_config_cuda//cuda:cudart",
-        "@tensorrt",
+        "@local_config_tensorrt//:tensorrt",
    ],
 )

@@ -52,7 +57,7 @@ cuda_library(
        "//modules/perception/inference/tensorrt:rt_common",
        "@eigen",
        "@local_config_cuda//cuda:cudart",
-        "@tensorrt",
+        "@local_config_tensorrt//:tensorrt",
    ],
 )

@@ -64,7 +69,8 @@ cuda_library(
        "//modules/perception/inference/tensorrt:rt_common",
        "@eigen",
        "@local_config_cuda//cuda:cudart",
-        "@tensorrt",
+        "@local_config_cuda//cuda:cudnn",
+        "@local_config_tensorrt//:tensorrt",
    ],
 )

@@ -87,6 +93,7 @@ cc_test(
        ":perception_inference_tensorrt_plugins",
        "//modules/perception/inference/tensorrt:rt_common",
        "@com_google_googletest//:gtest_main",
+        "@local_config_cuda//cuda:cublas",
    ],
 )


--- a/modules/perception/inference/tensorrt/plugins/argmax_plugin.cu
+++ b/modules/perception/inference/tensorrt/plugins/argmax_plugin.cu
@@ -19,14 +19,9 @@
 namespace apollo {
 namespace perception {
 namespace inference {
-__global__ void
-cmp(const int nthreads,
-    const float *in_data,
-    const int channels,
-    const int height,
-    const int width,
-    const bool out_max_val,
-    float *out_data) {
+__global__ void cmp(const int nthreads, const float *in_data,
+                    const int channels, const int height, const int width,
+                    const bool out_max_val, float *out_data) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < nthreads) {
    int w = idx % width;
@@ -55,21 +50,18 @@ cmp(const int nthreads,
    }
  }
 }
-int
-ArgMax1Plugin::enqueue(int batchSize,
-                       const void *const *inputs,
-                       void **outputs,
-                       void *workspace,
-                       cudaStream_t stream) {
+int ArgMax1Plugin::enqueue(int batchSize, const void *const *inputs,
+                           void **outputs, void *workspace,
+                           cudaStream_t stream) {
  const int thread_size = 512;
  int block_size =
-      (input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize
-          + thread_size - 1) / thread_size;
-  cmp << < block_size, thread_size >> >
-      (input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] *
-          batchSize, (const float *) inputs[0], input_dims_.d[0],
-          input_dims_.d[1], input_dims_.d[2], out_max_val_,
-          reinterpret_cast<float *>(outputs[0]));
+      (input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize +
+       thread_size - 1) /
+      thread_size;
+  cmp<<<block_size, thread_size>>>(
+      input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize,
+      (const float *)inputs[0], input_dims_.d[0], input_dims_.d[1],
+      input_dims_.d[2], out_max_val_, reinterpret_cast<float *>(outputs[0]));
  return 0;
 }


--- a/modules/perception/inference/tensorrt/plugins/leakyReLU_plugin.cu
+++ b/modules/perception/inference/tensorrt/plugins/leakyReLU_plugin.cu
@@ -22,7 +22,7 @@ namespace apollo {
 namespace perception {
 namespace inference {

-template<typename Dtype>
+template <typename Dtype>
 __global__ void ReLU(const int nthreads, const Dtype *in_data,
                     const float negative_slope, Dtype *out_data) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -34,21 +34,19 @@ __global__ void ReLU(const int nthreads, const Dtype *in_data,
  }
 }

-int ReLUPlugin::enqueue(int batchSize,
-                         const void *const *inputs,
-                         void **outputs,
-                         void *workspace,
-                         cudaStream_t stream) {
+int ReLUPlugin::enqueue(int batchSize, const void *const *inputs,
+                        void **outputs, void *workspace, cudaStream_t stream) {
  const int thread_size = 512;
  const int block_size =
-      (input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize
-          + thread_size - 1) / thread_size;
-  const int nthreads = input_dims_.d[0] * input_dims_.d[1]
-                       * input_dims_.d[2] * batchSize;
+      (input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize +
+       thread_size - 1) /
+      thread_size;
+  const int nthreads =
+      input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize;

-  ReLU<< < block_size, thread_size, 0, stream >> > (
-          nthreads, (const float *) (inputs[0]),
-          negative_slope_,  reinterpret_cast<float *>(outputs[0]));
+  ReLU<<<block_size, thread_size, 0, stream>>>(
+      nthreads, (const float *)(inputs[0]), negative_slope_,
+      reinterpret_cast<float *>(outputs[0]));
  return 1;
 }
 }  // namespace inference

--- a/modules/perception/inference/tensorrt/plugins/slice_plugin.cu
+++ b/modules/perception/inference/tensorrt/plugins/slice_plugin.cu
@@ -23,7 +23,7 @@ namespace inference {

 typedef int8_t int8;

-template<typename Dtype>
+template <typename Dtype>
 __global__ void Slice(const int nthreads, const Dtype *in_data,
                      const int num_slices, const int slice_size,
                      const int bottom_slice_axis, const int top_slice_axis,
@@ -33,17 +33,15 @@ __global__ void Slice(const int nthreads, const Dtype *in_data,
    const int total_slice_size = slice_size * top_slice_axis;
    const int slice_num = index / total_slice_size;
    const int slice_index = index % total_slice_size;
-    const int bottom_index = slice_index
-        + (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
+    const int bottom_index =
+        slice_index +
+        (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
    out_data[index] = in_data[bottom_index];
  }
 }

-int SLICEPlugin::enqueue(int batchSize,
-                         const void *const *inputs,
-                         void **outputs,
-                         void *workspace,
-                         cudaStream_t stream) {
+int SLICEPlugin::enqueue(int batchSize, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
  int slice_size = 1;
  for (size_t index = axis_ + 1; index < input_dims_.nbDims; index++) {
    slice_size *= input_dims_.d[index];
@@ -61,10 +59,10 @@ int SLICEPlugin::enqueue(int batchSize,
    const int block_num = (nthreads + 511) / 512;

    Slice  // NOLINT_NEXT_LINE(whitespace/operators)
-        << < block_num, 512, 0, stream >> > (
-        nthreads, (const float *) (inputs[0]), num_slices, slice_size,
-            input_dims_.d[axis_], top_slice_axis,
-            offset_slice_axis,  reinterpret_cast<float *>(outputs[i]));
+        <<<block_num, 512, 0, stream>>>(
+            nthreads, (const float *)(inputs[0]), num_slices, slice_size,
+            input_dims_.d[axis_], top_slice_axis, offset_slice_axis,
+            reinterpret_cast<float *>(outputs[i]));
    offset_slice_axis += top_slice_axis;
  }
  return 1;

--- a/modules/perception/inference/tensorrt/plugins/softmax_plugin.cu
+++ b/modules/perception/inference/tensorrt/plugins/softmax_plugin.cu
@@ -21,10 +21,8 @@ namespace apollo {
 namespace perception {
 namespace inference {

-int SoftmaxPlugin::enqueue(int batch_size,
-                           const void *const *inputs,
-                           void **outputs,
-                           void *workspace,
+int SoftmaxPlugin::enqueue(int batch_size, const void *const *inputs,
+                           void **outputs, void *workspace,
                           cudaStream_t stream) {
  const float *in_data = reinterpret_cast<const float *>(inputs[0]);
  float *out_data = reinterpret_cast<float *>(outputs[0]);
@@ -37,37 +35,18 @@ int SoftmaxPlugin::enqueue(int batch_size,
  int c_stride = h * h_stride;
  int n_stride = c * c_stride;

-  cudnnSetTensor4dDescriptorEx(input_desc_,
-                               CUDNN_DATA_FLOAT,
-                               n,
-                               c,
-                               h,
-                               w,
-                               n_stride,
-                               c_stride,
-                               h_stride,
-                               w_stride);
-  cudnnSetTensor4dDescriptorEx(output_desc_,
-                               CUDNN_DATA_FLOAT,
-                               n,
-                               c,
-                               h,
-                               w,
-                               n_stride,
-                               c_stride,
-                               h_stride,
-                               w_stride);
+  cudnnSetTensor4dDescriptorEx(input_desc_, CUDNN_DATA_FLOAT, n, c, h, w,
+                               n_stride, c_stride, h_stride, w_stride);
+  cudnnSetTensor4dDescriptorEx(output_desc_, CUDNN_DATA_FLOAT, n, c, h, w,
+                               n_stride, c_stride, h_stride, w_stride);

  float a = 1.0;
  float b = 0.0;
  cudnnSetStream(cudnn_, stream);
-  cudnnSoftmaxForward(cudnn_, \
-                        CUDNN_SOFTMAX_ACCURATE, \
-                        CUDNN_SOFTMAX_MODE_CHANNEL, \
-                      (const void *) (&a), \
-                        input_desc_, in_data, \
-                        (const void *) (&b), \
-                        output_desc_, out_data);
+  cudnnSoftmaxForward(cudnn_, CUDNN_SOFTMAX_ACCURATE,
+                      CUDNN_SOFTMAX_MODE_CHANNEL, (const void *)(&a),
+                      input_desc_, in_data, (const void *)(&b), output_desc_,
+                      out_data);

  return 1;
 }

--- a/modules/perception/inference/tensorrt/rt_net_test.cc
+++ b/modules/perception/inference/tensorrt/rt_net_test.cc
@@ -19,7 +19,6 @@
 #include "gtest/gtest.h"
 #include "gtest/gtest_prod.h"

-#include "modules/perception/inference/inference_factory.h"
 #include "modules/perception/inference/utils/util.h"

 namespace apollo {

--- a/modules/perception/inference/tools/BUILD
+++ b/modules/perception/inference/tools/BUILD
@@ -8,7 +8,6 @@ cc_binary(
    srcs = ["cal_table_generator.cc"],
    linkstatic = False,
    deps = [
-        "//modules/perception/inference:inference_factory_lib",
        "//modules/perception/inference:inference_lib",
        "//modules/perception/inference/tensorrt:batch_stream",
        "//modules/perception/inference/tensorrt:entropy_calibrator",
@@ -22,7 +21,6 @@ cc_binary(
    srcs = ["denseline_sample.cc"],
    linkstatic = False,
    deps = [
-        "//modules/perception/inference:inference_factory_lib",
        "//modules/perception/inference:inference_lib",
        "//modules/perception/inference/tensorrt:batch_stream",
        "//modules/perception/inference/tensorrt:entropy_calibrator",
@@ -36,7 +34,6 @@ cc_binary(
    srcs = ["lane_sample.cc"],
    linkstatic = False,
    deps = [
-        "//modules/perception/inference:inference_factory_lib",
        "//modules/perception/inference:inference_lib",
        "//modules/perception/inference/tensorrt:batch_stream",
        "//modules/perception/inference/tensorrt:entropy_calibrator",
@@ -50,7 +47,6 @@ cc_binary(
    srcs = ["yolo_sample.cc"],
    linkstatic = False,
    deps = [
-        "//modules/perception/inference:inference_factory_lib",
        "//modules/perception/inference:inference_lib",
        "//modules/perception/inference/tensorrt:batch_stream",
        "//modules/perception/inference/tensorrt:entropy_calibrator",

--- a/modules/perception/inference/tools/denseline_sample.cc
+++ b/modules/perception/inference/tools/denseline_sample.cc
@@ -19,7 +19,6 @@

 #include "cyber/common/log.h"
 #include "modules/perception/inference/inference.h"
-#include "modules/perception/inference/inference_factory.h"
 #include "modules/perception/inference/tensorrt/batch_stream.h"
 #include "modules/perception/inference/tensorrt/entropy_calibrator.h"
 #include "modules/perception/inference/tensorrt/rt_net.h"
@@ -77,8 +76,8 @@ int main(int argc, char **argv) {
        proto_file, weight_file, outputs, inputs, calibrator);
  } else {
    AINFO << "fp32";
-    rt_net = apollo::perception::inference::CreateInferenceByName(
-        "RTNet", proto_file, weight_file, outputs, inputs);
+    rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
+                                                      outputs, inputs);
  }
  std::vector<int> shape = {1, 3, height, width};
  std::map<std::string, std::vector<int>> shape_map{{input_blob_name, shape}};

--- a/modules/perception/inference/tools/lane_sample.cc
+++ b/modules/perception/inference/tools/lane_sample.cc
@@ -19,7 +19,6 @@

 #include "cyber/common/log.h"
 #include "modules/perception/inference/inference.h"
-#include "modules/perception/inference/inference_factory.h"
 #include "modules/perception/inference/tensorrt/batch_stream.h"
 #include "modules/perception/inference/tensorrt/entropy_calibrator.h"
 #include "modules/perception/inference/tensorrt/rt_net.h"
@@ -67,13 +66,11 @@ int main(int argc, char **argv) {
  if (FLAGS_int8) {
    apollo::perception::inference::BatchStream stream(2, 50, "./batches/");
    nvinfer1::Int8EntropyCalibrator calibrator(stream, 0, true, "./");
-    std::cout << "int8" << std::endl;
-    rt_net = apollo::perception::inference::CreateInferenceByName(
-        "RTNetInt8", proto_file, weight_file, outputs, inputs, model_root);
+    rt_net = new apollo::perception::inference::RTNet(
+        proto_file, weight_file, outputs, inputs, model_root);
  } else {
-    std::cout << "fp32" << std::endl;
-    rt_net = apollo::perception::inference::CreateInferenceByName(
-        "RTNet", proto_file, weight_file, outputs, inputs);
+    rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
+                                                      outputs, inputs);
  }
  const int height = 608;
  const int width = 1024;

--- a/modules/perception/inference/tools/yolo_sample.cc
+++ b/modules/perception/inference/tools/yolo_sample.cc
@@ -18,7 +18,6 @@
 #include "opencv2/opencv.hpp"

 #include "modules/perception/inference/inference.h"
-#include "modules/perception/inference/inference_factory.h"
 #include "modules/perception/inference/tensorrt/batch_stream.h"
 #include "modules/perception/inference/tensorrt/entropy_calibrator.h"
 #include "modules/perception/inference/tensorrt/rt_net.h"
@@ -55,8 +54,8 @@ int main(int argc, char **argv) {
    rt_net = new apollo::perception::inference::RTNet(
        proto_file, weight_file, outputs, inputs, calibrator.get());
  } else {
-    rt_net = apollo::perception::inference::CreateInferenceByName(
-        "RTNet", proto_file, weight_file, outputs, inputs);
+    rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
+                                                      outputs, inputs);
  }
  const int height = 576;
  const int width = 1440;

--- a/modules/perception/inference/utils/BUILD
+++ b/modules/perception/inference/utils/BUILD
@@ -32,6 +32,7 @@ cuda_library(
    hdrs = ["util.h"],
    deps = [
        "//cyber",
+        "//modules/perception/base",
        "@eigen",
        "@local_config_cuda//cuda:cudart",
    ],

--- a/modules/perception/inference/utils/binary_data.h
+++ b/modules/perception/inference/utils/binary_data.h
@@ -15,10 +15,11 @@
 *****************************************************************************/
 #pragma once

-#include <boost/shared_ptr.hpp>
 #include <map>
 #include <string>

+#include <boost/shared_ptr.hpp>
+
 #include "modules/perception/base/blob.h"

 namespace apollo {

--- a/modules/perception/inference/utils/gemm.cu
+++ b/modules/perception/inference/utils/gemm.cu
@@ -112,8 +112,8 @@ void GPUGemmFloat(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  ACHECK(cublasSgemm(CudaUtil::get_handler(), cuTransB, cuTransA, N, M, K,
-                    &alpha, B, ldb, A, lda, &beta, C,
-                    N) == CUBLAS_STATUS_SUCCESS);
+                     &alpha, B, ldb, A, lda, &beta, C,
+                     N) == CUBLAS_STATUS_SUCCESS);
 }

 }  // namespace inference