提交 d23d649b 编写于 作者: S storypku

Build: modules/perception/inference pass

上级 3ddcd6ed
......@@ -42,7 +42,7 @@ cc_test(
)
cc_library(
name = "inference_factory_lib",
name = "inference_factory",
srcs = ["inference_factory.cc"],
hdrs = ["inference_factory.h"],
deps = [
......@@ -59,8 +59,9 @@ cc_test(
size = "small",
srcs = ["inference_factory_test.cc"],
deps = [
":inference_factory_lib",
":inference_factory",
"@com_google_googletest//:gtest_main",
"@paddlepaddle",
],
)
......
......@@ -16,12 +16,13 @@
#pragma once
#include <boost/shared_ptr.hpp>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include <boost/shared_ptr.hpp>
#include "modules/perception/base/blob.h"
namespace apollo {
......
......@@ -19,6 +19,7 @@
#include "gtest/gtest.h"
#include "modules/perception/inference/caffe/caffe_net.h"
#include "modules/perception/inference/paddlepaddle/paddle_net.h"
#include "modules/perception/inference/tensorrt/rt_net.h"
namespace apollo {
......
......@@ -69,21 +69,14 @@ license and copyright terms herein.
namespace apollo {
namespace perception {
namespace inference {
template<typename Dtype>
__global__ void ROIPoolForward(const int nthreads,
const Dtype *bottom_data,
const bool use_floor,
const Dtype spatial_scale,
const int channels,
const int height,
const int width,
const int pooled_height,
const int pooled_width,
const Dtype *bottom_rois,
Dtype *top_data,
int *argmax_data) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x;
index < (nthreads);
template <typename Dtype>
__global__ void ROIPoolForward(const int nthreads, const Dtype *bottom_data,
const bool use_floor, const Dtype spatial_scale,
const int channels, const int height,
const int width, const int pooled_height,
const int pooled_width, const Dtype *bottom_rois,
Dtype *top_data, int *argmax_data) {
for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
index += blockDim.x * gridDim.x) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
......@@ -113,19 +106,15 @@ __global__ void ROIPoolForward(const int nthreads,
// Force malformed ROIs to be 1x1
int roi_width = max(roi_end_w - roi_start_w + 1, 1);
int roi_height = max(roi_end_h - roi_start_h + 1, 1);
Dtype bin_size_h = static_cast<Dtype>(roi_height)
/ static_cast<Dtype>(pooled_height);
Dtype bin_size_w = static_cast<Dtype>(roi_width)
/ static_cast<Dtype>(pooled_width);
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph)
* bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw)
* bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1)
* bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1)
* bin_size_w));
Dtype bin_size_h =
static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height);
Dtype bin_size_w =
static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width);
int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h));
int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w));
int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h));
int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w));
// Add roi offsets and clip to input boundaries
hstart = min(max(hstart + roi_start_h, 0), height);
......@@ -152,11 +141,10 @@ __global__ void ROIPoolForward(const int nthreads,
argmax_data[index] = maxidx;
}
}
template<typename Dtype>
void ROIPoolingLayer<Dtype>::ForwardGPU(const std::vector<std::shared_ptr<
base::Blob<Dtype>>> &bottom,
const std::vector<std::shared_ptr<
base::Blob<Dtype>>> &top) {
template <typename Dtype>
void ROIPoolingLayer<Dtype>::ForwardGPU(
const std::vector<std::shared_ptr<base::Blob<Dtype>>> &bottom,
const std::vector<std::shared_ptr<base::Blob<Dtype>>> &top) {
auto feat_b = bottom[0];
auto roi_b = bottom[1];
channels_ = feat_b->channels();
......@@ -173,17 +161,17 @@ void ROIPoolingLayer<Dtype>::ForwardGPU(const std::vector<std::shared_ptr<
int count = top[0]->count();
const int thread_size = 512;
int block_size = (count + thread_size - 1) / thread_size;
ROIPoolForward<Dtype> << < block_size, thread_size >> > (
ROIPoolForward<Dtype><<<block_size, thread_size>>>(
count, bottom_data, use_floor_, spatial_scale_, channels_, height_,
width_,
pooled_height_, pooled_width_, bottom_rois, top_data, argmax_data);
width_, pooled_height_, pooled_width_, bottom_rois, top_data,
argmax_data);
}
template void ROIPoolingLayer<double>::ForwardGPU( \
const std::vector<std::shared_ptr<base::Blob<double>>> & bottom, \
const std::vector<std::shared_ptr<base::Blob<double>>> & top);
template void ROIPoolingLayer<float>::ForwardGPU( \
const std::vector<std::shared_ptr<base::Blob<float>>> & bottom, \
const std::vector<std::shared_ptr<base::Blob<float>>> & top);
template void ROIPoolingLayer<double>::ForwardGPU(
const std::vector<std::shared_ptr<base::Blob<double>>> &bottom,
const std::vector<std::shared_ptr<base::Blob<double>>> &top);
template void ROIPoolingLayer<float>::ForwardGPU(
const std::vector<std::shared_ptr<base::Blob<float>>> &bottom,
const std::vector<std::shared_ptr<base::Blob<float>>> &top);
} // namespace inference
} // namespace perception
......
......@@ -33,7 +33,11 @@ PaddleNet::PaddleNet(const std::string &model_file,
bool PaddleNet::Init(const std::map<std::string, std::vector<int>> &shapes) {
paddle::AnalysisConfig config;
config.SetModel(model_file_, param_file_);
// TODO(storypku):
// paddle::AnalysisConfig seems to have no such method:
// SetModel(const string&, const string&)
// config.SetModel(model_file_, param_file_);
config.SetModel(model_file_);
config.SwitchUseFeedFetchOps(false);
if (gpu_id_ >= 0) {
config.EnableUseGpu(MemoryPoolInitSizeMb, gpu_id_);
......
......@@ -26,6 +26,7 @@
#include <utility>
#include <vector>
#include "paddle/paddle_analysis_config.h"
#include "paddle/paddle_inference_api.h"
#include "modules/perception/inference/inference.h"
......
......@@ -10,9 +10,9 @@ cc_library(
deps = [
"//cyber",
"//modules/perception/base:common",
"//modules/perception/proto:rt_proto",
"//modules/perception/proto:rt_cc_proto",
"@com_google_absl//absl/strings",
"@tensorrt",
"@local_config_cuda//cuda:cudnn_header",
],
)
......@@ -32,8 +32,8 @@ cc_library(
hdrs = ["rt_utils.h"],
deps = [
"//cyber",
"//modules/perception/proto:rt_proto",
"@tensorrt",
"//modules/perception/proto:rt_cc_proto",
"@local_config_tensorrt//:tensorrt",
],
)
......@@ -64,10 +64,8 @@ cc_library(
"//modules/perception/base",
"//modules/perception/inference:inference_lib",
"//modules/perception/inference/tensorrt/plugins:perception_inference_tensorrt_plugins",
"//modules/perception/proto:rt_proto",
"@caffe",
"@com_google_protobuf//:protobuf",
"@tensorrt",
],
)
......@@ -80,22 +78,24 @@ cc_test(
],
deps = [
":rt_net",
"//modules/perception/inference:inference_factory_lib",
"//modules/perception/inference/utils:inference_util_cuda_lib",
"//third_party:cblas",
"@com_google_googletest//:gtest_main",
],
)
cc_library(
name = "entropy_calibrator",
srcs = ["entropy_calibrator.cc"],
hdrs = ["entropy_calibrator.h"],
linkopts = ["-lopencv_core -lnvinfer_plugin -lopencv_imgproc -lopencv_highgui"],
deps = [
":batch_stream",
":rt_utils",
"@caffe",
"@com_google_protobuf//:protobuf",
"@local_config_cuda//cuda:cudart",
"@tensorrt",
"@local_config_tensorrt//:tensorrt",
"@opencv",
],
)
......@@ -116,14 +116,13 @@ cc_library(
name = "batch_stream",
srcs = ["batch_stream.cc"],
hdrs = ["batch_stream.h"],
linkopts = ["-lopencv_core -lnvinfer_plugin -lopencv_imgproc -lopencv_highgui"],
deps = [
"//cyber",
"//modules/perception/proto:rt_proto",
"//modules/perception/proto:rt_cc_proto",
"@caffe",
"@com_google_absl//absl/strings",
"@com_google_protobuf//:protobuf",
"@tensorrt",
"@opencv",
],
)
......@@ -131,6 +130,7 @@ cc_test(
name = "batch_stream_test",
size = "small",
srcs = ["batch_stream_test.cc"],
copts = ["-fno-access-control"],
data = [
"//modules/perception/inference:inference_test_data",
],
......
......@@ -13,7 +13,6 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*****************************************************************************/
#define private public
#include "modules/perception/inference/tensorrt/batch_stream.h"
#include "gtest/gtest.h"
......
/******************************************************************************
* Copyright 2018 The Apollo Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*****************************************************************************/
#include "modules/perception/inference/tensorrt/entropy_calibrator.h"
#include <cuda_runtime_api.h>
#include <fstream>
namespace nvinfer1 {
DimsHW ICaffePoolOutputDimensionsFormula::compute(DimsHW input_dims,
DimsHW kernel_size,
DimsHW stride, DimsHW padding,
DimsHW dilation,
const char *layerName) const {
const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) * 1.0 /
stride.d[0];
auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) * 1.0 /
stride.d[1];
std::string str_name(layerName);
if (str_name.find("as_conv") == std::string::npos) {
return DimsHW(static_cast<int>(ceil(h_temp)) + 1,
static_cast<int>(ceil(w_temp)) + 1);
} else {
return DimsHW(static_cast<int>(h_temp) + 1, static_cast<int>(w_temp) + 1);
}
}
Int8EntropyCalibrator::Int8EntropyCalibrator(
const apollo::perception::inference::BatchStream &stream, int first_batch,
bool read_cache, std::string network)
: stream_(stream), read_cache_(read_cache), network_(network) {
DimsNCHW dims = stream_.getDims();
input_count_ = stream_.getBatchSize() * dims.c() * dims.h() * dims.w();
cudaMalloc(&device_input_, input_count_ * sizeof(float));
stream_.reset(first_batch);
}
Int8EntropyCalibrator::~Int8EntropyCalibrator() {
if (device_input_) {
(cudaFree(device_input_));
}
}
bool Int8EntropyCalibrator::getBatch(void *bindings[], const char *names[],
int nbBindings) {
if (!stream_.next()) {
return false;
}
(cudaMemcpy(device_input_, stream_.getBatch(), input_count_ * sizeof(float),
cudaMemcpyHostToDevice));
bindings[0] = device_input_;
return true;
}
const void *Int8EntropyCalibrator::readCalibrationCache(size_t &length) {
calibration_cache_.clear();
std::ifstream input(
apollo::perception::inference::locateFile(network_, "CalibrationTable"),
std::ios::binary);
input >> std::noskipws;
if (read_cache_ && input.good()) {
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(),
std::back_inserter(calibration_cache_));
}
length = calibration_cache_.size();
return length ? &calibration_cache_[0] : nullptr;
}
void Int8EntropyCalibrator::writeCalibrationCache(const void *cache,
size_t length) {
std::ofstream output(
apollo::perception::inference::locateFile(network_, "CalibrationTable"),
std::ios::binary);
output.write(reinterpret_cast<const char *>(cache), length);
}
} // namespace nvinfer1
......@@ -16,16 +16,14 @@
#pragma once
#include <cuda_runtime_api.h>
#include <algorithm>
#include <fstream>
#include <cmath>
#include <string>
#include <vector>
#include "NvCaffeParser.h"
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "modules/perception/inference/tensorrt/batch_stream.h"
#include "modules/perception/inference/tensorrt/rt_utils.h"
......@@ -34,85 +32,25 @@ class ICaffePoolOutputDimensionsFormula : public IOutputDimensionsFormula {
public:
virtual DimsHW compute(DimsHW input_dims, DimsHW kernel_size, DimsHW stride,
DimsHW padding, DimsHW dilation,
const char *layerName) const {
const int kernel_extent_h = dilation.d[0] * (kernel_size.d[0] - 1) + 1;
const int kernel_extent_w = dilation.d[1] * (kernel_size.d[1] - 1) + 1;
auto &&h_temp = (input_dims.d[0] + 2 * padding.d[0] - kernel_extent_h) *
1.0 / stride.d[0];
auto &&w_temp = (input_dims.d[1] + 2 * padding.d[1] - kernel_extent_w) *
1.0 / stride.d[1];
std::string str_name(layerName);
if (str_name.find("as_conv") == std::string::npos) {
return DimsHW(static_cast<int>(ceil(h_temp)) + 1,
static_cast<int>(ceil(w_temp)) + 1);
} else {
return DimsHW(static_cast<int>(h_temp) + 1, static_cast<int>(w_temp) + 1);
}
}
ICaffePoolOutputDimensionsFormula() {}
~ICaffePoolOutputDimensionsFormula() {}
const char *layerName) const;
ICaffePoolOutputDimensionsFormula() = default;
~ICaffePoolOutputDimensionsFormula() = default;
};
class Int8EntropyCalibrator : public IInt8EntropyCalibrator {
public:
Int8EntropyCalibrator(
const apollo::perception::inference::BatchStream &stream, int first_batch,
bool read_cache, std::string network)
: stream_(stream), read_cache_(read_cache), network_(network) {
DimsNCHW dims = stream_.getDims();
input_count_ = stream_.getBatchSize() * dims.c() * dims.h() * dims.w();
cudaMalloc(&device_input_, input_count_ * sizeof(float));
stream_.reset(first_batch);
}
virtual ~Int8EntropyCalibrator() {
if (device_input_) {
(cudaFree(device_input_));
}
}
bool read_cache, std::string network);
virtual ~Int8EntropyCalibrator();
int getBatchSize() const override { return stream_.getBatchSize(); }
bool getBatch(void *bindings[], const char *names[],
int nbBindings) override {
if (!stream_.next()) {
return false;
}
(cudaMemcpy(device_input_, stream_.getBatch(), input_count_ * sizeof(float),
cudaMemcpyHostToDevice));
bindings[0] = device_input_;
return true;
}
const void *readCalibrationCache(size_t &length) override {
calibration_cache_.clear();
std::ifstream input(
apollo::perception::inference::locateFile(network_, "CalibrationTable"),
std::ios::binary);
input >> std::noskipws;
if (read_cache_ && input.good()) {
std::copy(std::istream_iterator<char>(input),
std::istream_iterator<char>(),
std::back_inserter(calibration_cache_));
}
length = calibration_cache_.size();
return length ? &calibration_cache_[0] : nullptr;
}
bool getBatch(void *bindings[], const char *names[], int nbBindings) override;
void writeCalibrationCache(const void *cache, size_t length) override {
std::ofstream output(
apollo::perception::inference::locateFile(network_, "CalibrationTable"),
std::ios::binary);
output.write(reinterpret_cast<const char *>(cache), length);
}
const void *readCalibrationCache(size_t &length) override;
virtual CalibrationAlgoType getAlgorithm() {
return CalibrationAlgoType::kENTROPY_CALIBRATION;
}
void writeCalibrationCache(const void *cache, size_t length) override;
private:
apollo::perception::inference::BatchStream stream_;
......
......@@ -18,6 +18,11 @@ cc_library(
"*.h",
]),
linkstatic = True,
deps = [
"@local_config_cuda//cuda:cublas",
"@local_config_cuda//cuda:cudart",
"@local_config_cuda//cuda:cudnn",
],
)
cuda_library(
......@@ -28,7 +33,7 @@ cuda_library(
"//modules/perception/inference/tensorrt:rt_common",
"@eigen",
"@local_config_cuda//cuda:cudart",
"@tensorrt",
"@local_config_tensorrt//:tensorrt",
],
)
......@@ -40,7 +45,7 @@ cuda_library(
"//modules/perception/inference/tensorrt:rt_common",
"@eigen",
"@local_config_cuda//cuda:cudart",
"@tensorrt",
"@local_config_tensorrt//:tensorrt",
],
)
......@@ -52,7 +57,7 @@ cuda_library(
"//modules/perception/inference/tensorrt:rt_common",
"@eigen",
"@local_config_cuda//cuda:cudart",
"@tensorrt",
"@local_config_tensorrt//:tensorrt",
],
)
......@@ -64,7 +69,8 @@ cuda_library(
"//modules/perception/inference/tensorrt:rt_common",
"@eigen",
"@local_config_cuda//cuda:cudart",
"@tensorrt",
"@local_config_cuda//cuda:cudnn",
"@local_config_tensorrt//:tensorrt",
],
)
......@@ -87,6 +93,7 @@ cc_test(
":perception_inference_tensorrt_plugins",
"//modules/perception/inference/tensorrt:rt_common",
"@com_google_googletest//:gtest_main",
"@local_config_cuda//cuda:cublas",
],
)
......
......@@ -19,14 +19,9 @@
namespace apollo {
namespace perception {
namespace inference {
__global__ void
cmp(const int nthreads,
const float *in_data,
const int channels,
const int height,
const int width,
const bool out_max_val,
float *out_data) {
__global__ void cmp(const int nthreads, const float *in_data,
const int channels, const int height, const int width,
const bool out_max_val, float *out_data) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < nthreads) {
int w = idx % width;
......@@ -55,21 +50,18 @@ cmp(const int nthreads,
}
}
}
int
ArgMax1Plugin::enqueue(int batchSize,
const void *const *inputs,
void **outputs,
void *workspace,
cudaStream_t stream) {
int ArgMax1Plugin::enqueue(int batchSize, const void *const *inputs,
void **outputs, void *workspace,
cudaStream_t stream) {
const int thread_size = 512;
int block_size =
(input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize
+ thread_size - 1) / thread_size;
cmp << < block_size, thread_size >> >
(input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] *
batchSize, (const float *) inputs[0], input_dims_.d[0],
input_dims_.d[1], input_dims_.d[2], out_max_val_,
reinterpret_cast<float *>(outputs[0]));
(input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize +
thread_size - 1) /
thread_size;
cmp<<<block_size, thread_size>>>(
input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize,
(const float *)inputs[0], input_dims_.d[0], input_dims_.d[1],
input_dims_.d[2], out_max_val_, reinterpret_cast<float *>(outputs[0]));
return 0;
}
......
......@@ -22,7 +22,7 @@ namespace apollo {
namespace perception {
namespace inference {
template<typename Dtype>
template <typename Dtype>
__global__ void ReLU(const int nthreads, const Dtype *in_data,
const float negative_slope, Dtype *out_data) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -34,21 +34,19 @@ __global__ void ReLU(const int nthreads, const Dtype *in_data,
}
}
int ReLUPlugin::enqueue(int batchSize,
const void *const *inputs,
void **outputs,
void *workspace,
cudaStream_t stream) {
int ReLUPlugin::enqueue(int batchSize, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) {
const int thread_size = 512;
const int block_size =
(input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize
+ thread_size - 1) / thread_size;
const int nthreads = input_dims_.d[0] * input_dims_.d[1]
* input_dims_.d[2] * batchSize;
(input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize +
thread_size - 1) /
thread_size;
const int nthreads =
input_dims_.d[0] * input_dims_.d[1] * input_dims_.d[2] * batchSize;
ReLU<< < block_size, thread_size, 0, stream >> > (
nthreads, (const float *) (inputs[0]),
negative_slope_, reinterpret_cast<float *>(outputs[0]));
ReLU<<<block_size, thread_size, 0, stream>>>(
nthreads, (const float *)(inputs[0]), negative_slope_,
reinterpret_cast<float *>(outputs[0]));
return 1;
}
} // namespace inference
......
......@@ -23,7 +23,7 @@ namespace inference {
typedef int8_t int8;
template<typename Dtype>
template <typename Dtype>
__global__ void Slice(const int nthreads, const Dtype *in_data,
const int num_slices, const int slice_size,
const int bottom_slice_axis, const int top_slice_axis,
......@@ -33,17 +33,15 @@ __global__ void Slice(const int nthreads, const Dtype *in_data,
const int total_slice_size = slice_size * top_slice_axis;
const int slice_num = index / total_slice_size;
const int slice_index = index % total_slice_size;
const int bottom_index = slice_index
+ (slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
const int bottom_index =
slice_index +
(slice_num * bottom_slice_axis + offset_slice_axis) * slice_size;
out_data[index] = in_data[bottom_index];
}
}
int SLICEPlugin::enqueue(int batchSize,
const void *const *inputs,
void **outputs,
void *workspace,
cudaStream_t stream) {
int SLICEPlugin::enqueue(int batchSize, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) {
int slice_size = 1;
for (size_t index = axis_ + 1; index < input_dims_.nbDims; index++) {
slice_size *= input_dims_.d[index];
......@@ -61,10 +59,10 @@ int SLICEPlugin::enqueue(int batchSize,
const int block_num = (nthreads + 511) / 512;
Slice // NOLINT_NEXT_LINE(whitespace/operators)
<< < block_num, 512, 0, stream >> > (
nthreads, (const float *) (inputs[0]), num_slices, slice_size,
input_dims_.d[axis_], top_slice_axis,
offset_slice_axis, reinterpret_cast<float *>(outputs[i]));
<<<block_num, 512, 0, stream>>>(
nthreads, (const float *)(inputs[0]), num_slices, slice_size,
input_dims_.d[axis_], top_slice_axis, offset_slice_axis,
reinterpret_cast<float *>(outputs[i]));
offset_slice_axis += top_slice_axis;
}
return 1;
......
......@@ -21,10 +21,8 @@ namespace apollo {
namespace perception {
namespace inference {
int SoftmaxPlugin::enqueue(int batch_size,
const void *const *inputs,
void **outputs,
void *workspace,
int SoftmaxPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace,
cudaStream_t stream) {
const float *in_data = reinterpret_cast<const float *>(inputs[0]);
float *out_data = reinterpret_cast<float *>(outputs[0]);
......@@ -37,37 +35,18 @@ int SoftmaxPlugin::enqueue(int batch_size,
int c_stride = h * h_stride;
int n_stride = c * c_stride;
cudnnSetTensor4dDescriptorEx(input_desc_,
CUDNN_DATA_FLOAT,
n,
c,
h,
w,
n_stride,
c_stride,
h_stride,
w_stride);
cudnnSetTensor4dDescriptorEx(output_desc_,
CUDNN_DATA_FLOAT,
n,
c,
h,
w,
n_stride,
c_stride,
h_stride,
w_stride);
cudnnSetTensor4dDescriptorEx(input_desc_, CUDNN_DATA_FLOAT, n, c, h, w,
n_stride, c_stride, h_stride, w_stride);
cudnnSetTensor4dDescriptorEx(output_desc_, CUDNN_DATA_FLOAT, n, c, h, w,
n_stride, c_stride, h_stride, w_stride);
float a = 1.0;
float b = 0.0;
cudnnSetStream(cudnn_, stream);
cudnnSoftmaxForward(cudnn_, \
CUDNN_SOFTMAX_ACCURATE, \
CUDNN_SOFTMAX_MODE_CHANNEL, \
(const void *) (&a), \
input_desc_, in_data, \
(const void *) (&b), \
output_desc_, out_data);
cudnnSoftmaxForward(cudnn_, CUDNN_SOFTMAX_ACCURATE,
CUDNN_SOFTMAX_MODE_CHANNEL, (const void *)(&a),
input_desc_, in_data, (const void *)(&b), output_desc_,
out_data);
return 1;
}
......
......@@ -19,7 +19,6 @@
#include "gtest/gtest.h"
#include "gtest/gtest_prod.h"
#include "modules/perception/inference/inference_factory.h"
#include "modules/perception/inference/utils/util.h"
namespace apollo {
......
......@@ -8,7 +8,6 @@ cc_binary(
srcs = ["cal_table_generator.cc"],
linkstatic = False,
deps = [
"//modules/perception/inference:inference_factory_lib",
"//modules/perception/inference:inference_lib",
"//modules/perception/inference/tensorrt:batch_stream",
"//modules/perception/inference/tensorrt:entropy_calibrator",
......@@ -22,7 +21,6 @@ cc_binary(
srcs = ["denseline_sample.cc"],
linkstatic = False,
deps = [
"//modules/perception/inference:inference_factory_lib",
"//modules/perception/inference:inference_lib",
"//modules/perception/inference/tensorrt:batch_stream",
"//modules/perception/inference/tensorrt:entropy_calibrator",
......@@ -36,7 +34,6 @@ cc_binary(
srcs = ["lane_sample.cc"],
linkstatic = False,
deps = [
"//modules/perception/inference:inference_factory_lib",
"//modules/perception/inference:inference_lib",
"//modules/perception/inference/tensorrt:batch_stream",
"//modules/perception/inference/tensorrt:entropy_calibrator",
......@@ -50,7 +47,6 @@ cc_binary(
srcs = ["yolo_sample.cc"],
linkstatic = False,
deps = [
"//modules/perception/inference:inference_factory_lib",
"//modules/perception/inference:inference_lib",
"//modules/perception/inference/tensorrt:batch_stream",
"//modules/perception/inference/tensorrt:entropy_calibrator",
......
......@@ -19,7 +19,6 @@
#include "cyber/common/log.h"
#include "modules/perception/inference/inference.h"
#include "modules/perception/inference/inference_factory.h"
#include "modules/perception/inference/tensorrt/batch_stream.h"
#include "modules/perception/inference/tensorrt/entropy_calibrator.h"
#include "modules/perception/inference/tensorrt/rt_net.h"
......@@ -77,8 +76,8 @@ int main(int argc, char **argv) {
proto_file, weight_file, outputs, inputs, calibrator);
} else {
AINFO << "fp32";
rt_net = apollo::perception::inference::CreateInferenceByName(
"RTNet", proto_file, weight_file, outputs, inputs);
rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
outputs, inputs);
}
std::vector<int> shape = {1, 3, height, width};
std::map<std::string, std::vector<int>> shape_map{{input_blob_name, shape}};
......
......@@ -19,7 +19,6 @@
#include "cyber/common/log.h"
#include "modules/perception/inference/inference.h"
#include "modules/perception/inference/inference_factory.h"
#include "modules/perception/inference/tensorrt/batch_stream.h"
#include "modules/perception/inference/tensorrt/entropy_calibrator.h"
#include "modules/perception/inference/tensorrt/rt_net.h"
......@@ -67,13 +66,11 @@ int main(int argc, char **argv) {
if (FLAGS_int8) {
apollo::perception::inference::BatchStream stream(2, 50, "./batches/");
nvinfer1::Int8EntropyCalibrator calibrator(stream, 0, true, "./");
std::cout << "int8" << std::endl;
rt_net = apollo::perception::inference::CreateInferenceByName(
"RTNetInt8", proto_file, weight_file, outputs, inputs, model_root);
rt_net = new apollo::perception::inference::RTNet(
proto_file, weight_file, outputs, inputs, model_root);
} else {
std::cout << "fp32" << std::endl;
rt_net = apollo::perception::inference::CreateInferenceByName(
"RTNet", proto_file, weight_file, outputs, inputs);
rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
outputs, inputs);
}
const int height = 608;
const int width = 1024;
......
......@@ -18,7 +18,6 @@
#include "opencv2/opencv.hpp"
#include "modules/perception/inference/inference.h"
#include "modules/perception/inference/inference_factory.h"
#include "modules/perception/inference/tensorrt/batch_stream.h"
#include "modules/perception/inference/tensorrt/entropy_calibrator.h"
#include "modules/perception/inference/tensorrt/rt_net.h"
......@@ -55,8 +54,8 @@ int main(int argc, char **argv) {
rt_net = new apollo::perception::inference::RTNet(
proto_file, weight_file, outputs, inputs, calibrator.get());
} else {
rt_net = apollo::perception::inference::CreateInferenceByName(
"RTNet", proto_file, weight_file, outputs, inputs);
rt_net = new apollo::perception::inference::RTNet(proto_file, weight_file,
outputs, inputs);
}
const int height = 576;
const int width = 1440;
......
......@@ -32,6 +32,7 @@ cuda_library(
hdrs = ["util.h"],
deps = [
"//cyber",
"//modules/perception/base",
"@eigen",
"@local_config_cuda//cuda:cudart",
],
......
......@@ -15,10 +15,11 @@
*****************************************************************************/
#pragma once
#include <boost/shared_ptr.hpp>
#include <map>
#include <string>
#include <boost/shared_ptr.hpp>
#include "modules/perception/base/blob.h"
namespace apollo {
......
......@@ -112,8 +112,8 @@ void GPUGemmFloat(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
cublasOperation_t cuTransB =
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
ACHECK(cublasSgemm(CudaUtil::get_handler(), cuTransB, cuTransA, N, M, K,
&alpha, B, ldb, A, lda, &beta, C,
N) == CUBLAS_STATUS_SUCCESS);
&alpha, B, ldb, A, lda, &beta, C,
N) == CUBLAS_STATUS_SUCCESS);
}
} // namespace inference
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册