提交 87b045f3 编写于 作者: G ghdawn 提交者: Jiangtao Hu

use caffe::SyncedMemory

上级 9330295a
......@@ -100,7 +100,7 @@ void compute_overlapped_by_idx_gpu(const int nthreads,
void apply_nms_gpu(const float *bbox_data, const float *conf_data,
const int num_bboxes, const float confidence_threshold,
const int top_k, const float nms_threshold, std::vector<int> *indices,
std::shared_ptr <SyncedMemory> overlapped, std::shared_ptr <SyncedMemory> idx_sm) {
std::shared_ptr <caffe::SyncedMemory> overlapped, std::shared_ptr <caffe::SyncedMemory> idx_sm) {
// Keep part of detections whose scores are higher than confidence threshold.
cudaDeviceSynchronize();
std::vector<int> idx;
......
......@@ -112,8 +112,8 @@ void apply_nms_gpu(const float *bbox_data, const float *conf_data,
const int num_bboxes, const float confidence_threshold,
const int top_k, const float nms_threshold,
std::vector<int> *indices,
std::shared_ptr<SyncedMemory> overlappe,
std::shared_ptr<SyncedMemory> idx_sm);
std::shared_ptr<caffe::SyncedMemory> overlappe,
std::shared_ptr<caffe::SyncedMemory> idx_sm);
void compute_overlapped_by_idx_gpu(const int nthreads, const float *bbox_data,
const float overlap_threshold,
const int *idx, const int num_idx,
......
......@@ -19,131 +19,6 @@
namespace apollo {
namespace perception {
SyncedMemory::~SyncedMemory() {
if (cpu_ptr_ && own_cpu_data_) {
PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
if (gpu_ptr_ && own_gpu_data_) {
int initial_device = -1;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
}
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
perception_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
inline void SyncedMemory::to_gpu() {
switch (head_) {
case UNINITIALIZED:
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
perception_gpu_memset(size_, 0, gpu_ptr_);
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;
case HEAD_AT_CPU:
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_GPU:
case SYNCED:
break;
}
}
const void *SyncedMemory::cpu_data() {
to_cpu();
return (const void *)cpu_ptr_;
}
void SyncedMemory::set_cpu_data(void *data) {
if (data == nullptr) {
return;
}
if (own_cpu_data_) {
PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
cpu_ptr_ = data;
head_ = HEAD_AT_CPU;
own_cpu_data_ = false;
}
const void *SyncedMemory::gpu_data() {
to_gpu();
return (const void *)gpu_ptr_;
}
void SyncedMemory::set_gpu_data(void *data) {
if (data == nullptr) {
return;
}
if (own_gpu_data_) {
int initial_device = -1;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
}
void *SyncedMemory::mutable_cpu_data() {
to_cpu();
head_ = HEAD_AT_CPU;
return cpu_ptr_;
}
void *SyncedMemory::mutable_gpu_data() {
to_gpu();
head_ = HEAD_AT_GPU;
return gpu_ptr_;
}
void SyncedMemory::async_gpu_push(const cudaStream_t &stream) {
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
// Assume caller will synchronize on the stream before use
head_ = SYNCED;
}
} // namespace perception
} // namespace apollo
......@@ -126,7 +126,7 @@ void gpu_memcpy(const size_t N, const void *X, void *Y) {
}
}
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu,
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
int start_axis) {
int origin_width = frame.cols;
int origin_height = frame.rows;
......@@ -140,7 +140,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
const dim3 grid(divup(width, block.x), divup(height, block.y));
if (src_gpu == nullptr) {
src_gpu.reset(
new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
}
src_gpu->set_cpu_data(frame.data);
resize_linear_kernel << < grid, block >> > ((const unsigned char *) src_gpu->gpu_data(), dst
......@@ -148,7 +148,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
}
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu,
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
int start_axis, const float mean_b, const float mean_g, const float mean_r,
const float scale) {
int origin_width = frame.cols;
......@@ -163,7 +163,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
const dim3 grid(divup(width, block.x), divup(height, block.y));
if (src_gpu == nullptr) {
src_gpu.reset(
new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
}
src_gpu->set_cpu_data(frame.data);
resize_linear_with_mean_scale_kernel << < grid, block >> > ((const unsigned char *) src_gpu
......
......@@ -55,71 +55,14 @@ inline void PerceptionFreeHost(void *ptr, bool use_cuda) {
return;
}
class SyncedMemory {
public:
SyncedMemory()
: cpu_ptr_(NULL),
gpu_ptr_(NULL),
size_(0),
head_(UNINITIALIZED),
own_cpu_data_(false),
cpu_malloc_use_cuda_(false),
own_gpu_data_(false),
gpu_device_(-1) {}
explicit SyncedMemory(size_t size)
: cpu_ptr_(NULL),
gpu_ptr_(NULL),
size_(size),
head_(UNINITIALIZED),
own_cpu_data_(false),
cpu_malloc_use_cuda_(false),
own_gpu_data_(false),
gpu_device_(-1) {}
~SyncedMemory();
const void *cpu_data();
void set_cpu_data(void *data);
const void *gpu_data();
void set_gpu_data(void *data);
void *mutable_cpu_data();
void *mutable_gpu_data();
enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
SyncedHead head() { return head_; }
size_t size() { return size_; }
void async_gpu_push(const cudaStream_t &stream);
private:
void to_cpu();
void to_gpu();
void *cpu_ptr_;
void *gpu_ptr_;
size_t size_;
SyncedHead head_;
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
DISABLE_COPY_AND_ASSIGN(SyncedMemory);
}; // class SyncedMemory
int divup(int a, int b);
void resize(cv::Mat frame, caffe::Blob<float> *dst,
std::shared_ptr<SyncedMemory> src_gpu, int start_axis);
std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis);
// resize with mean and scale
void resize(cv::Mat frame, caffe::Blob<float> *dst,
std::shared_ptr<SyncedMemory> src_gpu, int start_axis,
std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis,
const float mean_b, const float mean_g, const float mean_r,
const float scale);
} // namespace perception
......
......@@ -66,7 +66,7 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
yolo::load_anchors(anchors_file, &anchors);
num_anchors_ = anchors.size() / 2;
obj_size_ = output_height_ * output_width_ * anchors.size() / 2;
anchor_.reset(new SyncedMemory(anchors.size() * sizeof(float)));
anchor_.reset(new caffe::SyncedMemory(anchors.size() * sizeof(float)));
auto anchor_cpu_data = anchor_->mutable_cpu_data();
memcpy(anchor_cpu_data, anchors.data(), anchors.size() * sizeof(float));
......@@ -77,20 +77,20 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
yolo::load_types(types_file, &types_);
res_box_tensor_.reset(
new SyncedMemory(obj_size_ * s_box_block_size * sizeof(float)));
new caffe::SyncedMemory(obj_size_ * s_box_block_size * sizeof(float)));
res_box_tensor_->cpu_data();
res_box_tensor_->gpu_data();
res_cls_tensor_.reset(
new SyncedMemory(types_.size() * obj_size_ * sizeof(float)));
new caffe::SyncedMemory(types_.size() * obj_size_ * sizeof(float)));
res_cls_tensor_->cpu_data();
res_cls_tensor_->gpu_data();
overlapped_.reset(new SyncedMemory(top_k_ * top_k_ * sizeof(bool)));
overlapped_.reset(new caffe::SyncedMemory(top_k_ * top_k_ * sizeof(bool)));
overlapped_->cpu_data();
overlapped_->gpu_data();
idx_sm_.reset(new SyncedMemory(top_k_ * sizeof(int)));
idx_sm_.reset(new caffe::SyncedMemory(top_k_ * sizeof(int)));
idx_sm_->cpu_data();
idx_sm_->gpu_data();
}
......@@ -148,7 +148,7 @@ void YoloCameraDetector::load_intrinsic(
int channel = 3;
image_data_.reset(
new SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char)));
new caffe::SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char)));
}
bool YoloCameraDetector::init_cnn(const string &yolo_root) {
......@@ -310,13 +310,13 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
resize(frame(roi), input_blob.get(), image_data_, 0);
}
pre_time.Stop();
ADEBUG << "Pre-processing: " << pre_time.MilliSeconds() << " ms";
AINFO << "Pre-processing: " << pre_time.MilliSeconds() << " ms";
/////////////////////////// detection part ///////////////////////////
caffe::Timer det_time;
det_time.Start();
cnnadapter_->forward();
ADEBUG << "Running detection: " << det_time.MilliSeconds() << " ms";
AINFO << "Running detection: " << det_time.MilliSeconds() << " ms";
caffe::Timer post_time;
post_time.Start();
......@@ -363,8 +363,8 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
temp_objects[i].reset();
}
temp_objects.clear();
ADEBUG << "Post-processing: " << post_time.MilliSeconds() << " ms";
ADEBUG << "Number of detected obstacles: " << objects->size();
AINFO << "Post-processing: " << post_time.MilliSeconds() << " ms";
AINFO << "Number of detected obstacles: " << objects->size();
Extract(objects);
yolo::recover_bbox(roi_w, roi_h, offset_y_, objects);
......
......@@ -86,13 +86,13 @@ class YoloCameraDetector : public BaseCameraDetector {
private:
std::shared_ptr<CNNAdapter> cnnadapter_;
std::shared_ptr<SyncedMemory> res_cls_tensor_ = nullptr;
std::shared_ptr<SyncedMemory> res_box_tensor_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> res_cls_tensor_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> res_box_tensor_ = nullptr;
std::shared_ptr<SyncedMemory> image_data_ = nullptr;
std::shared_ptr<SyncedMemory> overlapped_ = nullptr;
std::shared_ptr<SyncedMemory> idx_sm_ = nullptr;
std::shared_ptr<SyncedMemory> anchor_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> image_data_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> overlapped_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> idx_sm_ = nullptr;
std::shared_ptr<caffe::SyncedMemory> anchor_ = nullptr;
int height_ = 0;
int width_ = 0;
float min_2d_height_ = 0.0f;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册