提交 87b045f3 编写于 作者: G ghdawn 提交者: Jiangtao Hu

use caffe::SyncedMemory

上级 9330295a
...@@ -100,7 +100,7 @@ void compute_overlapped_by_idx_gpu(const int nthreads, ...@@ -100,7 +100,7 @@ void compute_overlapped_by_idx_gpu(const int nthreads,
void apply_nms_gpu(const float *bbox_data, const float *conf_data, void apply_nms_gpu(const float *bbox_data, const float *conf_data,
const int num_bboxes, const float confidence_threshold, const int num_bboxes, const float confidence_threshold,
const int top_k, const float nms_threshold, std::vector<int> *indices, const int top_k, const float nms_threshold, std::vector<int> *indices,
std::shared_ptr <SyncedMemory> overlapped, std::shared_ptr <SyncedMemory> idx_sm) { std::shared_ptr <caffe::SyncedMemory> overlapped, std::shared_ptr <caffe::SyncedMemory> idx_sm) {
// Keep part of detections whose scores are higher than confidence threshold. // Keep part of detections whose scores are higher than confidence threshold.
cudaDeviceSynchronize(); cudaDeviceSynchronize();
std::vector<int> idx; std::vector<int> idx;
......
...@@ -112,8 +112,8 @@ void apply_nms_gpu(const float *bbox_data, const float *conf_data, ...@@ -112,8 +112,8 @@ void apply_nms_gpu(const float *bbox_data, const float *conf_data,
const int num_bboxes, const float confidence_threshold, const int num_bboxes, const float confidence_threshold,
const int top_k, const float nms_threshold, const int top_k, const float nms_threshold,
std::vector<int> *indices, std::vector<int> *indices,
std::shared_ptr<SyncedMemory> overlappe, std::shared_ptr<caffe::SyncedMemory> overlappe,
std::shared_ptr<SyncedMemory> idx_sm); std::shared_ptr<caffe::SyncedMemory> idx_sm);
void compute_overlapped_by_idx_gpu(const int nthreads, const float *bbox_data, void compute_overlapped_by_idx_gpu(const int nthreads, const float *bbox_data,
const float overlap_threshold, const float overlap_threshold,
const int *idx, const int num_idx, const int *idx, const int num_idx,
......
...@@ -19,131 +19,6 @@ ...@@ -19,131 +19,6 @@
namespace apollo { namespace apollo {
namespace perception { namespace perception {
SyncedMemory::~SyncedMemory() {
if (cpu_ptr_ && own_cpu_data_) {
PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
if (gpu_ptr_ && own_gpu_data_) {
int initial_device = -1;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
}
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
perception_memset(size_, 0, cpu_ptr_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
own_cpu_data_ = true;
}
gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
inline void SyncedMemory::to_gpu() {
switch (head_) {
case UNINITIALIZED:
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
perception_gpu_memset(size_, 0, gpu_ptr_);
head_ = HEAD_AT_GPU;
own_gpu_data_ = true;
break;
case HEAD_AT_CPU:
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
head_ = SYNCED;
break;
case HEAD_AT_GPU:
case SYNCED:
break;
}
}
const void *SyncedMemory::cpu_data() {
to_cpu();
return (const void *)cpu_ptr_;
}
void SyncedMemory::set_cpu_data(void *data) {
if (data == nullptr) {
return;
}
if (own_cpu_data_) {
PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
}
cpu_ptr_ = data;
head_ = HEAD_AT_CPU;
own_cpu_data_ = false;
}
const void *SyncedMemory::gpu_data() {
to_gpu();
return (const void *)gpu_ptr_;
}
void SyncedMemory::set_gpu_data(void *data) {
if (data == nullptr) {
return;
}
if (own_gpu_data_) {
int initial_device = -1;
cudaGetDevice(&initial_device);
if (gpu_device_ != -1) {
CUDA_CHECK(cudaSetDevice(gpu_device_));
}
CUDA_CHECK(cudaFree(gpu_ptr_));
cudaSetDevice(initial_device);
}
gpu_ptr_ = data;
head_ = HEAD_AT_GPU;
own_gpu_data_ = false;
}
void *SyncedMemory::mutable_cpu_data() {
to_cpu();
head_ = HEAD_AT_CPU;
return cpu_ptr_;
}
void *SyncedMemory::mutable_gpu_data() {
to_gpu();
head_ = HEAD_AT_GPU;
return gpu_ptr_;
}
void SyncedMemory::async_gpu_push(const cudaStream_t &stream) {
CHECK(head_ == HEAD_AT_CPU);
if (gpu_ptr_ == NULL) {
CUDA_CHECK(cudaGetDevice(&gpu_device_));
CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
own_gpu_data_ = true;
}
const cudaMemcpyKind put = cudaMemcpyHostToDevice;
CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
// Assume caller will synchronize on the stream before use
head_ = SYNCED;
}
} // namespace perception } // namespace perception
} // namespace apollo } // namespace apollo
...@@ -126,7 +126,7 @@ void gpu_memcpy(const size_t N, const void *X, void *Y) { ...@@ -126,7 +126,7 @@ void gpu_memcpy(const size_t N, const void *X, void *Y) {
} }
} }
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu, void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
int start_axis) { int start_axis) {
int origin_width = frame.cols; int origin_width = frame.cols;
int origin_height = frame.rows; int origin_height = frame.rows;
...@@ -140,7 +140,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor ...@@ -140,7 +140,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
const dim3 grid(divup(width, block.x), divup(height, block.y)); const dim3 grid(divup(width, block.x), divup(height, block.y));
if (src_gpu == nullptr) { if (src_gpu == nullptr) {
src_gpu.reset( src_gpu.reset(
new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char))); new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
} }
src_gpu->set_cpu_data(frame.data); src_gpu->set_cpu_data(frame.data);
resize_linear_kernel << < grid, block >> > ((const unsigned char *) src_gpu->gpu_data(), dst resize_linear_kernel << < grid, block >> > ((const unsigned char *) src_gpu->gpu_data(), dst
...@@ -148,7 +148,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor ...@@ -148,7 +148,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
} }
void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu, void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
int start_axis, const float mean_b, const float mean_g, const float mean_r, int start_axis, const float mean_b, const float mean_g, const float mean_r,
const float scale) { const float scale) {
int origin_width = frame.cols; int origin_width = frame.cols;
...@@ -163,7 +163,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor ...@@ -163,7 +163,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
const dim3 grid(divup(width, block.x), divup(height, block.y)); const dim3 grid(divup(width, block.x), divup(height, block.y));
if (src_gpu == nullptr) { if (src_gpu == nullptr) {
src_gpu.reset( src_gpu.reset(
new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char))); new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
} }
src_gpu->set_cpu_data(frame.data); src_gpu->set_cpu_data(frame.data);
resize_linear_with_mean_scale_kernel << < grid, block >> > ((const unsigned char *) src_gpu resize_linear_with_mean_scale_kernel << < grid, block >> > ((const unsigned char *) src_gpu
......
...@@ -55,71 +55,14 @@ inline void PerceptionFreeHost(void *ptr, bool use_cuda) { ...@@ -55,71 +55,14 @@ inline void PerceptionFreeHost(void *ptr, bool use_cuda) {
return; return;
} }
class SyncedMemory {
public:
SyncedMemory()
: cpu_ptr_(NULL),
gpu_ptr_(NULL),
size_(0),
head_(UNINITIALIZED),
own_cpu_data_(false),
cpu_malloc_use_cuda_(false),
own_gpu_data_(false),
gpu_device_(-1) {}
explicit SyncedMemory(size_t size)
: cpu_ptr_(NULL),
gpu_ptr_(NULL),
size_(size),
head_(UNINITIALIZED),
own_cpu_data_(false),
cpu_malloc_use_cuda_(false),
own_gpu_data_(false),
gpu_device_(-1) {}
~SyncedMemory();
const void *cpu_data();
void set_cpu_data(void *data);
const void *gpu_data();
void set_gpu_data(void *data);
void *mutable_cpu_data();
void *mutable_gpu_data();
enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
SyncedHead head() { return head_; }
size_t size() { return size_; }
void async_gpu_push(const cudaStream_t &stream);
private:
void to_cpu();
void to_gpu();
void *cpu_ptr_;
void *gpu_ptr_;
size_t size_;
SyncedHead head_;
bool own_cpu_data_;
bool cpu_malloc_use_cuda_;
bool own_gpu_data_;
int gpu_device_;
DISABLE_COPY_AND_ASSIGN(SyncedMemory);
}; // class SyncedMemory
int divup(int a, int b); int divup(int a, int b);
void resize(cv::Mat frame, caffe::Blob<float> *dst, void resize(cv::Mat frame, caffe::Blob<float> *dst,
std::shared_ptr<SyncedMemory> src_gpu, int start_axis); std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis);
// resize with mean and scale // resize with mean and scale
void resize(cv::Mat frame, caffe::Blob<float> *dst, void resize(cv::Mat frame, caffe::Blob<float> *dst,
std::shared_ptr<SyncedMemory> src_gpu, int start_axis, std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis,
const float mean_b, const float mean_g, const float mean_r, const float mean_b, const float mean_g, const float mean_r,
const float scale); const float scale);
} // namespace perception } // namespace perception
......
...@@ -66,7 +66,7 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) { ...@@ -66,7 +66,7 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
yolo::load_anchors(anchors_file, &anchors); yolo::load_anchors(anchors_file, &anchors);
num_anchors_ = anchors.size() / 2; num_anchors_ = anchors.size() / 2;
obj_size_ = output_height_ * output_width_ * anchors.size() / 2; obj_size_ = output_height_ * output_width_ * anchors.size() / 2;
anchor_.reset(new SyncedMemory(anchors.size() * sizeof(float))); anchor_.reset(new caffe::SyncedMemory(anchors.size() * sizeof(float)));
auto anchor_cpu_data = anchor_->mutable_cpu_data(); auto anchor_cpu_data = anchor_->mutable_cpu_data();
memcpy(anchor_cpu_data, anchors.data(), anchors.size() * sizeof(float)); memcpy(anchor_cpu_data, anchors.data(), anchors.size() * sizeof(float));
...@@ -77,20 +77,20 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) { ...@@ -77,20 +77,20 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
yolo::load_types(types_file, &types_); yolo::load_types(types_file, &types_);
res_box_tensor_.reset( res_box_tensor_.reset(
new SyncedMemory(obj_size_ * s_box_block_size * sizeof(float))); new caffe::SyncedMemory(obj_size_ * s_box_block_size * sizeof(float)));
res_box_tensor_->cpu_data(); res_box_tensor_->cpu_data();
res_box_tensor_->gpu_data(); res_box_tensor_->gpu_data();
res_cls_tensor_.reset( res_cls_tensor_.reset(
new SyncedMemory(types_.size() * obj_size_ * sizeof(float))); new caffe::SyncedMemory(types_.size() * obj_size_ * sizeof(float)));
res_cls_tensor_->cpu_data(); res_cls_tensor_->cpu_data();
res_cls_tensor_->gpu_data(); res_cls_tensor_->gpu_data();
overlapped_.reset(new SyncedMemory(top_k_ * top_k_ * sizeof(bool))); overlapped_.reset(new caffe::SyncedMemory(top_k_ * top_k_ * sizeof(bool)));
overlapped_->cpu_data(); overlapped_->cpu_data();
overlapped_->gpu_data(); overlapped_->gpu_data();
idx_sm_.reset(new SyncedMemory(top_k_ * sizeof(int))); idx_sm_.reset(new caffe::SyncedMemory(top_k_ * sizeof(int)));
idx_sm_->cpu_data(); idx_sm_->cpu_data();
idx_sm_->gpu_data(); idx_sm_->gpu_data();
} }
...@@ -148,7 +148,7 @@ void YoloCameraDetector::load_intrinsic( ...@@ -148,7 +148,7 @@ void YoloCameraDetector::load_intrinsic(
int channel = 3; int channel = 3;
image_data_.reset( image_data_.reset(
new SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char))); new caffe::SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char)));
} }
bool YoloCameraDetector::init_cnn(const string &yolo_root) { bool YoloCameraDetector::init_cnn(const string &yolo_root) {
...@@ -310,13 +310,13 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame, ...@@ -310,13 +310,13 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
resize(frame(roi), input_blob.get(), image_data_, 0); resize(frame(roi), input_blob.get(), image_data_, 0);
} }
pre_time.Stop(); pre_time.Stop();
ADEBUG << "Pre-processing: " << pre_time.MilliSeconds() << " ms"; AINFO << "Pre-processing: " << pre_time.MilliSeconds() << " ms";
/////////////////////////// detection part /////////////////////////// /////////////////////////// detection part ///////////////////////////
caffe::Timer det_time; caffe::Timer det_time;
det_time.Start(); det_time.Start();
cnnadapter_->forward(); cnnadapter_->forward();
ADEBUG << "Running detection: " << det_time.MilliSeconds() << " ms"; AINFO << "Running detection: " << det_time.MilliSeconds() << " ms";
caffe::Timer post_time; caffe::Timer post_time;
post_time.Start(); post_time.Start();
...@@ -363,8 +363,8 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame, ...@@ -363,8 +363,8 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
temp_objects[i].reset(); temp_objects[i].reset();
} }
temp_objects.clear(); temp_objects.clear();
ADEBUG << "Post-processing: " << post_time.MilliSeconds() << " ms"; AINFO << "Post-processing: " << post_time.MilliSeconds() << " ms";
ADEBUG << "Number of detected obstacles: " << objects->size(); AINFO << "Number of detected obstacles: " << objects->size();
Extract(objects); Extract(objects);
yolo::recover_bbox(roi_w, roi_h, offset_y_, objects); yolo::recover_bbox(roi_w, roi_h, offset_y_, objects);
......
...@@ -86,13 +86,13 @@ class YoloCameraDetector : public BaseCameraDetector { ...@@ -86,13 +86,13 @@ class YoloCameraDetector : public BaseCameraDetector {
private: private:
std::shared_ptr<CNNAdapter> cnnadapter_; std::shared_ptr<CNNAdapter> cnnadapter_;
std::shared_ptr<SyncedMemory> res_cls_tensor_ = nullptr; std::shared_ptr<caffe::SyncedMemory> res_cls_tensor_ = nullptr;
std::shared_ptr<SyncedMemory> res_box_tensor_ = nullptr; std::shared_ptr<caffe::SyncedMemory> res_box_tensor_ = nullptr;
std::shared_ptr<SyncedMemory> image_data_ = nullptr; std::shared_ptr<caffe::SyncedMemory> image_data_ = nullptr;
std::shared_ptr<SyncedMemory> overlapped_ = nullptr; std::shared_ptr<caffe::SyncedMemory> overlapped_ = nullptr;
std::shared_ptr<SyncedMemory> idx_sm_ = nullptr; std::shared_ptr<caffe::SyncedMemory> idx_sm_ = nullptr;
std::shared_ptr<SyncedMemory> anchor_ = nullptr; std::shared_ptr<caffe::SyncedMemory> anchor_ = nullptr;
int height_ = 0; int height_ = 0;
int width_ = 0; int width_ = 0;
float min_2d_height_ = 0.0f; float min_2d_height_ = 0.0f;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册