use caffe::SyncedMemory

87b045f3 · ghdawn · Jiangtao Hu · 9330295a · 87b045f3 · 87b045f3
7 changed file
--- a/modules/perception/cuda_util/region_output.cu
+++ b/modules/perception/cuda_util/region_output.cu
@@ -100,7 +100,7 @@ void compute_overlapped_by_idx_gpu(const int nthreads,
 void apply_nms_gpu(const float *bbox_data, const float *conf_data,
                   const int num_bboxes, const float confidence_threshold,
                   const int top_k, const float nms_threshold, std::vector<int> *indices,
-                   std::shared_ptr <SyncedMemory> overlapped, std::shared_ptr <SyncedMemory> idx_sm) {
+                   std::shared_ptr <caffe::SyncedMemory> overlapped, std::shared_ptr <caffe::SyncedMemory> idx_sm) {
    // Keep part of detections whose scores are higher than confidence threshold.
    cudaDeviceSynchronize();
    std::vector<int> idx;

--- a/modules/perception/cuda_util/region_output.h
+++ b/modules/perception/cuda_util/region_output.h
@@ -112,8 +112,8 @@ void apply_nms_gpu(const float *bbox_data, const float *conf_data,
                   const int num_bboxes, const float confidence_threshold,
                   const int top_k, const float nms_threshold,
                   std::vector<int> *indices,
-                   std::shared_ptr<SyncedMemory> overlappe,
-                   std::shared_ptr<SyncedMemory> idx_sm);
+                   std::shared_ptr<caffe::SyncedMemory> overlappe,
+                   std::shared_ptr<caffe::SyncedMemory> idx_sm);
 void compute_overlapped_by_idx_gpu(const int nthreads, const float *bbox_data,
                                   const float overlap_threshold,
                                   const int *idx, const int num_idx,

--- a/modules/perception/cuda_util/util.cc
+++ b/modules/perception/cuda_util/util.cc
@@ -19,131 +19,6 @@
 namespace apollo {
 namespace perception {

-SyncedMemory::~SyncedMemory() {
-  if (cpu_ptr_ && own_cpu_data_) {
-    PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
-  }
-  if (gpu_ptr_ && own_gpu_data_) {
-    int initial_device = -1;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
-  }
-}
-
-inline void SyncedMemory::to_cpu() {
-  switch (head_) {
-    case UNINITIALIZED:
-      PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
-      perception_memset(size_, 0, cpu_ptr_);
-      head_ = HEAD_AT_CPU;
-      own_cpu_data_ = true;
-      break;
-    case HEAD_AT_GPU:
-      if (cpu_ptr_ == NULL) {
-        PerceptionMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
-        own_cpu_data_ = true;
-      }
-      gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
-      head_ = SYNCED;
-      break;
-    case HEAD_AT_CPU:
-    case SYNCED:
-      break;
-  }
-}
-
-inline void SyncedMemory::to_gpu() {
-  switch (head_) {
-    case UNINITIALIZED:
-      CUDA_CHECK(cudaGetDevice(&gpu_device_));
-      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-      perception_gpu_memset(size_, 0, gpu_ptr_);
-      head_ = HEAD_AT_GPU;
-      own_gpu_data_ = true;
-      break;
-    case HEAD_AT_CPU:
-      if (gpu_ptr_ == NULL) {
-        CUDA_CHECK(cudaGetDevice(&gpu_device_));
-        CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-        own_gpu_data_ = true;
-      }
-      gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);
-      head_ = SYNCED;
-      break;
-    case HEAD_AT_GPU:
-    case SYNCED:
-      break;
-  }
-}
-
-const void *SyncedMemory::cpu_data() {
-  to_cpu();
-  return (const void *)cpu_ptr_;
-}
-
-void SyncedMemory::set_cpu_data(void *data) {
-  if (data == nullptr) {
-    return;
-  }
-  if (own_cpu_data_) {
-    PerceptionFreeHost(cpu_ptr_, cpu_malloc_use_cuda_);
-  }
-  cpu_ptr_ = data;
-  head_ = HEAD_AT_CPU;
-  own_cpu_data_ = false;
-}
-
-const void *SyncedMemory::gpu_data() {
-  to_gpu();
-  return (const void *)gpu_ptr_;
-}
-
-void SyncedMemory::set_gpu_data(void *data) {
-  if (data == nullptr) {
-    return;
-  }
-  if (own_gpu_data_) {
-    int initial_device = -1;
-    cudaGetDevice(&initial_device);
-    if (gpu_device_ != -1) {
-      CUDA_CHECK(cudaSetDevice(gpu_device_));
-    }
-    CUDA_CHECK(cudaFree(gpu_ptr_));
-    cudaSetDevice(initial_device);
-  }
-  gpu_ptr_ = data;
-  head_ = HEAD_AT_GPU;
-  own_gpu_data_ = false;
-}
-
-void *SyncedMemory::mutable_cpu_data() {
-  to_cpu();
-  head_ = HEAD_AT_CPU;
-  return cpu_ptr_;
-}
-
-void *SyncedMemory::mutable_gpu_data() {
-  to_gpu();
-  head_ = HEAD_AT_GPU;
-  return gpu_ptr_;
-}
-
-void SyncedMemory::async_gpu_push(const cudaStream_t &stream) {
-  CHECK(head_ == HEAD_AT_CPU);
-  if (gpu_ptr_ == NULL) {
-    CUDA_CHECK(cudaGetDevice(&gpu_device_));
-    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));
-    own_gpu_data_ = true;
-  }
-  const cudaMemcpyKind put = cudaMemcpyHostToDevice;
-  CUDA_CHECK(cudaMemcpyAsync(gpu_ptr_, cpu_ptr_, size_, put, stream));
-  // Assume caller will synchronize on the stream before use
-  head_ = SYNCED;
-}

 }  // namespace perception
 }  // namespace apollo
--- a/modules/perception/cuda_util/util.cu
+++ b/modules/perception/cuda_util/util.cu
@@ -126,7 +126,7 @@ void gpu_memcpy(const size_t N, const void *X, void *Y) {
    }
 }

-void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu,
+void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
            int start_axis) {
    int origin_width = frame.cols;
    int origin_height = frame.rows;
@@ -140,7 +140,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
    const dim3 grid(divup(width, block.x), divup(height, block.y));
    if (src_gpu == nullptr) {
        src_gpu.reset(
-                new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
+                new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
    }
    src_gpu->set_cpu_data(frame.data);
    resize_linear_kernel << < grid, block >> > ((const unsigned char *) src_gpu->gpu_data(), dst
@@ -148,7 +148,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor

 }

-void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemory> src_gpu,
+void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <caffe::SyncedMemory> src_gpu,
            int start_axis, const float mean_b, const float mean_g, const float mean_r, 
            const float scale) {
    int origin_width = frame.cols;
@@ -163,7 +163,7 @@ void resize(cv::Mat frame, caffe::Blob<float> *dst, std::shared_ptr <SyncedMemor
    const dim3 grid(divup(width, block.x), divup(height, block.y));
    if (src_gpu == nullptr) {
        src_gpu.reset(
-                new SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
+                new caffe::SyncedMemory(origin_width * origin_height * channel * sizeof(unsigned char)));
    }
    src_gpu->set_cpu_data(frame.data);
    resize_linear_with_mean_scale_kernel << < grid, block >> > ((const unsigned char *) src_gpu

--- a/modules/perception/cuda_util/util.h
+++ b/modules/perception/cuda_util/util.h
@@ -55,71 +55,14 @@ inline void PerceptionFreeHost(void *ptr, bool use_cuda) {
  return;
 }

-class SyncedMemory {
- public:
-  SyncedMemory()
-      : cpu_ptr_(NULL),
-        gpu_ptr_(NULL),
-        size_(0),
-        head_(UNINITIALIZED),
-        own_cpu_data_(false),
-        cpu_malloc_use_cuda_(false),
-        own_gpu_data_(false),
-        gpu_device_(-1) {}
-  explicit SyncedMemory(size_t size)
-      : cpu_ptr_(NULL),
-        gpu_ptr_(NULL),
-        size_(size),
-        head_(UNINITIALIZED),
-        own_cpu_data_(false),
-        cpu_malloc_use_cuda_(false),
-        own_gpu_data_(false),
-        gpu_device_(-1) {}
-
-  ~SyncedMemory();
-
-  const void *cpu_data();
-
-  void set_cpu_data(void *data);
-
-  const void *gpu_data();
-
-  void set_gpu_data(void *data);
-
-  void *mutable_cpu_data();
-
-  void *mutable_gpu_data();
-
-  enum SyncedHead { UNINITIALIZED, HEAD_AT_CPU, HEAD_AT_GPU, SYNCED };
-  SyncedHead head() { return head_; }
-  size_t size() { return size_; }
-
-  void async_gpu_push(const cudaStream_t &stream);
-
- private:
-  void to_cpu();
-
-  void to_gpu();
-
-  void *cpu_ptr_;
-  void *gpu_ptr_;
-  size_t size_;
-  SyncedHead head_;
-  bool own_cpu_data_;
-  bool cpu_malloc_use_cuda_;
-  bool own_gpu_data_;
-  int gpu_device_;
-  DISABLE_COPY_AND_ASSIGN(SyncedMemory);
-};  // class SyncedMemory
-
 int divup(int a, int b);

 void resize(cv::Mat frame, caffe::Blob<float> *dst,
-            std::shared_ptr<SyncedMemory> src_gpu, int start_axis);
+            std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis);

 // resize with mean and scale
 void resize(cv::Mat frame, caffe::Blob<float> *dst,
-            std::shared_ptr<SyncedMemory> src_gpu, int start_axis,
+            std::shared_ptr<caffe::SyncedMemory> src_gpu, int start_axis,
            const float mean_b, const float mean_g, const float mean_r,
            const float scale);
 }  // namespace perception

--- a/modules/perception/obstacle/camera/detector/yolo_camera_detector/yolo_camera_detector.cc
+++ b/modules/perception/obstacle/camera/detector/yolo_camera_detector/yolo_camera_detector.cc
@@ -66,7 +66,7 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
  yolo::load_anchors(anchors_file, &anchors);
  num_anchors_ = anchors.size() / 2;
  obj_size_ = output_height_ * output_width_ * anchors.size() / 2;
-  anchor_.reset(new SyncedMemory(anchors.size() * sizeof(float)));
+  anchor_.reset(new caffe::SyncedMemory(anchors.size() * sizeof(float)));

  auto anchor_cpu_data = anchor_->mutable_cpu_data();
  memcpy(anchor_cpu_data, anchors.data(), anchors.size() * sizeof(float));
@@ -77,20 +77,20 @@ void YoloCameraDetector::init_anchor(const string &yolo_root) {
  yolo::load_types(types_file, &types_);

  res_box_tensor_.reset(
-      new SyncedMemory(obj_size_ * s_box_block_size * sizeof(float)));
+      new caffe::SyncedMemory(obj_size_ * s_box_block_size * sizeof(float)));
  res_box_tensor_->cpu_data();
  res_box_tensor_->gpu_data();

  res_cls_tensor_.reset(
-      new SyncedMemory(types_.size() * obj_size_ * sizeof(float)));
+      new caffe::SyncedMemory(types_.size() * obj_size_ * sizeof(float)));
  res_cls_tensor_->cpu_data();
  res_cls_tensor_->gpu_data();

-  overlapped_.reset(new SyncedMemory(top_k_ * top_k_ * sizeof(bool)));
+  overlapped_.reset(new caffe::SyncedMemory(top_k_ * top_k_ * sizeof(bool)));
  overlapped_->cpu_data();
  overlapped_->gpu_data();

-  idx_sm_.reset(new SyncedMemory(top_k_ * sizeof(int)));
+  idx_sm_.reset(new caffe::SyncedMemory(top_k_ * sizeof(int)));
  idx_sm_->cpu_data();
  idx_sm_->gpu_data();
 }
@@ -148,7 +148,7 @@ void YoloCameraDetector::load_intrinsic(

  int channel = 3;
  image_data_.reset(
-      new SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char)));
+      new caffe::SyncedMemory(roi_w * roi_h * channel * sizeof(unsigned char)));
 }

 bool YoloCameraDetector::init_cnn(const string &yolo_root) {
@@ -310,13 +310,13 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
    resize(frame(roi), input_blob.get(), image_data_, 0);
  }
  pre_time.Stop();
-  ADEBUG << "Pre-processing: " << pre_time.MilliSeconds() << " ms";
+  AINFO << "Pre-processing: " << pre_time.MilliSeconds() << " ms";

  /////////////////////////// detection part ///////////////////////////
  caffe::Timer det_time;
  det_time.Start();
  cnnadapter_->forward();
-  ADEBUG << "Running detection: " << det_time.MilliSeconds() << " ms";
+  AINFO << "Running detection: " << det_time.MilliSeconds() << " ms";
  caffe::Timer post_time;
  post_time.Start();

@@ -363,8 +363,8 @@ bool YoloCameraDetector::Detect(const cv::Mat &frame,
    temp_objects[i].reset();
  }
  temp_objects.clear();
-  ADEBUG << "Post-processing: " << post_time.MilliSeconds() << " ms";
-  ADEBUG << "Number of detected obstacles: " << objects->size();
+  AINFO << "Post-processing: " << post_time.MilliSeconds() << " ms";
+  AINFO << "Number of detected obstacles: " << objects->size();

  Extract(objects);
  yolo::recover_bbox(roi_w, roi_h, offset_y_, objects);

--- a/modules/perception/obstacle/camera/detector/yolo_camera_detector/yolo_camera_detector.h
+++ b/modules/perception/obstacle/camera/detector/yolo_camera_detector/yolo_camera_detector.h
@@ -86,13 +86,13 @@ class YoloCameraDetector : public BaseCameraDetector {
 private:
  std::shared_ptr<CNNAdapter> cnnadapter_;

-  std::shared_ptr<SyncedMemory> res_cls_tensor_ = nullptr;
-  std::shared_ptr<SyncedMemory> res_box_tensor_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> res_cls_tensor_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> res_box_tensor_ = nullptr;

-  std::shared_ptr<SyncedMemory> image_data_ = nullptr;
-  std::shared_ptr<SyncedMemory> overlapped_ = nullptr;
-  std::shared_ptr<SyncedMemory> idx_sm_ = nullptr;
-  std::shared_ptr<SyncedMemory> anchor_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> image_data_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> overlapped_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> idx_sm_ = nullptr;
+  std::shared_ptr<caffe::SyncedMemory> anchor_ = nullptr;
  int height_ = 0;
  int width_ = 0;
  float min_2d_height_ = 0.0f;