fix yolobox_cuda bug

* fix yolobox_cuda bug * update code format

fix yolobox_cuda bug
* fix yolobox_cuda bug * update code format
f4ac2768 · Wilber · GitHub · 4bad9853 · f4ac2768 · f4ac2768
隐藏空白更改
内联并排

Showing with 143 addition and 4 deletion

lite/backends/cuda/math/conv_op_cache_cudnn.h lite/backends/cuda/math/conv_op_cache_cudnn.h +133 -0

lite/kernels/cuda/yolo_box_compute.cu lite/kernels/cuda/yolo_box_compute.cu +10 -4

未找到文件。
--- a/lite/backends/cuda/math/conv_op_cache_cudnn.h
+++ b/lite/backends/cuda/math/conv_op_cache_cudnn.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+// Not thread-safe. Should be owned per-kernel.
+template <typename TAlgorithm>
+class AlgorithmsCache {
+ public:
+  AlgorithmsCache() : search_times_(0) { hash_.clear(); }
+  // Caches the best algorithm for a given
+  // combination of tensor dimensions & compute data type.
+  TAlgorithm GetAlgorithm(
+      const std::vector<int64_t>& dims1,
+      const std::vector<int64_t>& dims2,
+      const std::vector<int>& strides,
+      const std::vector<int>& paddings,
+      const std::vector<int>& dilations,
+      int algorithmFlags,  // can set for different data type
+      std::function<TAlgorithm()> gen_func);
+  TAlgorithm GetAlgorithm(int64_t area,
+                          int search_times,
+                          int algorithmFlags,
+                          std::function<TAlgorithm()> gen_func);
+ private:
+  std::unordered_map<int64_t, TAlgorithm> hash_;
+  int search_times_;
+};
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    const std::vector<int64_t>& dims1,
+    const std::vector<int64_t>& dims2,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& dilations,
+    int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  int64_t seed = 0;
+  // Hash all of the inputs, use to try and look up a previously
+  // discovered algorithm, or fall back to generating a new one.
+  std::hash<int64_t> hashFn;
+  // do hash like boost
+  // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
+  for (const auto num : dims1) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+  }
+  for (const auto num : dims2) {
+    seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1;
+  }
+  for (const auto num : strides) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 2;
+  }
+  for (const auto num : paddings) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 3;
+  }
+  for (const auto num : dilations) {
+    seed ^= hashFn(static_cast<int64_t>(num)) + 0x9e3779b9 + (seed << 6) +
+            (seed >> 2) + 4;
+  }
+  seed ^= hashFn(static_cast<int64_t>(algorithmFlags)) + 0x9e3779b9 +
+          (seed << 6) + (seed >> 2) + 5;
+  VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size();
+  if (seed == 0) return gen_func();
+  if (hash_.find(seed) == hash_.end()) {
+    TAlgorithm value = gen_func();
+    hash_[seed] = value;
+  }
+  return hash_[seed];
+}
+template <typename TAlgorithm>
+TAlgorithm AlgorithmsCache<TAlgorithm>::GetAlgorithm(
+    int64_t area,
+    int search_times,
+    int algorithmFlags,
+    std::function<TAlgorithm()> gen_func) {
+  if (hash_.find(area) != hash_.end()) {
+    return hash_[area];
+  }
+  if (search_times_ < search_times) {
+    auto algo = gen_func();
+    hash_[area] = algo;
+    ++search_times_;
+    return algo;
+  }
+  TAlgorithm algo{};
+  int64_t min = static_cast<uint64_t>(INT_MAX);
+  for (const auto& m : hash_) {
+    if (m.first < min) {
+      min = m.first;
+      algo = m.second;
+    }
+  }
+  return algo;
+}
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/yolo_box_compute.h"
+// #include "lite/core/target_wrapper.h"
 namespace paddle {
 namespace lite {
@@ -94,7 +95,7 @@ __host__ __device__ inline void CalcLabelScore(T* scores,
 template <typename T>
 __global__ void KeYoloBoxFw(const T* input,
-                            const T* imgsize,
+                            const int* imgsize,
                            T* boxes,
                            T* scores,
                            const float conf_thresh,
@@ -117,8 +118,8 @@ __global__ void KeYoloBoxFw(const T* input,
    int l = tid % w;
    int an_stride = (5 + class_num) * grid_num;
-    int img_height = static_cast<int>(imgsize[2 * i]);
+    int img_height = imgsize[2 * i];
-    int img_width = static_cast<int>(imgsize[2 * i + 1]);
+    int img_width = imgsize[2 * i + 1];
    int obj_idx =
        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
@@ -167,7 +168,7 @@ void YoloBoxCompute::Run() {
  int downsample_ratio = param.downsample_ratio;
  const float* input = X->data<float>();
-  const float* imgsize = ImgSize->data<float>();
+  const int* imgsize = ImgSize->data<int>();
  float* boxes = Boxes->mutable_data<float>(TARGET(kCUDA));
  float* scores = Scores->mutable_data<float>(TARGET(kCUDA));
@@ -180,6 +181,11 @@ void YoloBoxCompute::Run() {
  anchors_.Resize({static_cast<int64_t>(anchors.size())});
  int* d_anchors = anchors_.mutable_data<int>(TARGET(kCUDA));
+  // TargetWrapperCuda::MemcpyAsync(d_anchors,
+  //                               anchors.data(),
+  //                               sizeof(int) * anchors.size(),
+  //                               IoDirection::HtoD,
+  //                               stream);
  CopySync<TARGET(kCUDA)>(d_anchors,
                          anchors.data(),
                          sizeof(int) * anchors.size(),