akaze: fix T-API interfaces, disable OpenCL code

- OpenCL kernels don't handle matrices properly. Assumptions are not checked. - OpenCL/T-API integration is not correct.

akaze: fix T-API interfaces, disable OpenCL code
- OpenCL kernels don't handle matrices properly. Assumptions are not checked. - OpenCL/T-API integration is not correct.
e0489cb4 · Alexander Alekhin · 922ac1a1 · e0489cb4 · e0489cb4
Showing with 116 addition and 135 deletion

modules/features2d/src/kaze/AKAZEFeatures.cpp modules/features2d/src/kaze/AKAZEFeatures.cpp +112 -126

modules/features2d/src/kaze/AKAZEFeatures.h modules/features2d/src/kaze/AKAZEFeatures.h +4 -9

未找到文件。
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -15,6 +15,10 @@

 #include <iostream>

+#ifdef HAVE_OPENCL // OpenCL is not well supported
+#undef HAVE_OPENCL
+#endif
+
 // Namespaces
 namespace cv
 {
@@ -251,38 +255,41 @@ private:

 #ifdef HAVE_OPENCL
 static inline bool
-ocl_non_linear_diffusion_step(const UMat& Lt, const UMat& Lf, UMat& Lstep, float step_size)
+ocl_non_linear_diffusion_step(InputArray Lt_, InputArray Lf_, OutputArray Lstep_, float step_size)
 {
-  if(!Lt.isContinuous())
-    return false;
+    if (!Lt_.isContinuous())
+        return false;
+
+    UMat Lt = Lt_.getUMat(), Lf = Lf_.getUMat(), Lstep = Lstep_.getUMat();

-  size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};
+    size_t globalSize[] = {(size_t)Lt.cols, (size_t)Lt.rows};

-  ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
-  if( ker.empty() )
-    return false;
+    ocl::Kernel ker("AKAZE_nld_step_scalar", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;

-  return ker.args(
-    ocl::KernelArg::ReadOnly(Lt),
-    ocl::KernelArg::PtrReadOnly(Lf),
-    ocl::KernelArg::PtrWriteOnly(Lstep),
-    step_size).run(2, globalSize, 0, true);
+    return ker.args(
+            ocl::KernelArg::ReadOnly(Lt),
+            ocl::KernelArg::PtrReadOnly(Lf),
+            ocl::KernelArg::PtrWriteOnly(Lstep),
+            step_size)
+    .run(2, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

 static inline void
-non_linear_diffusion_step(const UMat& Lt, const UMat& Lf, UMat& Lstep, float step_size)
+non_linear_diffusion_step(InputArray Lt, InputArray Lf, OutputArray Lstep, float step_size)
 {
  CV_INSTRUMENT_REGION()

  Lstep.create(Lt.size(), Lt.type());

-  CV_OCL_RUN(true, ocl_non_linear_diffusion_step(Lt, Lf, Lstep, step_size));
+#ifdef HAVE_OPENCL
+  CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lstep.isUMat()), ocl_non_linear_diffusion_step(Lt, Lf, Lstep, step_size));
+#endif

-  // when on CPU UMats should be already allocated on CPU so getMat here is basicallly no-op
-  Mat Mstep = Lstep.getMat(ACCESS_WRITE);
-  parallel_for_(Range(0, Lt.rows), NonLinearScalarDiffusionStep(Lt.getMat(ACCESS_READ),
-    Lf.getMat(ACCESS_READ), Mstep, step_size));
+  Mat Mstep = Lstep.getMat();
+  parallel_for_(Range(0, Lt.rows()), NonLinearScalarDiffusionStep(Lt.getMat(), Lf.getMat(), Mstep, step_size));
 }

 /**
@@ -347,25 +354,28 @@ compute_kcontrast(const cv::Mat& Lx, const cv::Mat& Ly, float perc, int nbins)

 #ifdef HAVE_OPENCL
 static inline bool
-ocl_pm_g2(const UMat& Lx, const UMat& Ly, UMat& Lflow, float kcontrast)
+ocl_pm_g2(InputArray Lx_, InputArray Ly_, OutputArray Lflow_, float kcontrast)
 {
-  int total = Lx.rows * Lx.cols;
-  size_t globalSize[] = {(size_t)total};
-
-  ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
-  if( ker.empty() )
-    return false;
-
-  return ker.args(
-    ocl::KernelArg::PtrReadOnly(Lx),
-    ocl::KernelArg::PtrReadOnly(Ly),
-    ocl::KernelArg::PtrWriteOnly(Lflow),
-    kcontrast, total).run(1, globalSize, 0, true);
+    UMat Lx = Lx_.getUMat(), Ly = Ly_.getUMat(), Lflow = Lflow_.getUMat();
+
+    int total = Lx.rows * Lx.cols;
+    size_t globalSize[] = {(size_t)total};
+
+    ocl::Kernel ker("AKAZE_pm_g2", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;
+
+    return ker.args(
+            ocl::KernelArg::PtrReadOnly(Lx),
+            ocl::KernelArg::PtrReadOnly(Ly),
+            ocl::KernelArg::PtrWriteOnly(Lflow),
+            kcontrast, total)
+    .run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

 static inline void
-compute_diffusivity(const UMat& Lx, const UMat& Ly, UMat& Lflow, float kcontrast, int diffusivity)
+compute_diffusivity(InputArray Lx, InputArray Ly, OutputArray Lflow, float kcontrast, int diffusivity)
 {
  CV_INSTRUMENT_REGION()

@@ -376,7 +386,9 @@ compute_diffusivity(const UMat& Lx, const UMat& Ly, UMat& Lflow, float kcontrast
      pm_g1(Lx, Ly, Lflow, kcontrast);
    break;
    case KAZE::DIFF_PM_G2:
-      CV_OCL_RUN(true, ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
+#ifdef HAVE_OPENCL
+      CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Lflow.isUMat()), ocl_pm_g2(Lx, Ly, Lflow, kcontrast));
+#endif
      pm_g2(Lx, Ly, Lflow, kcontrast);
    break;
    case KAZE::DIFF_WEICKERT:
@@ -391,32 +403,6 @@ compute_diffusivity(const UMat& Lx, const UMat& Ly, UMat& Lflow, float kcontrast
  }
 }

-/**
- * @brief Fetches pyramid from the gpu.
- * @details Setups mapping for matrices that might be probably on the GPU, if the
- * code executes with OpenCL. This will setup MLx, MLy, Mdet members in the pyramid with
- * mapping to respective UMats. This must be called before CPU-only parts of AKAZE, that work
- * only on these Mats.
- *
- * This prevents mapping/unmapping overhead (and possible uploads/downloads) that would occur, if
- * we just create Mats from UMats each time we need it later. This has devastating effects on OCL
- * performace.
- *
- * @param evolution Pyramid to download
- */
-static inline void downloadPyramid(std::vector<Evolution>& evolution)
-{
-  CV_INSTRUMENT_REGION()
-
-  for (size_t i = 0; i < evolution.size(); ++i) {
-    Evolution& e = evolution[i];
-    e.Mx = e.Lx.getMat(ACCESS_READ);
-    e.My = e.Ly.getMat(ACCESS_READ);
-    e.Mt = e.Lt.getMat(ACCESS_READ);
-    e.Mdet = e.Ldet.getMat(ACCESS_READ);
-  }
-}
-
 /**
 * @brief This method creates the nonlinear scale space for a given image
 * @param img Input image for which the nonlinear scale space needs to be created
@@ -435,12 +421,11 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
  if (evolution_.size() == 1) {
    // we don't need to compute kcontrast factor
    Compute_Determinant_Hessian_Response();
-    downloadPyramid(evolution_);
    return;
  }

  // derivatives, flow and diffusion step
-  UMat Lx, Ly, Lsmooth, Lflow, Lstep;
+  Mat Lx, Ly, Lsmooth, Lflow, Lstep;

  // compute derivatives for computing k contrast
  GaussianBlur(img, Lsmooth, Size(5, 5), 1.0f, 1.0f, BORDER_REPLICATE);
@@ -448,8 +433,7 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
  Scharr(Lsmooth, Ly, CV_32F, 0, 1, 1, 0, BORDER_DEFAULT);
  Lsmooth.release();
  // compute the kcontrast factor
-  float kcontrast = compute_kcontrast(Lx.getMat(ACCESS_READ), Ly.getMat(ACCESS_READ),
-    options_.kcontrast_percentile, options_.kcontrast_nbins);
+  float kcontrast = compute_kcontrast(Lx, Ly, options_.kcontrast_percentile, options_.kcontrast_nbins);

  // Now generate the rest of evolution levels
  for (size_t i = 1; i < evolution_.size(); i++) {
@@ -483,31 +467,30 @@ void AKAZEFeatures::Create_Nonlinear_Scale_Space(InputArray img)
  }

  Compute_Determinant_Hessian_Response();
-  downloadPyramid(evolution_);
-
-  return;
 }

 /* ************************************************************************* */

 #ifdef HAVE_OPENCL
 static inline bool
-ocl_compute_determinant(const UMat& Lxx, const UMat& Lxy, const UMat& Lyy,
-  UMat& Ldet, float sigma)
+ocl_compute_determinant(InputArray Lxx_, InputArray Lxy_, InputArray Lyy_, OutputArray Ldet_, float sigma)
 {
-  const int total = Lxx.rows * Lxx.cols;
-  size_t globalSize[] = {(size_t)total};
-
-  ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
-  if( ker.empty() )
-    return false;
-
-  return ker.args(
-    ocl::KernelArg::PtrReadOnly(Lxx),
-    ocl::KernelArg::PtrReadOnly(Lxy),
-    ocl::KernelArg::PtrReadOnly(Lyy),
-    ocl::KernelArg::PtrWriteOnly(Ldet),
-    sigma, total).run(1, globalSize, 0, true);
+    UMat Lxx = Lxx_.getUMat(), Lxy = Lxy_.getUMat(), Lyy = Lyy_.getUMat(), Ldet = Ldet_.getUMat();
+
+    const int total = Lxx.rows * Lxx.cols;
+    size_t globalSize[] = {(size_t)total};
+
+    ocl::Kernel ker("AKAZE_compute_determinant", ocl::features2d::akaze_oclsrc);
+    if (ker.empty())
+        return false;
+
+    return ker.args(
+            ocl::KernelArg::PtrReadOnly(Lxx),
+            ocl::KernelArg::PtrReadOnly(Lxy),
+            ocl::KernelArg::PtrReadOnly(Lyy),
+            ocl::KernelArg::PtrWriteOnly(Ldet),
+            sigma, total)
+    .run(1, globalSize, 0, true);
 }
 #endif // HAVE_OPENCL

@@ -521,27 +504,30 @@ ocl_compute_determinant(const UMat& Lxx, const UMat& Lxy, const UMat& Lyy,
 * @param Ldet output determinant
 * @param sigma determinant will be scaled by this sigma
 */
-static inline void compute_determinant(const UMat& Lxx, const UMat& Lxy, const UMat& Lyy,
-  UMat& Ldet, float sigma)
+static inline void compute_determinant(InputArray Lxx, InputArray Lxy, InputArray Lyy, OutputArray Ldet, float sigma)
 {
-  CV_INSTRUMENT_REGION()
+    CV_INSTRUMENT_REGION()

-  Ldet.create(Lxx.size(), Lxx.type());
+    Ldet.create(Lxx.size(), Lxx.type());

-  CV_OCL_RUN(true, ocl_compute_determinant(Lxx, Lxy, Lyy, Ldet, sigma));
-
-  // output determinant
-  Mat Mxx = Lxx.getMat(ACCESS_READ), Mxy = Lxy.getMat(ACCESS_READ), Myy = Lyy.getMat(ACCESS_READ);
-  Mat Mdet = Ldet.getMat(ACCESS_WRITE);
-  float *lxx = Mxx.ptr<float>();
-  float *lxy = Mxy.ptr<float>();
-  float *lyy = Myy.ptr<float>();
-  float *ldet = Mdet.ptr<float>();
-  const int total = Lxx.cols * Lxx.rows;
-  for (int j = 0; j < total; j++) {
-    ldet[j] = (lxx[j] * lyy[j] - lxy[j] * lxy[j]) * sigma;
-  }
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(OCL_PERFORMANCE_CHECK(Ldet.isUMat()), ocl_compute_determinant(Lxx, Lxy, Lyy, Ldet, sigma));
+#endif

+    // output determinant
+    Mat Mxx = Lxx.getMat(), Mxy = Lxy.getMat(), Myy = Lyy.getMat(), Mdet = Ldet.getMat();
+    const int W = Mxx.cols, H = Mxx.rows;
+    for (int y = 0; y < H; y++)
+    {
+        float *lxx = Mxx.ptr<float>(y);
+        float *lxy = Mxy.ptr<float>(y);
+        float *lyy = Myy.ptr<float>(y);
+        float *ldet = Mdet.ptr<float>(y);
+        for (int x = 0; x < W; x++)
+        {
+            ldet[x] = (lxx[x] * lyy[x] - lxy[x] * lxy[x]) * sigma;
+        }
+    }
 }

 class DeterminantHessianResponse : public ParallelLoopBody
@@ -554,7 +540,7 @@ public:

  void operator()(const Range& range) const
  {
-    UMat Lxx, Lxy, Lyy;
+    Mat Lxx, Lxy, Lyy;

    for (int i = range.start; i < range.end; i++)
    {
@@ -670,16 +656,16 @@ public:
      const Evolution &e = (*evolution_)[i];
      Mat &kpts = (*keypoints_by_layers_)[i];
      // this mask will hold positions of keypoints in this level
-      kpts = Mat::zeros(e.Mdet.size(), CV_8UC1);
+      kpts = Mat::zeros(e.Ldet.size(), CV_8UC1);

      // if border is too big we shouldn't search any keypoints
      if (e.border + 1 >= e.Ldet.rows)
        continue;

-      const float * prev = e.Mdet.ptr<float>(e.border - 1);
-      const float * curr = e.Mdet.ptr<float>(e.border    );
-      const float * next = e.Mdet.ptr<float>(e.border + 1);
-      const float * ldet = e.Mdet.ptr<float>();
+      const float * prev = e.Ldet.ptr<float>(e.border - 1);
+      const float * curr = e.Ldet.ptr<float>(e.border    );
+      const float * next = e.Ldet.ptr<float>(e.border + 1);
+      const float * ldet = e.Ldet.ptr<float>();
      uchar *mask = kpts.ptr<uchar>();
      const int search_radius = e.sigma_size; // size of keypoint in this level

@@ -743,8 +729,8 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<Mat>& keypoints_by_laye
    const Mat &keypoints = keypoints_by_layers[i];
    const uchar *const kpts = keypoints_by_layers[i].ptr<uchar>();
    uchar *const kpts_prev = keypoints_by_layers[i-1].ptr<uchar>();
-    const float *const ldet = evolution_[i].Mdet.ptr<float>();
-    const float *const ldet_prev = evolution_[i-1].Mdet.ptr<float>();
+    const float *const ldet = evolution_[i].Ldet.ptr<float>();
+    const float *const ldet_prev = evolution_[i-1].Ldet.ptr<float>();
    // ratios are just powers of 2
    const int diff_ratio = (int)evolution_[i].octave_ratio / (int)evolution_[i-1].octave_ratio;
    const int search_radius = evolution_[i].sigma_size * diff_ratio; // size of keypoint in this level
@@ -775,8 +761,8 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<Mat>& keypoints_by_laye
    const Mat &keypoints = keypoints_by_layers[i];
    const uchar *const kpts = keypoints_by_layers[i].ptr<uchar>();
    uchar *const kpts_next = keypoints_by_layers[i+1].ptr<uchar>();
-    const float *const ldet = evolution_[i].Mdet.ptr<float>();
-    const float *const ldet_next = evolution_[i+1].Mdet.ptr<float>();
+    const float *const ldet = evolution_[i].Ldet.ptr<float>();
+    const float *const ldet_next = evolution_[i+1].Ldet.ptr<float>();
    // ratios are just powers of 2, i+1 ratio is always greater or equal to i
    const int diff_ratio = (int)evolution_[i+1].octave_ratio / (int)evolution_[i].octave_ratio;
    const int search_radius = evolution_[i+1].sigma_size; // size of keypoints in upper level
@@ -814,7 +800,7 @@ void AKAZEFeatures::Do_Subpixel_Refinement(

  for (size_t i = 0; i < keypoints_by_layers.size(); i++) {
    const Evolution &e = evolution_[i];
-    const float * const ldet = e.Mdet.ptr<float>();
+    const float * const ldet = e.Ldet.ptr<float>();
    const float ratio = e.octave_ratio;
    const int cols = e.Ldet.cols;
    const Mat& keypoints = keypoints_by_layers[i];
@@ -1308,7 +1294,7 @@ void Compute_Main_Orientation(KeyPoint& kpt, const std::vector<Evolution>& evolu
  // Sample derivatives responses for the points within radius of 6*scale
  const int ang_size = 109;
  float resX[ang_size], resY[ang_size];
-  Sample_Derivative_Response_Radius6(e.Mx, e.My, x0, y0, scale, resX, resY);
+  Sample_Derivative_Response_Radius6(e.Lx, e.Ly, x0, y0, scale, resX, resY);

  // Compute the angle of each gradient vector
  float Ang[ang_size];
@@ -1445,8 +1431,8 @@ void MSURF_Upright_Descriptor_64_Invoker::Get_MSURF_Upright_Descriptor_64(const
  ratio = (float)(1 << kpt.octave);
  scale = cvRound(0.5f*kpt.size / ratio);
  const int level = kpt.class_id;
-  Mat Lx = evolution[level].Mx;
-  Mat Ly = evolution[level].My;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
  yf = kpt.pt.y / ratio;
  xf = kpt.pt.x / ratio;

@@ -1575,8 +1561,8 @@ void MSURF_Descriptor_64_Invoker::Get_MSURF_Descriptor_64(const KeyPoint& kpt, f
  scale = cvRound(0.5f*kpt.size / ratio);
  angle = kpt.angle * static_cast<float>(CV_PI / 180.f);
  const int level = kpt.class_id;
-  Mat Lx = evolution[level].Mx;
-  Mat Ly = evolution[level].My;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
  yf = kpt.pt.y / ratio;
  xf = kpt.pt.x / ratio;
  co = cos(angle);
@@ -1708,9 +1694,9 @@ void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(cons
  ratio = (float)(1 << kpt.octave);
  scale = cvRound(0.5f*kpt.size / ratio);
  const int level = kpt.class_id;
-  Mat Lx = evolution[level].Mx;
-  Mat Ly = evolution[level].My;
-  Mat Lt = evolution[level].Mt;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
  yf = kpt.pt.y / ratio;
  xf = kpt.pt.x / ratio;

@@ -1795,9 +1781,9 @@ void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_st
    int pattern_size = options_->descriptor_pattern_size;
    int chan = options_->descriptor_channels;
    int valpos = 0;
-    Mat Lx = evolution[level].Mx;
-    Mat Ly = evolution[level].My;
-    Mat Lt = evolution[level].Mt;
+    const Mat Lx = evolution[level].Lx;
+    const Mat Ly = evolution[level].Ly;
+    const Mat Lt = evolution[level].Lt;

    for (int i = -pattern_size; i < pattern_size; i += sample_step) {
        for (int j = -pattern_size; j < pattern_size; j += sample_step) {
@@ -1944,9 +1930,9 @@ void MLDB_Descriptor_Subset_Invoker::Get_MLDB_Descriptor_Subset(const KeyPoint&
  int scale = cvRound(0.5f*kpt.size / ratio);
  float angle = kpt.angle * static_cast<float>(CV_PI / 180.f);
  const int level = kpt.class_id;
-  Mat Lx = evolution[level].Mx;
-  Mat Ly = evolution[level].My;
-  Mat Lt = evolution[level].Mt;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
  float yf = kpt.pt.y / ratio;
  float xf = kpt.pt.x / ratio;
  float co = cos(angle);
@@ -2051,9 +2037,9 @@ void Upright_MLDB_Descriptor_Subset_Invoker::Get_Upright_MLDB_Descriptor_Subset(
  float ratio = (float)(1 << kpt.octave);
  int scale = cvRound(0.5f*kpt.size / ratio);
  const int level = kpt.class_id;
-  Mat Lx = evolution[level].Mx;
-  Mat Ly = evolution[level].My;
-  Mat Lt = evolution[level].Mt;
+  const Mat Lx = evolution[level].Lx;
+  const Mat Ly = evolution[level].Ly;
+  const Mat Lt = evolution[level].Lt;
  float yf = kpt.pt.y / ratio;
  float xf = kpt.pt.x / ratio;


--- a/modules/features2d/src/kaze/AKAZEFeatures.h
+++ b/modules/features2d/src/kaze/AKAZEFeatures.h
@@ -29,15 +29,10 @@ struct Evolution
    border = 0;
  }

-  UMat Lx, Ly;           ///< First order spatial derivatives
-  UMat Lt;               ///< Evolution image
-  UMat Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
-  UMat Ldet;             ///< Detector response
-
-  // the same as above, holding CPU mapping to UMats above
-  Mat Mx, My;
-  Mat Mt;
-  Mat Mdet;
+  Mat Lx, Ly;           ///< First order spatial derivatives
+  Mat Lt;               ///< Evolution image
+  Mat Lsmooth;          ///< Smoothed image, used only for computing determinant, released afterwards
+  Mat Ldet;             ///< Detector response

  Size size;                ///< Size of the layer
  float etime;              ///< Evolution time