diff --git a/modules/features2d/src/kaze/AKAZEFeatures.cpp b/modules/features2d/src/kaze/AKAZEFeatures.cpp
index 97222dfa4f76e4d77a56ead5202a8ad5c3eca9bb..72569dad9296ec77efd41bebeb5651e065786719 100644
--- a/modules/features2d/src/kaze/AKAZEFeatures.cpp
+++ b/modules/features2d/src/kaze/AKAZEFeatures.cpp
@@ -47,8 +47,8 @@ void AKAZEFeatures::Allocate_Memory_Evolution(void) {
   int level_height = 0, level_width = 0;
 
   // Allocate the dimension of the matrices for the evolution
-  for (int i = 0; i <= options_.omax - 1; i++) {
-    rfactor = 1.0f / pow(2.f, i);
+  for (int i = 0, power = 1; i <= options_.omax - 1; i++, power *= 2) {
+    rfactor = 1.0f / power;
     level_height = (int)(options_.img_height*rfactor);
     level_width = (int)(options_.img_width*rfactor);
 
@@ -190,7 +190,7 @@ public:
 
     for (int i = range.start; i < range.end; i++)
     {
-      float ratio = pow(2.f, (float)evolution[i].octave);
+        float ratio = (float)fastpow(2, evolution[i].octave);
       int sigma_size_ = fRound(evolution[i].esigma * options_.derivative_factor / ratio);
 
       compute_scharr_derivatives(evolution[i].Lsmooth, evolution[i].Lx, 1, 0, sigma_size_);
@@ -272,7 +272,11 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts)
   }
 
   for (size_t i = 0; i < evolution_.size(); i++) {
+    float* prev = evolution_[i].Ldet.ptr<float>(0);
+    float* curr = evolution_[i].Ldet.ptr<float>(1);
     for (int ix = 1; ix < evolution_[i].Ldet.rows - 1; ix++) {
+      float* next = evolution_[i].Ldet.ptr<float>(ix + 1);
+
       for (int jx = 1; jx < evolution_[i].Ldet.cols - 1; jx++) {
         is_extremum = false;
         is_repeated = false;
@@ -281,21 +285,21 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts)
 
         // Filter the points with the detector threshold
         if (value > options_.dthreshold && value >= options_.min_dthreshold &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix)+jx - 1) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix)+jx + 1) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix - 1) + jx - 1) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix - 1) + jx) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix - 1) + jx + 1) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix + 1) + jx - 1) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix + 1) + jx) &&
-            value > *(evolution_[i].Ldet.ptr<float>(ix + 1) + jx + 1)) {
+            value > curr[jx-1] &&
+            value > curr[jx+1] &&
+            value > prev[jx-1] &&
+            value > prev[jx] &&
+            value > prev[jx+1] &&
+            value > next[jx-1] &&
+            value > next[jx] &&
+            value > next[jx+1]) {
 
           is_extremum = true;
           point.response = fabs(value);
           point.size = evolution_[i].esigma*options_.derivative_factor;
           point.octave = (int)evolution_[i].octave;
           point.class_id = (int)i;
-          ratio = pow(2.f, point.octave);
+          ratio = (float)fastpow(2, point.octave);
           sigma_size_ = fRound(point.size / ratio);
           point.pt.x = static_cast<float>(jx);
           point.pt.y = static_cast<float>(ix);
@@ -305,8 +309,10 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts)
 
             if ((point.class_id - 1) == kpts_aux[ik].class_id ||
                 point.class_id == kpts_aux[ik].class_id) {
-              dist = sqrt(pow(point.pt.x*ratio - kpts_aux[ik].pt.x, 2) + pow(point.pt.y*ratio - kpts_aux[ik].pt.y, 2));
-              if (dist <= point.size) {
+              float distx = point.pt.x*ratio - kpts_aux[ik].pt.x;
+              float disty = point.pt.y*ratio - kpts_aux[ik].pt.y;
+              dist = distx * distx + disty * disty;
+              if (dist <= point.size * point.size) {
                 if (point.response > kpts_aux[ik].response) {
                   id_repeated = (int)ik;
                   is_repeated = true;
@@ -349,6 +355,8 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts)
           } //if is_extremum
         }
       } // for jx
+      prev = curr;
+      curr = next;
     } // for ix
   } // for i
 
@@ -361,8 +369,10 @@ void AKAZEFeatures::Find_Scale_Space_Extrema(std::vector<cv::KeyPoint>& kpts)
 
       // Compare response with the upper scale
       if ((pt.class_id + 1) == kpts_aux[j].class_id) {
-        dist = sqrt(pow(pt.pt.x - kpts_aux[j].pt.x, 2) + pow(pt.pt.y - kpts_aux[j].pt.y, 2));
-        if (dist <= pt.size) {
+        float distx = pt.pt.x - kpts_aux[j].pt.x;
+        float disty = pt.pt.y - kpts_aux[j].pt.y;
+        dist = distx * distx + disty * disty;
+        if (dist <= pt.size * pt.size) {
           if (pt.response < kpts_aux[j].response) {
             is_repeated = true;
             break;
@@ -386,12 +396,12 @@ void AKAZEFeatures::Do_Subpixel_Refinement(std::vector<cv::KeyPoint>& kpts)
   float Dx = 0.0, Dy = 0.0, ratio = 0.0;
   float Dxx = 0.0, Dyy = 0.0, Dxy = 0.0;
   int x = 0, y = 0;
-  cv::Mat A = cv::Mat::zeros(2, 2, CV_32F);
-  cv::Mat b = cv::Mat::zeros(2, 1, CV_32F);
-  cv::Mat dst = cv::Mat::zeros(2, 1, CV_32F);
+  Matx22f A(0, 0, 0, 0);
+  Vec2f b(0, 0);
+  Vec2f dst(0, 0);
 
   for (size_t i = 0; i < kpts.size(); i++) {
-    ratio = pow(2.f, kpts[i].octave);
+    ratio = (float)fastpow(2, kpts[i].octave);
     x = fRound(kpts[i].pt.x / ratio);
     y = fRound(kpts[i].pt.y / ratio);
 
@@ -416,19 +426,20 @@ void AKAZEFeatures::Do_Subpixel_Refinement(std::vector<cv::KeyPoint>& kpts)
         + (*(evolution_[kpts[i].class_id].Ldet.ptr<float>(y + 1) + x - 1)));
 
     // Solve the linear system
-    *(A.ptr<float>(0)) = Dxx;
-    *(A.ptr<float>(1) + 1) = Dyy;
-    *(A.ptr<float>(0) + 1) = *(A.ptr<float>(1)) = Dxy;
-    *(b.ptr<float>(0)) = -Dx;
-    *(b.ptr<float>(1)) = -Dy;
+    A(0, 0) = Dxx;
+    A(1, 1) = Dyy;
+    A(0, 1) = A(1, 0) = Dxy;
+    b(0) = -Dx;
+    b(1) = -Dy;
 
     cv::solve(A, b, dst, DECOMP_LU);
 
-    if (fabs(*(dst.ptr<float>(0))) <= 1.0f && fabs(*(dst.ptr<float>(1))) <= 1.0f) {
-      kpts[i].pt.x = x + (*(dst.ptr<float>(0)));
-      kpts[i].pt.y = y + (*(dst.ptr<float>(1)));
-      kpts[i].pt.x *= powf(2.f, (float)evolution_[kpts[i].class_id].octave);
-      kpts[i].pt.y *= powf(2.f, (float)evolution_[kpts[i].class_id].octave);
+    if (fabs(dst(0)) <= 1.0f && fabs(dst(1)) <= 1.0f) {
+        kpts[i].pt.x = x + dst(0);
+      kpts[i].pt.y = y + dst(1);
+      int power = fastpow(2, evolution_[kpts[i].class_id].octave);
+      kpts[i].pt.x *= power;
+      kpts[i].pt.y *= power;
       kpts[i].angle = 0.0;
 
       // In OpenCV the size of a keypoint its the diameter
@@ -637,6 +648,10 @@ public:
   }
 
   void Get_MLDB_Full_Descriptor(const cv::KeyPoint& kpt, unsigned char* desc) const;
+  void MLDB_Fill_Values(float* values, int sample_step, int level,
+                        float xf, float yf, float co, float si, float scale) const;
+  void MLDB_Binary_Comparisons(float* values, unsigned char* desc,
+                               int count, int& dpos) const;
 
 private:
   std::vector<cv::KeyPoint>* keypoints_;
@@ -754,7 +769,8 @@ void AKAZEFeatures::Compute_Main_Orientation(cv::KeyPoint& kpt, const std::vecto
 
   int ix = 0, iy = 0, idx = 0, s = 0, level = 0;
   float xf = 0.0, yf = 0.0, gweight = 0.0, ratio = 0.0;
-  std::vector<float> resX(109), resY(109), Ang(109);
+  const int ang_size = 109;
+  float resX[ang_size], resY[ang_size], Ang[ang_size];
   const int id[] = { 6, 5, 4, 3, 2, 1, 0, 1, 2, 3, 4, 5, 6 };
 
   // Variables for computing the dominant direction
@@ -778,17 +794,17 @@ void AKAZEFeatures::Compute_Main_Orientation(cv::KeyPoint& kpt, const std::vecto
         resX[idx] = gweight*(*(evolution_[level].Lx.ptr<float>(iy)+ix));
         resY[idx] = gweight*(*(evolution_[level].Ly.ptr<float>(iy)+ix));
 
-        Ang[idx] = getAngle(resX[idx], resY[idx]);
         ++idx;
       }
     }
   }
+  fastAtan2(resY, resX, Ang, ang_size, false);
   // Loop slides pi/3 window around feature point
   for (ang1 = 0; ang1 < (float)(2.0 * CV_PI); ang1 += 0.15f) {
     ang2 = (ang1 + (float)(CV_PI / 3.0) >(float)(2.0*CV_PI) ? ang1 - (float)(5.0*CV_PI / 3.0) : ang1 + (float)(CV_PI / 3.0));
     sumX = sumY = 0.f;
 
-    for (size_t k = 0; k < Ang.size(); ++k) {
+    for (size_t k = 0; k < ang_size; ++k) {
       // Get angle from the x-axis of the sample point
       const float & ang = Ang[k];
 
@@ -1281,304 +1297,112 @@ void Upright_MLDB_Full_Descriptor_Invoker::Get_Upright_MLDB_Full_Descriptor(cons
   }
 }
 
-/* ************************************************************************* */
-/**
- * @brief This method computes the descriptor of the provided keypoint given the
- * main orientation of the keypoint
- * @param kpt Input keypoint
- * @param desc Descriptor vector
- */
-void MLDB_Full_Descriptor_Invoker::Get_MLDB_Full_Descriptor(const cv::KeyPoint& kpt, unsigned char *desc) const {
-
-  float di = 0.0, dx = 0.0, dy = 0.0, ratio = 0.0;
-  float ri = 0.0, rx = 0.0, ry = 0.0, rrx = 0.0, rry = 0.0, xf = 0.0, yf = 0.0;
-  float sample_x = 0.0, sample_y = 0.0, co = 0.0, si = 0.0, angle = 0.0;
-  int x1 = 0, y1 = 0, sample_step = 0, pattern_size = 0;
-  int level = 0, nsamples = 0, scale = 0;
-  int dcount1 = 0, dcount2 = 0;
-
-  const AKAZEOptions & options = *options_;
-  const std::vector<TEvolution>& evolution = *evolution_;
-
-  // Matrices for the M-LDB descriptor
-  cv::Mat values_1 = cv::Mat::zeros(4, options.descriptor_channels, CV_32FC1);
-  cv::Mat values_2 = cv::Mat::zeros(9, options.descriptor_channels, CV_32FC1);
-  cv::Mat values_3 = cv::Mat::zeros(16, options.descriptor_channels, CV_32FC1);
-
-  // Get the information from the keypoint
-  ratio = (float)(1 << kpt.octave);
-  scale = fRound(0.5f*kpt.size / ratio);
-  angle = kpt.angle;
-  level = kpt.class_id;
-  yf = kpt.pt.y / ratio;
-  xf = kpt.pt.x / ratio;
-  co = cos(angle);
-  si = sin(angle);
-
-  // First 2x2 grid
-  pattern_size = options.descriptor_pattern_size;
-  sample_step = pattern_size;
-
-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (float k = (float)i; k < i + sample_step; k++) {
-        for (float l = (float)j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + (l*scale*co + k*scale*si);
-          sample_x = xf + (-l*scale*si + k*scale*co);
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-
-          di += ri;
-
-          if (options.descriptor_channels == 2) {
-            dx += sqrtf(rx*rx + ry*ry);
-          }
-          else if (options.descriptor_channels == 3) {
-            // Get the x and y derivatives on the rotated axis
-            rry = rx*co + ry*si;
-            rrx = -rx*si + ry*co;
-            dx += rrx;
-            dy += rry;
-          }
-
-          nsamples++;
-        }
-      }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_1.ptr<float>(dcount2)) = di;
-      if (options.descriptor_channels > 1) {
-        *(values_1.ptr<float>(dcount2)+1) = dx;
-      }
-
-      if (options.descriptor_channels > 2) {
-        *(values_1.ptr<float>(dcount2)+2) = dy;
-      }
-
-      dcount2++;
-    }
-  }
-
-  // Do binary comparison first level
-  for (int i = 0; i < 4; i++) {
-    for (int j = i + 1; j < 4; j++) {
-      if (*(values_1.ptr<float>(i)) > *(values_1.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-    }
-  }
-
-  if (options.descriptor_channels > 1) {
-    for (int i = 0; i < 4; i++) {
-      for (int j = i + 1; j < 4; j++) {
-        if (*(values_1.ptr<float>(i)+1) > *(values_1.ptr<float>(j)+1)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-        }
-
-        dcount1++;
-      }
-    }
-  }
-
-  if (options.descriptor_channels > 2) {
-    for (int i = 0; i < 4; i++) {
-      for (int j = i + 1; j < 4; j++) {
-        if (*(values_1.ptr<float>(i)+2) > *(values_1.ptr<float>(j)+2)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-        }
-        dcount1++;
-      }
-    }
-  }
-
-  // Second 3x3 grid
-  sample_step = static_cast<int>(ceil(pattern_size*2. / 3.));
-  dcount2 = 0;
-
-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (int k = i; k < i + sample_step; k++) {
-        for (int l = j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + (l*scale*co + k*scale*si);
-          sample_x = xf + (-l*scale*si + k*scale*co);
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-          di += ri;
+void MLDB_Full_Descriptor_Invoker::MLDB_Fill_Values(float* values, int sample_step, int level,
+                                                    float xf, float yf, float co, float si, float scale) const
+{
+    const std::vector<TEvolution>& evolution = *evolution_;
+    int pattern_size = options_->descriptor_pattern_size;
+    int chan = options_->descriptor_channels;
+    int valpos = 0;
+
+    for (int i = -pattern_size; i < pattern_size; i += sample_step) {
+        for (int j = -pattern_size; j < pattern_size; j += sample_step) {
+            float di, dx, dy;
+            di = dx = dy = 0.0;
+            int nsamples = 0;
+
+            for (int k = i; k < i + sample_step; k++) {
+              for (int l = j; l < j + sample_step; l++) {
+                float sample_y = yf + (l*co * scale + k*si*scale);
+                float sample_x = xf + (-l*si * scale + k*co*scale);
+
+                int y1 = fRound(sample_y);
+                int x1 = fRound(sample_x);
+
+                float ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
+                di += ri;
+
+                if(chan > 1) {
+                    float rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
+                    float ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
+                    if (chan == 2) {
+                      dx += sqrtf(rx*rx + ry*ry);
+                    }
+                    else {
+                      float rry = rx*co + ry*si;
+                      float rrx = -rx*si + ry*co;
+                      dx += rrx;
+                      dy += rry;
+                    }
+                }
+                nsamples++;
+              }
+            }
+            di /= nsamples;
+            dx /= nsamples;
+            dy /= nsamples;
 
-          if (options.descriptor_channels == 2) {
-            dx += sqrtf(rx*rx + ry*ry);
-          }
-          else if (options.descriptor_channels == 3) {
-            // Get the x and y derivatives on the rotated axis
-            rry = rx*co + ry*si;
-            rrx = -rx*si + ry*co;
-            dx += rrx;
-            dy += rry;
+            values[valpos] = di;
+            if (chan > 1) {
+                values[valpos + 1] = dx;
+            }
+            if (chan > 2) {
+              values[valpos + 2] = dy;
+            }
+            valpos += chan;
           }
-
-          nsamples++;
         }
-      }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_2.ptr<float>(dcount2)) = di;
-      if (options.descriptor_channels > 1) {
-        *(values_2.ptr<float>(dcount2)+1) = dx;
-      }
-
-      if (options.descriptor_channels > 2) {
-        *(values_2.ptr<float>(dcount2)+2) = dy;
-      }
-
-      dcount2++;
-    }
-  }
-
-  // Do binary comparison second level
-  for (int i = 0; i < 9; i++) {
-    for (int j = i + 1; j < 9; j++) {
-      if (*(values_2.ptr<float>(i)) > *(values_2.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-    }
-  }
+}
 
-  if (options.descriptor_channels > 1) {
-    for (int i = 0; i < 9; i++) {
-      for (int j = i + 1; j < 9; j++) {
-        if (*(values_2.ptr<float>(i)+1) > *(values_2.ptr<float>(j)+1)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-        }
-        dcount1++;
-      }
+void MLDB_Full_Descriptor_Invoker::MLDB_Binary_Comparisons(float* values, unsigned char* desc,
+                                                           int count, int& dpos) const {
+    int chan = options_->descriptor_channels;
+    int* ivalues = (int*) values;
+    for(int i = 0; i < count * chan; i++) {
+        ivalues[i] = CV_TOGGLE_FLT(ivalues[i]);
     }
-  }
 
-  if (options.descriptor_channels > 2) {
-    for (int i = 0; i < 9; i++) {
-      for (int j = i + 1; j < 9; j++) {
-        if (*(values_2.ptr<float>(i)+2) > *(values_2.ptr<float>(j)+2)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
+    for(int pos = 0; pos < chan; pos++) {
+        for (int i = 0; i < count; i++) {
+            int ival = ivalues[chan * i + pos];
+            for (int j = i + 1; j < count; j++) {
+                int res = ival > ivalues[chan * j + pos];
+                desc[dpos >> 3] |= (res << (dpos & 7));
+                dpos++;
+            }
         }
-        dcount1++;
-      }
     }
-  }
-
-  // Third 4x4 grid
-  sample_step = pattern_size / 2;
-  dcount2 = 0;
-
-  for (int i = -pattern_size; i < pattern_size; i += sample_step) {
-    for (int j = -pattern_size; j < pattern_size; j += sample_step) {
-      di = dx = dy = 0.0;
-      nsamples = 0;
-
-      for (int k = i; k < i + sample_step; k++) {
-        for (int l = j; l < j + sample_step; l++) {
-
-          // Get the coordinates of the sample point
-          sample_y = yf + (l*scale*co + k*scale*si);
-          sample_x = xf + (-l*scale*si + k*scale*co);
-
-          y1 = fRound(sample_y);
-          x1 = fRound(sample_x);
-
-          ri = *(evolution[level].Lt.ptr<float>(y1)+x1);
-          rx = *(evolution[level].Lx.ptr<float>(y1)+x1);
-          ry = *(evolution[level].Ly.ptr<float>(y1)+x1);
-          di += ri;
-
-          if (options.descriptor_channels == 2) {
-            dx += sqrtf(rx*rx + ry*ry);
-          }
-          else if (options.descriptor_channels == 3) {
-            // Get the x and y derivatives on the rotated axis
-            rry = rx*co + ry*si;
-            rrx = -rx*si + ry*co;
-            dx += rrx;
-            dy += rry;
-          }
-
-          nsamples++;
-        }
-      }
-
-      di /= nsamples;
-      dx /= nsamples;
-      dy /= nsamples;
-
-      *(values_3.ptr<float>(dcount2)) = di;
-      if (options.descriptor_channels > 1)
-        *(values_3.ptr<float>(dcount2)+1) = dx;
+}
 
-      if (options.descriptor_channels > 2)
-        *(values_3.ptr<float>(dcount2)+2) = dy;
+/* ************************************************************************* */
+/**
+ * @brief This method computes the descriptor of the provided keypoint given the
+ * main orientation of the keypoint
+ * @param kpt Input keypoint
+ * @param desc Descriptor vector
+ */
+void MLDB_Full_Descriptor_Invoker::Get_MLDB_Full_Descriptor(const cv::KeyPoint& kpt, unsigned char *desc) const {
 
-      dcount2++;
-    }
-  }
+  const int max_channels = 3;
+  CV_Assert(options_->descriptor_channels <= max_channels);
+  float values[16*max_channels];
+  const double size_mult[3] = {1, 2.0/3.0, 1.0/2.0};
 
-  // Do binary comparison third level
-  for (int i = 0; i < 16; i++) {
-    for (int j = i + 1; j < 16; j++) {
-      if (*(values_3.ptr<float>(i)) > *(values_3.ptr<float>(j))) {
-        desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-      }
-      dcount1++;
-    }
-  }
+  float ratio = (float)(1 << kpt.octave);
+  float scale = (float)fRound(0.5f*kpt.size / ratio);
+  float xf = kpt.pt.x / ratio;
+  float yf = kpt.pt.y / ratio;
+  float co = cos(kpt.angle);
+  float si = sin(kpt.angle);
+  int pattern_size = options_->descriptor_pattern_size;
 
-  if (options.descriptor_channels > 1) {
-    for (int i = 0; i < 16; i++) {
-      for (int j = i + 1; j < 16; j++) {
-        if (*(values_3.ptr<float>(i)+1) > *(values_3.ptr<float>(j)+1)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-        }
-        dcount1++;
-      }
-    }
-  }
+  int dpos = 0;
+  for(int lvl = 0; lvl < 3; lvl++) {
 
-  if (options.descriptor_channels > 2) {
-    for (int i = 0; i < 16; i++) {
-      for (int j = i + 1; j < 16; j++) {
-        if (*(values_3.ptr<float>(i)+2) > *(values_3.ptr<float>(j)+2)) {
-          desc[dcount1 / 8] |= (1 << (dcount1 % 8));
-        }
-        dcount1++;
-      }
-    }
+      int val_count = (lvl + 2) * (lvl + 2);
+      int sample_step = static_cast<int>(ceil(pattern_size * size_mult[lvl]));
+      MLDB_Fill_Values(values, sample_step, kpt.class_id, xf, yf, co, si, scale);
+      MLDB_Binary_Comparisons(values, desc, val_count, dpos);
   }
 }
 
diff --git a/modules/features2d/src/kaze/nldiffusion_functions.cpp b/modules/features2d/src/kaze/nldiffusion_functions.cpp
index ea7cd8141a33a3418d3b35489460abd188fd73fb..1c0d70a75a4e8f1e74c72a2c4f92ea6a2cb441b5 100644
--- a/modules/features2d/src/kaze/nldiffusion_functions.cpp
+++ b/modules/features2d/src/kaze/nldiffusion_functions.cpp
@@ -23,6 +23,7 @@
  */
 
 #include "nldiffusion_functions.h"
+#include <iostream>
 
 // Namespaces
 using namespace std;
@@ -92,7 +93,21 @@ namespace cv {
              * @param k Contrast factor parameter
              */
             void pm_g1(const cv::Mat& Lx, const cv::Mat& Ly, cv::Mat& dst, float k) {
-                cv::exp(-(Lx.mul(Lx) + Ly.mul(Ly)) / (k*k), dst);
+
+              Size sz = Lx.size();
+              float inv_k = 1.0f / (k*k);
+              for (int y = 0; y < sz.height; y++) {
+
+                const float* Lx_row = Lx.ptr<float>(y);
+                const float* Ly_row = Ly.ptr<float>(y);
+                float* dst_row = dst.ptr<float>(y);
+
+                for (int x = 0; x < sz.width; x++) {
+                  dst_row[x] = (-inv_k*(Lx_row[x]*Lx_row[x] + Ly_row[x]*Ly_row[x]));
+                }
+              }
+
+              exp(dst, dst);
             }
 
             /* ************************************************************************* */
@@ -105,9 +120,20 @@ namespace cv {
              * @param k Contrast factor parameter
              */
             void pm_g2(const cv::Mat &Lx, const cv::Mat& Ly, cv::Mat& dst, float k) {
-                dst = 1.0f / (1.0f + (Lx.mul(Lx) + Ly.mul(Ly)) / (k*k));
-            }
 
+                Size sz = Lx.size();
+                dst.create(sz, Lx.type());
+                float k2inv = 1.0f / (k * k);
+
+                for(int y = 0; y < sz.height; y++) {
+                    const float *Lx_row = Lx.ptr<float>(y);
+                    const float *Ly_row = Ly.ptr<float>(y);
+                    float* dst_row = dst.ptr<float>(y);
+                    for(int x = 0; x < sz.width; x++) {
+                        dst_row[x] = 1.0f / (1.0f + ((Lx_row[x] * Lx_row[x] + Ly_row[x] * Ly_row[x]) * k2inv));
+                    }
+                }
+            }
             /* ************************************************************************* */
             /**
              * @brief This function computes Weickert conductivity coefficient gw
@@ -120,12 +146,26 @@ namespace cv {
              * Proceedings of Algorithmy 2000
              */
             void weickert_diffusivity(const cv::Mat& Lx, const cv::Mat& Ly, cv::Mat& dst, float k) {
-                Mat modg;
-                cv::pow((Lx.mul(Lx) + Ly.mul(Ly)) / (k*k), 4, modg);
-                cv::exp(-3.315f / modg, dst);
-                dst = 1.0f - dst;
+
+              Size sz = Lx.size();
+              float inv_k = 1.0f / (k*k);
+              for (int y = 0; y < sz.height; y++) {
+
+                const float* Lx_row = Lx.ptr<float>(y);
+                const float* Ly_row = Ly.ptr<float>(y);
+                float* dst_row = dst.ptr<float>(y);
+
+                for (int x = 0; x < sz.width; x++) {
+                  float dL = inv_k*(Lx_row[x]*Lx_row[x] + Ly_row[x]*Ly_row[x]);
+                  dst_row[x] = -3.315f/(dL*dL*dL*dL);
+                }
+              }
+
+              exp(dst, dst);
+              dst = 1.0 - dst;
             }
 
+
             /* ************************************************************************* */
             /**
             * @brief This function computes Charbonnier conductivity coefficient gc
@@ -139,11 +179,23 @@ namespace cv {
             * Proceedings of Algorithmy 2000
             */
             void charbonnier_diffusivity(const cv::Mat& Lx, const cv::Mat& Ly, cv::Mat& dst, float k) {
-                Mat den;
-                cv::sqrt(1.0f + (Lx.mul(Lx) + Ly.mul(Ly)) / (k*k), den);
-                dst = 1.0f / den;
+
+              Size sz = Lx.size();
+              float inv_k = 1.0f / (k*k);
+              for (int y = 0; y < sz.height; y++) {
+
+                const float* Lx_row = Lx.ptr<float>(y);
+                const float* Ly_row = Ly.ptr<float>(y);
+                float* dst_row = dst.ptr<float>(y);
+
+                for (int x = 0; x < sz.width; x++) {
+                  float den = sqrt(1.0f+inv_k*(Lx_row[x]*Lx_row[x] + Ly_row[x]*Ly_row[x]));
+                  dst_row[x] = 1.0f / den;
+                }
+              }
             }
 
+
             /* ************************************************************************* */
             /**
              * @brief This function computes a good empirical value for the k contrast factor
@@ -160,7 +212,7 @@ namespace cv {
             float compute_k_percentile(const cv::Mat& img, float perc, float gscale, int nbins, int ksize_x, int ksize_y) {
 
                 int nbin = 0, nelements = 0, nthreshold = 0, k = 0;
-                float kperc = 0.0, modg = 0.0, lx = 0.0, ly = 0.0;
+                float kperc = 0.0, modg = 0.0;
                 float npoints = 0.0;
                 float hmax = 0.0;
 
@@ -181,10 +233,10 @@ namespace cv {
 
                 // Skip the borders for computing the histogram
                 for (int i = 1; i < gaussian.rows - 1; i++) {
+                    const float *lx = Lx.ptr<float>(i);
+                    const float *ly = Ly.ptr<float>(i);
                     for (int j = 1; j < gaussian.cols - 1; j++) {
-                        lx = *(Lx.ptr<float>(i)+j);
-                        ly = *(Ly.ptr<float>(i)+j);
-                        modg = sqrt(lx*lx + ly*ly);
+                        modg = lx[j]*lx[j] + ly[j]*ly[j];
 
                         // Get the maximum
                         if (modg > hmax) {
@@ -192,17 +244,17 @@ namespace cv {
                         }
                     }
                 }
-
+                hmax = sqrt(hmax);
                 // Skip the borders for computing the histogram
                 for (int i = 1; i < gaussian.rows - 1; i++) {
+                    const float *lx = Lx.ptr<float>(i);
+                    const float *ly = Ly.ptr<float>(i);
                     for (int j = 1; j < gaussian.cols - 1; j++) {
-                        lx = *(Lx.ptr<float>(i)+j);
-                        ly = *(Ly.ptr<float>(i)+j);
-                        modg = sqrt(lx*lx + ly*ly);
+                        modg = lx[j]*lx[j] + ly[j]*ly[j];
 
                         // Find the correspondent bin
                         if (modg != 0.0) {
-                            nbin = (int)floor(nbins*(modg / hmax));
+                            nbin = (int)floor(nbins*(sqrt(modg) / hmax));
 
                             if (nbin == nbins) {
                                 nbin--;
@@ -249,7 +301,7 @@ namespace cv {
             /* ************************************************************************* */
             /**
              * @brief Compute derivative kernels for sizes different than 3
-             * @param _kx Horizontal kernel values
+             * @param _kx Horizontal kernel ues
              * @param _ky Vertical kernel values
              * @param dx Derivative order in X-direction (horizontal)
              * @param dy Derivative order in Y-direction (vertical)
@@ -314,17 +366,25 @@ namespace cv {
 
                     for (int i = range.start; i < range.end; i++)
                     {
+                        const float *c_prev  = c.ptr<float>(i - 1);
+                        const float *c_curr  = c.ptr<float>(i);
+                        const float *c_next  = c.ptr<float>(i + 1);
+                        const float *ld_prev = Ld.ptr<float>(i - 1);
+                        const float *ld_curr = Ld.ptr<float>(i);
+                        const float *ld_next = Ld.ptr<float>(i + 1);
+
+                        float *dst  = Lstep.ptr<float>(i);
+
                         for (int j = 1; j < Lstep.cols - 1; j++)
                         {
-                            float xpos = ((*(c.ptr<float>(i)+j)) + (*(c.ptr<float>(i)+j + 1)))*((*(Ld.ptr<float>(i)+j + 1)) - (*(Ld.ptr<float>(i)+j)));
-                            float xneg = ((*(c.ptr<float>(i)+j - 1)) + (*(c.ptr<float>(i)+j)))*((*(Ld.ptr<float>(i)+j)) - (*(Ld.ptr<float>(i)+j - 1)));
-                            float ypos = ((*(c.ptr<float>(i)+j)) + (*(c.ptr<float>(i + 1) + j)))*((*(Ld.ptr<float>(i + 1) + j)) - (*(Ld.ptr<float>(i)+j)));
-                            float yneg = ((*(c.ptr<float>(i - 1) + j)) + (*(c.ptr<float>(i)+j)))*((*(Ld.ptr<float>(i)+j)) - (*(Ld.ptr<float>(i - 1) + j)));
-                            *(Lstep.ptr<float>(i)+j) = 0.5f*stepsize*(xpos - xneg + ypos - yneg);
+                            float xpos = (c_curr[j]   + c_curr[j+1])*(ld_curr[j+1] - ld_curr[j]);
+                            float xneg = (c_curr[j-1] + c_curr[j])  *(ld_curr[j]   - ld_curr[j-1]);
+                            float ypos = (c_curr[j]   + c_next[j])  *(ld_next[j]   - ld_curr[j]);
+                            float yneg = (c_prev[j]   + c_curr[j])  *(ld_curr[j]   - ld_prev[j]);
+                            dst[j] = 0.5f*stepsize*(xpos - xneg + ypos - yneg);
                         }
                     }
                 }
-
             private:
                 cv::Mat * _Ld;
                 const cv::Mat * _c;
@@ -345,39 +405,65 @@ namespace cv {
             */
             void nld_step_scalar(cv::Mat& Ld, const cv::Mat& c, cv::Mat& Lstep, float stepsize) {
 
-                cv::parallel_for_(cv::Range(1, Lstep.rows - 1), Nld_Step_Scalar_Invoker(Ld, c, Lstep, stepsize));
+                cv::parallel_for_(cv::Range(1, Lstep.rows - 1), Nld_Step_Scalar_Invoker(Ld, c, Lstep, stepsize), (double)Ld.total()/(1 << 16));
 
+                float xneg, xpos, yneg, ypos;
+                float* dst = Lstep.ptr<float>(0);
+                const float* cprv = NULL;
+                const float* ccur  = c.ptr<float>(0);
+                const float* cnxt  = c.ptr<float>(1);
+                const float* ldprv = NULL;
+                const float* ldcur = Ld.ptr<float>(0);
+                const float* ldnxt = Ld.ptr<float>(1);
                 for (int j = 1; j < Lstep.cols - 1; j++) {
-                    float xpos = ((*(c.ptr<float>(0) + j)) + (*(c.ptr<float>(0) + j + 1)))*((*(Ld.ptr<float>(0) + j + 1)) - (*(Ld.ptr<float>(0) + j)));
-                    float xneg = ((*(c.ptr<float>(0) + j - 1)) + (*(c.ptr<float>(0) + j)))*((*(Ld.ptr<float>(0) + j)) - (*(Ld.ptr<float>(0) + j - 1)));
-                    float ypos = ((*(c.ptr<float>(0) + j)) + (*(c.ptr<float>(1) + j)))*((*(Ld.ptr<float>(1) + j)) - (*(Ld.ptr<float>(0) + j)));
-                    *(Lstep.ptr<float>(0) + j) = 0.5f*stepsize*(xpos - xneg + ypos);
+                    xpos = (ccur[j]   + ccur[j+1]) * (ldcur[j+1] - ldcur[j]);
+                    xneg = (ccur[j-1] + ccur[j])   * (ldcur[j]   - ldcur[j-1]);
+                    ypos = (ccur[j]   + cnxt[j])   * (ldnxt[j]   - ldcur[j]);
+                    dst[j] = 0.5f*stepsize*(xpos - xneg + ypos);
                 }
 
+                dst = Lstep.ptr<float>(Lstep.rows - 1);
+                ccur = c.ptr<float>(Lstep.rows - 1);
+                cprv = c.ptr<float>(Lstep.rows - 2);
+                ldcur = Ld.ptr<float>(Lstep.rows - 1);
+                ldprv = Ld.ptr<float>(Lstep.rows - 2);
+
                 for (int j = 1; j < Lstep.cols - 1; j++) {
-                    float xpos = ((*(c.ptr<float>(Lstep.rows - 1) + j)) + (*(c.ptr<float>(Lstep.rows - 1) + j + 1)))*((*(Ld.ptr<float>(Lstep.rows - 1) + j + 1)) - (*(Ld.ptr<float>(Lstep.rows - 1) + j)));
-                    float xneg = ((*(c.ptr<float>(Lstep.rows - 1) + j - 1)) + (*(c.ptr<float>(Lstep.rows - 1) + j)))*((*(Ld.ptr<float>(Lstep.rows - 1) + j)) - (*(Ld.ptr<float>(Lstep.rows - 1) + j - 1)));
-                    float ypos = ((*(c.ptr<float>(Lstep.rows - 1) + j)) + (*(c.ptr<float>(Lstep.rows - 1) + j)))*((*(Ld.ptr<float>(Lstep.rows - 1) + j)) - (*(Ld.ptr<float>(Lstep.rows - 1) + j)));
-                    float yneg = ((*(c.ptr<float>(Lstep.rows - 2) + j)) + (*(c.ptr<float>(Lstep.rows - 1) + j)))*((*(Ld.ptr<float>(Lstep.rows - 1) + j)) - (*(Ld.ptr<float>(Lstep.rows - 2) + j)));
-                    *(Lstep.ptr<float>(Lstep.rows - 1) + j) = 0.5f*stepsize*(xpos - xneg + ypos - yneg);
+                    xpos = (ccur[j] + ccur[j+1]) * (ldcur[j+1] - ldcur[j]);
+                    xneg = (ccur[j-1] + ccur[j]) * (ldcur[j] - ldcur[j-1]);
+                    yneg = (cprv[j] + ccur[j])   * (ldcur[j] - ldprv[j]);
+                    dst[j] = 0.5f*stepsize*(xpos - xneg - yneg);
                 }
 
-                for (int i = 1; i < Lstep.rows - 1; i++) {
-                    float xpos = ((*(c.ptr<float>(i))) + (*(c.ptr<float>(i)+1)))*((*(Ld.ptr<float>(i)+1)) - (*(Ld.ptr<float>(i))));
-                    float xneg = ((*(c.ptr<float>(i))) + (*(c.ptr<float>(i))))*((*(Ld.ptr<float>(i))) - (*(Ld.ptr<float>(i))));
-                    float ypos = ((*(c.ptr<float>(i))) + (*(c.ptr<float>(i + 1))))*((*(Ld.ptr<float>(i + 1))) - (*(Ld.ptr<float>(i))));
-                    float yneg = ((*(c.ptr<float>(i - 1))) + (*(c.ptr<float>(i))))*((*(Ld.ptr<float>(i))) - (*(Ld.ptr<float>(i - 1))));
-                    *(Lstep.ptr<float>(i)) = 0.5f*stepsize*(xpos - xneg + ypos - yneg);
-                }
+                ccur = c.ptr<float>(1);
+                ldcur = Ld.ptr<float>(1);
+                cprv = c.ptr<float>(0);
+                ldprv = Ld.ptr<float>(0);
+
+                int r0 = Lstep.cols - 1;
+                int r1 = Lstep.cols - 2;
 
                 for (int i = 1; i < Lstep.rows - 1; i++) {
-                    float xneg = ((*(c.ptr<float>(i)+Lstep.cols - 2)) + (*(c.ptr<float>(i)+Lstep.cols - 1)))*((*(Ld.ptr<float>(i)+Lstep.cols - 1)) - (*(Ld.ptr<float>(i)+Lstep.cols - 2)));
-                    float ypos = ((*(c.ptr<float>(i)+Lstep.cols - 1)) + (*(c.ptr<float>(i + 1) + Lstep.cols - 1)))*((*(Ld.ptr<float>(i + 1) + Lstep.cols - 1)) - (*(Ld.ptr<float>(i)+Lstep.cols - 1)));
-                    float yneg = ((*(c.ptr<float>(i - 1) + Lstep.cols - 1)) + (*(c.ptr<float>(i)+Lstep.cols - 1)))*((*(Ld.ptr<float>(i)+Lstep.cols - 1)) - (*(Ld.ptr<float>(i - 1) + Lstep.cols - 1)));
-                    *(Lstep.ptr<float>(i)+Lstep.cols - 1) = 0.5f*stepsize*(-xneg + ypos - yneg);
+                    cnxt = c.ptr<float>(i + 1);
+                    ldnxt = Ld.ptr<float>(i + 1);
+                    dst = Lstep.ptr<float>(i);
+
+                    xpos = (ccur[0] + ccur[1]) * (ldcur[1] - ldcur[0]);
+                    ypos = (ccur[0] + cnxt[0]) * (ldnxt[0] - ldcur[0]);
+                    yneg = (cprv[0] + ccur[0]) * (ldcur[0] - ldprv[0]);
+                    dst[0] = 0.5f*stepsize*(xpos + ypos - yneg);
+
+                    xneg = (ccur[r1] + ccur[r0]) * (ldcur[r0] - ldcur[r1]);
+                    ypos = (ccur[r0] + cnxt[r0]) * (ldnxt[r0] - ldcur[r0]);
+                    yneg = (cprv[r0] + ccur[r0]) * (ldcur[r0] - ldprv[r0]);
+                    dst[r0] = 0.5f*stepsize*(-xneg + ypos - yneg);
+
+                    cprv = ccur;
+                    ccur = cnxt;
+                    ldprv = ldcur;
+                    ldcur = ldnxt;
                 }
-
-                Ld = Ld + Lstep;
+                Ld += Lstep;
             }
 
             /* ************************************************************************* */
@@ -434,4 +520,4 @@ namespace cv {
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/modules/features2d/src/kaze/utils.h b/modules/features2d/src/kaze/utils.h
index 13ae352474287ab593dbfc5970975c78ac2c2c6f..d9bfbe44721c7fdeb44ba471bcfa2e43e16a7fd3 100644
--- a/modules/features2d/src/kaze/utils.h
+++ b/modules/features2d/src/kaze/utils.h
@@ -74,4 +74,24 @@ inline int fRound(float flt) {
   return (int)(flt + 0.5f);
 }
 
+/* ************************************************************************* */
+/**
+ * @brief Exponentiation by squaring
+ * @param flt Exponentiation base
+ * @return dst Exponentiation value
+ */
+inline int fastpow(int base, int exp) {
+    int res = 1;
+    while(exp > 0) {
+        if(exp & 1) {
+            exp--;
+            res *= base;
+        } else {
+            exp /= 2;
+            base *= base;
+        }
+    }
+    return res;
+}
+
 #endif