Fix softmax performance issue

3e3b214b · Liangliang He · 3236ad7c · 3e3b214b · 3e3b214b · 3e3b214b
Showing with 37 addition and 38 deletion

mace/kernels/neon/conv_2d_neon_1x1.cc mace/kernels/neon/conv_2d_neon_1x1.cc +0 -2

mace/kernels/softmax.h mace/kernels/softmax.h +28 -28

mace/ops/softmax_benchmark.cc mace/ops/softmax_benchmark.cc +9 -8

未找到文件。
--- a/mace/kernels/neon/conv_2d_neon_1x1.cc
+++ b/mace/kernels/neon/conv_2d_neon_1x1.cc
@@ -264,7 +264,6 @@ void Conv2dNeonK1x1S1(const float *input,  // NCHW
                bias ? bias[i] : 0);
    }
  }
-// benchmark omp collapsed(2)
 #pragma omp parallel for collapse(2)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t c = 0; c < round_up_channels; c += kOutputChannelBlockSize) {
@@ -326,7 +325,6 @@ void Conv2dNeonPixelK1x1S1(
  const index_t total_loops = total_pixels >> 3;
  const index_t loop_remaining = total_pixels & 7;
-// benchmark omp collapsed(2)
 #pragma omp parallel for collapse(2)
  for (index_t n = 0; n < batch; ++n) {
    for (index_t c = 0; c < channels; ++c) {

--- a/mace/kernels/softmax.h
+++ b/mace/kernels/softmax.h
@@ -6,28 +6,32 @@
 #define MACE_KERNELS_SOFTMAX_H_
 #include "mace/core/future.h"
+#include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/tensor.h"
 #include "mace/public/mace.h"
-#include "mace/core/runtime/opencl/cl2_header.h"
+#include "mace/utils/utils.h"
 namespace mace {
 namespace kernels {
 template <DeviceType D, typename T>
 struct SoftmaxFunctor {
-  void operator()(const Tensor *logits,
+  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future) {
-                  Tensor *output,
-                  StatsFuture *future) {
    Tensor::MappingGuard logits_guard(logits);
    Tensor::MappingGuard output_guard(output);
    const T *logits_ptr = logits->data<T>();
    T *output_ptr = output->mutable_data<T>();
    auto &logits_shape = logits->shape();
-    const index_t batch_size = std::accumulate(logits_shape.begin(), logits_shape.end()-1,
+    const index_t batch_size =
-                                               1, std::multiplies<index_t>());
+        std::accumulate(logits_shape.begin(), logits_shape.end() - 1, 1,
+                        std::multiplies<index_t>());
    const index_t num_classes = logits_shape.back();
-#pragma omp parallel for
+#pragma omp parallel
+    {
+      // Allocate per thread buffer
+      std::vector<T> exp_data(num_classes);
+#pragma omp for
      for (index_t i = 0; i < batch_size; ++i) {
        const index_t pos = i * num_classes;
        T max_value = logits_ptr[pos];
@@ -36,7 +40,6 @@ struct SoftmaxFunctor {
        }
        // TODO: check overflow?
        T sum = 0;
-      std::vector<T> exp_data(num_classes);
        for (index_t c = 0; c < num_classes; ++c) {
          exp_data[c] = ::exp((logits_ptr[pos + c] - max_value));
          sum += exp_data[c];
@@ -46,15 +49,12 @@ struct SoftmaxFunctor {
        }
      }
    }
+  }
 };
+template <typename T>
-template<typename T>
 struct SoftmaxFunctor<DeviceType::OPENCL, T> {
+  void operator()(const Tensor *logits, Tensor *output, StatsFuture *future);
-  void operator()(const Tensor *logits,
-                  Tensor *output,
-                  StatsFuture *future);
  cl::Kernel kernel_;
 };

--- a/mace/ops/softmax_benchmark.cc
+++ b/mace/ops/softmax_benchmark.cc
@@ -47,7 +47,8 @@ static void SoftmaxBenchmark(
 }
 #define BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE)                   \
-  static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(int iters) {  \
+  static void BM_SOFTMAX_##N##_##C##_##H##_##W##_##TYPE##_##DEVICE(  \
+      int iters) {                                                   \
    const int64_t tot = static_cast<int64_t>(iters) * N * C * H * W; \
    mace::testing::MaccProcessed(tot);                               \
    mace::testing::BytesProcessed(tot *(sizeof(TYPE)));              \
@@ -60,9 +61,9 @@ static void SoftmaxBenchmark(
  BM_SOFTMAX_MACRO(N, C, H, W, float, OPENCL); \
  BM_SOFTMAX_MACRO(N, C, H, W, half, OPENCL);
-BM_SOFTMAX(1, 1, 512, 512);
+BM_SOFTMAX(1, 2, 512, 512);
-BM_SOFTMAX(1, 3, 128, 128);
 BM_SOFTMAX(1, 3, 512, 512);
-BM_SOFTMAX(1, 32, 112, 112);
+BM_SOFTMAX(1, 4, 512, 512);
-BM_SOFTMAX(1, 64, 256, 256);
+BM_SOFTMAX(1, 10, 256, 256);
+BM_SOFTMAX(1, 1024, 7, 7);
 }  // namespace mace