Fix tuning bug: add profiling=true at vaildate_gcn.sh.

76af1f81 · liuqi · a038d23f · 76af1f81 · 76af1f81 · 76af1f81
13 changed file
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -55,11 +55,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
    local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
    return {{local_ws[0], local_ws[1]},
            {local_ws[1], local_ws[0]},
+            {kwg_size / 4, 4},
            {kwg_size / 16, 16},
            {kwg_size / 32, 32},
            {kwg_size / 64, 64},
            {kwg_size / 128, 128},
            {kwg_size / 256, 256},
+            {kwg_size / 512, 512},
            {kwg_size, 1},
            {1, kwg_size}
    };

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -65,9 +65,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{8, 128, 1}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},
@@ -83,7 +81,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {8, 128, 1}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -55,8 +55,7 @@ static void Concat2(const Tensor *input0,
    local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {local_ws[2], local_ws[1], local_ws[0]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
@@ -73,7 +72,9 @@ static void Concat2(const Tensor *input0,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -72,9 +72,7 @@ void Conv1x1(const Tensor *input,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size/16, 4, 4},
            {kwg_size/32, 4, 8},
            {kwg_size/32, 8, 4},
@@ -90,7 +88,9 @@ void Conv1x1(const Tensor *input,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t>& params)->cl_int {

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -66,8 +66,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {local_ws[2], local_ws[1], local_ws[0]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
@@ -84,7 +83,9 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -68,8 +68,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {local_ws[2], local_ws[1], local_ws[0]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
@@ -86,7 +85,9 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -67,9 +67,7 @@ static void Pooling(const Tensor *input,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},
@@ -85,7 +83,9 @@ static void Pooling(const Tensor *input,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -55,9 +55,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},
@@ -73,7 +71,9 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
            {7, 15, 9},
            {9, 7, 15},
            {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -64,9 +64,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},
@@ -78,7 +76,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
            {1, kwg_size / 32, 32},
            {1, kwg_size / 64, 64},
            {1, kwg_size / 128, 128},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
  };
  cl::Event event;
  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {

--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -69,9 +69,12 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
  BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);

-BM_ADDN(2, 1, 240, 240, 256, float);
+BM_ADDN(2, 1, 256, 256, 32, float);
+BM_ADDN(2, 1, 128, 128, 32, float);
 // BM_ADDN(2, 1, 240, 240, 256, half);
-BM_ADDN(4, 1, 240, 240, 256, float);
+BM_ADDN(4, 1, 128, 128, 3, float);
+BM_ADDN(2, 1, 256, 256, 3, float);
+BM_ADDN(2, 1, 512, 512, 3, float);
 // BM_ADDN(4, 1, 240, 240, 256, half);

 }  //  namespace mace
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -46,6 +46,8 @@ class Tuner {
      // tune
      std::vector<param_type> opt_param = default_param;
      RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
+      VLOG(1) << "Tuning result. "
+              << param_key << ": " << internal::MakeString(opt_param);
      param_table_[param_key] = opt_param;
      return res;
    } else {

--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -9,10 +9,10 @@

 namespace mace {

-class TunerTest: public ::testing::Test {
+class TunerTest : public ::testing::Test {
 protected:
  virtual void SetUp() {
-    remove( "/data/local/tmp/mace.config" );
+    remove("/data/local/tmp/mace.config");
    setenv("MACE_RUN_PARAMETER_PATH", "/data/local/tmp/mace.config", 1);
    setenv("MACE_TUNING", "1", 1);
  }
@@ -20,7 +20,7 @@ class TunerTest: public ::testing::Test {

 TEST_F(TunerTest, SimpleRun) {
  int expect = 1;
-  auto TunerFunc = [&](const std::vector<int>& params)->int {
+  auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
    if (params.front() == 1) {
      return expect;
    } else {
@@ -29,19 +29,27 @@ TEST_F(TunerTest, SimpleRun) {
  };

  WallClockTimer timer;
-  std::vector<int> default_params(1, 1);
-  int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
+  std::vector<unsigned int> default_params(1, 1);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                         default_params,
+                                                                         nullptr,
+                                                                         TunerFunc,
+                                                                         &timer);

  EXPECT_EQ(expect, res);

  default_params[0] = 2;
-  res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
-  EXPECT_EQ(expect+1, res);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                     default_params,
+                                                                     nullptr,
+                                                                     TunerFunc,
+                                                                     &timer);
+  EXPECT_EQ(expect + 1, res);
 }

 TEST_F(TunerTest, SimpleTune) {
  int expect = 3;
-  auto TunerFunc = [&](const std::vector<int>& params)->int {
+  auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
    if (params.front() == expect) {
      return expect;
    } else {
@@ -50,17 +58,26 @@ TEST_F(TunerTest, SimpleTune) {
    }
  };

-  std::vector<int> default_params(1, 1);
-  auto params_generator = []()->std::vector<std::vector<int>> {
+  std::vector<unsigned int> default_params(1, 1);
+  auto params_generator = []() -> std::vector<std::vector<unsigned int>> {
    return {{1}, {2}, {3}, {4}};
  };
  // tune
  WallClockTimer timer;
-  int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc, &timer);
+  int res =
+      Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                   default_params,
+                                                                   *params_generator,
+                                                                   TunerFunc,
+                                                                   &timer);
  EXPECT_EQ(expect, res);

  // run
-  res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                     default_params,
+                                                                     nullptr,
+                                                                     TunerFunc,
+                                                                     &timer);
  EXPECT_EQ(expect, res);
 }


--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -39,6 +39,15 @@ build_and_run()
    PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
  fi

+  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
+    TUNING_MODE_BUILD_FLAGS="--define profiling=true"
+    tuning_flag=1
+    round=0 # only warm up
+  else
+    tuning_flag=0
+    round=100
+  fi
+
  bazel build --verbose_failures -c opt --strip always mace/examples:mace_run \
    --crosstool_top=//external:android/crosstool \
    --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
@@ -47,7 +56,8 @@ build_and_run()
    --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
    --copt="-Werror=return-type" \
    --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
-    $PRODUCTION_MODE_BUILD_FLAGS  || exit -1
+    $PRODUCTION_MODE_BUILD_FLAGS \
+    $TUNING_MODE_BUILD_FLAGS || exit -1

  adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit -1
  if [ "$PRODUCTION_MODE" = false ]; then
@@ -56,14 +66,6 @@ build_and_run()
  adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit -1
  adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} || exit -1

-  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
-    tuning_flag=1
-    round=0 # only warm up
-  else
-    tuning_flag=0
-    round=2
-  fi
-
  adb </dev/null shell MACE_TUNING=${tuning_flag} \
    MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
    MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \