diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 42359f45e9b3cd4eded47aa4ef15efe75bccaf79..360e2ba9e5a504927c072cf4b106c5ba65022172 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -55,11 +55,13 @@ static void AddN(const std::vector &input_tensors, local_ws[1] = std::min(batch_height_pixels, kwg_size / local_ws[0]); return {{local_ws[0], local_ws[1]}, {local_ws[1], local_ws[0]}, + {kwg_size / 4, 4}, {kwg_size / 16, 16}, {kwg_size / 32, 32}, {kwg_size / 64, 64}, {kwg_size / 128, 128}, {kwg_size / 256, 256}, + {kwg_size / 512, 512}, {kwg_size, 1}, {1, kwg_size} }; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 513d73665f2d16e45393464b7faa7765a73763a6..76b74906b72601dc613827ef8e88d0d5e1a135f8 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -65,9 +65,7 @@ void BatchNormFunctor::operator()( local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{8, 128, 1}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, @@ -83,7 +81,9 @@ void BatchNormFunctor::operator()( {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {8, 128, 1}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 921b34ce0350aac8647986584d3a8f68bceb248c..8f8d38801200c862b433291bf280942222679753 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -55,8 +55,7 @@ static void Concat2(const Tensor *input0, local_ws[0] = std::min(channel_blk, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, @@ -73,7 +72,9 @@ static void Concat2(const Tensor *input0, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 49eea13d6420a066fd7fb3d8bb2cf5ba5fc7a348..e160ce6142be20a21922075b01996e2d369abb6a 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -72,9 +72,7 @@ void Conv1x1(const Tensor *input, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {kwg_size/16, 4, 4}, {kwg_size/32, 4, 8}, {kwg_size/32, 8, 4}, @@ -90,7 +88,9 @@ void Conv1x1(const Tensor *input, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector& params)->cl_int { diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index d108dea19f22ebdeda897c750b693792d6943d73..e42060527d44700e47b58be0f2b63a23b3a6d990 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -66,8 +66,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, @@ -84,7 +83,9 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 89026e83b489a27bf894496adcec907e2597cb5a..6b6746f37cefd1c10800d300621de08ddf3dedf5 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -68,8 +68,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, @@ -86,7 +85,9 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 6835af69df39236ce545c743410cc9fcf81a0258..0d1676337e26897b3e574085bb64212525edd66f 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -67,9 +67,7 @@ static void Pooling(const Tensor *input, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, @@ -85,7 +83,9 @@ static void Pooling(const Tensor *input, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index 831197f132d0afd8dd754c734ec881850d7c1eb7..180b7317b01a6217c0b37264aa8d3ecc757a1592 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -55,9 +55,7 @@ void ReluFunctor::operator()(const Tensor *input, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, @@ -73,7 +71,9 @@ void ReluFunctor::operator()(const Tensor *input, {7, 15, 9}, {9, 7, 15}, {15, 7, 9}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 7d3af2233b4ae70044c687187434711072544531..3496a56332c02eb205eff48b14a4a3060d2c1f94 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -64,9 +64,7 @@ void ResizeBilinearFunctor::operator()( local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, + return {{local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, @@ -78,7 +76,9 @@ void ResizeBilinearFunctor::operator()( {1, kwg_size / 32, 32}, {1, kwg_size / 64, 64}, {1, kwg_size / 128, 128}, - {1, kwg_size, 1}}; + {1, kwg_size, 1}, + {4, 15, 8}, //SNPE size + }; }; cl::Event event; auto func = [&](const std::vector ¶ms) -> cl_int { diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc index 2d5293188f6b67ab48c8b6e09c233c9300350fb9..41fb6e9e96a2385288b37650b3882a93aa4d26b8 100644 --- a/mace/ops/addn_benchmark.cc +++ b/mace/ops/addn_benchmark.cc @@ -69,9 +69,12 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) { BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \ BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL); -BM_ADDN(2, 1, 240, 240, 256, float); +BM_ADDN(2, 1, 256, 256, 32, float); +BM_ADDN(2, 1, 128, 128, 32, float); // BM_ADDN(2, 1, 240, 240, 256, half); -BM_ADDN(4, 1, 240, 240, 256, float); +BM_ADDN(4, 1, 128, 128, 3, float); +BM_ADDN(2, 1, 256, 256, 3, float); +BM_ADDN(2, 1, 512, 512, 3, float); // BM_ADDN(4, 1, 240, 240, 256, half); } // namespace mace diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 9404a56575483ef11264268a9210363a7b775d98..6296934dbe310fec2baa4f79da468bd5f187a40e 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -46,6 +46,8 @@ class Tuner { // tune std::vector opt_param = default_param; RetType res = Tune(param_generator, func, timer, &opt_param); + VLOG(1) << "Tuning result. " + << param_key << ": " << internal::MakeString(opt_param); param_table_[param_key] = opt_param; return res; } else { diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc index fae12c91aa160bc8fa7ecdcc66070316cdf71ff5..0e57e929759aca6f23642ab96c158ef48baf87d1 100644 --- a/mace/utils/tuner_test.cc +++ b/mace/utils/tuner_test.cc @@ -9,10 +9,10 @@ namespace mace { -class TunerTest: public ::testing::Test { +class TunerTest : public ::testing::Test { protected: virtual void SetUp() { - remove( "/data/local/tmp/mace.config" ); + remove("/data/local/tmp/mace.config"); setenv("MACE_RUN_PARAMETER_PATH", "/data/local/tmp/mace.config", 1); setenv("MACE_TUNING", "1", 1); } @@ -20,7 +20,7 @@ class TunerTest: public ::testing::Test { TEST_F(TunerTest, SimpleRun) { int expect = 1; - auto TunerFunc = [&](const std::vector& params)->int { + auto TunerFunc = [&](const std::vector ¶ms) -> int { if (params.front() == 1) { return expect; } else { @@ -29,19 +29,27 @@ TEST_F(TunerTest, SimpleRun) { }; WallClockTimer timer; - std::vector default_params(1, 1); - int res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc, &timer); + std::vector default_params(1, 1); + int res = Tuner::Get()->template TuneOrRun("SimpleRun", + default_params, + nullptr, + TunerFunc, + &timer); EXPECT_EQ(expect, res); default_params[0] = 2; - res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc, &timer); - EXPECT_EQ(expect+1, res); + res = Tuner::Get()->template TuneOrRun("SimpleRun", + default_params, + nullptr, + TunerFunc, + &timer); + EXPECT_EQ(expect + 1, res); } TEST_F(TunerTest, SimpleTune) { int expect = 3; - auto TunerFunc = [&](const std::vector& params)->int { + auto TunerFunc = [&](const std::vector ¶ms) -> int { if (params.front() == expect) { return expect; } else { @@ -50,17 +58,26 @@ TEST_F(TunerTest, SimpleTune) { } }; - std::vector default_params(1, 1); - auto params_generator = []()->std::vector> { + std::vector default_params(1, 1); + auto params_generator = []() -> std::vector> { return {{1}, {2}, {3}, {4}}; }; // tune WallClockTimer timer; - int res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, *params_generator, TunerFunc, &timer); + int res = + Tuner::Get()->template TuneOrRun("SimpleRun", + default_params, + *params_generator, + TunerFunc, + &timer); EXPECT_EQ(expect, res); // run - res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc, &timer); + res = Tuner::Get()->template TuneOrRun("SimpleRun", + default_params, + nullptr, + TunerFunc, + &timer); EXPECT_EQ(expect, res); } diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index 88ed9d7b76b0a8891e8753ab8167d34a9acfef2c..8e54739dd0b063b950fb8ac1b2ace78bd6b27d79 100755 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -39,6 +39,15 @@ build_and_run() PRODUCTION_MODE_BUILD_FLAGS="--define production=true" fi + if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then + TUNING_MODE_BUILD_FLAGS="--define profiling=true" + tuning_flag=1 + round=0 # only warm up + else + tuning_flag=0 + round=100 + fi + bazel build --verbose_failures -c opt --strip always mace/examples:mace_run \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ @@ -47,7 +56,8 @@ build_and_run() --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-Werror=return-type" \ --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ - $PRODUCTION_MODE_BUILD_FLAGS || exit -1 + $PRODUCTION_MODE_BUILD_FLAGS \ + $TUNING_MODE_BUILD_FLAGS || exit -1 adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit -1 if [ "$PRODUCTION_MODE" = false ]; then @@ -56,14 +66,6 @@ build_and_run() adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit -1 adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} || exit -1 - if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then - tuning_flag=1 - round=0 # only warm up - else - tuning_flag=0 - round=2 - fi - adb