diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index 42359f45e9b3cd4eded47aa4ef15efe75bccaf79..360e2ba9e5a504927c072cf4b106c5ba65022172 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -55,11 +55,13 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
     local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
     return {{local_ws[0], local_ws[1]},
             {local_ws[1], local_ws[0]},
+            {kwg_size / 4, 4},
             {kwg_size / 16, 16},
             {kwg_size / 32, 32},
             {kwg_size / 64, 64},
             {kwg_size / 128, 128},
             {kwg_size / 256, 256},
+            {kwg_size / 512, 512},
             {kwg_size, 1},
             {1, kwg_size}
     };
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index 513d73665f2d16e45393464b7faa7765a73763a6..76b74906b72601dc613827ef8e88d0d5e1a135f8 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -65,9 +65,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{8, 128, 1}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
             {kwg_size / 32, 8, 4},
@@ -83,7 +81,9 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {8, 128, 1}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
index 921b34ce0350aac8647986584d3a8f68bceb248c..8f8d38801200c862b433291bf280942222679753 100644
--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -55,8 +55,7 @@ static void Concat2(const Tensor *input0,
     local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
     local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {local_ws[2], local_ws[1], local_ws[0]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
@@ -73,7 +72,9 @@ static void Concat2(const Tensor *input0,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 49eea13d6420a066fd7fb3d8bb2cf5ba5fc7a348..e160ce6142be20a21922075b01996e2d369abb6a 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -72,9 +72,7 @@ void Conv1x1(const Tensor *input,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {kwg_size/16, 4, 4},
             {kwg_size/32, 4, 8},
             {kwg_size/32, 8, 4},
@@ -90,7 +88,9 @@ void Conv1x1(const Tensor *input,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t>& params)->cl_int {
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index d108dea19f22ebdeda897c750b693792d6943d73..e42060527d44700e47b58be0f2b63a23b3a6d990 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -66,8 +66,7 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {local_ws[2], local_ws[1], local_ws[0]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
@@ -84,7 +83,9 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc
index 89026e83b489a27bf894496adcec907e2597cb5a..6b6746f37cefd1c10800d300621de08ddf3dedf5 100644
--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -68,8 +68,7 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {local_ws[2], local_ws[1], local_ws[0]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
@@ -86,7 +85,9 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index 6835af69df39236ce545c743410cc9fcf81a0258..0d1676337e26897b3e574085bb64212525edd66f 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -67,9 +67,7 @@ static void Pooling(const Tensor *input,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
             {kwg_size / 32, 8, 4},
@@ -85,7 +83,9 @@ static void Pooling(const Tensor *input,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index 831197f132d0afd8dd754c734ec881850d7c1eb7..180b7317b01a6217c0b37264aa8d3ecc757a1592 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -55,9 +55,7 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
             {kwg_size / 32, 8, 4},
@@ -73,7 +71,9 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
             {7, 15, 9},
             {9, 7, 15},
             {15, 7, 9},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index 7d3af2233b4ae70044c687187434711072544531..3496a56332c02eb205eff48b14a4a3060d2c1f94 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -64,9 +64,7 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
     local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
     local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
     local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
-    return {{4, 15, 8}, //SNPE size
-            {local_ws[0], local_ws[1], local_ws[2]},
-            {local_ws[2], local_ws[1], local_ws[0]},
+    return {{local_ws[0], local_ws[1], local_ws[2]},
             {kwg_size / 16, 4, 4},
             {kwg_size / 32, 4, 8},
             {kwg_size / 32, 8, 4},
@@ -78,7 +76,9 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
             {1, kwg_size / 32, 32},
             {1, kwg_size / 64, 64},
             {1, kwg_size / 128, 128},
-            {1, kwg_size, 1}};
+            {1, kwg_size, 1},
+            {4, 15, 8}, //SNPE size
+    };
   };
   cl::Event event;
   auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
diff --git a/mace/ops/addn_benchmark.cc b/mace/ops/addn_benchmark.cc
index 2d5293188f6b67ab48c8b6e09c233c9300350fb9..41fb6e9e96a2385288b37650b3882a93aa4d26b8 100644
--- a/mace/ops/addn_benchmark.cc
+++ b/mace/ops/addn_benchmark.cc
@@ -69,9 +69,12 @@ static void AddNBenchmark(int iters, int inputs, int n, int h, int w, int c) {
   BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, CPU); \
   BM_ADDN_MACRO(INPUTS, N, H, W, C, TYPE, OPENCL);
 
-BM_ADDN(2, 1, 240, 240, 256, float);
+BM_ADDN(2, 1, 256, 256, 32, float);
+BM_ADDN(2, 1, 128, 128, 32, float);
 // BM_ADDN(2, 1, 240, 240, 256, half);
-BM_ADDN(4, 1, 240, 240, 256, float);
+BM_ADDN(4, 1, 128, 128, 3, float);
+BM_ADDN(2, 1, 256, 256, 3, float);
+BM_ADDN(2, 1, 512, 512, 3, float);
 // BM_ADDN(4, 1, 240, 240, 256, half);
 
 }  //  namespace mace
diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h
index 9404a56575483ef11264268a9210363a7b775d98..6296934dbe310fec2baa4f79da468bd5f187a40e 100644
--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -46,6 +46,8 @@ class Tuner {
       // tune
       std::vector<param_type> opt_param = default_param;
       RetType res = Tune<RetType>(param_generator, func, timer, &opt_param);
+      VLOG(1) << "Tuning result. "
+              << param_key << ": " << internal::MakeString(opt_param);
       param_table_[param_key] = opt_param;
       return res;
     } else {
diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc
index fae12c91aa160bc8fa7ecdcc66070316cdf71ff5..0e57e929759aca6f23642ab96c158ef48baf87d1 100644
--- a/mace/utils/tuner_test.cc
+++ b/mace/utils/tuner_test.cc
@@ -9,10 +9,10 @@
 
 namespace mace {
 
-class TunerTest: public ::testing::Test {
+class TunerTest : public ::testing::Test {
  protected:
   virtual void SetUp() {
-    remove( "/data/local/tmp/mace.config" );
+    remove("/data/local/tmp/mace.config");
     setenv("MACE_RUN_PARAMETER_PATH", "/data/local/tmp/mace.config", 1);
     setenv("MACE_TUNING", "1", 1);
   }
@@ -20,7 +20,7 @@ class TunerTest: public ::testing::Test {
 
 TEST_F(TunerTest, SimpleRun) {
   int expect = 1;
-  auto TunerFunc = [&](const std::vector<int>& params)->int {
+  auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
     if (params.front() == 1) {
       return expect;
     } else {
@@ -29,19 +29,27 @@ TEST_F(TunerTest, SimpleRun) {
   };
 
   WallClockTimer timer;
-  std::vector<int> default_params(1, 1);
-  int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
+  std::vector<unsigned int> default_params(1, 1);
+  int res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                         default_params,
+                                                                         nullptr,
+                                                                         TunerFunc,
+                                                                         &timer);
 
   EXPECT_EQ(expect, res);
 
   default_params[0] = 2;
-  res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
-  EXPECT_EQ(expect+1, res);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                     default_params,
+                                                                     nullptr,
+                                                                     TunerFunc,
+                                                                     &timer);
+  EXPECT_EQ(expect + 1, res);
 }
 
 TEST_F(TunerTest, SimpleTune) {
   int expect = 3;
-  auto TunerFunc = [&](const std::vector<int>& params)->int {
+  auto TunerFunc = [&](const std::vector<unsigned int> &params) -> int {
     if (params.front() == expect) {
       return expect;
     } else {
@@ -50,17 +58,26 @@ TEST_F(TunerTest, SimpleTune) {
     }
   };
 
-  std::vector<int> default_params(1, 1);
-  auto params_generator = []()->std::vector<std::vector<int>> {
+  std::vector<unsigned int> default_params(1, 1);
+  auto params_generator = []() -> std::vector<std::vector<unsigned int>> {
     return {{1}, {2}, {3}, {4}};
   };
   // tune
   WallClockTimer timer;
-  int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc, &timer);
+  int res =
+      Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                   default_params,
+                                                                   *params_generator,
+                                                                   TunerFunc,
+                                                                   &timer);
   EXPECT_EQ(expect, res);
 
   // run
-  res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc, &timer);
+  res = Tuner<unsigned int>::Get()->template TuneOrRun<unsigned int>("SimpleRun",
+                                                                     default_params,
+                                                                     nullptr,
+                                                                     TunerFunc,
+                                                                     &timer);
   EXPECT_EQ(expect, res);
 }
 
diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh
index 88ed9d7b76b0a8891e8753ab8167d34a9acfef2c..8e54739dd0b063b950fb8ac1b2ace78bd6b27d79 100755
--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -39,6 +39,15 @@ build_and_run()
     PRODUCTION_MODE_BUILD_FLAGS="--define production=true"
   fi
 
+  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
+    TUNING_MODE_BUILD_FLAGS="--define profiling=true"
+    tuning_flag=1
+    round=0 # only warm up
+  else
+    tuning_flag=0
+    round=100
+  fi
+
   bazel build --verbose_failures -c opt --strip always mace/examples:mace_run \
     --crosstool_top=//external:android/crosstool \
     --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
@@ -47,7 +56,8 @@ build_and_run()
     --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
     --copt="-Werror=return-type" \
     --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \
-    $PRODUCTION_MODE_BUILD_FLAGS  || exit -1
+    $PRODUCTION_MODE_BUILD_FLAGS \
+    $TUNING_MODE_BUILD_FLAGS || exit -1
 
   adb shell "mkdir -p ${PHONE_DATA_DIR}" || exit -1
   if [ "$PRODUCTION_MODE" = false ]; then
@@ -56,14 +66,6 @@ build_and_run()
   adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} || exit -1
   adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} || exit -1
 
-  if [[ "${TUNING_OR_NOT}" != "0" && "$PRODUCTION_MODE" != true ]];then
-    tuning_flag=1
-    round=0 # only warm up
-  else
-    tuning_flag=0
-    round=2
-  fi
-
   adb </dev/null shell MACE_TUNING=${tuning_flag} \
     MACE_CPP_MIN_VLOG_LEVEL=$VLOG_LEVEL \
     MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \