fix conflix

b1397592 · yejianwu · 79d940af · ecef3596 · b1397592 · b1397592
15 changed file
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -6,6 +6,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
  built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
  auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);

-  const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
-
  uint32_t idx = 0;
  for (auto input : input_tensors) {
  addn_kernel.setArg(idx++,
@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
  }
  addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

+  const uint32_t gws[2] = {
+      static_cast<uint32_t>(width_pixels),
+      static_cast<uint32_t>(batch_height_pixels)
+  };
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
+  std::vector<uint32_t> lws = {64, 16};
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    uint32_t local_ws[2];
+    local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
+    local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
+    return {{local_ws[0], local_ws[1]},
+            {kwg_size / 16, 16},
+            {kwg_size / 32, 32},
+            {kwg_size / 64, 64},
+            {kwg_size / 128, 128},
+            {kwg_size / 256, 256},
+            {kwg_size, 1},
+            {1, kwg_size}
+    };
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        addn_kernel, cl::NullRange,
-      cl::NDRange(width_pixels, batch_height_pixels),
-      cl::NDRange(64, 16),  // TODO fix this
-      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
+        cl::NDRange(gws[0], gws[1]),
+        cl::NDRange(params[0], params[1]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "addn_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
+
 }

 template <typename T>

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
-  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
    return {{8, 128, 1}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},

--- a/mace/kernels/opencl/concat.cc
+++ b/mace/kernels/opencl/concat.cc
@@ -6,6 +6,7 @@
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0,
  concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
  concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blk),
+      static_cast<uint32_t>(width),
+      static_cast<uint32_t>(batch * height),
+  };
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
-
-  uint32_t lws[3] = {8, 16, 8};
-//  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
-//  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
-//  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
-
+  std::vector<uint32_t> lws = {8, 16, 8};
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
+    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        concat_kernel, cl::NullRange,
-      cl::NDRange(static_cast<uint32_t>(channel_blk),
-                  static_cast<uint32_t>(width),
-                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(lws[0], lws[1], lws[2]),
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS);
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "concat_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 }

 template<typename T>

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 15, 8};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
+  auto params_generator = [&]()->std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size/16, 4, 4},
            {kwg_size/32, 4, 8},
            {kwg_size/32, 8, 4},

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {4, 15, 8};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
-  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -6,6 +6,7 @@
 #include "mace/core/runtime/opencl/cl2_header.h"
 #include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/kernels/opencl/helper.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input,
  index_t channels = output->dim(3);

  index_t channel_blocks = (channels + 3) / 4;
-  const uint32_t gws[3] = {
-      static_cast<uint32_t>(channel_blocks),
-      static_cast<uint32_t>(out_width),
-      static_cast<uint32_t>(batch * out_height),
-  };

  auto runtime = OpenCLRuntime::Get();
  std::set<std::string> built_options;
@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input,
  }
  auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);

-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
-
-  uint32_t lws[3];
-  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
-  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
-  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
-
  uint32_t idx = 0;
  pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
  pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input,
  pooling_kernel.setArg(idx++, pooling_size);
  pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));

+  const uint32_t gws[3] = {
+      static_cast<uint32_t>(channel_blocks),
+      static_cast<uint32_t>(out_width),
+      static_cast<uint32_t>(batch * out_height),
+  };
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
+  std::vector<uint32_t> lws(3, 0);
+  lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+  lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
+    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {3, 15, 9},
+            {7, 15, 9},
+            {9, 7, 15},
+            {15, 7, 9},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        pooling_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS) << error;
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "pooling_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
 }

 template<typename T>

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
                           static_cast<uint32_t>(height * batch)};
  const std::vector<uint32_t> lws = {8, 16, 8};
  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
-  auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
            {kwg_size / 16, 4, 4},
            {kwg_size / 32, 4, 8},
            {kwg_size / 32, 8, 4},

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -7,6 +7,7 @@
 #include "mace/kernels/resize_bilinear.h"
 #include "mace/kernels/opencl/helper.h"
 #include "mace/utils/utils.h"
+#include "mace/utils/tuner.h"

 namespace mace {
 namespace kernels {
@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
  auto rb_kernel  = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);

-  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
-
  uint32_t idx = 0;
  rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
  rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
  rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
  rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));

-  auto command_queue = runtime->command_queue();
-
-  cl_int error = command_queue.enqueueNDRangeKernel(
+  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
+                           static_cast<uint32_t>(out_width),
+                           static_cast<uint32_t>(out_height * batch)};
+  const std::vector<uint32_t> lws = {8, 16, 8};
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
+  auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
+    std::vector<uint32_t> local_ws(3, 0);
+    local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
+    local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
+    local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
+    return {{4, 15, 8}, //SNPE size
+            {local_ws[0], local_ws[1], local_ws[2]},
+            {kwg_size / 16, 4, 4},
+            {kwg_size / 32, 4, 8},
+            {kwg_size / 32, 8, 4},
+            {kwg_size / 64, 8, 8},
+            {kwg_size / 64, 16, 4},
+            {kwg_size / 128, 8, 16},
+            {kwg_size / 128, 16, 8},
+            {kwg_size / 128, 32, 4},
+            {1, kwg_size / 32, 32},
+            {1, kwg_size / 64, 64},
+            {1, kwg_size / 128, 128},
+            {1, kwg_size, 1}};
+  };
+  auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
+    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        rb_kernel, cl::NullRange,
-      cl::NDRange(static_cast<int32_t>(channel_blocks),
-                  static_cast<int32_t>(out_width),
-                  static_cast<int32_t>(out_height * batch)),
-      // TODO tuning
-      cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
-      nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
-  MACE_CHECK(error == CL_SUCCESS, error);
+        cl::NDRange(gws[0], gws[1], gws[2]),
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+
+    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
+    return error;
+  };
+  std::stringstream ss;
+  ss << "resize_bilinear_opencl_kernel_"
+     << output->dim(0) << "_"
+     << output->dim(1) << "_"
+     << output->dim(2) << "_"
+     << output->dim(3);
+  Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
+                                                     lws,
+                                                     params_generator,
+                                                     func);
+
 }

 template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;

--- a/mace/python/tools/BUILD
+++ b/mace/python/tools/BUILD
@@ -8,6 +8,7 @@ py_library(
    ],
    srcs_version = "PY2AND3",
    deps = [
+        ":memory_optimizer",
        "//mace/proto:mace_py",
    ],
 )

--- a/mace/python/tools/memory_optimizer.py
+++ b/mace/python/tools/memory_optimizer.py
@@ -65,7 +65,7 @@ class MemoryOptimizer(object):
            raise Exception('ref count is less than 0')

    for mem in self.mem_block:
-      arena = net_def.mem_arena
+      arena = self.net_def.mem_arena
      block = arena.mem_block.add()
      block.mem_id = mem
      block.x = self.mem_block[mem][0]
@@ -83,20 +83,7 @@ class MemoryOptimizer(object):

    print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)

-if __name__ == '__main__':
-  model_file = sys.argv[1]
-  opt_model_file = sys.argv[2]
-  with open(model_file, "rb") as f:
-    net_def = mace_pb2.NetDef()
-    net_def.ParseFromString(f.read())
-    optimizer = MemoryOptimizer(net_def)
-    optimizer.optimize()
-
-    with open(opt_model_file, "wb") as f:
-      f.write(net_def.SerializeToString())
-    with open(opt_model_file + '_txt', "wb") as f:
-      net_def.ClearField('tensors')
-      f.write(str(net_def))
-
-

+def optimize_memory(net_def):
+  mem_optimizer = MemoryOptimizer(net_def)
+  mem_optimizer.optimize()
\ No newline at end of file
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
--- a/mace/python/tools/tf_dsp_converter_lib.py
+++ b/mace/python/tools/tf_dsp_converter_lib.py
@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
    elif is_node_flatten_reshape(first_op):
      op_def.type = 'Flatten'
      op_def.input.extend([t.name for t in first_op.inputs])
+      op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
      convert_op_outputs(op_def, first_op)
    elif dsp_ops.has_op(first_op.type):
      op_def.input.extend([t.name for t in first_op.inputs])

--- a/tools/validate.py
+++ b/tools/validate.py
@@ -4,6 +4,7 @@ import os
 import os.path
 import tensorflow as tf
 import numpy as np
+from scipy import spatial

 from tensorflow import gfile

@@ -34,9 +35,12 @@ def load_data(file):
 def valid_output(out_shape, mace_out_file, tf_out_value):
  mace_out_value = load_data(mace_out_file)
  if mace_out_value.size != 0:
+    similarity = (1 - spatial.distance.cosine(tf_out_value.flat, mace_out_value))
+    print 'MACE VS TF similarity: ', similarity
+    if similarity > 0.999:
+      print '=======================Passed! Haha======================'
    mace_out_value = mace_out_value.reshape(out_shape)
    np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
-    print '=======================Passed! Haha======================'
  else:
    print '=======================Skip empty node==================='

@@ -62,7 +66,7 @@ def run_model(input_shape):
        input_value = input_value.reshape(input_shape)
        
        output_value = session.run(output_node, feed_dict={input_node: [input_value]})
-        # output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight')
+        output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_out')
        return output_value

 def main(unused_args):

--- a/tools/validate_gcn.sh
+++ b/tools/validate_gcn.sh
@@ -2,10 +2,10 @@
 # Must run at root dir of mace project.
 set +x
 Usage() {
-  echo 'Usage: bash tools/validate_gcn.sh tf_model_file'
+  echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size'
 }

-if [ $# != 1 ];then
+if [ $# != 2 ];then
  Usage
  exit -1
 fi
@@ -13,18 +13,18 @@ fi
 TF_MODEL_FILE_PATH=$1
 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
 MACE_MODEL_NAME='mace_model.pb'
-MACE_OPT_MODEL_NAME='mace_opt_model.pb'
 INPUT_FILE_NAME='model_input'
 OUTPUT_FILE_NAME='gcn.out'
 OUTPUT_LIST_FILE='gcn.list'
 PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
 KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
+IMAGE_SIZE=$2

 # Step 1: Generate input data
 echo "Step 1: Generate input data"
 python tools/validate.py --generate_data true --random_seed 1 \
 --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
- --input_shape=512,512,3
+ --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"

 # Step 2: convert tf model to mace model
 echo "Step 2: convert tf model to mace model and optimize memory"
@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
                                         --output_node=GCN/br_result_2/fcn_br \
                                         --data_type=DT_HALF \
                                         --runtime=gpu
-bazel build mace/python/tools:memory_optimizer
-bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
-                                             ${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
-

 # Step 3: Run model on the phone
 echo "Step 3: Run model on the phone"
@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run  \

 adb shell "mkdir -p ${PHONE_DATA_DIR}"
 adb shell "mkdir -p ${KERNEL_DIR}"
-adb push mace/kernels/opencl/cl/ ${KERNEL_DIR}
-adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR}
+adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
+adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
 adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
 adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}

 num_threads=${1:-4}

-adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
+adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \
+        MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
        MACE_KERNEL_PATH=$KERNEL_DIR \
        OMP_NUM_THREADS=$num_threads \
        ${PHONE_DATA_DIR}/mace_run \
-          --model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \
+          --model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
          --input=mace_input_node \
          --output=mace_output_node \
-          --input_shape=1,512,512,3\
+          --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
          --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
          --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
          --device=OPENCL   \
@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
    --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
    --input_node input \
    --output_node GCN/br_result_2/fcn_br\
-    --output_shape 1,512,512,2
+    --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
+    --output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"