diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 31cd19104f43082e10fa4fdef77e6d02ceeb67cd..83e6b65b3d882bcc857f08fb0cffd87a9b84a65f 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -6,6 +6,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -33,8 +34,6 @@ static void AddN(const std::vector &input_tensors, built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options); - const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel); - uint32_t idx = 0; for (auto input : input_tensors) { addn_kernel.setArg(idx++, @@ -42,12 +41,47 @@ static void AddN(const std::vector &input_tensors, } addn_kernel.setArg(idx++, *(static_cast(output->buffer()))); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, cl::NullRange, - cl::NDRange(width_pixels, batch_height_pixels), - cl::NDRange(64, 16), // TODO fix this - nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS) << "error code: " << error; + const uint32_t gws[2] = { + static_cast(width_pixels), + static_cast(batch_height_pixels) + }; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); + std::vector lws = {64, 16}; + auto params_generator = [&]() -> std::vector> { + uint32_t local_ws[2]; + local_ws[0] = std::min(width_pixels, kwg_size); + local_ws[1] = std::min(batch_height_pixels, kwg_size / local_ws[0]); + return {{local_ws[0], local_ws[1]}, + {kwg_size / 16, 16}, + {kwg_size / 32, 32}, + {kwg_size / 64, 64}, + {kwg_size / 128, 128}, + {kwg_size / 256, 256}, + {kwg_size, 1}, + {1, kwg_size} + }; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + addn_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "addn_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); + } template diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index a5362262901f8c38290e22af31e2760310c3717a..de6571ea03fba7498af4adb6cb203e5789177069 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -48,8 +48,13 @@ void BatchNormFunctor::operator()( static_cast(height * batch)}; const std::vector lws = {8, 16, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - auto params_generator = [&kwg_size]() -> std::vector> { + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return {{8, 128, 1}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index f80f370d3367215391f47a2e7416bb8aea3bc429..706ed8f1c8e8c7257fb8284439887ec62e39759d 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -6,6 +6,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0, concat_kernel.setArg(idx++, static_cast(input0->dim(3))); concat_kernel.setArg(idx++, *(static_cast(output->buffer()))); + const uint32_t gws[3] = { + static_cast(channel_blk), + static_cast(width), + static_cast(batch * height), + }; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); + std::vector lws = {8, 16, 8}; + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blk, kwg_size); + local_ws[1] = std::min(width, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); + return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + concat_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - uint32_t lws[3] = {8, 16, 8}; -// lws[0] = std::min(channel_blk, kwg_size); -// lws[1] = std::min(width, kwg_size / lws[0]); -// lws[2] = std::min(height * batch, kwg_size / (lws[0] * lws[1])); - - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blk), - static_cast(width), - static_cast(height * batch)), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "concat_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } template diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 1fe00494b41fde6c247cf6e0af91e91a170c652a..9eaaa3b1e053cee2b9bc1a72746f357a6adeae67 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input, static_cast(height * batch)}; const std::vector lws = {8, 15, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&kwg_size]()->std::vector> { + auto params_generator = [&]()->std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, {kwg_size/16, 4, 4}, {kwg_size/32, 4, 8}, {kwg_size/32, 8, 4}, diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 858fc5fc396515febbb43951777a02840abfa267..0b77b6c26dfc2c93c4ccdaf9232a97e06e80a046 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, static_cast(height * batch)}; const std::vector lws = {4, 15, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&kwg_size]() -> std::vector> { + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 7a74f86b84ba5d2ecd019dddcba1923d4f9b9c75..dcfbdec818c5ae00a09eb5b56dce46d8cbde4cba 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, static_cast(height * batch)}; const std::vector lws = {8, 16, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&kwg_size]() -> std::vector> { + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 349c619574e425aea00b4521194c3ae04649942f..5a0fbadf3db02894394ab5e5c83cd2eb6a864104 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -6,6 +6,7 @@ #include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -23,11 +24,6 @@ static void Pooling(const Tensor *input, index_t channels = output->dim(3); index_t channel_blocks = (channels + 3) / 4; - const uint32_t gws[3] = { - static_cast(channel_blocks), - static_cast(out_width), - static_cast(batch * out_height), - }; auto runtime = OpenCLRuntime::Get(); std::set built_options; @@ -44,13 +40,6 @@ static void Pooling(const Tensor *input, } auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - - uint32_t lws[3]; - lws[0] = std::min(channel_blocks, kwg_size); - lws[1] = std::min(out_width, kwg_size / lws[0]); - lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); - uint32_t idx = 0; pooling_kernel.setArg(idx++, *(static_cast(input->buffer()))); pooling_kernel.setArg(idx++, static_cast(input->dim(1))); @@ -62,12 +51,60 @@ static void Pooling(const Tensor *input, pooling_kernel.setArg(idx++, pooling_size); pooling_kernel.setArg(idx++, *(static_cast(output->buffer()))); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS) << error; + const uint32_t gws[3] = { + static_cast(channel_blocks), + static_cast(out_width), + static_cast(batch * out_height), + }; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); + std::vector lws(3, 0); + lws[0] = std::min(channel_blocks, kwg_size); + lws[1] = std::min(out_width, kwg_size / lws[0]); + lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); + local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); + return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + pooling_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "pooling_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } template diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index 28ff881b1315a98e66b74a057351d1136b555af7..483ec8d492df6a755ae8d665245912873960de2d 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -50,8 +50,13 @@ void ReluFunctor::operator()(const Tensor *input, static_cast(height * batch)}; const std::vector lws = {8, 16, 8}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); - auto params_generator = [&kwg_size]() -> std::vector> { + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(width, kwg_size / local_ws[0]); + local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, {kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4}, diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 1ebc21f8a000668659ba49892b793df377e7141a..a3686e479f29cb76e55f08652fe385ca940e5d10 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -7,6 +7,7 @@ #include "mace/kernels/resize_bilinear.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -44,8 +45,6 @@ void ResizeBilinearFunctor::operator()( built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); - uint32_t idx = 0; rb_kernel.setArg(idx++, *(static_cast(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast(output->buffer()))); @@ -55,17 +54,52 @@ void ResizeBilinearFunctor::operator()( rb_kernel.setArg(idx++, static_cast(in_width)); rb_kernel.setArg(idx++, static_cast(out_height)); - auto command_queue = runtime->command_queue(); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(out_width), + static_cast(out_height * batch)}; + const std::vector lws = {8, 16, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(channel_blocks, kwg_size); + local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); + local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); + return {{4, 15, 8}, //SNPE size + {local_ws[0], local_ws[1], local_ws[2]}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + rb_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "resize_bilinear_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); - cl_int error = command_queue.enqueueNDRangeKernel( - rb_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blocks), - static_cast(out_width), - static_cast(out_height * batch)), - // TODO tuning - cl::NDRange(1, static_cast(out_width > kwg_size ? kwg_size : out_width), 1), - nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS, error); } template struct ResizeBilinearFunctor; diff --git a/mace/python/tools/BUILD b/mace/python/tools/BUILD index b1bb214cb7153324924e05ddc81868c94f09b73a..675f12acb73ee99e810c9add14087ebc63408812 100644 --- a/mace/python/tools/BUILD +++ b/mace/python/tools/BUILD @@ -8,6 +8,7 @@ py_library( ], srcs_version = "PY2AND3", deps = [ + ":memory_optimizer", "//mace/proto:mace_py", ], ) diff --git a/mace/python/tools/memory_optimizer.py b/mace/python/tools/memory_optimizer.py index f64df5bada8f5acf1730a1f5e15227605cadad24..8841ba577c704687efb9761a5a1e65eab7f4cbda 100644 --- a/mace/python/tools/memory_optimizer.py +++ b/mace/python/tools/memory_optimizer.py @@ -65,7 +65,7 @@ class MemoryOptimizer(object): raise Exception('ref count is less than 0') for mem in self.mem_block: - arena = net_def.mem_arena + arena = self.net_def.mem_arena block = arena.mem_block.add() block.mem_id = mem block.x = self.mem_block[mem][0] @@ -83,20 +83,7 @@ class MemoryOptimizer(object): print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) -if __name__ == '__main__': - model_file = sys.argv[1] - opt_model_file = sys.argv[2] - with open(model_file, "rb") as f: - net_def = mace_pb2.NetDef() - net_def.ParseFromString(f.read()) - optimizer = MemoryOptimizer(net_def) - optimizer.optimize() - - with open(opt_model_file, "wb") as f: - f.write(net_def.SerializeToString()) - with open(opt_model_file + '_txt', "wb") as f: - net_def.ClearField('tensors') - f.write(str(net_def)) - - +def optimize_memory(net_def): + mem_optimizer = MemoryOptimizer(net_def) + mem_optimizer.optimize() \ No newline at end of file diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py index f603d3b5c96e76ef52bee06911b467d9b0112a29..99094b886d9a89dacf881c7fbcfc2eb8c6563e8a 100644 --- a/mace/python/tools/tf_converter_lib.py +++ b/mace/python/tools/tf_converter_lib.py @@ -1,7 +1,7 @@ from mace.proto import mace_pb2 import tensorflow as tf import numpy as np -from mace.python.tools.convert_util import tf_dtype_2_mace_dtype +from mace.python.tools import memory_optimizer # TODO: support NCHW formt, now only support NHWC. padding_mode = { @@ -25,22 +25,10 @@ data_type_map = { 'DT_FLOAT': mace_pb2.DT_FLOAT } -def convert_tensor(op, tensor): - tf_tensor = op.outputs[0].eval() - tensor.name = op.outputs[0].name +BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"] - shape = list(tf_tensor.shape) - tensor.dims.extend(shape) - - tf_dt = op.get_attr('dtype') - if tf_dt == tf.float32: - tensor.data_type = mace_pb2.DT_FLOAT - tensor.float_data.extend(tf_tensor.astype(float).flat) - elif tf_dt == tf.int32: - tensor.data_type = mace_pb2.DT_INT32 - tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) - else: - raise Exception("Not supported tensor type: " + tf_dt.name) +MACE_INPUT_NODE_NAME = "mace_input_node" +MACE_OUTPUT_NODE_NAME = "mace_output_node" def get_input_tensor(op, index): input_tensor = op.inputs[index] @@ -48,260 +36,386 @@ def get_input_tensor(op, index): input_tensor = get_input_tensor(input_tensor.op, 0) return input_tensor -def add_buffer_to_image(input_name, input_type, dt, net_def): - output_name = input_name[:-2] + "_b2i" + input_name[-2:] - op_def = net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'BufferToImage' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'mode' - arg.i = 0 - arg = op_def.arg.add() - arg.name = 'T' - arg.i = dt - return output_name - -def add_image_to_buffer(input_name, input_type, dt, net_def): - output_name = input_name[:-2] + "_i2b" + input_name[-2:] - op_def = net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([input_name]) - op_def.output.extend([output_name]) - - arg = op_def.arg.add() - arg.name = 'buffer_type' - arg.i = buffer_type_map[input_type] - arg = op_def.arg.add() - arg.name = 'T' - arg.i = dt - return output_name - -def add_input_transform(name, dt, net_def): - new_input_name = "mace_input_node:0" - op_def = net_def.op.add() - op_def.name = name - op_def.type = 'BufferToImage' - op_def.input.extend([new_input_name]) - op_def.output.extend([name+':0']) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT'] - - arg = op_def.arg.add() - arg.name = 'T' - arg.i = dt - -def add_output_transform(name, net_def): - output_name = "mace_output_node:0" - op_def = net_def.op.add() - op_def.name = output_name[:-2] - op_def.type = 'ImageToBuffer' - op_def.input.extend([name+':0']) - op_def.output.extend([output_name]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'buffer_type' - epsilon_arg.i = buffer_type_map['IN_OUT'] - - -def convert_op_outputs(mace_op_def, tf_op): - mace_op_def.output.extend([output.name for output in tf_op.outputs]) - mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype) - for output in tf_op.outputs]) - output_shapes = [] - for output in tf_op.outputs: - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(output.shape.as_list()) - output_shapes.append(output_shape) - mace_op_def.output_shape.extend(output_shapes) - - -def convert_ops(unresolved_ops, dt, net_def, device): - ops_count = len(unresolved_ops) - resolved_count = 1 - - first_op = unresolved_ops[0] - - if first_op.type in ['Placeholder', 'Reshape', 'Identity']: - pass - elif first_op.type == 'Const': - tensor = net_def.tensors.add() - convert_tensor(first_op, tensor) - else: - op_def = net_def.op.add() +class TFConverter(object): + def __init__(self, tf_ops, net_def, dt, device): + self.net_def = net_def + self.tf_ops = tf_ops + self.dt = dt + self.device = device + self.tf_graph = {} + self.resolved_ops = {} + + for op in tf_ops: + self.resolved_ops[op.name] = 0 + for input in op.inputs: + input_name = input.name[:-2] + if input_name not in self.tf_graph: + self.tf_graph[input_name] = [] + self.tf_graph[input_name].append(op) + + def add_buffer_to_image(self, input_name, input_type): + output_name = input_name[:-2] + "_b2i" + input_name[-2:] + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'BufferToImage' + op_def.input.extend([input_name]) + op_def.output.extend([output_name]) + + arg = op_def.arg.add() + arg.name = 'buffer_type' + arg.i = buffer_type_map[input_type] + arg = op_def.arg.add() + arg.name = 'mode' + arg.i = 0 arg = op_def.arg.add() arg.name = 'T' - arg.i = dt + arg.i = self.dt + return output_name - if first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative': - op_def.name = first_op.name - if first_op.type == 'DepthwiseConv2dNative': - op_def.type = 'DepthwiseConv2d' - else: - op_def.type = first_op.type - if device == 'gpu': - op_def.input.extend([first_op.inputs[0].name]) - output_name = add_buffer_to_image(first_op.inputs[1].name, "FILTER", dt, net_def) + def add_input_transform(self, name): + new_input_name = MACE_INPUT_NODE_NAME + ":0" + op_def = self.net_def.op.add() + op_def.name = name + op_def.type = 'BufferToImage' + op_def.input.extend([new_input_name]) + op_def.output.extend([name+':0']) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT'] + + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + def add_output_transform(self, name): + output_name = MACE_OUTPUT_NODE_NAME + ":0" + op_def = self.net_def.op.add() + op_def.name = output_name[:-2] + op_def.type = 'ImageToBuffer' + op_def.input.extend([name+':0']) + op_def.output.extend([output_name]) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'buffer_type' + epsilon_arg.i = buffer_type_map['IN_OUT'] + + @staticmethod + def add_output_shape(outputs, op): + output_shapes = [] + for output in outputs: + if output.shape.num_elements() is not None: + output_shape = mace_pb2.OutputShape() + output_shape.dims.extend(output.shape.as_list()) + output_shapes.append(output_shape) + op.output_shape.extend(output_shapes) + + def convert_tensor(self, op): + tensor = self.net_def.tensors.add() + tf_tensor = op.outputs[0].eval() + tensor.name = op.outputs[0].name + + shape = list(tf_tensor.shape) + tensor.dims.extend(shape) + + tf_dt = op.get_attr('dtype') + if tf_dt == tf.float32: + tensor.data_type = mace_pb2.DT_FLOAT + tensor.float_data.extend(tf_tensor.astype(np.float32).flat) + elif tf_dt == tf.int32: + tensor.data_type = mace_pb2.DT_INT32 + tensor.int32_data.extend(tf_tensor.astype(np.int32).flat) + else: + raise Exception("Not supported tensor type: " + tf_dt.name) + self.resolved_ops[op.name] = 1 + + def convert_conv2d(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + if op.type == 'DepthwiseConv2dNative': + op_def.type = 'DepthwiseConv2d' + else: + op_def.type = op.type + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + output_name = self.add_buffer_to_image(op.inputs[1].name, "FILTER") + op_def.input.extend([output_name]) + else: + op_def.input.extend([input.name for input in op.inputs]) + + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + final_op = op + self.resolved_ops[op.name] = 1 + + if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' : + bias_add_op = self.tf_graph[op.name][0] + if self.device == 'gpu': + output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT") op_def.input.extend([output_name]) else: - op_def.input.extend([input.name for input in first_op.inputs]) - - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[first_op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - final_op = first_op - - if ops_count >= 3 and unresolved_ops[1].type == 'Const' and unresolved_ops[2].type == 'BiasAdd' : - bias_tensor = unresolved_ops[1] - tensor = net_def.tensors.add() - convert_tensor(bias_tensor, tensor) - - bias_add_op = unresolved_ops[2] - if device == 'gpu': - output_name = add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT", dt, net_def) - op_def.input.extend([output_name]) - else: - op_def.input.extend([bias_add_op.inputs[1].name]) - final_op = bias_add_op - resolved_count = 3 - - if ops_count >= 4 and unresolved_ops[3].type == 'Relu': - relu_op = unresolved_ops[3]; - op_def.type = "FusedConv2D" - final_op = relu_op - resolved_count = 4 - - convert_op_outputs(op_def, final_op) - - elif first_op.type == 'FusedBatchNorm': - op_def.name = first_op.name - op_def.type = 'BatchNorm' - if device == 'gpu': - op_def.input.extend([first_op.inputs[0].name]) - for i in range(1, len(first_op.inputs)): - output_name = add_buffer_to_image(first_op.inputs[i].name, "ARGUMENT", dt, net_def) - op_def.input.extend([output_name]) + op_def.input.extend([bias_add_op.inputs[1].name]) + final_op = bias_add_op + self.resolved_ops[bias_add_op.name] = 1 + + if len(self.tf_graph[final_op.name]) == 1 \ + and self.tf_graph[final_op.name][0].type == 'Relu': + relu_op = self.tf_graph[final_op.name][0] + op_def.type = "FusedConv2D" + final_op = relu_op + self.resolved_ops[relu_op.name] = 1 + + op_def.output.extend([output.name for output in final_op.outputs]) + self.add_output_shape(final_op.outputs, op_def) + self.net_def.op.extend([op_def]) + + def convert_fused_batchnorm(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'BatchNorm' + if self.device == 'gpu': + op_def.input.extend([op.inputs[0].name]) + for i in range(1, len(op.inputs)): + output_name = self.add_buffer_to_image(op.inputs[i].name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([op.outputs[0].name]) + + self.add_output_shape(op.outputs, op_def) + + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'epsilon' + epsilon_arg.f = op.get_attr('epsilon') + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + self.net_def.op.extend([op_def]) + + def convert_batchnorm(self, op): + bn_ops = [] + bn_ops.append(op) + for i in range(1, 3): + if len(self.tf_graph[bn_ops[i-1].name]) == 1 \ + and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]: + bn_ops.append(self.tf_graph[bn_ops[i-1].name][0]) else: - op_def.input.extend([input.name for input in first_op.inputs]) - op_def.output.extend([first_op.outputs[0].name]) - - output_shape = mace_pb2.OutputShape() - output_shape.dims.extend(first_op.outputs[0].shape.as_list()) - op_def.output_shape.extend([output_shape]) - - epsilon_arg = op_def.arg.add() - epsilon_arg.name = 'epsilon' - epsilon_arg.f = first_op.get_attr('epsilon') - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - elif first_op.type == 'Add' and first_op.name.endswith( - 'batchnorm/add') and ops_count > 7: - add_op = first_op - mul_op = unresolved_ops[2] - mul_1_op = unresolved_ops[3] - mul_2_op = unresolved_ops[4] - sub_op = unresolved_ops[5] - add_1_op = unresolved_ops[6] - # print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type) - if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \ - mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add': raise Exception('Invalid BatchNorm Op') + if len(self.tf_graph[bn_ops[2].name]) == 2 \ + and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \ + and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]: + bn_ops.append(self.tf_graph[bn_ops[2].name][0]) + bn_ops.append(self.tf_graph[bn_ops[2].name][1]) + else: + raise Exception('Invalid BatchNorm Op') + bn_ops.append(self.tf_graph[bn_ops[4].name][0]) + bn_ops.append(self.tf_graph[bn_ops[3].name][0]) - get_input_tensor(mul_1_op, 0) - input_name = get_input_tensor(mul_1_op, 0).name - gamma = get_input_tensor(mul_op, 1).name - beta = get_input_tensor(sub_op, 0).name - mean = get_input_tensor(mul_2_op, 0).name - variance = get_input_tensor(add_op, 0).name - epsilon = get_input_tensor(add_op, 1).name - - op_def.name = first_op.name[:-4] # remove /add - op_def.type = 'BatchNorm' - op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) - convert_op_outputs(op_def, add_1_op) - - resolved_count = 7 - elif first_op.type == 'Relu6': - op_def.name = first_op.name - op_def.type = 'Relu' - op_def.input.extend([input.name for input in first_op.inputs]) - convert_op_outputs(op_def, first_op) - - max_limit_arg = op_def.arg.add() - max_limit_arg.name = 'max_limit' - max_limit_arg.f = 6 - elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool': - op_def.name = first_op.name - op_def.type = 'Pooling' - op_def.input.extend([input.name for input in first_op.inputs]) - convert_op_outputs(op_def, first_op) - - pooling_type_arg = op_def.arg.add() - pooling_type_arg.name = 'pooling_type' - pooling_type_arg.i = pooling_type_mode[first_op.type] - padding_arg = op_def.arg.add() - padding_arg.name = 'padding' - padding_arg.i = padding_mode[first_op.get_attr('padding')] - strides_arg = op_def.arg.add() - strides_arg.name = 'strides' - strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) - kernels_arg = op_def.arg.add() - kernels_arg.name = 'kernels' - kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) - data_format_arg = op_def.arg.add() - data_format_arg.name = 'data_format' - data_format_arg.s = 'NHWC' - elif first_op.type == 'Add': - op_def.name = first_op.name - op_def.type = "AddN" - op_def.input.extend([input.name for input in first_op.inputs]) - convert_op_outputs(op_def, first_op) - elif first_op.type == 'ConcatV2': - op_def.name = first_op.name - op_def.type = "Concat" - op_def.input.extend([first_op.inputs[i].name for i in xrange(2)]) - axis_arg = op_def.arg.add() - axis_arg.name = 'axis' - axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32) - convert_op_outputs(op_def, first_op) - elif first_op.type == 'ResizeBilinear': - op_def.name = first_op.name - op_def.type = "ResizeBilinear" - op_def.input.extend([first_op.inputs[0].name]) - size_arg = op_def.arg.add() - size_arg.name = 'size' - size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat) - size_arg = op_def.arg.add() - size_arg.name = 'align_corners' - size_arg.i = first_op.get_attr('align_corners') - convert_op_outputs(op_def, first_op) - elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']: - op_def.name = first_op.name - op_def.type = first_op.type - op_def.input.extend([input.name for input in first_op.inputs]) - convert_op_outputs(op_def, first_op) + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + + input_name = get_input_tensor(bn_ops[3], 0).name + gamma = get_input_tensor(bn_ops[2], 1).name + beta = get_input_tensor(bn_ops[5], 0).name + mean = get_input_tensor(bn_ops[4], 0).name + variance = get_input_tensor(bn_ops[0], 0).name + + op_def.name = op.name[:-4] # remove /add + op_def.type = 'BatchNorm' + if self.device == 'gpu': + op_def.input.extend([input_name]) + for tensor_name in [gamma, beta, mean, variance]: + output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT") + op_def.input.extend([output_name]) else: - raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type)) - pass + op_def.input.extend([input_name, gamma, beta, mean, variance]) + op_def.output.extend([output.name for output in bn_ops[6].outputs]) + self.add_output_shape(bn_ops[6].outputs, op_def) + epsilon_arg = op_def.arg.add() + epsilon_arg.name = 'epsilon' + epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + + self.net_def.op.extend([op_def]) + for i in range(0, 7): + self.resolved_ops[bn_ops[i].name] = 1 + + def convert_pooling(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Pooling' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + pooling_type_arg = op_def.arg.add() + pooling_type_arg.name = 'pooling_type' + pooling_type_arg.i = pooling_type_mode[op.type] + padding_arg = op_def.arg.add() + padding_arg.name = 'padding' + padding_arg.i = padding_mode[op.get_attr('padding')] + strides_arg = op_def.arg.add() + strides_arg.name = 'strides' + strides_arg.ints.extend(op.get_attr('strides')[1:3]) + kernels_arg = op_def.arg.add() + kernels_arg.name = 'kernels' + kernels_arg.ints.extend(op.get_attr('ksize')[1:3]) + data_format_arg = op_def.arg.add() + data_format_arg.name = 'data_format' + data_format_arg.s = 'NHWC' + self.resolved_ops[op.name] = 1 + + def convert_relu6(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = 'Relu' + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + max_limit_arg = op_def.arg.add() + max_limit_arg.name = 'max_limit' + max_limit_arg.f = 6 + self.resolved_ops[op.name] = 1 + + def convert_add(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "AddN" + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_concat(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "Concat" + op_def.input.extend([op.inputs[i].name for i in xrange(2)]) + op_def.output.extend([output.name for output in op.outputs]) + axis_arg = op_def.arg.add() + axis_arg.name = 'axis' + axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_resize_bilinear(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "ResizeBilinear" + op_def.input.extend([op.inputs[0].name]) + op_def.output.extend([output.name for output in op.outputs]) + size_arg = op_def.arg.add() + size_arg.name = 'size' + size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat) + size_arg = op_def.arg.add() + size_arg.name = 'align_corners' + size_arg.i = op.get_attr('align_corners') + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert_bias_add(self, op): + op_def = mace_pb2.OperatorDef() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = "BiasAdd" + op_def.input.extend([op.inputs[0].name]) + if self.device == 'gpu': + output_name = self.add_buffer_to_image(op.inputs[1].name, "ARGUMENT") + op_def.input.extend([output_name]) + else: + op_def.input.extend([op.inputs[1].name]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.net_def.op.extend([op_def]) + self.resolved_ops[op.name] = 1 + + def convert_normal_op(self, op): + op_def = self.net_def.op.add() + arg = op_def.arg.add() + arg.name = 'T' + arg.i = self.dt + op_def.name = op.name + op_def.type = op.type + op_def.input.extend([input.name for input in op.inputs]) + op_def.output.extend([output.name for output in op.outputs]) + self.add_output_shape(op.outputs, op_def) + self.resolved_ops[op.name] = 1 + + def convert(self, input_node, output_node): + if self.device == 'gpu': + self.add_input_transform(input_node) + + for op in self.tf_ops: + if self.resolved_ops[op.name] == 1: + continue + if op.type in ['Placeholder', 'Reshape', 'Identity']: + self.resolved_ops[op.name] = 1 + pass + elif op.type == 'Const': + self.convert_tensor(op) + elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative': + self.convert_conv2d(op) + elif op.type == 'FusedBatchNorm': + self.convert_fused_batchnorm(op) + elif op.type == 'Add' and op.name.endswith('batchnorm/add'): + self.convert_batchnorm(op) + elif op.type == 'AvgPool' or op.type == 'MaxPool': + self.convert_pooling(op) + elif op.type == 'Relu6': + self.convert_relu6(op) + elif op.type == 'Add': + self.convert_add(op) + elif op.type == 'ConcatV2': + self.convert_concat(op) + elif op.type == 'ResizeBilinear': + self.convert_resize_bilinear(op) + elif op.type == 'BiasAdd': + self.convert_bias_add(op) + elif op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND']: + self.convert_normal_op(op) + else: + raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type)) - for i in range(resolved_count): - del unresolved_ops[0] + if self.device == 'gpu': + self.add_output_transform(output_node) + for key in self.resolved_ops: + if self.resolved_ops[key] != 1: + print 'Unresolve Op: %s' % key def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): net_def = mace_pb2.NetDef() @@ -311,14 +425,11 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi with session.graph.as_default() as graph: tf.import_graph_def(input_graph_def, name="") ops = graph.get_operations() - unresolved_ops = ops - if device == 'gpu': - add_input_transform(input_node, dt, net_def) - while len(unresolved_ops) > 0: - convert_ops(unresolved_ops, dt, net_def, device) - if device == 'gpu': - add_output_transform(output_node, net_def) - - print "PB Parsed." + converter = TFConverter(ops, net_def, dt, device) + converter.convert(input_node, output_node) + print "PB Converted, start optimize memory." + mem_optimizer = memory_optimizer.MemoryOptimizer(net_def) + mem_optimizer.optimize() + print "Memory optimization done." return net_def diff --git a/mace/python/tools/tf_dsp_converter_lib.py b/mace/python/tools/tf_dsp_converter_lib.py index 209173e90f8930d2fe7abbc767def55ac93e5e9a..e9eae6361362447b2297c4a1b99a6e0c6b46166b 100644 --- a/mace/python/tools/tf_dsp_converter_lib.py +++ b/mace/python/tools/tf_dsp_converter_lib.py @@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): elif is_node_flatten_reshape(first_op): op_def.type = 'Flatten' op_def.input.extend([t.name for t in first_op.inputs]) + op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs]) convert_op_outputs(op_def, first_op) elif dsp_ops.has_op(first_op.type): op_def.input.extend([t.name for t in first_op.inputs]) diff --git a/tools/validate.py b/tools/validate.py index 9edbdd2456e93c8ad89d812013c9cdea24b57aea..f322ed703b6a46c9fee577208a18fe6c00692302 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -4,6 +4,7 @@ import os import os.path import tensorflow as tf import numpy as np +from scipy import spatial from tensorflow import gfile @@ -34,9 +35,12 @@ def load_data(file): def valid_output(out_shape, mace_out_file, tf_out_value): mace_out_value = load_data(mace_out_file) if mace_out_value.size != 0: + similarity = (1 - spatial.distance.cosine(tf_out_value.flat, mace_out_value)) + print 'MACE VS TF similarity: ', similarity + if similarity > 0.999: + print '=======================Passed! Haha======================' mace_out_value = mace_out_value.reshape(out_shape) np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05) - print '=======================Passed! Haha======================' else: print '=======================Skip empty node===================' @@ -62,7 +66,7 @@ def run_model(input_shape): input_value = input_value.reshape(input_shape) output_value = session.run(output_node, feed_dict={input_node: [input_value]}) - # output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight') + output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_out') return output_value def main(unused_args): diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index 9946fde358e2d06fcd46cdcb1c79067e5479bed0..275f1bfb827f7f713d8b93c892c5080ba2cbd6b6 100644 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -2,10 +2,10 @@ # Must run at root dir of mace project. set +x Usage() { - echo 'Usage: bash tools/validate_gcn.sh tf_model_file' + echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size' } -if [ $# != 1 ];then +if [ $# != 2 ];then Usage exit -1 fi @@ -13,18 +13,18 @@ fi TF_MODEL_FILE_PATH=$1 MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MACE_MODEL_NAME='mace_model.pb' -MACE_OPT_MODEL_NAME='mace_opt_model.pb' INPUT_FILE_NAME='model_input' OUTPUT_FILE_NAME='gcn.out' OUTPUT_LIST_FILE='gcn.list' PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" KERNEL_DIR="${PHONE_DATA_DIR}/cl/" +IMAGE_SIZE=$2 # Step 1: Generate input data echo "Step 1: Generate input data" python tools/validate.py --generate_data true --random_seed 1 \ --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \ - --input_shape=512,512,3 + --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3" # Step 2: convert tf model to mace model echo "Step 2: convert tf model to mace model and optimize memory" @@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ --output_node=GCN/br_result_2/fcn_br \ --data_type=DT_HALF \ --runtime=gpu -bazel build mace/python/tools:memory_optimizer -bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \ - ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} - # Step 3: Run model on the phone echo "Step 3: Run model on the phone" @@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \ adb shell "mkdir -p ${PHONE_DATA_DIR}" adb shell "mkdir -p ${KERNEL_DIR}" -adb push mace/kernels/opencl/cl/ ${KERNEL_DIR} -adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR} +adb push mace/kernels/opencl/cl/* ${KERNEL_DIR} +adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} num_threads=${1:-4} -adb