提交 b1397592 编写于 作者: Y yejianwu

fix conflix

...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options); auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0; uint32_t idx = 0;
for (auto input : input_tensors) { for (auto input : input_tensors) {
addn_kernel.setArg(idx++, addn_kernel.setArg(idx++,
...@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
} }
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[2] = {
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
std::vector<uint32_t> lws = {64, 16};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]},
{kwg_size / 16, 16},
{kwg_size / 32, 32},
{kwg_size / 64, 64},
{kwg_size / 128, 128},
{kwg_size / 256, 256},
{kwg_size, 1},
{1, kwg_size}
};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange, addn_kernel, cl::NullRange,
cl::NDRange(width_pixels, batch_height_pixels), cl::NDRange(gws[0], gws[1]),
cl::NDRange(64, 16), // TODO fix this cl::NDRange(params[0], params[1]),
nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "error code: " << error;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template <typename T> template <typename T>
......
...@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{8, 128, 1}, //SNPE size return {{8, 128, 1}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0, ...@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0,
concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3))); concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
std::vector<uint32_t> lws = {8, 16, 8};
uint32_t lws[3] = {8, 16, 8}; auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
// lws[0] = std::min<uint32_t>(channel_blk, kwg_size); std::vector<uint32_t> local_ws(3, 0);
// lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]); local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
// lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1])); local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange, concat_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blk), cl::NDRange(gws[0], gws[1], gws[2]),
static_cast<uint32_t>(width), cl::NDRange(params[0], params[1], params[2]),
static_cast<uint32_t>(height * batch)),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "concat_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template<typename T> template<typename T>
......
...@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input, ...@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8}; const std::vector<uint32_t> lws = {8, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> { auto params_generator = [&]()->std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size/16, 4, 4}, {kwg_size/16, 4, 4},
{kwg_size/32, 4, 8}, {kwg_size/32, 4, 8},
{kwg_size/32, 8, 4}, {kwg_size/32, 8, 4},
......
...@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8}; const std::vector<uint32_t> lws = {4, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, ...@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input, ...@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input,
index_t channels = output->dim(3); index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input, ...@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input,
} }
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0; uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
...@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input, ...@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input,
pooling_kernel.setArg(idx++, pooling_size); pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
std::vector<uint32_t> lws(3, 0);
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]), cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << error;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template<typename T> template<typename T>
......
...@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0; uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
...@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
rb_kernel.setArg(idx++, static_cast<int32_t>(in_width)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
rb_kernel.setArg(idx++, static_cast<int32_t>(out_height)); rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
auto command_queue = runtime->command_queue(); const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
cl_int error = command_queue.enqueueNDRangeKernel( static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange, rb_kernel, cl::NullRange,
cl::NDRange(static_cast<int32_t>(channel_blocks), cl::NDRange(gws[0], gws[1], gws[2]),
static_cast<int32_t>(out_width), cl::NDRange(params[0], params[1], params[2]),
static_cast<int32_t>(out_height * batch)), NULL, OpenCLRuntime::Get()->GetDefaultEvent());
// TODO tuning
cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1), MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); return error;
MACE_CHECK(error == CL_SUCCESS, error); };
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>; template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
...@@ -8,6 +8,7 @@ py_library( ...@@ -8,6 +8,7 @@ py_library(
], ],
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":memory_optimizer",
"//mace/proto:mace_py", "//mace/proto:mace_py",
], ],
) )
......
...@@ -65,7 +65,7 @@ class MemoryOptimizer(object): ...@@ -65,7 +65,7 @@ class MemoryOptimizer(object):
raise Exception('ref count is less than 0') raise Exception('ref count is less than 0')
for mem in self.mem_block: for mem in self.mem_block:
arena = net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.x = self.mem_block[mem][0] block.x = self.mem_block[mem][0]
...@@ -83,20 +83,7 @@ class MemoryOptimizer(object): ...@@ -83,20 +83,7 @@ class MemoryOptimizer(object):
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
if __name__ == '__main__':
model_file = sys.argv[1]
opt_model_file = sys.argv[2]
with open(model_file, "rb") as f:
net_def = mace_pb2.NetDef()
net_def.ParseFromString(f.read())
optimizer = MemoryOptimizer(net_def)
optimizer.optimize()
with open(opt_model_file, "wb") as f:
f.write(net_def.SerializeToString())
with open(opt_model_file + '_txt', "wb") as f:
net_def.ClearField('tensors')
f.write(str(net_def))
def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
\ No newline at end of file
from mace.proto import mace_pb2 from mace.proto import mace_pb2
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from mace.python.tools.convert_util import tf_dtype_2_mace_dtype from mace.python.tools import memory_optimizer
# TODO: support NCHW formt, now only support NHWC. # TODO: support NCHW formt, now only support NHWC.
padding_mode = { padding_mode = {
...@@ -25,22 +25,10 @@ data_type_map = { ...@@ -25,22 +25,10 @@ data_type_map = {
'DT_FLOAT': mace_pb2.DT_FLOAT 'DT_FLOAT': mace_pb2.DT_FLOAT
} }
def convert_tensor(op, tensor): BATCH_NORM_ORDER = ["Add", "Rsqrt", "Mul", "Mul", "Mul", "Sub", "Add"]
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
shape = list(tf_tensor.shape) MACE_INPUT_NODE_NAME = "mace_input_node"
tensor.dims.extend(shape) MACE_OUTPUT_NODE_NAME = "mace_output_node"
tf_dt = op.get_attr('dtype')
if tf_dt == tf.float32:
tensor.data_type = mace_pb2.DT_FLOAT
tensor.float_data.extend(tf_tensor.astype(float).flat)
elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else:
raise Exception("Not supported tensor type: " + tf_dt.name)
def get_input_tensor(op, index): def get_input_tensor(op, index):
input_tensor = op.inputs[index] input_tensor = op.inputs[index]
...@@ -48,9 +36,26 @@ def get_input_tensor(op, index): ...@@ -48,9 +36,26 @@ def get_input_tensor(op, index):
input_tensor = get_input_tensor(input_tensor.op, 0) input_tensor = get_input_tensor(input_tensor.op, 0)
return input_tensor return input_tensor
def add_buffer_to_image(input_name, input_type, dt, net_def): class TFConverter(object):
def __init__(self, tf_ops, net_def, dt, device):
self.net_def = net_def
self.tf_ops = tf_ops
self.dt = dt
self.device = device
self.tf_graph = {}
self.resolved_ops = {}
for op in tf_ops:
self.resolved_ops[op.name] = 0
for input in op.inputs:
input_name = input.name[:-2]
if input_name not in self.tf_graph:
self.tf_graph[input_name] = []
self.tf_graph[input_name].append(op)
def add_buffer_to_image(self, input_name, input_type):
output_name = input_name[:-2] + "_b2i" + input_name[-2:] output_name = input_name[:-2] + "_b2i" + input_name[-2:]
op_def = net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'BufferToImage' op_def.type = 'BufferToImage'
op_def.input.extend([input_name]) op_def.input.extend([input_name])
...@@ -64,28 +69,12 @@ def add_buffer_to_image(input_name, input_type, dt, net_def): ...@@ -64,28 +69,12 @@ def add_buffer_to_image(input_name, input_type, dt, net_def):
arg.i = 0 arg.i = 0
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = dt arg.i = self.dt
return output_name return output_name
def add_image_to_buffer(input_name, input_type, dt, net_def): def add_input_transform(self, name):
output_name = input_name[:-2] + "_i2b" + input_name[-2:] new_input_name = MACE_INPUT_NODE_NAME + ":0"
op_def = net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer'
op_def.input.extend([input_name])
op_def.output.extend([output_name])
arg = op_def.arg.add()
arg.name = 'buffer_type'
arg.i = buffer_type_map[input_type]
arg = op_def.arg.add()
arg.name = 'T'
arg.i = dt
return output_name
def add_input_transform(name, dt, net_def):
new_input_name = "mace_input_node:0"
op_def = net_def.op.add()
op_def.name = name op_def.name = name
op_def.type = 'BufferToImage' op_def.type = 'BufferToImage'
op_def.input.extend([new_input_name]) op_def.input.extend([new_input_name])
...@@ -97,11 +86,11 @@ def add_input_transform(name, dt, net_def): ...@@ -97,11 +86,11 @@ def add_input_transform(name, dt, net_def):
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = dt arg.i = self.dt
def add_output_transform(name, net_def): def add_output_transform(self, name):
output_name = "mace_output_node:0" output_name = MACE_OUTPUT_NODE_NAME + ":0"
op_def = net_def.op.add() op_def = self.net_def.op.add()
op_def.name = output_name[:-2] op_def.name = output_name[:-2]
op_def.type = 'ImageToBuffer' op_def.type = 'ImageToBuffer'
op_def.input.extend([name+':0']) op_def.input.extend([name+':0'])
...@@ -111,197 +100,322 @@ def add_output_transform(name, net_def): ...@@ -111,197 +100,322 @@ def add_output_transform(name, net_def):
epsilon_arg.name = 'buffer_type' epsilon_arg.name = 'buffer_type'
epsilon_arg.i = buffer_type_map['IN_OUT'] epsilon_arg.i = buffer_type_map['IN_OUT']
@staticmethod
def convert_op_outputs(mace_op_def, tf_op): def add_output_shape(outputs, op):
mace_op_def.output.extend([output.name for output in tf_op.outputs])
mace_op_def.output_type.extend([tf_dtype_2_mace_dtype(output.dtype)
for output in tf_op.outputs])
output_shapes = [] output_shapes = []
for output in tf_op.outputs: for output in outputs:
if output.shape.num_elements() is not None:
output_shape = mace_pb2.OutputShape() output_shape = mace_pb2.OutputShape()
output_shape.dims.extend(output.shape.as_list()) output_shape.dims.extend(output.shape.as_list())
output_shapes.append(output_shape) output_shapes.append(output_shape)
mace_op_def.output_shape.extend(output_shapes) op.output_shape.extend(output_shapes)
def convert_tensor(self, op):
tensor = self.net_def.tensors.add()
tf_tensor = op.outputs[0].eval()
tensor.name = op.outputs[0].name
def convert_ops(unresolved_ops, dt, net_def, device): shape = list(tf_tensor.shape)
ops_count = len(unresolved_ops) tensor.dims.extend(shape)
resolved_count = 1
first_op = unresolved_ops[0]
if first_op.type in ['Placeholder', 'Reshape', 'Identity']: tf_dt = op.get_attr('dtype')
pass if tf_dt == tf.float32:
elif first_op.type == 'Const': tensor.data_type = mace_pb2.DT_FLOAT
tensor = net_def.tensors.add() tensor.float_data.extend(tf_tensor.astype(np.float32).flat)
convert_tensor(first_op, tensor) elif tf_dt == tf.int32:
tensor.data_type = mace_pb2.DT_INT32
tensor.int32_data.extend(tf_tensor.astype(np.int32).flat)
else: else:
op_def = net_def.op.add() raise Exception("Not supported tensor type: " + tf_dt.name)
self.resolved_ops[op.name] = 1
def convert_conv2d(self, op):
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add() arg = op_def.arg.add()
arg.name = 'T' arg.name = 'T'
arg.i = dt arg.i = self.dt
op_def.name = op.name
if first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative': if op.type == 'DepthwiseConv2dNative':
op_def.name = first_op.name
if first_op.type == 'DepthwiseConv2dNative':
op_def.type = 'DepthwiseConv2d' op_def.type = 'DepthwiseConv2d'
else: else:
op_def.type = first_op.type op_def.type = op.type
if device == 'gpu': if self.device == 'gpu':
op_def.input.extend([first_op.inputs[0].name]) op_def.input.extend([op.inputs[0].name])
output_name = add_buffer_to_image(first_op.inputs[1].name, "FILTER", dt, net_def) output_name = self.add_buffer_to_image(op.inputs[1].name, "FILTER")
op_def.input.extend([output_name]) op_def.input.extend([output_name])
else: else:
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in op.inputs])
padding_arg = op_def.arg.add() padding_arg = op_def.arg.add()
padding_arg.name = 'padding' padding_arg.name = 'padding'
padding_arg.i = padding_mode[first_op.get_attr('padding')] padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add() strides_arg = op_def.arg.add()
strides_arg.name = 'strides' strides_arg.name = 'strides'
strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) strides_arg.ints.extend(op.get_attr('strides')[1:3])
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC' data_format_arg.s = 'NHWC'
final_op = first_op final_op = op
self.resolved_ops[op.name] = 1
if ops_count >= 3 and unresolved_ops[1].type == 'Const' and unresolved_ops[2].type == 'BiasAdd' :
bias_tensor = unresolved_ops[1]
tensor = net_def.tensors.add()
convert_tensor(bias_tensor, tensor)
bias_add_op = unresolved_ops[2] if len(self.tf_graph[op.name]) == 1 and self.tf_graph[op.name][0].type == 'BiasAdd' :
if device == 'gpu': bias_add_op = self.tf_graph[op.name][0]
output_name = add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT", dt, net_def) if self.device == 'gpu':
output_name = self.add_buffer_to_image(bias_add_op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name]) op_def.input.extend([output_name])
else: else:
op_def.input.extend([bias_add_op.inputs[1].name]) op_def.input.extend([bias_add_op.inputs[1].name])
final_op = bias_add_op final_op = bias_add_op
resolved_count = 3 self.resolved_ops[bias_add_op.name] = 1
if ops_count >= 4 and unresolved_ops[3].type == 'Relu': if len(self.tf_graph[final_op.name]) == 1 \
relu_op = unresolved_ops[3]; and self.tf_graph[final_op.name][0].type == 'Relu':
relu_op = self.tf_graph[final_op.name][0]
op_def.type = "FusedConv2D" op_def.type = "FusedConv2D"
final_op = relu_op final_op = relu_op
resolved_count = 4 self.resolved_ops[relu_op.name] = 1
convert_op_outputs(op_def, final_op) op_def.output.extend([output.name for output in final_op.outputs])
self.add_output_shape(final_op.outputs, op_def)
self.net_def.op.extend([op_def])
elif first_op.type == 'FusedBatchNorm': def convert_fused_batchnorm(self, op):
op_def.name = first_op.name op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'BatchNorm' op_def.type = 'BatchNorm'
if device == 'gpu': if self.device == 'gpu':
op_def.input.extend([first_op.inputs[0].name]) op_def.input.extend([op.inputs[0].name])
for i in range(1, len(first_op.inputs)): for i in range(1, len(op.inputs)):
output_name = add_buffer_to_image(first_op.inputs[i].name, "ARGUMENT", dt, net_def) output_name = self.add_buffer_to_image(op.inputs[i].name, "ARGUMENT")
op_def.input.extend([output_name]) op_def.input.extend([output_name])
else: else:
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([first_op.outputs[0].name]) op_def.output.extend([op.outputs[0].name])
output_shape = mace_pb2.OutputShape() self.add_output_shape(op.outputs, op_def)
output_shape.dims.extend(first_op.outputs[0].shape.as_list())
op_def.output_shape.extend([output_shape])
epsilon_arg = op_def.arg.add() epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon' epsilon_arg.name = 'epsilon'
epsilon_arg.f = first_op.get_attr('epsilon') epsilon_arg.f = op.get_attr('epsilon')
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC' data_format_arg.s = 'NHWC'
elif first_op.type == 'Add' and first_op.name.endswith( self.resolved_ops[op.name] = 1
'batchnorm/add') and ops_count > 7: self.net_def.op.extend([op_def])
add_op = first_op
mul_op = unresolved_ops[2] def convert_batchnorm(self, op):
mul_1_op = unresolved_ops[3] bn_ops = []
mul_2_op = unresolved_ops[4] bn_ops.append(op)
sub_op = unresolved_ops[5] for i in range(1, 3):
add_1_op = unresolved_ops[6] if len(self.tf_graph[bn_ops[i-1].name]) == 1 \
# print (mul_op.type, mul_2_op.type, mul_1_op.type, sub_op.type) and self.tf_graph[bn_ops[i-1].name][0].type == BATCH_NORM_ORDER[i]:
if mul_op.type != 'Mul' or mul_2_op.type != 'Mul' or \ bn_ops.append(self.tf_graph[bn_ops[i-1].name][0])
mul_1_op.type != 'Mul' or sub_op.type != 'Sub' or add_1_op.type != 'Add': else:
raise Exception('Invalid BatchNorm Op')
if len(self.tf_graph[bn_ops[2].name]) == 2 \
and self.tf_graph[bn_ops[2].name][0].type == BATCH_NORM_ORDER[3] \
and self.tf_graph[bn_ops[2].name][1].type == BATCH_NORM_ORDER[4]:
bn_ops.append(self.tf_graph[bn_ops[2].name][0])
bn_ops.append(self.tf_graph[bn_ops[2].name][1])
else:
raise Exception('Invalid BatchNorm Op') raise Exception('Invalid BatchNorm Op')
bn_ops.append(self.tf_graph[bn_ops[4].name][0])
bn_ops.append(self.tf_graph[bn_ops[3].name][0])
op_def = mace_pb2.OperatorDef()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
get_input_tensor(mul_1_op, 0) input_name = get_input_tensor(bn_ops[3], 0).name
input_name = get_input_tensor(mul_1_op, 0).name gamma = get_input_tensor(bn_ops[2], 1).name
gamma = get_input_tensor(mul_op, 1).name beta = get_input_tensor(bn_ops[5], 0).name
beta = get_input_tensor(sub_op, 0).name mean = get_input_tensor(bn_ops[4], 0).name
mean = get_input_tensor(mul_2_op, 0).name variance = get_input_tensor(bn_ops[0], 0).name
variance = get_input_tensor(add_op, 0).name
epsilon = get_input_tensor(add_op, 1).name
op_def.name = first_op.name[:-4] # remove /add op_def.name = op.name[:-4] # remove /add
op_def.type = 'BatchNorm' op_def.type = 'BatchNorm'
op_def.input.extend([input_name, gamma, beta, mean, variance, epsilon]) if self.device == 'gpu':
convert_op_outputs(op_def, add_1_op) op_def.input.extend([input_name])
for tensor_name in [gamma, beta, mean, variance]:
output_name = self.add_buffer_to_image(tensor_name, "ARGUMENT")
op_def.input.extend([output_name])
else:
op_def.input.extend([input_name, gamma, beta, mean, variance])
op_def.output.extend([output.name for output in bn_ops[6].outputs])
self.add_output_shape(bn_ops[6].outputs, op_def)
epsilon_arg = op_def.arg.add()
epsilon_arg.name = 'epsilon'
epsilon_arg.f = get_input_tensor(op, 1).eval().astype(np.float)
data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC'
resolved_count = 7 self.net_def.op.extend([op_def])
elif first_op.type == 'Relu6': for i in range(0, 7):
op_def.name = first_op.name self.resolved_ops[bn_ops[i].name] = 1
op_def.type = 'Relu'
op_def.input.extend([input.name for input in first_op.inputs])
convert_op_outputs(op_def, first_op)
max_limit_arg = op_def.arg.add() def convert_pooling(self, op):
max_limit_arg.name = 'max_limit' op_def = self.net_def.op.add()
max_limit_arg.f = 6 arg = op_def.arg.add()
elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool': arg.name = 'T'
op_def.name = first_op.name arg.i = self.dt
op_def.name = op.name
op_def.type = 'Pooling' op_def.type = 'Pooling'
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in op.inputs])
convert_op_outputs(op_def, first_op) op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
pooling_type_arg = op_def.arg.add() pooling_type_arg = op_def.arg.add()
pooling_type_arg.name = 'pooling_type' pooling_type_arg.name = 'pooling_type'
pooling_type_arg.i = pooling_type_mode[first_op.type] pooling_type_arg.i = pooling_type_mode[op.type]
padding_arg = op_def.arg.add() padding_arg = op_def.arg.add()
padding_arg.name = 'padding' padding_arg.name = 'padding'
padding_arg.i = padding_mode[first_op.get_attr('padding')] padding_arg.i = padding_mode[op.get_attr('padding')]
strides_arg = op_def.arg.add() strides_arg = op_def.arg.add()
strides_arg.name = 'strides' strides_arg.name = 'strides'
strides_arg.ints.extend(first_op.get_attr('strides')[1:3]) strides_arg.ints.extend(op.get_attr('strides')[1:3])
kernels_arg = op_def.arg.add() kernels_arg = op_def.arg.add()
kernels_arg.name = 'kernels' kernels_arg.name = 'kernels'
kernels_arg.ints.extend(first_op.get_attr('ksize')[1:3]) kernels_arg.ints.extend(op.get_attr('ksize')[1:3])
data_format_arg = op_def.arg.add() data_format_arg = op_def.arg.add()
data_format_arg.name = 'data_format' data_format_arg.name = 'data_format'
data_format_arg.s = 'NHWC' data_format_arg.s = 'NHWC'
elif first_op.type == 'Add': self.resolved_ops[op.name] = 1
op_def.name = first_op.name
def convert_relu6(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = 'Relu'
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
max_limit_arg = op_def.arg.add()
max_limit_arg.name = 'max_limit'
max_limit_arg.f = 6
self.resolved_ops[op.name] = 1
def convert_add(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "AddN" op_def.type = "AddN"
op_def.input.extend([input.name for input in first_op.inputs]) op_def.input.extend([input.name for input in op.inputs])
convert_op_outputs(op_def, first_op) op_def.output.extend([output.name for output in op.outputs])
elif first_op.type == 'ConcatV2': self.add_output_shape(op.outputs, op_def)
op_def.name = first_op.name self.resolved_ops[op.name] = 1
def convert_concat(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "Concat" op_def.type = "Concat"
op_def.input.extend([first_op.inputs[i].name for i in xrange(2)]) op_def.input.extend([op.inputs[i].name for i in xrange(2)])
op_def.output.extend([output.name for output in op.outputs])
axis_arg = op_def.arg.add() axis_arg = op_def.arg.add()
axis_arg.name = 'axis' axis_arg.name = 'axis'
axis_arg.i = get_input_tensor(first_op, 2).eval().astype(np.int32) axis_arg.i = get_input_tensor(op, 2).eval().astype(np.int32)
convert_op_outputs(op_def, first_op) self.add_output_shape(op.outputs, op_def)
elif first_op.type == 'ResizeBilinear': self.resolved_ops[op.name] = 1
op_def.name = first_op.name
def convert_resize_bilinear(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "ResizeBilinear" op_def.type = "ResizeBilinear"
op_def.input.extend([first_op.inputs[0].name]) op_def.input.extend([op.inputs[0].name])
op_def.output.extend([output.name for output in op.outputs])
size_arg = op_def.arg.add() size_arg = op_def.arg.add()
size_arg.name = 'size' size_arg.name = 'size'
size_arg.ints.extend(get_input_tensor(first_op, 1).eval().astype(np.int32).flat) size_arg.ints.extend(get_input_tensor(op, 1).eval().astype(np.int32).flat)
size_arg = op_def.arg.add() size_arg = op_def.arg.add()
size_arg.name = 'align_corners' size_arg.name = 'align_corners'
size_arg.i = first_op.get_attr('align_corners') size_arg.i = op.get_attr('align_corners')
convert_op_outputs(op_def, first_op) self.add_output_shape(op.outputs, op_def)
elif first_op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND', 'BiasAdd']: self.resolved_ops[op.name] = 1
op_def.name = first_op.name
op_def.type = first_op.type def convert_bias_add(self, op):
op_def.input.extend([input.name for input in first_op.inputs]) op_def = mace_pb2.OperatorDef()
convert_op_outputs(op_def, first_op) arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = "BiasAdd"
op_def.input.extend([op.inputs[0].name])
if self.device == 'gpu':
output_name = self.add_buffer_to_image(op.inputs[1].name, "ARGUMENT")
op_def.input.extend([output_name])
else: else:
raise Exception('Unknown Op: %s, type: %s' % (first_op.name, first_op.type)) op_def.input.extend([op.inputs[1].name])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.net_def.op.extend([op_def])
self.resolved_ops[op.name] = 1
def convert_normal_op(self, op):
op_def = self.net_def.op.add()
arg = op_def.arg.add()
arg.name = 'T'
arg.i = self.dt
op_def.name = op.name
op_def.type = op.type
op_def.input.extend([input.name for input in op.inputs])
op_def.output.extend([output.name for output in op.outputs])
self.add_output_shape(op.outputs, op_def)
self.resolved_ops[op.name] = 1
def convert(self, input_node, output_node):
if self.device == 'gpu':
self.add_input_transform(input_node)
for op in self.tf_ops:
if self.resolved_ops[op.name] == 1:
continue
if op.type in ['Placeholder', 'Reshape', 'Identity']:
self.resolved_ops[op.name] = 1
pass pass
elif op.type == 'Const':
self.convert_tensor(op)
elif op.type == 'Conv2D' or op.type == 'DepthwiseConv2dNative':
self.convert_conv2d(op)
elif op.type == 'FusedBatchNorm':
self.convert_fused_batchnorm(op)
elif op.type == 'Add' and op.name.endswith('batchnorm/add'):
self.convert_batchnorm(op)
elif op.type == 'AvgPool' or op.type == 'MaxPool':
self.convert_pooling(op)
elif op.type == 'Relu6':
self.convert_relu6(op)
elif op.type == 'Add':
self.convert_add(op)
elif op.type == 'ConcatV2':
self.convert_concat(op)
elif op.type == 'ResizeBilinear':
self.convert_resize_bilinear(op)
elif op.type == 'BiasAdd':
self.convert_bias_add(op)
elif op.type in ['Relu', 'SpaceToBatchND', 'BatchToSpaceND']:
self.convert_normal_op(op)
else:
raise Exception('Unknown Op: %s, type: %s' % (op.name, op.type))
for i in range(resolved_count): if self.device == 'gpu':
del unresolved_ops[0] self.add_output_transform(output_node)
for key in self.resolved_ops:
if self.resolved_ops[key] != 1:
print 'Unresolve Op: %s' % key
def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device): def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, device):
net_def = mace_pb2.NetDef() net_def = mace_pb2.NetDef()
...@@ -311,14 +425,11 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi ...@@ -311,14 +425,11 @@ def convert_to_mace_pb(input_graph_def, input_node, output_node, data_type, devi
with session.graph.as_default() as graph: with session.graph.as_default() as graph:
tf.import_graph_def(input_graph_def, name="") tf.import_graph_def(input_graph_def, name="")
ops = graph.get_operations() ops = graph.get_operations()
unresolved_ops = ops converter = TFConverter(ops, net_def, dt, device)
if device == 'gpu': converter.convert(input_node, output_node)
add_input_transform(input_node, dt, net_def) print "PB Converted, start optimize memory."
while len(unresolved_ops) > 0: mem_optimizer = memory_optimizer.MemoryOptimizer(net_def)
convert_ops(unresolved_ops, dt, net_def, device) mem_optimizer.optimize()
if device == 'gpu': print "Memory optimization done."
add_output_transform(output_node, net_def)
print "PB Parsed."
return net_def return net_def
...@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): ...@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
elif is_node_flatten_reshape(first_op): elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten' op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs]) op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op) convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type): elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs]) op_def.input.extend([t.name for t in first_op.inputs])
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import os.path import os.path
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from scipy import spatial
from tensorflow import gfile from tensorflow import gfile
...@@ -34,9 +35,12 @@ def load_data(file): ...@@ -34,9 +35,12 @@ def load_data(file):
def valid_output(out_shape, mace_out_file, tf_out_value): def valid_output(out_shape, mace_out_file, tf_out_value):
mace_out_value = load_data(mace_out_file) mace_out_value = load_data(mace_out_file)
if mace_out_value.size != 0: if mace_out_value.size != 0:
similarity = (1 - spatial.distance.cosine(tf_out_value.flat, mace_out_value))
print 'MACE VS TF similarity: ', similarity
if similarity > 0.999:
print '=======================Passed! Haha======================'
mace_out_value = mace_out_value.reshape(out_shape) mace_out_value = mace_out_value.reshape(out_shape)
np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05) np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
print '=======================Passed! Haha======================'
else: else:
print '=======================Skip empty node===================' print '=======================Skip empty node==================='
...@@ -62,7 +66,7 @@ def run_model(input_shape): ...@@ -62,7 +66,7 @@ def run_model(input_shape):
input_value = input_value.reshape(input_shape) input_value = input_value.reshape(input_shape)
output_value = session.run(output_node, feed_dict={input_node: [input_value]}) output_value = session.run(output_node, feed_dict={input_node: [input_value]})
# output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight') output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_out')
return output_value return output_value
def main(unused_args): def main(unused_args):
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
# Must run at root dir of mace project. # Must run at root dir of mace project.
set +x set +x
Usage() { Usage() {
echo 'Usage: bash tools/validate_gcn.sh tf_model_file' echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size'
} }
if [ $# != 1 ];then if [ $# != 2 ];then
Usage Usage
exit -1 exit -1
fi fi
...@@ -13,18 +13,18 @@ fi ...@@ -13,18 +13,18 @@ fi
TF_MODEL_FILE_PATH=$1 TF_MODEL_FILE_PATH=$1
MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
MACE_MODEL_NAME='mace_model.pb' MACE_MODEL_NAME='mace_model.pb'
MACE_OPT_MODEL_NAME='mace_opt_model.pb'
INPUT_FILE_NAME='model_input' INPUT_FILE_NAME='model_input'
OUTPUT_FILE_NAME='gcn.out' OUTPUT_FILE_NAME='gcn.out'
OUTPUT_LIST_FILE='gcn.list' OUTPUT_LIST_FILE='gcn.list'
PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
KERNEL_DIR="${PHONE_DATA_DIR}/cl/" KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
IMAGE_SIZE=$2
# Step 1: Generate input data # Step 1: Generate input data
echo "Step 1: Generate input data" echo "Step 1: Generate input data"
python tools/validate.py --generate_data true --random_seed 1 \ python tools/validate.py --generate_data true --random_seed 1 \
--input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \ --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
--input_shape=512,512,3 --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
# Step 2: convert tf model to mace model # Step 2: convert tf model to mace model
echo "Step 2: convert tf model to mace model and optimize memory" echo "Step 2: convert tf model to mace model and optimize memory"
...@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ ...@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_node=GCN/br_result_2/fcn_br \ --output_node=GCN/br_result_2/fcn_br \
--data_type=DT_HALF \ --data_type=DT_HALF \
--runtime=gpu --runtime=gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
# Step 3: Run model on the phone # Step 3: Run model on the phone
echo "Step 3: Run model on the phone" echo "Step 3: Run model on the phone"
...@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \ ...@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}" adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}" adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/ ${KERNEL_DIR} adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
num_threads=${1:-4} num_threads=${1:-4}
adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \ adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \ MACE_KERNEL_PATH=$KERNEL_DIR \
OMP_NUM_THREADS=$num_threads \ OMP_NUM_THREADS=$num_threads \
${PHONE_DATA_DIR}/mace_run \ ${PHONE_DATA_DIR}/mace_run \
--model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \ --model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
--input=mace_input_node \ --input=mace_input_node \
--output=mace_output_node \ --output=mace_output_node \
--input_shape=1,512,512,3\ --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \ --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \ --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
--device=OPENCL \ --device=OPENCL \
...@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \ ...@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \ --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
--input_node input \ --input_node input \
--output_node GCN/br_result_2/fcn_br\ --output_node GCN/br_result_2/fcn_br\
--output_shape 1,512,512,2 --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
--output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册