提交 b1397592 编写于 作者: Y yejianwu

fix conflix

...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -33,8 +34,6 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size())); built_options.emplace("-DINPUT_NUM=" + ToString(input_tensors.size()));
auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options); auto addn_kernel = runtime->BuildKernel("addn", "addn", built_options);
const uint32_t lws = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
uint32_t idx = 0; uint32_t idx = 0;
for (auto input : input_tensors) { for (auto input : input_tensors) {
addn_kernel.setArg(idx++, addn_kernel.setArg(idx++,
...@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -42,12 +41,47 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
} }
addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); addn_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( const uint32_t gws[2] = {
addn_kernel, cl::NullRange, static_cast<uint32_t>(width_pixels),
cl::NDRange(width_pixels, batch_height_pixels), static_cast<uint32_t>(batch_height_pixels)
cl::NDRange(64, 16), // TODO fix this };
nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
MACE_CHECK(error == CL_SUCCESS) << "error code: " << error; std::vector<uint32_t> lws = {64, 16};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]},
{kwg_size / 16, 16},
{kwg_size / 32, 32},
{kwg_size / 64, 64},
{kwg_size / 128, 128},
{kwg_size / 256, 256},
{kwg_size, 1},
{1, kwg_size}
};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template <typename T> template <typename T>
......
...@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -48,8 +48,13 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{8, 128, 1}, //SNPE size return {{8, 128, 1}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0, ...@@ -41,21 +42,57 @@ static void Concat2(const Tensor *input0,
concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3))); concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blk),
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
std::vector<uint32_t> lws = {8, 16, 8};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
uint32_t lws[3] = {8, 16, 8}; MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
// lws[0] = std::min<uint32_t>(channel_blk, kwg_size); return error;
// lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]); };
// lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1])); std::stringstream ss;
ss << "concat_opencl_kernel_"
cl_int error = runtime->command_queue().enqueueNDRangeKernel( << output->dim(0) << "_"
concat_kernel, cl::NullRange, << output->dim(1) << "_"
cl::NDRange(static_cast<uint32_t>(channel_blk), << output->dim(2) << "_"
static_cast<uint32_t>(width), << output->dim(3);
static_cast<uint32_t>(height * batch)), Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
cl::NDRange(lws[0], lws[1], lws[2]), lws,
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); params_generator,
MACE_CHECK(error == CL_SUCCESS); func);
} }
template<typename T> template<typename T>
......
...@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input, ...@@ -68,8 +68,13 @@ void Conv1x1(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8}; const std::vector<uint32_t> lws = {8, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> { auto params_generator = [&]()->std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size/16, 4, 4}, {kwg_size/16, 4, 4},
{kwg_size/32, 4, 8}, {kwg_size/32, 4, 8},
{kwg_size/32, 8, 4}, {kwg_size/32, 8, 4},
......
...@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, ...@@ -60,8 +60,13 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8}; const std::vector<uint32_t> lws = {4, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, ...@@ -62,8 +62,13 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include "mace/core/runtime/opencl/cl2_header.h" #include "mace/core/runtime/opencl/cl2_header.h"
#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input, ...@@ -23,11 +24,6 @@ static void Pooling(const Tensor *input,
index_t channels = output->dim(3); index_t channels = output->dim(3);
index_t channel_blocks = (channels + 3) / 4; index_t channel_blocks = (channels + 3) / 4;
const uint32_t gws[3] = {
static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(batch * out_height),
};
auto runtime = OpenCLRuntime::Get(); auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options; std::set<std::string> built_options;
...@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input, ...@@ -44,13 +40,6 @@ static void Pooling(const Tensor *input,
} }
auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options); auto pooling_kernel = runtime->BuildKernel("pooling", "pooling", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t idx = 0; uint32_t idx = 0;
pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1))); pooling_kernel.setArg(idx++, static_cast<int32_t>(input->dim(1)));
...@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input, ...@@ -62,12 +51,60 @@ static void Pooling(const Tensor *input,
pooling_kernel.setArg(idx++, pooling_size); pooling_kernel.setArg(idx++, pooling_size);
pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); pooling_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel( const uint32_t gws[3] = {
pooling_kernel, cl::NullRange, static_cast<uint32_t>(channel_blocks),
cl::NDRange(gws[0], gws[1], gws[2]), static_cast<uint32_t>(out_width),
cl::NDRange(lws[0], lws[1], lws[2]), static_cast<uint32_t>(batch * out_height),
NULL, OpenCLRuntime::Get()->GetDefaultEvent()); };
MACE_CHECK(error == CL_SUCCESS) << error; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
std::vector<uint32_t> lws(3, 0);
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
} }
template<typename T> template<typename T>
......
...@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -50,8 +50,13 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4},
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "mace/kernels/resize_bilinear.h" #include "mace/kernels/resize_bilinear.h"
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
...@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -44,8 +45,6 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options); auto rb_kernel = runtime->BuildKernel("resize_bilinear", "resize_bilinear_nocache", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
uint32_t idx = 0; uint32_t idx = 0;
rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer()))); rb_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer()))); rb_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
...@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -55,17 +54,52 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
rb_kernel.setArg(idx++, static_cast<int32_t>(in_width)); rb_kernel.setArg(idx++, static_cast<int32_t>(in_width));
rb_kernel.setArg(idx++, static_cast<int32_t>(out_height)); rb_kernel.setArg(idx++, static_cast<int32_t>(out_height));
auto command_queue = runtime->command_queue(); const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
cl_int error = command_queue.enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(static_cast<int32_t>(channel_blocks),
static_cast<int32_t>(out_width),
static_cast<int32_t>(out_height * batch)),
// TODO tuning
cl::NDRange(1, static_cast<int32_t>(out_width > kwg_size ? kwg_size : out_width), 1),
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>; template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
...@@ -8,6 +8,7 @@ py_library( ...@@ -8,6 +8,7 @@ py_library(
], ],
srcs_version = "PY2AND3", srcs_version = "PY2AND3",
deps = [ deps = [
":memory_optimizer",
"//mace/proto:mace_py", "//mace/proto:mace_py",
], ],
) )
......
...@@ -65,7 +65,7 @@ class MemoryOptimizer(object): ...@@ -65,7 +65,7 @@ class MemoryOptimizer(object):
raise Exception('ref count is less than 0') raise Exception('ref count is less than 0')
for mem in self.mem_block: for mem in self.mem_block:
arena = net_def.mem_arena arena = self.net_def.mem_arena
block = arena.mem_block.add() block = arena.mem_block.add()
block.mem_id = mem block.mem_id = mem
block.x = self.mem_block[mem][0] block.x = self.mem_block[mem][0]
...@@ -83,20 +83,7 @@ class MemoryOptimizer(object): ...@@ -83,20 +83,7 @@ class MemoryOptimizer(object):
print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size) print('origin mem: %d, optimized mem: %d', origin_mem_size, optimized_mem_size)
if __name__ == '__main__':
model_file = sys.argv[1]
opt_model_file = sys.argv[2]
with open(model_file, "rb") as f:
net_def = mace_pb2.NetDef()
net_def.ParseFromString(f.read())
optimizer = MemoryOptimizer(net_def)
optimizer.optimize()
with open(opt_model_file, "wb") as f:
f.write(net_def.SerializeToString())
with open(opt_model_file + '_txt', "wb") as f:
net_def.ClearField('tensors')
f.write(str(net_def))
def optimize_memory(net_def):
mem_optimizer = MemoryOptimizer(net_def)
mem_optimizer.optimize()
\ No newline at end of file
...@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops): ...@@ -149,6 +149,7 @@ def convert_ops(unresolved_ops, resolved_ops, net_def, output_node, dsp_ops):
elif is_node_flatten_reshape(first_op): elif is_node_flatten_reshape(first_op):
op_def.type = 'Flatten' op_def.type = 'Flatten'
op_def.input.extend([t.name for t in first_op.inputs]) op_def.input.extend([t.name for t in first_op.inputs])
op_def.out_max_byte_size.extend([max_elem_size(out) for out in first_op.outputs])
convert_op_outputs(op_def, first_op) convert_op_outputs(op_def, first_op)
elif dsp_ops.has_op(first_op.type): elif dsp_ops.has_op(first_op.type):
op_def.input.extend([t.name for t in first_op.inputs]) op_def.input.extend([t.name for t in first_op.inputs])
......
...@@ -4,6 +4,7 @@ import os ...@@ -4,6 +4,7 @@ import os
import os.path import os.path
import tensorflow as tf import tensorflow as tf
import numpy as np import numpy as np
from scipy import spatial
from tensorflow import gfile from tensorflow import gfile
...@@ -34,9 +35,12 @@ def load_data(file): ...@@ -34,9 +35,12 @@ def load_data(file):
def valid_output(out_shape, mace_out_file, tf_out_value): def valid_output(out_shape, mace_out_file, tf_out_value):
mace_out_value = load_data(mace_out_file) mace_out_value = load_data(mace_out_file)
if mace_out_value.size != 0: if mace_out_value.size != 0:
similarity = (1 - spatial.distance.cosine(tf_out_value.flat, mace_out_value))
print 'MACE VS TF similarity: ', similarity
if similarity > 0.999:
print '=======================Passed! Haha======================'
mace_out_value = mace_out_value.reshape(out_shape) mace_out_value = mace_out_value.reshape(out_shape)
np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05) np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
print '=======================Passed! Haha======================'
else: else:
print '=======================Skip empty node===================' print '=======================Skip empty node==================='
...@@ -62,7 +66,7 @@ def run_model(input_shape): ...@@ -62,7 +66,7 @@ def run_model(input_shape):
input_value = input_value.reshape(input_shape) input_value = input_value.reshape(input_shape)
output_value = session.run(output_node, feed_dict={input_node: [input_value]}) output_value = session.run(output_node, feed_dict={input_node: [input_value]})
# output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_weight') output_value.astype(np.float32).tofile( os.path.dirname(FLAGS.input_file) + '/tf_out')
return output_value return output_value
def main(unused_args): def main(unused_args):
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
# Must run at root dir of mace project. # Must run at root dir of mace project.
set +x set +x
Usage() { Usage() {
echo 'Usage: bash tools/validate_gcn.sh tf_model_file' echo 'Usage: bash tools/validate_gcn.sh tf_model_path image_size'
} }
if [ $# != 1 ];then if [ $# != 2 ];then
Usage Usage
exit -1 exit -1
fi fi
...@@ -13,18 +13,18 @@ fi ...@@ -13,18 +13,18 @@ fi
TF_MODEL_FILE_PATH=$1 TF_MODEL_FILE_PATH=$1
MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH}) MODEL_DIR=$(dirname ${TF_MODEL_FILE_PATH})
MACE_MODEL_NAME='mace_model.pb' MACE_MODEL_NAME='mace_model.pb'
MACE_OPT_MODEL_NAME='mace_opt_model.pb'
INPUT_FILE_NAME='model_input' INPUT_FILE_NAME='model_input'
OUTPUT_FILE_NAME='gcn.out' OUTPUT_FILE_NAME='gcn.out'
OUTPUT_LIST_FILE='gcn.list' OUTPUT_LIST_FILE='gcn.list'
PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}" PHONE_DATA_DIR="/data/local/tmp/${MACE_MODEL_NAME}"
KERNEL_DIR="${PHONE_DATA_DIR}/cl/" KERNEL_DIR="${PHONE_DATA_DIR}/cl/"
IMAGE_SIZE=$2
# Step 1: Generate input data # Step 1: Generate input data
echo "Step 1: Generate input data" echo "Step 1: Generate input data"
python tools/validate.py --generate_data true --random_seed 1 \ python tools/validate.py --generate_data true --random_seed 1 \
--input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \ --input_file=${MODEL_DIR}/${INPUT_FILE_NAME} \
--input_shape=512,512,3 --input_shape="${IMAGE_SIZE},${IMAGE_SIZE},3"
# Step 2: convert tf model to mace model # Step 2: convert tf model to mace model
echo "Step 2: convert tf model to mace model and optimize memory" echo "Step 2: convert tf model to mace model and optimize memory"
...@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ ...@@ -35,10 +35,6 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output_node=GCN/br_result_2/fcn_br \ --output_node=GCN/br_result_2/fcn_br \
--data_type=DT_HALF \ --data_type=DT_HALF \
--runtime=gpu --runtime=gpu
bazel build mace/python/tools:memory_optimizer
bazel-bin/mace/python/tools/memory_optimizer ${MODEL_DIR}/${MACE_MODEL_NAME} \
${MODEL_DIR}/${MACE_OPT_MODEL_NAME}
# Step 3: Run model on the phone # Step 3: Run model on the phone
echo "Step 3: Run model on the phone" echo "Step 3: Run model on the phone"
...@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \ ...@@ -49,21 +45,22 @@ bazel build -c opt --strip always mace/examples:mace_run \
adb shell "mkdir -p ${PHONE_DATA_DIR}" adb shell "mkdir -p ${PHONE_DATA_DIR}"
adb shell "mkdir -p ${KERNEL_DIR}" adb shell "mkdir -p ${KERNEL_DIR}"
adb push mace/kernels/opencl/cl/ ${KERNEL_DIR} adb push mace/kernels/opencl/cl/* ${KERNEL_DIR}
adb push ${MODEL_DIR}/${MACE_OPT_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
num_threads=${1:-4} num_threads=${1:-4}
adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \ adb </dev/null shell MACE_CPP_MIN_VLOG_LEVEL=0 \
MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \ MACE_KERNEL_PATH=$KERNEL_DIR \
OMP_NUM_THREADS=$num_threads \ OMP_NUM_THREADS=$num_threads \
${PHONE_DATA_DIR}/mace_run \ ${PHONE_DATA_DIR}/mace_run \
--model=${PHONE_DATA_DIR}/${MACE_OPT_MODEL_NAME} \ --model=${PHONE_DATA_DIR}/${MACE_MODEL_NAME} \
--input=mace_input_node \ --input=mace_input_node \
--output=mace_output_node \ --output=mace_output_node \
--input_shape=1,512,512,3\ --input_shape="1,${IMAGE_SIZE},${IMAGE_SIZE},3"\
--input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \ --input_file=${PHONE_DATA_DIR}/${INPUT_FILE_NAME} \
--output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \ --output_file=${PHONE_DATA_DIR}/${OUTPUT_FILE_NAME} \
--device=OPENCL \ --device=OPENCL \
...@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \ ...@@ -81,4 +78,5 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \ --mace_out_file ${MODEL_DIR}/${OUTPUT_FILE_NAME} \
--input_node input \ --input_node input \
--output_node GCN/br_result_2/fcn_br\ --output_node GCN/br_result_2/fcn_br\
--output_shape 1,512,512,2 --input_shape "${IMAGE_SIZE},${IMAGE_SIZE},3" \
--output_shape "1,${IMAGE_SIZE},${IMAGE_SIZE},2"
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册