diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 3c8b013ac51bfd219afef3a91ab600e195c0b99e..5b7ccdd8a7f24aec1247fe0a60b22c1b915f37eb 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() { return (stop_nanos_ - start_nanos_) / 1000.0; } +double OpenCLProfilingTimer::AccumulatedMicros() { + return accumulated_micros_; +} + +void OpenCLProfilingTimer::AccumulateTiming(){ + StopTiming(); + accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0; +} + +void OpenCLProfilingTimer::ClearTiming() { + start_nanos_ = 0; + stop_nanos_ = 0; + accumulated_micros_ = 0; +} + OpenCLRuntime *OpenCLRuntime::Global() { static OpenCLRuntime instance; return &instance; diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 7245b926997459da7c52992524f635bc041d0c92..ff596459eaac19b69deb40c4a60440f9a1e484ac 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -18,16 +18,20 @@ namespace mace { class OpenCLProfilingTimer : public Timer { - public: - explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {}; - void StartTiming() override; - void StopTiming() override; - double ElapsedMicros() override; + public: + explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {}; + void StartTiming() override; + void StopTiming() override; + void AccumulateTiming() override; + void ClearTiming() override; + double ElapsedMicros() override; + double AccumulatedMicros() override; - private: - const cl::Event *event_; - double start_nanos_; - double stop_nanos_; + private: + const cl::Event *event_; + double start_nanos_; + double stop_nanos_; + double accumulated_micros_; }; class OpenCLRuntime { @@ -40,15 +44,15 @@ class OpenCLRuntime { void GetCallStats(const cl::Event &event, CallStats *stats); uint32_t GetDeviceMaxWorkGroupSize(); - uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); + uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); private: OpenCLRuntime(); ~OpenCLRuntime(); - OpenCLRuntime(const OpenCLRuntime&) = delete; - OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; + OpenCLRuntime(const OpenCLRuntime &) = delete; + OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; void BuildProgram(const std::string &program_file_name, const std::string &binary_file_name, diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 44eaa47e52a9558a27f8ba70128b7c06eb457a65..5575a0b1b70868e18a859131065ad4b498b27e43 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -63,52 +63,11 @@ void ActivationFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = - runtime->GetKernelMaxWorkGroupSize(activation_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future); } template struct ActivationFunctor; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index b4079dc39b729e589de1596651470aee84347c14..261efde071ee3b200c3a35290e685b43297ec956 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -49,56 +49,14 @@ static void AddN(const std::vector &input_tensors, static_cast(width_pixels), static_cast(batch_height_pixels) }; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); - std::vector lws = {64, 16}; - auto params_generator = [&]() -> std::vector> { - uint32_t local_ws[2]; - local_ws[0] = std::min(width_pixels, kwg_size); - local_ws[1] = std::min(batch_height_pixels, kwg_size / local_ws[0]); - return {{local_ws[0], local_ws[1]}, - {local_ws[1], local_ws[0]}, - {kwg_size / 4, 4}, - {kwg_size / 16, 16}, - {kwg_size / 32, 32}, - {kwg_size / 64, 64}, - {kwg_size / 128, 128}, - {kwg_size / 256, 256}, - {kwg_size / 512, 512}, - {kwg_size, 1}, - {1, kwg_size} - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1]), - cl::NDRange(params[0], params[1]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {64, 16, 1}; std::stringstream ss; ss << "addn_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 2d6c95a37963b6ffceb9d216d58017cedd01cb00..02ab76a85eedcdeb735c69937a326522fcf6b273 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -83,51 +83,11 @@ void BatchNormFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {8, 128, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future); } template struct BatchNormFunctor; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index a02b155236c31a4f57177f51691ba7864aec91cc..b47a096efd2d2472e50b510e722e7142740fb332 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -50,65 +50,14 @@ static void Concat2(const Tensor *input0, static_cast(width), static_cast(batch * height), }; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); - std::vector lws = {8, 16, 8}; - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blk, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index a8e9192d2c410f5e5bd7c5802b8f332f50c5b400..d62fdf56535372d7fa98da2dad16395656c078bb 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -96,51 +96,11 @@ void Conv1x1(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {8, 15, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 15, 8, 1}; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } extern void Conv2dOpenclK1x1S1(const Tensor *input, diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 9779107b4a1bd1f524d4faa2766bbb37776b603d..3875403862fd97e58f6e6279e0d4e6a92ab9a96c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -94,52 +94,11 @@ static void Conv2d3x3S12(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {4, 15, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {4, 15, 8, 1}; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 8929579907b006ffeaf9b3ac3bb25260077880ee..2a96d8647668ab1abd5bc61c07f1af1c4b37a99e 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -96,52 +96,11 @@ void Conv2dOpencl(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } } // namespace kernels diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a..e220d34463212887bbaaf927288a15ad9549ba32 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -4,13 +4,14 @@ #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { // [(c+3)/4*W, N * H] void CalInOutputImageShape(const std::vector &shape, /* NHWC */ - std::vector &image_shape) { + std::vector &image_shape) { MACE_CHECK(shape.size() == 4); image_shape.resize(2); image_shape[0] = RoundUpDiv4(shape[3]) * shape[2]; @@ -39,41 +40,30 @@ void CalImage2DShape(const std::vector &shape, /* NHWC */ const BufferType type, std::vector &image_shape) { switch (type) { - case FILTER: - CalFilterImageShape(shape, image_shape); + case FILTER:CalFilterImageShape(shape, image_shape); break; - case IN_OUT: - CalInOutputImageShape(shape, image_shape); + case IN_OUT:CalInOutputImageShape(shape, image_shape); break; - case ARGUMENT: - CalArgImageShape(shape, image_shape); + case ARGUMENT:CalArgImageShape(shape, image_shape); break; - default: - LOG(FATAL) << "Mace not supported yet."; + default:LOG(FATAL) << "Mace not supported yet."; } } - std::string DtToCLDt(const DataType dt) { switch (dt) { - case DT_FLOAT: - return "float"; - case DT_HALF: - return "half"; - default: - LOG(FATAL) << "Unsupported data type"; + case DT_FLOAT:return "float"; + case DT_HALF:return "half"; + default:LOG(FATAL) << "Unsupported data type"; return ""; } } std::string DtToCLCMDDt(const DataType dt) { switch (dt) { - case DT_FLOAT: - return "f"; - case DT_HALF: - return "h"; - default: - LOG(FATAL) << "Not supported data type for opencl cmd data type"; + case DT_FLOAT:return "f"; + case DT_HALF:return "h"; + default:LOG(FATAL) << "Not supported data type for opencl cmd data type"; return ""; } } @@ -81,10 +71,8 @@ std::string DtToCLCMDDt(const DataType dt) { std::string DtToUpstreamCLDt(const DataType dt) { switch (dt) { case DT_FLOAT: - case DT_HALF: - return "float"; - default: - LOG(FATAL) << "Unsupported data type"; + case DT_HALF:return "float"; + default:LOG(FATAL) << "Unsupported data type"; return ""; } } @@ -92,13 +80,200 @@ std::string DtToUpstreamCLDt(const DataType dt) { std::string DtToUpstreamCLCMDDt(const DataType dt) { switch (dt) { case DT_FLOAT: - case DT_HALF: - return "f"; - default: - LOG(FATAL) << "Not supported data type for opencl cmd data type"; + case DT_HALF:return "f"; + default:LOG(FATAL) << "Not supported data type for opencl cmd data type"; return ""; } } +void TuningOrRun3DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future) { + auto runtime = OpenCLRuntime::Global(); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(gws[0], kwg_size); + local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); + local_ws[2] = std::min(gws[2], + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size + }; + }; + cl::Event event; + auto func = [&](const std::vector ¶ms, + Timer *timer, + std::vector *tuning_result) -> cl_int { + MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D"; + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params[3]; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->ClearTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + tuning_result->assign(params.begin(), params.end()); + + if (LimitKernelTime()) { + double elapse_time = timer->AccumulatedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + (*tuning_result)[3] = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } + } + return error; + }; + OpenCLProfilingTimer timer(&event); + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + + if (future != nullptr) { + future->wait_fn = [event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + OpenCLRuntime::Global()->GetCallStats(event, stats); + } + }; + } +} + +void TuningOrRun2DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future) { + auto runtime = OpenCLRuntime::Global(); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); + auto params_generator = [&]() -> std::vector> { + uint32_t local_ws[2]; + local_ws[0] = std::min(gws[0], kwg_size); + local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); + return {{local_ws[0], local_ws[1], 1}, + {local_ws[1], local_ws[0], 1}, + {kwg_size / 4, 4, 1}, + {kwg_size / 16, 16, 1}, + {kwg_size / 32, 32, 1}, + {kwg_size / 64, 64, 1}, + {kwg_size / 128, 128, 1}, + {kwg_size / 256, 256, 1}, + {kwg_size / 512, 512, 1}, + {kwg_size, 1, 1}, + {1, kwg_size, 1} + }; + }; + cl::Event event; + auto func = [&](const std::vector ¶ms, + Timer *timer, + std::vector *tuning_result) -> cl_int { + MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d"; + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params[2]; + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), + nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->ClearTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + tuning_result->assign(params.begin(), params.end()); + + if (LimitKernelTime()) { + double elapse_time = timer->AccumulatedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); + (*tuning_result)[2] = num_blocks; + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } + } + return error; + }; + OpenCLProfilingTimer timer(&event); + Tuner::Get()->template TuneOrRun(tuning_key, + lws, + params_generator, + func, + &timer); + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + +} + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 2927dbfff77000166027cd377ff05dc1337bcc00..466064b6d8b6ab98a09ec001fb46cace22447b78 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -14,9 +14,11 @@ namespace mace { namespace kernels { +const float kMaxKernelExeTime = 1000.0; // microseconds + enum BufferType { FILTER = 0, - IN_OUT= 1, + IN_OUT = 1, ARGUMENT = 2 }; @@ -32,6 +34,19 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt); +void TuningOrRun3DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future); + + +void TuningOrRun2DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future); + inline void SetFuture(StatsFuture *future, const cl::Event &event) { if (future != nullptr) { future->wait_fn = [event](CallStats *stats) { @@ -43,10 +58,15 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) { } } +inline bool LimitKernelTime() { + const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME"); + return flag != nullptr && strlen(flag) == 1 && flag[0] == '1'; +} + namespace { template void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { - (*ss) << v; + (*ss) << v; } template @@ -54,8 +74,8 @@ void AppendToStream(std::stringstream *ss, const std::string &delimiter, T first, Args... args) { - (*ss) << first << delimiter; - AppendToStream(ss, delimiter, args...); + (*ss) << first << delimiter; + AppendToStream(ss, delimiter, args...); } } // namespace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 79a6f102930e69cec40ce2447fa5a4dcc83bbf2b..b147c15ad1e34def84560c4fd81da2988d1b8c89 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -60,67 +60,17 @@ static void Pooling(const Tensor *input, static_cast(batch * out_height), }; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - std::vector lws(3, 0); + std::vector lws(4, 1); lws[0] = std::min(channel_blocks, kwg_size); lws[1] = std::min(out_width, kwg_size / lws[0]); lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); - local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; std::stringstream ss; ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index dc0d8cd08cbd0eeb24a6a46c23bdb37813ebbba2..f8d3aed2a3cb232aafe54d9713dd8efd7635bddb 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -59,60 +59,14 @@ void ResizeBilinearFunctor::operator()( const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(out_width), static_cast(out_height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); - local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size - }; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - rb_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future); } template struct ResizeBilinearFunctor; diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index bfc75e73f8786a67ae7dc19723f3e5ff03d6f476..e47a4f8956397424475dd14026b205a0b698485c 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -41,64 +41,14 @@ void SoftmaxFunctor::operator()(const Tensor *logits, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}}; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - softmax_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 1fd5bf1a5d1199cbf1fb4139b2c4d83b7b0d9408..8ef3f7c45e4c9bd61c0d02aa6e7d0e0dfdb75d82 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -61,58 +61,14 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor const uint32_t gws[3] = {chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - const std::vector lws = {8, 16, 8}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(chan_blk, kwg_size); - local_ws[1] = std::min(32, kwg_size / local_ws[0]); - local_ws[2] = std::min(32, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {4, 32, 8}, - {4, 64, 4}, - {4, 128, 2}, - {8, 16, 8}, - {8, 32, 4}, - {8, 64, 2}, - {16, 8, 8}, - {16, 16, 4}, - {16, 32, 2}, - {32, 8, 4}, - {32, 16, 2}, - {64, 4, 4}}; - }; - cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - s2b_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - return error; - }; + std::vector lws = {8, 16, 8, 1}; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future); } template struct SpaceToBatchFunctor; diff --git a/mace/utils/timer.h b/mace/utils/timer.h index cee4411e278abc3dce303c15f02ae8c37acfef1a..ca0c2b3ca04a8af260b9cbfacbcca2a5a02906cb 100644 --- a/mace/utils/timer.h +++ b/mace/utils/timer.h @@ -10,29 +10,50 @@ namespace mace { class Timer { - public: - virtual void StartTiming() = 0; - virtual void StopTiming() = 0; - virtual double ElapsedMicros() = 0; + public: + virtual void StartTiming() = 0; + virtual void StopTiming() = 0; + virtual void AccumulateTiming() = 0; + virtual void ClearTiming() = 0; + virtual double ElapsedMicros() = 0; + virtual double AccumulatedMicros() = 0; }; class WallClockTimer : public Timer { - public: - void StartTiming() override { - start_micros_ = mace::utils::NowMicros(); - } - - void StopTiming() override { - stop_micros_ = mace::utils::NowMicros(); - } - - double ElapsedMicros() override { - return stop_micros_ - start_micros_; - } - - private: - double start_micros_; - double stop_micros_; + public: + WallClockTimer() : accumulated_micros_(0) {} + + void StartTiming() override { + start_micros_ = mace::utils::NowMicros(); + } + + void StopTiming() override { + stop_micros_ = mace::utils::NowMicros(); + } + + void AccumulateTiming() override { + StopTiming(); + accumulated_micros_ += stop_micros_ - start_micros_; + } + + void ClearTiming() override { + start_micros_ = 0; + stop_micros_ = 0; + accumulated_micros_ = 0; + } + + double ElapsedMicros() override { + return stop_micros_ - start_micros_; + } + + double AccumulatedMicros() override { + return accumulated_micros_; + } + + private: + double start_micros_; + double stop_micros_; + double accumulated_micros_; }; } // namespace mace diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index b7364e66a72b5861d8d67801c79b921029b6c04a..369152819afb67c554c8c057777fc91d9b3e1349 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -41,10 +41,10 @@ class Tuner { template RetType TuneOrRun( const std::string param_key, - const std::vector &default_param, + std::vector &default_param, const std::function>()> ¶m_generator, - const std::function &)> &func, + const std::function &, Timer *, std::vector *)> &func, Timer *timer) { std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key); if (IsTuning() && param_generator != nullptr) { @@ -60,12 +60,12 @@ class Tuner { if (param_table_.find(obfucated_param_key) != param_table_.end()) { VLOG(1) << param_key << ": " << internal::MakeString(param_table_[obfucated_param_key]); - return func(param_table_[obfucated_param_key]); + return func(param_table_[obfucated_param_key], nullptr, nullptr); } else { #ifndef MACE_DISABLE_NO_TUNING_WARNING LOG(WARNING) << "Fallback to default parameter: " << param_key; #endif - return func(default_param); + return func(default_param, nullptr, nullptr); } } } @@ -119,18 +119,17 @@ class Tuner { template inline RetType Run( - const std::function &)> &func, - const std::vector ¶ms, + const std::function &, Timer *, std::vector *)> &func, + std::vector ¶ms, Timer *timer, int num_runs, - double *time_us) { + double *time_us, + std::vector *tuning_result) { RetType res; int64_t total_time_us = 0; for (int i = 0; i < num_runs; ++i) { - timer->StartTiming(); - res = func(params); - timer->StopTiming(); - total_time_us += timer->ElapsedMicros(); + res = func(params, timer, tuning_result); + total_time_us += timer->AccumulatedMicros(); } *time_us = total_time_us * 1.0 / num_runs; @@ -141,24 +140,25 @@ class Tuner { inline RetType Tune( const std::function>()> ¶m_generator, - const std::function &)> &func, + const std::function &, Timer *, std::vector *)> &func, Timer *timer, std::vector *opt_params) { RetType res; double opt_time = std::numeric_limits::max(); auto params = param_generator(); - for (const auto ¶m : params) { + std::vector tuning_result; + for (auto param : params) { double tmp_time = 0.0; // warm up - Run(func, param, timer, 2, &tmp_time); + Run(func, param, timer, 2, &tmp_time, &tuning_result); // run - RetType tmp_res = Run(func, param, timer, 10, &tmp_time); + RetType tmp_res = Run(func, param, timer, 10, &tmp_time, &tuning_result); // Check the execution time if (tmp_time < opt_time) { opt_time = tmp_time; - *opt_params = param; + *opt_params = tuning_result; res = tmp_res; } } diff --git a/tools/export_lib.sh b/tools/export_lib.sh index abcaf6145b9b8583eecf669088d8258197117adb..cb0028376b5b25da5eb4431c9d9b9633077f3d1e 100755 --- a/tools/export_lib.sh +++ b/tools/export_lib.sh @@ -68,7 +68,6 @@ build_target() --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-Werror=return-type" \ --copt="-DMACE_OBFUSCATE_LITERALS" \ - $TUNING_MODE_BUILD_FLAGS \ $DSP_MODE_BUILD_FLAGS || exit -1 } diff --git a/tools/wino_conv.py b/tools/wino_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..a8cdf3d8e88586b10dd3256de3670978c2a2e5f2 --- /dev/null +++ b/tools/wino_conv.py @@ -0,0 +1,141 @@ +import numpy as np +import math +import tensorflow as tf + +A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32) +A = np.transpose(A_T) +B_T = np.array([ + [1, 0, -1, 0], + [0, 1, 1, 0], + [0, -1, 1, 0], + [0, 1, 0, -1] +]).astype(np.float32) +B = np.transpose(B_T) +G = np.array([ + [1, 0, 0], + [0.5, 0.5, 0.5], + [0.5, -0.5, 0.5], + [0, 0, 1], +]).astype(np.float32) +G_T = np.transpose(G) + + +def output_shape(input_shape, filter_shape): + out_shape = np.zeros(4).astype(np.int32) + out_shape[0] = input_shape[0] + out_shape[1] = filter_shape[0] + out_shape[2] = input_shape[2] - 2 + out_shape[3] = input_shape[3] - 2 + return out_shape + + +def winog_conv(input, filter): + m = 2 + r = 3 + alpha = m + r - 1 + input_shape = input.shape + filter_shape = filter.shape + out_shape = output_shape(input_shape, filter_shape) + + K = filter_shape[0] + C = input_shape[1] + U = np.zeros((K * 16, C)) + + for k in range(K): + for c in range(C): + u = np.dot(np.dot(G, filter[k, c, :, :]), G_T) + for i in range(4): + for j in range(4) : + U[(i * 4 + j) * K + k, c] = u[i, j] + + print 'filter out: ', U.shape + print U[0, 0] + U.astype(np.float32).tofile("filter_out") + + rounded_h = int(math.ceil(out_shape[2] / 2.0)) + rounded_w = int(math.ceil(out_shape[3] / 2.0)) + P = input_shape[0] * rounded_h * rounded_w + V = np.zeros((C * 16, P)) + for p in range(P): + for c in range(C): + n = p / (rounded_w * rounded_h) + t = p % (rounded_h * rounded_w) + h_idx = t / rounded_w + w_idx = t % rounded_w + h_start = h_idx * 2 + w_start = w_idx * 2 + h_end = min(h_start+4, input_shape[2]) + w_end = min(w_start+4, input_shape[3]) + d = np.zeros((4, 4)) + d[0:h_end-h_start, 0:w_end-w_start] = input[n, c, h_start:h_end, w_start:w_end] + v = np.dot(np.dot(B_T, d), B) + for i in range(4): + for j in range(4): + V[(i*4+j)*C + c, p] = v[i, j] + + tmp = V.reshape(16, C, P, 1) + print 'input out: ', tmp.shape + tmp.astype(np.float32).tofile("C") + M = np.zeros((16 * K, P)) + for i in range(alpha * alpha): + u = U[i * K : (i+1) * K, :] + v = V[i * C : (i+1) * C, :] + M[i * K : (i+1) * K, :] = np.dot(u, v) + + print 'M shape: ', M.shape + M.astype(np.float32).tofile("gemm") + res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1])) + for k in range(K): + for b in range(P): + m = np.zeros((4, 4)) + for i in range(4): + for j in range(4): + m[i][j] = M[(i*4+j) * K + k, b] + y = np.dot(np.dot(A_T, m), A) + for i in range(2): + for j in range(2): + n = b / (rounded_h * rounded_w) + t = b % (rounded_h * rounded_w) + p = (t / rounded_w) * 2 + i + q = (t % rounded_w) * 2 + j + if p >= out_shape[2] or q >= out_shape[3]: + continue + res[n, p, q, k] = y[i, j] + + print 'Res shape: ', res.shape + res.astype(np.float32).tofile("res") + + return res + +def tf_conv(input, filter): + conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID') + with tf.Session() as sess: + res = sess.run(conv_op) + return res + + +def main(): + input = np.random.random([7, 61, 71, 31]).astype(np.float32) + # input = np.fromfile(file="A", dtype=np.float32) + # input = input.reshape(1, 3, 3, 5) + print 'input shape: ', input.shape + input.tofile("A") + filter = np.random.random([3, 3, 31, 31]).astype(np.float32) + tf_out = tf_conv(input, filter) + input = input.transpose((0, 3, 1, 2)) + filter = filter.transpose((3, 2, 0, 1)) + print 'filter shape: ', filter.shape + filter.tofile("filter_in") + winog_out = winog_conv(input, filter) + res = np.allclose(tf_out, winog_out) + if res: + print "=========Pass=========" + else: + print "=========Failed=========" + print "TF: ", tf_out + print "Winograd: ", winog_out + + +if __name__ == '__main__': + main() +