diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 473e5fb58b1dd958180f5c9786ccba0ece232d90..5575a0b1b70868e18a859131065ad4b498b27e43 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -64,84 +64,10 @@ void ActivationFunctor::operator()(const Tensor *input, static_cast(width), static_cast(height * batch)}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = - runtime->GetKernelMaxWorkGroupSize(activation_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - activation_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - activation_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::string tuning_key = Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future); } template struct ActivationFunctor; diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 946e74cf8a6e2e5b7f6e0c0f2052c55ef2ce42e7..261efde071ee3b200c3a35290e685b43297ec956 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -49,89 +49,14 @@ static void AddN(const std::vector &input_tensors, static_cast(width_pixels), static_cast(batch_height_pixels) }; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); std::vector lws = {64, 16, 1}; - auto params_generator = [&]() -> std::vector> { - uint32_t local_ws[2]; - local_ws[0] = std::min(width_pixels, kwg_size); - local_ws[1] = std::min(batch_height_pixels, kwg_size / local_ws[0]); - return {{local_ws[0], local_ws[1], 1}, - {local_ws[1], local_ws[0], 1}, - {kwg_size / 4, 4, 1}, - {kwg_size / 16, 16, 1}, - {kwg_size / 32, 32, 1}, - {kwg_size / 64, 64, 1}, - {kwg_size / 128, 128, 1}, - {kwg_size / 256, 256, 1}, - {kwg_size / 512, 512, 1}, - {kwg_size, 1, 1}, - {1, kwg_size, 1} - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[1] / num_blocks; - if (gws[1] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, - cl::NDRange(0, i * block_size), - cl::NDRange(gws[0], gws1), - cl::NDRange(params[0], params[1]), - nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1]), - cl::NDRange(params[0], params[1]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); - params.back() = num_blocks; - const uint32_t block_size = gws[1] / num_blocks; - if (gws[1] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, - cl::NDRange(0, i * block_size), - cl::NDRange(gws[0], gws1), - cl::NDRange(params[0], params[1]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << "addn_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 29a5f2fa694256e3a087a2374d65da3f8e35da9c..02ab76a85eedcdeb735c69937a326522fcf6b273 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -84,83 +84,10 @@ void BatchNormFunctor::operator()(const Tensor *input, static_cast(width), static_cast(height * batch)}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {8, 128, 1, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future); } template struct BatchNormFunctor; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 23082529f94abcbb560bdc78a18c19ff4e2e612a..b47a096efd2d2472e50b510e722e7142740fb332 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -50,96 +50,14 @@ static void Concat2(const Tensor *input0, static_cast(width), static_cast(batch * height), }; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); std::vector lws = {8, 16, 8, 1}; - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blk, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2], 1}, - {local_ws[2], local_ws[1], local_ws[0], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, //SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index e4b4ab93942e8b2f39d1b1ff9216ae60c61fe216..d62fdf56535372d7fa98da2dad16395656c078bb 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -97,83 +97,10 @@ void Conv1x1(const Tensor *input, static_cast(width_blocks), static_cast(height * batch)}; std::vector lws = {8, 15, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } extern void Conv2dOpenclK1x1S1(const Tensor *input, diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index a374ea514d603c711b70a39f8a9879e6abe81088..3875403862fd97e58f6e6279e0d4e6a92ab9a96c 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -95,83 +95,10 @@ static void Conv2d3x3S12(const Tensor *input, static_cast(width_blocks), static_cast(height * batch)}; std::vector lws = {4, 15, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } void Conv2dOpenclK3x3S1(const Tensor *input, const Tensor *filter, diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index d671d4d884e69f65ac8a6390d7fce4008d8422f2..2a96d8647668ab1abd5bc61c07f1af1c4b37a99e 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -97,83 +97,10 @@ void Conv2dOpencl(const Tensor *input, static_cast(width_blocks), static_cast(height * batch)}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width_blocks, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, - kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun( - tuning_key, lws, params_generator, func, &timer); - SetFuture(future, event); + TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future); } } // namespace kernels diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 2c1dc264bd5ac1ddaeeaf47ea54a6e8b9e32e13a..783a30243407653cc660375b542f2c8f896ac52e 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -4,6 +4,7 @@ #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -100,5 +101,181 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { } } + +void TuningOrRun3DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future) { + auto runtime = OpenCLRuntime::Global(); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); + auto params_generator = [&]() -> std::vector> { + std::vector local_ws(3, 0); + local_ws[0] = std::min(gws[0], kwg_size); + local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); + local_ws[2] = std::min(gws[2], + kwg_size / (local_ws[0] * local_ws[1])); + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size + }; + }; + cl::Event event; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } + return error; + }; + OpenCLProfilingTimer timer(&event); + Tuner::Get()->template TuneOrRun( + tuning_key, lws, params_generator, func, &timer); + + if (future != nullptr) { + future->wait_fn = [event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + OpenCLRuntime::Global()->GetCallStats(event, stats); + } + }; + } +} + +void TuningOrRun2DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future) { + auto runtime = OpenCLRuntime::Global(); + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel); + auto params_generator = [&]() -> std::vector> { + uint32_t local_ws[2]; + local_ws[0] = std::min(gws[0], kwg_size); + local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); + return {{local_ws[0], local_ws[1], 1}, + {local_ws[1], local_ws[0], 1}, + {kwg_size / 4, 4, 1}, + {kwg_size / 16, 16, 1}, + {kwg_size / 32, 32, 1}, + {kwg_size / 64, 64, 1}, + {kwg_size / 128, 128, 1}, + {kwg_size / 256, 256, 1}, + {kwg_size / 512, 512, 1}, + {kwg_size, 1, 1}, + {1, kwg_size, 1} + }; + }; + cl::Event event; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), + nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); + params.back() = num_blocks; + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } + return error; + }; + OpenCLProfilingTimer timer(&event); + Tuner::Get()->template TuneOrRun(tuning_key, + lws, + params_generator, + func, + &timer); + if (future != nullptr) { + future->wait_fn = [runtime, event](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + runtime->GetCallStats(event, stats); + } + }; + } + +} + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 34e787a8b97ee4246e7e9a61c87e0c474a2a06ec..cfbef59f7038285d462d635d172b892bd6de56a1 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -18,7 +18,7 @@ const float kMaxKernelExeTime = 1000.0; // microseconds enum BufferType { FILTER = 0, - IN_OUT= 1, + IN_OUT = 1, ARGUMENT = 2 }; @@ -34,6 +34,19 @@ std::string DtToCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt); +void TuningOrRun3DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future); + + +void TuningOrRun2DKernel(cl::Kernel &kernel, + const std::string tuning_key, + const uint32_t *gws, + std::vector &lws, + StatsFuture *future); + inline void SetFuture(StatsFuture *future, const cl::Event &event) { if (future != nullptr) { future->wait_fn = [event](CallStats *stats) { @@ -48,7 +61,7 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) { namespace { template void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { - (*ss) << v; + (*ss) << v; } template @@ -56,8 +69,8 @@ void AppendToStream(std::stringstream *ss, const std::string &delimiter, T first, Args... args) { - (*ss) << first << delimiter; - AppendToStream(ss, delimiter, args...); + (*ss) << first << delimiter; + AppendToStream(ss, delimiter, args...); } } // namespace diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 194ee133042bf62c3fdcc70cff97a8c910b0aeaf..b147c15ad1e34def84560c4fd81da2988d1b8c89 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -64,95 +64,13 @@ static void Pooling(const Tensor *input, lws[0] = std::min(channel_blocks, kwg_size); lws[1] = std::min(out_width, kwg_size / lws[0]); lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); - local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 0ad87eeaf2ebcc43625411f39a76ba388be367bf..f8d3aed2a3cb232aafe54d9713dd8efd7635bddb 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -60,95 +60,13 @@ void ResizeBilinearFunctor::operator()( static_cast(out_width), static_cast(out_height * batch)}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); - local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - rb_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - rb_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future); } template struct ResizeBilinearFunctor; diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index ca9c5fdbe868508e963eb737f3eeb19744c770f7..e47a4f8956397424475dd14026b205a0b698485c 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -42,95 +42,13 @@ void SoftmaxFunctor::operator()(const Tensor *logits, static_cast(width), static_cast(height * batch)}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(channel_blocks, kwg_size); - local_ws[1] = std::min(width, kwg_size / local_ws[0]); - local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - softmax_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - softmax_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future); } template diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index cf4762fc495cc0c3fa3af61bfba6ff40722cd0c7..8ef3f7c45e4c9bd61c0d02aa6e7d0e0dfdb75d82 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -62,95 +62,13 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; std::vector lws = {8, 16, 8, 1}; - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel); - auto params_generator = [&]() -> std::vector> { - std::vector local_ws(3, 0); - local_ws[0] = std::min(chan_blk, kwg_size); - local_ws[1] = std::min(32, kwg_size / local_ws[0]); - local_ws[2] = std::min(32, kwg_size / (local_ws[0] * local_ws[1])); - return { - {local_ws[0], local_ws[1], local_ws[2], 1}, - {kwg_size / 16, 4, 4, 1}, - {kwg_size / 32, 4, 8, 1}, - {kwg_size / 32, 8, 4, 1}, - {kwg_size / 64, 8, 8, 1}, - {kwg_size / 64, 16, 4, 1}, - {kwg_size / 128, 8, 16, 1}, - {kwg_size / 128, 16, 8, 1}, - {kwg_size / 128, 32, 4, 1}, - {1, kwg_size / 32, 32, 1}, - {1, kwg_size / 64, 64, 1}, - {1, kwg_size / 128, 128, 1}, - {3, 15, 9, 1}, - {7, 15, 9, 1}, - {9, 7, 15, 1}, - {15, 7, 9, 1}, - {1, kwg_size, 1, 1}, - {4, 15, 8, 1}, // SNPE size - }; - }; - cl::Event event; - auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { - cl_int error = CL_SUCCESS; - if (timer == nullptr) { - uint32_t num_blocks = params.back(); - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - s2b_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - } - } else { - timer->StartTiming(); - error = runtime->command_queue().enqueueNDRangeKernel( - s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->StopTiming(); - double elapse_time = timer->ElapsedMicros(); - timer->ClearTiming(); - uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); - params.back() = num_blocks; - const uint32_t block_size = gws[2] / num_blocks; - if (gws[2] % num_blocks > 0) num_blocks++; - for (uint32_t i = 0; i < num_blocks; ++i) { - uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - error = runtime->command_queue().enqueueNDRangeKernel( - s2b_kernel, - cl::NDRange(0, 0, i * block_size), - cl::NDRange(gws[0], gws[1], gws2), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; - timer->AccumulateTiming(); - } - } - return error; - }; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" << batch_tensor->dim(3); - OpenCLProfilingTimer timer(&event); - Tuner::Get()->template TuneOrRun(ss.str(), - lws, - params_generator, - func, - &timer); - if (future != nullptr) { - future->wait_fn = [runtime, event](CallStats *stats) { - event.wait(); - if (stats != nullptr) { - runtime->GetCallStats(event, stats); - } - }; - } + TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future); } template struct SpaceToBatchFunctor;